From 294a9f40927b6cb4ddd77b956f785c4e53ebc083 Mon Sep 17 00:00:00 2001
From: Brad Hilton <brad.hilton.nw@gmail.com>
Date: Thu, 18 Sep 2025 00:23:04 +0000
Subject: [PATCH] fix: Update model configuration and server logic

- Changed the base model in `yes-no-maybe.ipynb` to `Qwen/Qwen3-30B-A3B-Instruct`.
- Disabled loading in 4-bit mode in `get_model_config.py` for better compatibility.
- Adjusted target modules for Qwen3 MoE models to avoid unsupported LoRA weights.
- Enhanced the `openai_server_task` in `server.py` to ensure missing fields in LoRA requests are handled gracefully.
---
 dev/yes-no-maybe.ipynb          | 284 ++++++++++++++++++++++++++++----
 src/art/dev/get_model_config.py |  20 ++-
 src/art/vllm/server.py          |  26 +--
 3 files changed, 281 insertions(+), 49 deletions(-)
diff --git a/dev/yes-no-maybe.ipynb b/dev/yes-no-maybe.ipynb
index f9d7a6d8..4b4df24f 100644
--- a/dev/yes-no-maybe.ipynb
+++ b/dev/yes-no-maybe.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,9 +12,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>\n",
+       ".cell-output-ipywidget-background {\n",
+       "    background-color: transparent !important;\n",
+       "}\n",
+       ":root {\n",
+       "    --jp-widgets-color: var(--vscode-editor-foreground);\n",
+       "    --jp-widgets-font-size: var(--vscode-editor-font-size);\n",
+       "}  \n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "%%html\n",
     "<style>\n",
@@ -30,48 +51,71 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Skipping tuning as there is no suitable data. This can happen when all the trajectories in the same group have the same reward and thus no advantage to train on.\n",
-      "Advanced step from 264 to 265 (no training occurred)\n"
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mbradhilton\u001b[0m to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
      ]
     },
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "96f344d734164144b12b3480cefc39a4",
-       "version_major": 2,
-       "version_minor": 0
-      },
+      "text/html": [
+       "Tracking run with wandb version 0.21.0"
+      ],
       "text/plain": [
-       "gather:   0%|          | 0/1152 [00:00<?, ?it/s]"
+       "<IPython.core.display.HTML object>"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Skipping tuning as there is no suitable data. This can happen when all the trajectories in the same group have the same reward and thus no advantage to train on.\n",
-      "Advanced step from 265 to 266 (no training occurred)\n"
-     ]
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/home/sky/sky_workdir/dev/wandb/run-20250917_234909-009</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6bd3b26ae80d4d61b20a5335693d5b89",
-       "version_major": 2,
-       "version_minor": 0
-      },
+      "text/html": [
+       "Resuming run <strong><a href='https://wandb.ai/bradhilton/yes-no-maybe/runs/009' target=\"_blank\">009</a></strong> to <a href='https://wandb.ai/bradhilton/yes-no-maybe' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
+      ],
       "text/plain": [
-       "gather:   0%|          | 0/1152 [00:00<?, ?it/s]"
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/bradhilton/yes-no-maybe' target=\"_blank\">https://wandb.ai/bradhilton/yes-no-maybe</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/bradhilton/yes-no-maybe/runs/009' target=\"_blank\">https://wandb.ai/bradhilton/yes-no-maybe/runs/009</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
       ]
      },
      "metadata": {},
@@ -81,14 +125,115 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Skipping tuning as there is no suitable data. This can happen when all the trajectories in the same group have the same reward and thus no advantage to train on.\n",
-      "Advanced step from 266 to 267 (no training occurred)\n"
+      "INFO 09-17 23:49:13 [__init__.py:235] Automatically detected platform cuda.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/sky/sky_workdir/src/art/__init__.py:12: UserWarning: WARNING: Unsloth should be imported before transformers, peft to ensure all optimizations are applied. Your code may run slower or encounter memory issues without these optimizations.\n",
+      "\n",
+      "Please restructure your imports with 'import unsloth' at the top of your file.\n",
+      "  import unsloth  # type: ignore # noqa: F401\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n",
+      "INFO 09-17 23:49:19 [__init__.py:235] Automatically detected platform cuda.\n",
+      "🦥 Unsloth Zoo will now patch everything to make training faster!\n",
+      "==((====))==  Unsloth 2025.9.6: Fast Qwen3_Moe patching. Transformers: 4.53.2. vLLM: 0.10.0.\n",
+      "   \\\\   /|    NVIDIA H200. Num GPUs = 1. Max memory: 139.811 GB. Platform: Linux.\n",
+      "O^O/ \\_/ \\    Torch: 2.7.1+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.1\n",
+      "\\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]\n",
+      " \"-____-\"     Free license: http://github.com/unslothai/unsloth\n",
+      "Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n",
+      "Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.\n",
+      "Unsloth: Patching vLLM v1 graph capture\n",
+      "Unsloth: Patching vLLM v0 graph capture\n",
+      "Unsloth: vLLM loading unsloth/Qwen3-30B-A3B-Instruct-2507 with actual GPU utilization = 78.66%\n",
+      "Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 139.81 GB.\n",
+      "Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 32768. Num Sequences = 400.\n",
+      "Unsloth: vLLM's KV Cache can use up to 104.57 GB. Also swap space = 6 GB.\n",
+      "Unsloth: Not an error, but `device` is not supported in vLLM. Skipping.\n",
+      "INFO 09-17 23:49:31 [config.py:1604] Using max model len 32768\n",
+      "INFO 09-17 23:49:31 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=2048.\n",
+      "INFO 09-17 23:49:31 [llm_engine.py:228] Initializing a V0 LLM engine (v0.10.0) with config: model='unsloth/Qwen3-30B-A3B-Instruct-2507', speculative_config=None, tokenizer='unsloth/Qwen3-30B-A3B-Instruct-2507', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=unsloth/Qwen3-30B-A3B-Instruct-2507, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={\"level\":0,\"debug_dump_path\":\"\",\"cache_dir\":\"\",\"backend\":\"inductor\",\"custom_ops\":[],\"splitting_ops\":[],\"use_inductor\":true,\"compile_sizes\":[],\"inductor_compile_config\":{\"epilogue_fusion\":true,\"max_autotune\":false,\"shape_padding\":true,\"trace.enabled\":false,\"triton.cudagraphs\":true,\"debug\":false,\"dce\":true,\"memory_planning\":true,\"coordinate_descent_tuning\":false,\"trace.graph_diagram\":false,\"compile_threads\":32,\"group_fusion\":true,\"disable_progress\":false,\"verbose_progress\":true,\"triton.multi_kernel\":0,\"triton.use_block_ptr\":true,\"triton.enable_persistent_tma_matmul\":true,\"triton.autotune_at_compile_time\":false,\"triton.cooperative_reductions\":false,\"cuda.compile_opt_level\":\"-O2\",\"cuda.enable_cuda_lto\":true,\"combo_kernels\":false,\"benchmark_combo_kernel\":true,\"combo_kernel_foreach_dynamic_shapes\":true,\"enable_auto_functionalized_v2\":false},\"inductor_passes\":{},\"use_cudagraph\":true,\"cudagraph_num_of_warmups\":1,\"cudagraph_capture_sizes\":[400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"cudagraph_copy_inputs\":false,\"full_cuda_graph\":false,\"max_capture_size\":400,\"local_cache_dir\":null}, use_cached_outputs=False, \n",
+      "INFO 09-17 23:49:32 [cuda.py:398] Using Flash Attention backend.\n",
+      "INFO 09-17 23:49:32 [parallel_state.py:1102] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0\n",
+      "INFO 09-17 23:49:32 [model_runner.py:1083] Starting to load model unsloth/Qwen3-30B-A3B-Instruct-2507...\n",
+      "INFO 09-17 23:49:33 [weight_utils.py:296] Using model weights format ['*.safetensors']\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading safetensors checkpoint shards:   0% Completed | 0/16 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards:   6% Completed | 1/16 [00:00<00:10,  1.40it/s]\n",
+      "Loading safetensors checkpoint shards:  12% Completed | 2/16 [00:01<00:10,  1.31it/s]\n",
+      "Loading safetensors checkpoint shards:  19% Completed | 3/16 [00:02<00:10,  1.29it/s]\n",
+      "Loading safetensors checkpoint shards:  25% Completed | 4/16 [00:03<00:09,  1.30it/s]\n",
+      "Loading safetensors checkpoint shards:  31% Completed | 5/16 [00:03<00:08,  1.30it/s]\n",
+      "Loading safetensors checkpoint shards:  38% Completed | 6/16 [00:04<00:07,  1.30it/s]\n",
+      "Loading safetensors checkpoint shards:  44% Completed | 7/16 [00:05<00:06,  1.30it/s]\n",
+      "Loading safetensors checkpoint shards:  50% Completed | 8/16 [00:06<00:06,  1.30it/s]\n",
+      "Loading safetensors checkpoint shards:  56% Completed | 9/16 [00:06<00:05,  1.29it/s]\n",
+      "Loading safetensors checkpoint shards:  62% Completed | 10/16 [00:07<00:04,  1.29it/s]\n",
+      "Loading safetensors checkpoint shards:  69% Completed | 11/16 [00:08<00:03,  1.29it/s]\n",
+      "Loading safetensors checkpoint shards:  75% Completed | 12/16 [00:09<00:03,  1.29it/s]\n",
+      "Loading safetensors checkpoint shards:  81% Completed | 13/16 [00:10<00:02,  1.29it/s]\n",
+      "Loading safetensors checkpoint shards:  88% Completed | 14/16 [00:10<00:01,  1.29it/s]\n",
+      "Loading safetensors checkpoint shards:  94% Completed | 15/16 [00:11<00:00,  1.30it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 16/16 [00:11<00:00,  1.64it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 16/16 [00:11<00:00,  1.36it/s]\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 09-17 23:49:45 [default_loader.py:262] Loading weights took 11.81 seconds\n",
+      "INFO 09-17 23:49:45 [punica_selector.py:19] Using PunicaWrapperGPU.\n",
+      "WARNING 09-17 23:49:45 [models.py:52] For MoE models, vLLM currently does not support fused MoE LoRA inference. Please ensure that the loaded LoRA model does not contain expert weights.\n",
+      "INFO 09-17 23:49:46 [model_runner.py:1115] Model loading took 56.9483 GiB and 12.457003 seconds\n",
+      "INFO 09-17 23:49:46 [fused_moe.py:688] Using configuration from /home/sky/sky_workdir/.venv/lib/python3.10/site-packages/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json for MoE layer.\n",
+      "INFO 09-17 23:49:47 [worker.py:295] Memory profiling takes 0.94 seconds\n",
+      "INFO 09-17 23:49:47 [worker.py:295] the current vLLM instance can use total_gpu_memory (139.81GiB) x gpu_memory_utilization (0.79) = 109.98GiB\n",
+      "INFO 09-17 23:49:47 [worker.py:295] model weights take 56.95GiB; non_torch_memory takes 0.16GiB; PyTorch activation peak memory takes 2.19GiB; the rest of the memory reserved for KV Cache is 50.68GiB.\n",
+      "INFO 09-17 23:49:47 [executor_base.py:113] # cuda blocks: 34597, # CPU blocks: 4096\n",
+      "INFO 09-17 23:49:47 [executor_base.py:118] Maximum concurrency for 32768 tokens per request: 16.89x\n",
+      "INFO 09-17 23:49:49 [vllm_utils.py:721] Unsloth: Running patched vLLM v0 `capture_model`.\n",
+      "INFO 09-17 23:49:49 [model_runner.py:1385] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Capturing CUDA graph shapes: 100%|██████████| 53/53 [00:09<00:00,  5.86it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 09-17 23:49:58 [model_runner.py:1537] Graph capturing finished in 9 secs, took 1.34 GiB\n",
+      "INFO 09-17 23:49:58 [vllm_utils.py:728] Unsloth: Patched vLLM v0 graph capture finished in 9 secs.\n",
+      "INFO 09-17 23:49:59 [llm_engine.py:424] init engine (profile, create kv cache, warmup model) took 13.06 seconds\n",
+      "Unsloth: Just some info: will skip parsing ['q_norm', 'layer_norm2', 'pre_feedforward_layernorm', 'post_attention_layernorm', 'input_layernorm', 'post_layernorm', 'norm1', 'k_norm', 'post_feedforward_layernorm', 'layer_norm1', 'norm2']\n",
+      "Unsloth: Just some info: will skip parsing ['cross_attn_post_attention_layernorm', 'q_norm', 'layer_norm2', 'pre_feedforward_layernorm', 'post_attention_layernorm', 'input_layernorm', 'post_layernorm', 'norm1', 'k_norm', 'cross_attn_input_layernorm', 'post_feedforward_layernorm', 'layer_norm1', 'norm2']\n",
+      "Unsloth: Making `model.base_model.model.model` require gradients\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "174a88b44d0b4f53a8910ba4ff463215",
+       "model_id": "c6d8312013204ffa86d2f907b70dcdd2",
        "version_major": 2,
        "version_minor": 0
       },
@@ -99,27 +244,100 @@
      "metadata": {},
      "output_type": "display_data"
     },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Tried to log to step 0 that is less than the current step 1000. Steps must be monotonically increasing, so this data will be ignored. See https://wandb.me/define-metric to log data out of order.\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Skipping tuning as there is no suitable data. This can happen when all the trajectories in the same group have the same reward and thus no advantage to train on.\n",
-      "Advanced step from 267 to 268 (no training occurred)\n"
+      "Packed 832 trajectories into 2 sequences of length 2048\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "89e8164861604905a115fe6f53d26c35",
+       "model_id": "5b8b599a109e442781cc056ae2a371db",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "gather:   0%|          | 0/1152 [00:00<?, ?it/s]"
+       "train:   0%|          | 0/2 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1\n",
+      "   \\\\   /|    Num examples = 10,000,000 | Num Epochs = 3 | Total steps = 30,000,000\n",
+      "O^O/ \\_/ \\    Batch size per device = 2 | Gradient accumulation steps = 1\n",
+      "\\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2\n",
+      " \"-____-\"     Trainable parameters = 6,684,672 of 1,549,357,056 (0.43% trained)\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "mat1 and mat2 shapes cannot be multiplied (1024x2048 and 1x128)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 76\u001b[0m\n\u001b[1;32m     68\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28;01mawait\u001b[39;00m model\u001b[38;5;241m.\u001b[39mget_step(), \u001b[38;5;241m1_000\u001b[39m):\n\u001b[1;32m     69\u001b[0m     train_groups \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m art\u001b[38;5;241m.\u001b[39mgather_trajectory_groups(\n\u001b[1;32m     70\u001b[0m         (\n\u001b[1;32m     71\u001b[0m             art\u001b[38;5;241m.\u001b[39mTrajectoryGroup(rollout(openai_client, prompt) \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m32\u001b[39m))\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     74\u001b[0m         pbar_desc\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgather\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     75\u001b[0m     )\n\u001b[0;32m---> 76\u001b[0m     \u001b[38;5;28;01mawait\u001b[39;00m model\u001b[38;5;241m.\u001b[39mtrain(\n\u001b[1;32m     77\u001b[0m         train_groups,\n\u001b[1;32m     78\u001b[0m         config\u001b[38;5;241m=\u001b[39mart\u001b[38;5;241m.\u001b[39mTrainConfig(learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-4\u001b[39m),\n\u001b[1;32m     79\u001b[0m         \u001b[38;5;66;03m# _config=art.dev.TrainConfig(\u001b[39;00m\n\u001b[1;32m     80\u001b[0m         \u001b[38;5;66;03m#     precalculate_logprobs=True,\u001b[39;00m\n\u001b[1;32m     81\u001b[0m         \u001b[38;5;66;03m# ),\u001b[39;00m\n\u001b[1;32m     82\u001b[0m     )\n",
+      "File \u001b[0;32m~/sky_workdir/src/art/model.py:368\u001b[0m, in \u001b[0;36mTrainableModel.train\u001b[0;34m(self, trajectory_groups, config, _config, verbose)\u001b[0m\n\u001b[1;32m    352\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mtrain\u001b[39m(\n\u001b[1;32m    353\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    354\u001b[0m     trajectory_groups: Iterable[TrajectoryGroup],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    357\u001b[0m     verbose: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    358\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    359\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    360\u001b[0m \u001b[38;5;124;03m    Reinforce fine-tune the model with a batch of trajectory groups.\u001b[39;00m\n\u001b[1;32m    361\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    366\u001b[0m \u001b[38;5;124;03m            not yet part of the public API. Use at your own risk.\u001b[39;00m\n\u001b[1;32m    367\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 368\u001b[0m     \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbackend()\u001b[38;5;241m.\u001b[39m_train_model(\n\u001b[1;32m    369\u001b[0m         \u001b[38;5;28mself\u001b[39m, \u001b[38;5;28mlist\u001b[39m(trajectory_groups), config, _config \u001b[38;5;129;01mor\u001b[39;00m {}, verbose\n\u001b[1;32m    370\u001b[0m     ):\n\u001b[1;32m    371\u001b[0m         \u001b[38;5;28;01mpass\u001b[39;00m\n",
+      "File \u001b[0;32m~/sky_workdir/src/art/local/backend.py:483\u001b[0m, in \u001b[0;36mLocalBackend._train_model\u001b[0;34m(self, model, trajectory_groups, config, dev_config, verbose)\u001b[0m\n\u001b[1;32m    481\u001b[0m     estimated_gradient_steps \u001b[38;5;241m=\u001b[39m math\u001b[38;5;241m.\u001b[39mceil(estimated_gradient_steps \u001b[38;5;241m/\u001b[39m dp)\n\u001b[1;32m    482\u001b[0m pbar \u001b[38;5;241m=\u001b[39m tqdm\u001b[38;5;241m.\u001b[39mtqdm(total\u001b[38;5;241m=\u001b[39mestimated_gradient_steps, desc\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 483\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mfor\u001b[39;00m result \u001b[38;5;129;01min\u001b[39;00m service\u001b[38;5;241m.\u001b[39mtrain(\n\u001b[1;32m    484\u001b[0m     disk_packed_tensors, config, dev_config, verbose\n\u001b[1;32m    485\u001b[0m ):\n\u001b[1;32m    486\u001b[0m     num_gradient_steps \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(\n\u001b[1;32m    487\u001b[0m         result\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_gradient_steps\u001b[39m\u001b[38;5;124m\"\u001b[39m, estimated_gradient_steps)\n\u001b[1;32m    488\u001b[0m     )\n\u001b[1;32m    489\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m num_gradient_steps \u001b[38;5;241m==\u001b[39m estimated_gradient_steps, (\n\u001b[1;32m    490\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_gradient_steps \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnum_gradient_steps\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m != estimated_gradient_steps \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mestimated_gradient_steps\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    491\u001b[0m     )\n",
+      "File \u001b[0;32m~/sky_workdir/src/mp_actors/move.py:175\u001b[0m, in \u001b[0;36mProxy.__getattr__.<locals>.async_gen_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    173\u001b[0m     send_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    174\u001b[0m     \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m         send_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m get_response(\n\u001b[1;32m    176\u001b[0m             args, kwargs, \u001b[38;5;28mid\u001b[39m, send_value\n\u001b[1;32m    177\u001b[0m         )\n\u001b[1;32m    178\u001b[0m         args, kwargs \u001b[38;5;241m=\u001b[39m (), {}\n\u001b[1;32m    179\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopAsyncIteration\u001b[39;00m:\n",
+      "File \u001b[0;32m~/sky_workdir/src/mp_actors/move.py:161\u001b[0m, in \u001b[0;36mProxy.__getattr__.<locals>.get_response\u001b[0;34m(args, kwargs, id, send_value)\u001b[0m\n\u001b[1;32m    156\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_requests\u001b[38;5;241m.\u001b[39mput_nowait(request)\n\u001b[1;32m    157\u001b[0m done, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio\u001b[38;5;241m.\u001b[39mwait(\n\u001b[1;32m    158\u001b[0m     [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_futures[request\u001b[38;5;241m.\u001b[39mid], \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process_future],\n\u001b[1;32m    159\u001b[0m     return_when\u001b[38;5;241m=\u001b[39masyncio\u001b[38;5;241m.\u001b[39mFIRST_COMPLETED,\n\u001b[1;32m    160\u001b[0m )\n\u001b[0;32m--> 161\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdone\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/lib/python3.10/asyncio/futures.py:201\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    199\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__log_traceback \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m    200\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 201\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\u001b[38;5;241m.\u001b[39mwith_traceback(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception_tb)\n\u001b[1;32m    202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result\n",
+      "File \u001b[0;32m~/sky_workdir/src/mp_actors/move.py:284\u001b[0m, in \u001b[0;36m_handle_request\u001b[0;34m()\u001b[0m\n\u001b[1;32m    280\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m request\u001b[38;5;241m.\u001b[39mid \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m generators:\n\u001b[1;32m    281\u001b[0m         generators[request\u001b[38;5;241m.\u001b[39mid] \u001b[38;5;241m=\u001b[39m result_or_callable(\n\u001b[1;32m    282\u001b[0m             \u001b[38;5;241m*\u001b[39mrequest\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mrequest\u001b[38;5;241m.\u001b[39mkwargs\n\u001b[1;32m    283\u001b[0m         )\n\u001b[0;32m--> 284\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m generators[request\u001b[38;5;241m.\u001b[39mid]\u001b[38;5;241m.\u001b[39masend(request\u001b[38;5;241m.\u001b[39msend_value)\n\u001b[1;32m    285\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(result_or_callable):\n\u001b[1;32m    286\u001b[0m     result_or_coro \u001b[38;5;241m=\u001b[39m result_or_callable(\u001b[38;5;241m*\u001b[39mrequest\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mrequest\u001b[38;5;241m.\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/src/art/unsloth/service.py:168\u001b[0m, in \u001b[0;36mtrain\u001b[0;34m()\u001b[0m\n\u001b[1;32m    164\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\n\u001b[1;32m    165\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDone waiting for a result from the queue or for the training task to, presumably, raise an exception\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    166\u001b[0m     )\n\u001b[1;32m    167\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m task \u001b[38;5;129;01min\u001b[39;00m done:\n\u001b[0;32m--> 168\u001b[0m     result \u001b[38;5;241m=\u001b[39m task\u001b[38;5;241m.\u001b[39mresult()\n\u001b[1;32m    169\u001b[0m     \u001b[38;5;66;03m# If `result` is `None`, the training task finished somehow.\u001b[39;00m\n\u001b[1;32m    170\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, (\n\u001b[1;32m    171\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe training task should never finish.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    172\u001b[0m     )\n",
+      "File \u001b[0;32m~/miniconda3/lib/python3.10/asyncio/futures.py:201\u001b[0m, in \u001b[0;36mresult\u001b[0;34m()\u001b[0m\n\u001b[1;32m    199\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__log_traceback \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m    200\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 201\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\u001b[38;5;241m.\u001b[39mwith_traceback(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception_tb)\n\u001b[1;32m    202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result\n",
+      "File \u001b[0;32m~/miniconda3/lib/python3.10/asyncio/tasks.py:232\u001b[0m, in \u001b[0;36m__step\u001b[0;34m()\u001b[0m\n\u001b[1;32m    228\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    229\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m exc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    230\u001b[0m         \u001b[38;5;66;03m# We use the `send` method directly, because coroutines\u001b[39;00m\n\u001b[1;32m    231\u001b[0m         \u001b[38;5;66;03m# don't have `__iter__` and `__next__` methods.\u001b[39;00m\n\u001b[0;32m--> 232\u001b[0m         result \u001b[38;5;241m=\u001b[39m coro\u001b[38;5;241m.\u001b[39msend(\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    233\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    234\u001b[0m         result \u001b[38;5;241m=\u001b[39m coro\u001b[38;5;241m.\u001b[39mthrow(exc)\n",
+      "File \u001b[0;32m~/sky_workdir/src/art/unsloth/train.py:40\u001b[0m, in \u001b[0;36mtrain\u001b[0;34m()\u001b[0m\n\u001b[1;32m     38\u001b[0m     trainer\u001b[38;5;241m.\u001b[39m_metrics \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m: defaultdict(\u001b[38;5;28mlist\u001b[39m)}\n\u001b[1;32m     39\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 40\u001b[0m     trainer\u001b[38;5;241m.\u001b[39mtrain()\n\u001b[1;32m     41\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m     42\u001b[0m     trainer\u001b[38;5;241m.\u001b[39mcompute_loss \u001b[38;5;241m=\u001b[39m _compute_loss\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/transformers/trainer.py:2206\u001b[0m, in \u001b[0;36mtrain\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2204\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   2205\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2206\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inner_training_loop(\n\u001b[1;32m   2207\u001b[0m         args\u001b[38;5;241m=\u001b[39margs,\n\u001b[1;32m   2208\u001b[0m         resume_from_checkpoint\u001b[38;5;241m=\u001b[39mresume_from_checkpoint,\n\u001b[1;32m   2209\u001b[0m         trial\u001b[38;5;241m=\u001b[39mtrial,\n\u001b[1;32m   2210\u001b[0m         ignore_keys_for_eval\u001b[38;5;241m=\u001b[39mignore_keys_for_eval,\n\u001b[1;32m   2211\u001b[0m     )\n",
+      "File \u001b[0;32m<string>:321\u001b[0m, in \u001b[0;36m_fast_inner_training_loop\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32m<string>:34\u001b[0m, in \u001b[0;36m_unsloth_training_step\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32m~/sky_workdir/src/art/unsloth/train.py:112\u001b[0m, in \u001b[0;36mcompute_loss\u001b[0;34m()\u001b[0m\n\u001b[1;32m    108\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m seq_len \u001b[38;5;241m%\u001b[39m chunk_size \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m, (\n\u001b[1;32m    109\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSequence length (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mseq_len\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) must be evenly divisible by chunk size (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mchunk_size\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    110\u001b[0m )\n\u001b[1;32m    111\u001b[0m os\u001b[38;5;241m.\u001b[39menviron[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUNSLOTH_RETURN_HIDDEN_STATES\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 112\u001b[0m new_logprobs, entropies \u001b[38;5;241m=\u001b[39m calculate_logprobs(\n\u001b[1;32m    113\u001b[0m     dtype_for_autocasting,\n\u001b[1;32m    114\u001b[0m     trainer,\n\u001b[1;32m    115\u001b[0m     inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokens\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m    116\u001b[0m     attn_bias,\n\u001b[1;32m    117\u001b[0m     next_input_ids,\n\u001b[1;32m    118\u001b[0m     lm_head_t,\n\u001b[1;32m    119\u001b[0m     chunk_size\u001b[38;5;241m=\u001b[39mchunk_size,\n\u001b[1;32m    120\u001b[0m     inference_mode\u001b[38;5;241m=\u001b[39mreturn_new_logprobs,\n\u001b[1;32m    121\u001b[0m     no_grad\u001b[38;5;241m=\u001b[39mreturn_new_logprobs,\n\u001b[1;32m    122\u001b[0m     reference_logprobs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    123\u001b[0m )\n\u001b[1;32m    124\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m return_new_logprobs:\n\u001b[1;32m    125\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mpad(new_logprobs[:, :\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m], (\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m0\u001b[39m), value\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.0\u001b[39m)\n",
+      "File \u001b[0;32m~/sky_workdir/src/art/unsloth/train.py:321\u001b[0m, in \u001b[0;36mcalculate_logprobs\u001b[0;34m()\u001b[0m\n\u001b[1;32m    295\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mcalculate_logprobs\u001b[39m(\n\u001b[1;32m    296\u001b[0m     dtype_for_autocast: torch\u001b[38;5;241m.\u001b[39mdtype,\n\u001b[1;32m    297\u001b[0m     trainer: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGRPOTrainer\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    307\u001b[0m     torch\u001b[38;5;241m.\u001b[39mTensor, torch\u001b[38;5;241m.\u001b[39mTensor\n\u001b[1;32m    308\u001b[0m ]:  \u001b[38;5;66;03m# Returns (log_probs, entropy) both shape [B, S]\u001b[39;00m\n\u001b[1;32m    309\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m (\n\u001b[1;32m    310\u001b[0m         torch\u001b[38;5;241m.\u001b[39minference_mode() \u001b[38;5;28;01mif\u001b[39;00m inference_mode \u001b[38;5;28;01melse\u001b[39;00m nullcontext(),\n\u001b[1;32m    311\u001b[0m         torch\u001b[38;5;241m.\u001b[39mno_grad() \u001b[38;5;28;01mif\u001b[39;00m no_grad \u001b[38;5;28;01melse\u001b[39;00m nullcontext(),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    319\u001b[0m         torch\u001b[38;5;241m.\u001b[39mamp\u001b[38;5;241m.\u001b[39mautocast_mode\u001b[38;5;241m.\u001b[39mautocast(device_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mdtype_for_autocast),\n\u001b[1;32m    320\u001b[0m     ):\n\u001b[0;32m--> 321\u001b[0m         hidden_states \u001b[38;5;241m=\u001b[39m trainer\u001b[38;5;241m.\u001b[39mmodel(  \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m    322\u001b[0m             input_ids\u001b[38;5;241m=\u001b[39minput_ids, causal_mask\u001b[38;5;241m=\u001b[39mcausal_mask\n\u001b[1;32m    323\u001b[0m         )\u001b[38;5;241m.\u001b[39mlogits  \u001b[38;5;66;03m# Shape [B, S, H]\u001b[39;00m\n\u001b[1;32m    324\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m _calculate_logprobs(lm_head_t, hidden_states, next_input_ids, chunk_size)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1749\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1760\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1761\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/accelerate/utils/operations.py:818\u001b[0m, in \u001b[0;36mforward\u001b[0;34m()\u001b[0m\n\u001b[1;32m    817\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 818\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m model_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/accelerate/utils/operations.py:806\u001b[0m, in \u001b[0;36m__call__\u001b[0;34m()\u001b[0m\n\u001b[1;32m    805\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 806\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m convert_to_fp32(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/amp/autocast_mode.py:44\u001b[0m, in \u001b[0;36mdecorate_autocast\u001b[0;34m()\u001b[0m\n\u001b[1;32m     41\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m     42\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecorate_autocast\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     43\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m autocast_instance:\n\u001b[0;32m---> 44\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/peft/peft_model.py:881\u001b[0m, in \u001b[0;36mforward\u001b[0;34m()\u001b[0m\n\u001b[1;32m    879\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_enable_peft_forward_hooks(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    880\u001b[0m     kwargs \u001b[38;5;241m=\u001b[39m {k: v \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m kwargs\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m k \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mspecial_peft_forward_args}\n\u001b[0;32m--> 881\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_base_model()(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1749\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1760\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1761\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m~/sky_workdir/dev/unsloth_compiled_cache/unsloth_compiled_module_qwen3_moe.py:747\u001b[0m, in \u001b[0;36mforward\u001b[0;34m()\u001b[0m\n\u001b[1;32m    731\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    732\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    733\u001b[0m     input_ids: Optional[torch\u001b[38;5;241m.\u001b[39mLongTensor] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    745\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Unpack[KwargsForCausalLM],\n\u001b[1;32m    746\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m MoeCausalLMOutputWithPast:\n\u001b[0;32m--> 747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m Qwen3MoeForCausalLM_forward(\u001b[38;5;28mself\u001b[39m, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, output_router_logits, cache_position, logits_to_keep, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/transformers/utils/generic.py:943\u001b[0m, in \u001b[0;36mwrapper\u001b[0;34m()\u001b[0m\n\u001b[1;32m    940\u001b[0m     set_attribute_for_modules(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_is_top_level_module\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m    942\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 943\u001b[0m     output \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    944\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m is_requested_to_return_tuple \u001b[38;5;129;01mor\u001b[39;00m (is_configured_to_return_tuple \u001b[38;5;129;01mand\u001b[39;00m is_top_level_module):\n\u001b[1;32m    945\u001b[0m         output \u001b[38;5;241m=\u001b[39m output\u001b[38;5;241m.\u001b[39mto_tuple()\n",
+      "File \u001b[0;32m~/sky_workdir/dev/unsloth_compiled_cache/unsloth_compiled_module_qwen3_moe.py:546\u001b[0m, in \u001b[0;36mQwen3MoeForCausalLM_forward\u001b[0;34m()\u001b[0m\n\u001b[1;32m    541\u001b[0m output_hidden_states \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    542\u001b[0m     output_hidden_states \u001b[38;5;28;01mif\u001b[39;00m output_hidden_states \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39moutput_hidden_states\n\u001b[1;32m    543\u001b[0m )\n\u001b[1;32m    545\u001b[0m \u001b[38;5;66;03m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m--> 546\u001b[0m outputs: MoeModelOutputWithPast \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel(\n\u001b[1;32m    547\u001b[0m     input_ids\u001b[38;5;241m=\u001b[39minput_ids,\n\u001b[1;32m    548\u001b[0m     attention_mask\u001b[38;5;241m=\u001b[39mattention_mask,\n\u001b[1;32m    549\u001b[0m     position_ids\u001b[38;5;241m=\u001b[39mposition_ids,\n\u001b[1;32m    550\u001b[0m     past_key_values\u001b[38;5;241m=\u001b[39mpast_key_values,\n\u001b[1;32m    551\u001b[0m     inputs_embeds\u001b[38;5;241m=\u001b[39minputs_embeds,\n\u001b[1;32m    552\u001b[0m     use_cache\u001b[38;5;241m=\u001b[39muse_cache,\n\u001b[1;32m    553\u001b[0m     output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m    554\u001b[0m     output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m    555\u001b[0m     output_router_logits\u001b[38;5;241m=\u001b[39moutput_router_logits,\n\u001b[1;32m    556\u001b[0m     cache_position\u001b[38;5;241m=\u001b[39mcache_position,\n\u001b[1;32m    557\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    558\u001b[0m )\n\u001b[1;32m    560\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs\u001b[38;5;241m.\u001b[39mlast_hidden_state\n\u001b[1;32m    561\u001b[0m \u001b[38;5;66;03m# Only compute necessary logits, and do not upcast them to float if we are not computing the loss\u001b[39;00m\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1749\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1760\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1761\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/transformers/utils/generic.py:943\u001b[0m, in \u001b[0;36mwrapper\u001b[0;34m()\u001b[0m\n\u001b[1;32m    940\u001b[0m     set_attribute_for_modules(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_is_top_level_module\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m    942\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 943\u001b[0m     output \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    944\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m is_requested_to_return_tuple \u001b[38;5;129;01mor\u001b[39;00m (is_configured_to_return_tuple \u001b[38;5;129;01mand\u001b[39;00m is_top_level_module):\n\u001b[1;32m    945\u001b[0m         output \u001b[38;5;241m=\u001b[39m output\u001b[38;5;241m.\u001b[39mto_tuple()\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/transformers/models/qwen3_moe/modeling_qwen3_moe.py:547\u001b[0m, in \u001b[0;36mforward\u001b[0;34m()\u001b[0m\n\u001b[1;32m    544\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_hidden_states:\n\u001b[1;32m    545\u001b[0m     all_hidden_states \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (hidden_states,)\n\u001b[0;32m--> 547\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m decoder_layer(\n\u001b[1;32m    548\u001b[0m     hidden_states,\n\u001b[1;32m    549\u001b[0m     attention_mask\u001b[38;5;241m=\u001b[39mcausal_mask,\n\u001b[1;32m    550\u001b[0m     position_ids\u001b[38;5;241m=\u001b[39mposition_ids,\n\u001b[1;32m    551\u001b[0m     past_key_value\u001b[38;5;241m=\u001b[39mpast_key_values,\n\u001b[1;32m    552\u001b[0m     output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m    553\u001b[0m     output_router_logits\u001b[38;5;241m=\u001b[39moutput_router_logits,\n\u001b[1;32m    554\u001b[0m     use_cache\u001b[38;5;241m=\u001b[39muse_cache,\n\u001b[1;32m    555\u001b[0m     cache_position\u001b[38;5;241m=\u001b[39mcache_position,\n\u001b[1;32m    556\u001b[0m     position_embeddings\u001b[38;5;241m=\u001b[39mposition_embeddings,\n\u001b[1;32m    557\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mflash_attn_kwargs,\n\u001b[1;32m    558\u001b[0m )\n\u001b[1;32m    560\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    562\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/transformers/modeling_layers.py:82\u001b[0m, in \u001b[0;36m__call__\u001b[0;34m()\u001b[0m\n\u001b[1;32m     79\u001b[0m         message \u001b[38;5;241m=\u001b[39m message\u001b[38;5;241m.\u001b[39mrstrip(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     80\u001b[0m         logger\u001b[38;5;241m.\u001b[39mwarning(message)\n\u001b[0;32m---> 82\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(partial(\u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs), \u001b[38;5;241m*\u001b[39margs)\n\u001b[1;32m     83\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/_compile.py:51\u001b[0m, in \u001b[0;36minner\u001b[0;34m()\u001b[0m\n\u001b[1;32m     48\u001b[0m     disable_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mdisable(fn, recursive)\n\u001b[1;32m     49\u001b[0m     fn\u001b[38;5;241m.\u001b[39m__dynamo_disable \u001b[38;5;241m=\u001b[39m disable_fn  \u001b[38;5;66;03m# type: ignore[attr-defined]\u001b[39;00m\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m disable_fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838\u001b[0m, in \u001b[0;36m_fn\u001b[0;34m()\u001b[0m\n\u001b[1;32m    836\u001b[0m _maybe_set_eval_frame(_callback_from_stance(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback))\n\u001b[1;32m    837\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 838\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    839\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    840\u001b[0m     set_eval_frame(\u001b[38;5;28;01mNone\u001b[39;00m)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/utils/checkpoint.py:488\u001b[0m, in \u001b[0;36mcheckpoint\u001b[0;34m()\u001b[0m\n\u001b[1;32m    483\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m context_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m noop_context_fn \u001b[38;5;129;01mor\u001b[39;00m debug \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m:\n\u001b[1;32m    484\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    485\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPassing `context_fn` or `debug` is only supported when \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    486\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muse_reentrant=False.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    487\u001b[0m         )\n\u001b[0;32m--> 488\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m CheckpointFunction\u001b[38;5;241m.\u001b[39mapply(function, preserve, \u001b[38;5;241m*\u001b[39margs)\n\u001b[1;32m    489\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    490\u001b[0m     gen \u001b[38;5;241m=\u001b[39m _checkpoint_without_reentrant_generator(\n\u001b[1;32m    491\u001b[0m         function, preserve, context_fn, determinism_check, debug, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m    492\u001b[0m     )\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/autograd/function.py:575\u001b[0m, in \u001b[0;36mapply\u001b[0;34m()\u001b[0m\n\u001b[1;32m    572\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_C\u001b[38;5;241m.\u001b[39m_are_functorch_transforms_active():\n\u001b[1;32m    573\u001b[0m     \u001b[38;5;66;03m# See NOTE: [functorch vjp and autograd interaction]\u001b[39;00m\n\u001b[1;32m    574\u001b[0m     args \u001b[38;5;241m=\u001b[39m _functorch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39munwrap_dead_wrappers(args)\n\u001b[0;32m--> 575\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m    577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_setup_ctx_defined:\n\u001b[1;32m    578\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m    579\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIn order to use an autograd.Function with functorch transforms \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    580\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m(vmap, grad, jvp, jacrev, ...), it must override the setup_context \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    581\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstaticmethod. For more details, please see \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    582\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://pytorch.org/docs/main/notes/extending.func.html\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    583\u001b[0m     )\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/unsloth_zoo/gradient_checkpointing.py:477\u001b[0m, in \u001b[0;36mforward\u001b[0;34m()\u001b[0m\n\u001b[1;32m    474\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ctx\u001b[38;5;241m.\u001b[39m_requires_gradient: ctx\u001b[38;5;241m.\u001b[39msave_for_backward(\u001b[38;5;241m*\u001b[39mtensor_inputs)\n\u001b[1;32m    476\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m--> 477\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m run_function(\u001b[38;5;241m*\u001b[39margs)\n\u001b[1;32m    479\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_gpu_buffer: MAIN_STREAM\u001b[38;5;241m.\u001b[39mwait_stream(EXTRA_STREAM)\n\u001b[1;32m    480\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1749\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1760\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1761\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/transformers/models/qwen3_moe/modeling_qwen3_moe.py:367\u001b[0m, in \u001b[0;36mforward\u001b[0;34m()\u001b[0m\n\u001b[1;32m    364\u001b[0m residual \u001b[38;5;241m=\u001b[39m hidden_states\n\u001b[1;32m    365\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpost_attention_layernorm(hidden_states)\n\u001b[0;32m--> 367\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmlp(hidden_states)\n\u001b[1;32m    368\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(hidden_states, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m    369\u001b[0m     hidden_states, router_logits \u001b[38;5;241m=\u001b[39m hidden_states\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1749\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1760\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1761\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/transformers/models/qwen3_moe/modeling_qwen3_moe.py:233\u001b[0m, in \u001b[0;36mforward\u001b[0;34m()\u001b[0m\n\u001b[1;32m    231\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39mview(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, hidden_dim)\n\u001b[1;32m    232\u001b[0m \u001b[38;5;66;03m# router_logits: (batch * sequence_length, n_experts)\u001b[39;00m\n\u001b[0;32m--> 233\u001b[0m router_logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgate(hidden_states)\n\u001b[1;32m    235\u001b[0m routing_weights \u001b[38;5;241m=\u001b[39m F\u001b[38;5;241m.\u001b[39msoftmax(router_logits, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat)\n\u001b[1;32m    236\u001b[0m routing_weights, selected_experts \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtopk(routing_weights, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtop_k, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1749\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1760\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1761\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m~/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/linear.py:125\u001b[0m, in \u001b[0;36mforward\u001b[0;34m()\u001b[0m\n\u001b[1;32m    124\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 125\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m F\u001b[38;5;241m.\u001b[39mlinear(\u001b[38;5;28minput\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mweight, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbias)\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: mat1 and mat2 shapes cannot be multiplied (1024x2048 and 1x128)"
+     ]
     }
    ],
    "source": [
@@ -135,7 +353,7 @@
     "model = art.TrainableModel(\n",
     "    name=\"009\",\n",
     "    project=\"yes-no-maybe\",\n",
-    "    base_model=\"Qwen/Qwen2.5-7B-Instruct\",\n",
+    "    base_model=\"Qwen/Qwen3-30B-A3B-Instruct-2507\",\n",
     "    # _internal_config=art.dev.InternalModelConfig(\n",
     "    #     _decouple_vllm_and_unsloth=True,\n",
     "    #     engine_args=art.dev.EngineArgs(gpu_memory_utilization=0.7),\n",
diff --git a/src/art/dev/get_model_config.py b/src/art/dev/get_model_config.py
index 7499839d..3b9405c0 100644
--- a/src/art/dev/get_model_config.py
+++ b/src/art/dev/get_model_config.py
@@ -20,7 +20,7 @@ def get_model_config(
         enable_prefix_caching=True,
         fast_inference=True,
         gpu_memory_utilization=(0.79 if enable_sleep_mode else 0.55),
-        load_in_4bit=True,
+        load_in_4bit=False,
         max_lora_rank=8,
         max_seq_length=32768,
         model_name=base_model,
@@ -33,6 +33,11 @@ def get_model_config(
         init_args.pop("gpu_memory_utilization")
         init_args.pop("max_lora_rank")
         init_args.pop("use_async")
+    # Qwen3 MoE models: avoid Unsloth empty-model trick that sets hidden_size=1,
+    # which breaks MoE gate dimensions during training. Disable fast_inference.
+    base_lower = base_model.lower()
+    if ("qwen3" in base_lower) and ("a3b" in base_lower or "moe" in base_lower):
+        init_args["fast_inference"] = False
     engine_args = EngineArgs(
         disable_log_requests=True,
         enable_sleep_mode=enable_sleep_mode,
@@ -57,12 +62,19 @@ def get_model_config(
             "k_proj",
             "v_proj",
             "o_proj",
-            "gate_proj",
-            "up_proj",
-            "down_proj",
         ],
         use_gradient_checkpointing="unsloth",
     )
+    # For Qwen3 MoE (A3B) models, avoid targeting MLP expert modules which
+    # produce expert-specific LoRA weights unsupported by vLLM. Restrict to
+    # attention projections only.
+    if ("qwen3" in base_lower) and ("a3b" in base_lower or "moe" in base_lower):
+        peft_args["target_modules"] = [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+        ]
     peft_args.update(config.get("peft_args", {}))
     trainer_args = TrainerArgs(
         adam_beta1=0.9,
diff --git a/src/art/vllm/server.py b/src/art/vllm/server.py
index d3559fe5..94ce3b3d 100644
--- a/src/art/vllm/server.py
+++ b/src/art/vllm/server.py
@@ -47,20 +47,22 @@ async def openai_server_task(
     patch_tool_parser_manager()
     set_vllm_log_file(config.get("log_file", "vllm.log"))
 
-    # Patch engine.add_lora; hopefully temporary
-    add_lora = engine.add_lora
+    # Patch engine.add_lora to ensure missing fields exist without wrapping
+    _orig_add_lora = engine.add_lora
 
     async def _add_lora(lora_request) -> None:
-        class LoRARequest:
-            def __getattr__(self, name: str) -> Any:
-                if name == "lora_tensors" and not hasattr(lora_request, name):
-                    return None
-                return getattr(lora_request, name)
-
-            def __setattr__(self, name: str, value: Any) -> None:
-                setattr(lora_request, name, value)
-
-        await add_lora(LoRARequest())  # type: ignore
+        # Some versions expect these attributes; set defaults if absent.
+        if not hasattr(lora_request, "lora_tensors"):
+            try:
+                setattr(lora_request, "lora_tensors", None)
+            except Exception:
+                pass
+        if not hasattr(lora_request, "lora_embeddings"):
+            try:
+                setattr(lora_request, "lora_embeddings", None)
+            except Exception:
+                pass
+        await _orig_add_lora(lora_request)  # type: ignore
 
     engine.add_lora = _add_lora