vllm-project · dragondream-chen · Dec 18, 2025
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -288,7 +288,7 @@ def dummy_run(self,
         positions = self.positions[:num_tokens]
         previous_hidden_states = self.hidden_states[:num_tokens]
         for i in range(self.num_speculative_tokens):
-            if i > 0 and not in_graph_capturing and aclgraph_runtime_mode == CUDAGraphMode.FULL:
+            if i > 0 and in_graph_capturing and aclgraph_runtime_mode == CUDAGraphMode.FULL:
                 aclgraph_runtime_mode = CUDAGraphMode.NONE
             with set_ascend_forward_context(
                     attn_metadata,

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2154,6 +2154,8 @@ def dummy_drafter_compute_logits(hidden_states):
                 dummy_compute_logits(hidden_states)
 
             if self.drafter:
+                # `in_graph_capturing` indicates whether the main model is in graph capturing.
+                # The value is only used in `mtp_proposer.py` currently and defaults to False.
                 self.drafter.dummy_run(
                     num_tokens=num_tokens_padded,
                     with_prefill=with_prefill,
@@ -2162,7 +2164,7 @@ def dummy_drafter_compute_logits(hidden_states):
                     aclgraph_runtime_mode=aclgraph_runtime_mode,
                     batch_descriptor=batch_descriptor,
                     dummy_compute_logits=dummy_drafter_compute_logits,
-                    in_graph_capturing=not force_attention)
+                    in_graph_capturing=force_attention)
             if is_profile and self.dynamic_eplb:
                 self.model.clear_all_moe_loads()
             if not is_profile and self.dynamic_eplb: