[Bugfix] add force_attention comment

dragondream-chen · yiz-liu · commit f3bf1921fd32 · 2025-12-17T16:24:37.000+08:00
Signed-off-by: chenmenglong &lt;chenmenglong1@huawei.com&gt;
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -288,7 +288,7 @@ def dummy_run(self,
         positions = self.positions[:num_tokens]
         previous_hidden_states = self.hidden_states[:num_tokens]
         for i in range(self.num_speculative_tokens):
-            if i > 0 and not in_graph_capturing and aclgraph_runtime_mode == CUDAGraphMode.FULL:
+            if i > 0 and in_graph_capturing and aclgraph_runtime_mode == CUDAGraphMode.FULL:
                 aclgraph_runtime_mode = CUDAGraphMode.NONE
             with set_ascend_forward_context(
                     attn_metadata,
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2148,6 +2148,8 @@ def dummy_drafter_compute_logits(hidden_states):
                 dummy_compute_logits(hidden_states)
 
             if self.drafter:
+                # `in_graph_capturing` indicates whether the main model is in graph capturing.
+                # The value is only used in `mtp_proposer.py` currently and defaults to False.
                 self.drafter.dummy_run(
                     num_tokens=num_tokens_padded,
                     with_prefill=with_prefill,
@@ -2156,7 +2158,7 @@ def dummy_drafter_compute_logits(hidden_states):
                     aclgraph_runtime_mode=aclgraph_runtime_mode,
                     batch_descriptor=batch_descriptor,
                     dummy_compute_logits=dummy_drafter_compute_logits,
-                    in_graph_capturing=not force_attention)
+                    in_graph_capturing=force_attention)
             if self.in_profile_run and self.dynamic_eplb:
                 self.model.clear_all_moe_loads()
             if not self.in_profile_run and self.dynamic_eplb: