refactor: Defer and centralize ACL graph parameter initialization

yiz-liu · yiz-liu · commit 165259da1c8e · 2025-12-08T23:36:59.000+08:00
Moves the setting of ACL graph parameters (`set_graph_params` and `set_mtp_graph_params`) from the initial setup of the model runner and proposer to a later point in the initialization lifecycle.

This change ensures that the graph parameters are configured using the definitive `aclgraph_batch_sizes`, which are only determined just before profiling preparation. It also centralizes this configuration logic within the `NPUModelRunner`.

Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py
@@ -427,7 +427,7 @@ class GraphParams:
 _graph_params: Optional[GraphParams] = None
 
 
-def set_graph_params(aclgraph_capture_sizes: set[int]):
+def set_graph_params(aclgraph_capture_sizes: list[int]):
     global _graph_params
     if _graph_params is not None:
         raise ValueError("Graph parameters have already been set!")
@@ -456,7 +456,7 @@ def get_graph_params():
 _mtp_graph_params: Optional[GraphParams] = None
 
 
-def set_mtp_graph_params(aclgraph_capture_sizes: set[int]):
+def set_mtp_graph_params(aclgraph_capture_sizes: list[int]):
     global _mtp_graph_params
     if _mtp_graph_params is not None:
         raise ValueError("MTPGraph parameters have already been set!")
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -31,7 +31,6 @@
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
-                                               set_mtp_graph_params,
                                                update_mla_attn_params)
 from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
 from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
@@ -213,8 +212,6 @@ def load_model(self, model) -> None:
         if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(
         ):
             self.update_stream: torch.npu.Stream = torch.npu.Stream()
-            set_mtp_graph_params(
-                self.vllm_config.compilation_config.cudagraph_capture_sizes)
             self.model = ACLGraphWrapper(self.model,
                                          self.vllm_config,
                                          runtime_mode=CUDAGraphMode.FULL)
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -123,6 +123,7 @@
 # yapf: disable
 from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
                                                set_graph_params,
+                                               set_mtp_graph_params,
                                                update_attn_dcp_pcp_params,
                                                update_attn_params,
                                                update_mla_attn_dcp_pcp_params,
@@ -3368,7 +3369,6 @@ def load_model(self) -> None:
         # wrap the model with full graph wrapper if needed.
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             self.update_stream: torch.npu.Stream = torch.npu.Stream()
-            set_graph_params(self.compilation_config.cudagraph_capture_sizes)
             self.model = ACLGraphWrapper(self.model,
                                          self.vllm_config,
                                          runtime_mode=CUDAGraphMode.FULL)
@@ -4087,6 +4087,12 @@ def initialize_aclgraph_capture(self) -> None:
             self.aclgraph_batch_sizes = (capture_sizes
                                          if capture_sizes is not None else [])
 
+        # NOTE: Since aclgraph_batch_sizes cannot be determined until here,
+        # we set the graph params right before initializing the keys.
+        set_graph_params(self.aclgraph_batch_sizes)
+        if self.speculative_config:
+            set_mtp_graph_params(self.aclgraph_batch_sizes)
+
         self.aclgraph_dispatcher.initialize_cudagraph_keys(
             self.compilation_config.cudagraph_mode,
             self.uniform_decode_query_len)