add graph mode check

whx-sjtu · whx-sjtu · commit fb032fbaf4d7 · 2025-12-15T10:24:59.000+08:00
Signed-off-by: whx-sjtu &lt;2952154980@qq.com&gt;
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -34,7 +34,8 @@
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
-                                         split_decodes_and_prefills)
+                                         split_decodes_and_prefills,
+                                         using_paged_attention)
 from vllm_ascend.compilation.acl_graph import (get_graph_params,
                                                update_graph_params_workspaces)
 from vllm_ascend.utils import (AscendDeviceType, get_ascend_device_type,
@@ -763,9 +764,7 @@ def forward_impl(
                                                attn_metadata, output)
         else:
             num_tokens = query.shape[0]
-            if get_current_vllm_config(
-            ).speculative_config is None and attn_metadata.attn_state == AscendAttentionState.DecodeOnly and num_tokens in get_ascend_config(
-            ).pa_shape_list:
+            if using_paged_attention(attn_metadata.attn_state, num_tokens):
                 output = self.full_graph_attention_with_pa(
                     query, attn_metadata, output)
             else:
diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py
@@ -1,13 +1,29 @@
 from dataclasses import dataclass
+from functools import lru_cache
 from typing import Any, List, Optional
 
 import torch
 import torch.nn.functional as F
+from vllm.config import get_current_vllm_config
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group,
                                           is_v1_kv_transfer_group)
 from vllm.forward_context import ForwardContext, get_forward_context
 
+from vllm_ascend.utils import get_ascend_config
+
+
+@lru_cache
+def using_paged_attention(attn_state, runtime_shape: int) -> bool:
+    vllm_config = get_current_vllm_config()
+    if vllm_config.speculative_config is not None:
+        return False
+    from vllm.config.compilation import CUDAGraphMode
+
+    from vllm_ascend.attention.attention_v1 import AscendAttentionState
+    return vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY and attn_state == AscendAttentionState.DecodeOnly and runtime_shape in get_ascend_config(
+    ).pa_shape_list
+
 
 @dataclass
 # class AscendCommonLongSequenceMetadata:
diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py
@@ -19,7 +19,8 @@
 from vllm.logger import logger
 from vllm.platforms import current_platform
 
-from ..attention.utils import PAGED_ATTENTION_LIST
+from vllm_ascend.attention.utils import using_paged_attention
+
 from ..utils import weak_ref_tensors
 
 
@@ -296,7 +297,8 @@ def _update_attn_fia_params(update_stream, forward_context, runtime_shape):
 
 
 def update_attn_params(update_stream, forward_context, runtime_shape):
-    if runtime_shape in PAGED_ATTENTION_LIST:
+    if using_paged_attention(forward_context.attn_metadata.attn_state,
+                             runtime_shape):
         _update_attn_pa_params(update_stream, forward_context, runtime_shape)
     else:
         _update_attn_fia_params(update_stream, forward_context, runtime_shape)