xx

wangxiaoxin-sherie · wangxiaoxin-sherie · commit e49d89599576 · 2025-11-03T19:25:07.000+08:00
diff --git a/tests/e2e/multicard/test_qwen3_next.py b/tests/e2e/multicard/test_qwen3_next.py
@@ -36,3 +36,18 @@ def test_models_distributed_Qwen3_NEXT_TP4():
                     distributed_executor_backend="mp",
                     enforce_eager=True) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
+
+def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
+    example_prompts = [
+        "Hello, my name is",
+    ] * 4
+    max_tokens = 5
+    with VllmRunner("Qwen/Qwen3-Next-80B-A3B-Instruct",
+                    tensor_parallel_size=4,
+                    max_model_len=4096,
+                    gpu_memory_utilization=0.8,
+                    distributed_executor_backend="mp",
+                    enforce_eager=False,
+                    compilation_config={"cudagraph_mode":"FULL_DECODE_ONLY",
+                                        "cudagraph_capture_sizes":[1,8,24,48,60]}) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py
@@ -290,8 +290,6 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
 
 def update_attn_dcp_pcp_params(update_stream, forward_context, runtime_shape):
     graph_params = get_graph_params()
-    # FIXME: Behold! We are using a temporary hack here to update the args
-    # for each layer's attention op in the graph.
     with torch.npu.stream(update_stream):
         for key, param, handle, event in zip(
                 forward_context.attn_metadata,