Skip to content

Commit e49d895

Browse files
author
wangxiaoxin-sherie
committed
xx
1 parent 63db91a commit e49d895

File tree

2 files changed

+15
-2
lines changed

2 files changed

+15
-2
lines changed

tests/e2e/multicard/test_qwen3_next.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,18 @@ def test_models_distributed_Qwen3_NEXT_TP4():
3636
distributed_executor_backend="mp",
3737
enforce_eager=True) as vllm_model:
3838
vllm_model.generate_greedy(example_prompts, max_tokens)
39+
40+
def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
41+
example_prompts = [
42+
"Hello, my name is",
43+
] * 4
44+
max_tokens = 5
45+
with VllmRunner("Qwen/Qwen3-Next-80B-A3B-Instruct",
46+
tensor_parallel_size=4,
47+
max_model_len=4096,
48+
gpu_memory_utilization=0.8,
49+
distributed_executor_backend="mp",
50+
enforce_eager=False,
51+
compilation_config={"cudagraph_mode":"FULL_DECODE_ONLY",
52+
"cudagraph_capture_sizes":[1,8,24,48,60]}) as vllm_model:
53+
vllm_model.generate_greedy(example_prompts, max_tokens)

vllm_ascend/compilation/acl_graph.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,8 +290,6 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
290290

291291
def update_attn_dcp_pcp_params(update_stream, forward_context, runtime_shape):
292292
graph_params = get_graph_params()
293-
# FIXME: Behold! We are using a temporary hack here to update the args
294-
# for each layer's attention op in the graph.
295293
with torch.npu.stream(update_stream):
296294
for key, param, handle, event in zip(
297295
forward_context.attn_metadata,

0 commit comments

Comments
 (0)