Skip to content

Commit bfafe30

Browse files
[CI] refect e2e test (#4799)
### What this PR does / why we need it? This PR updates the CI configuration and adjusts a set of end-to-end (e2e) tests under tests/e2e/multicard, in order to refactor the test suite and ensure compatibility with current codebase and CI workflows. 1. tests/e2e/multicard/test_prefix_caching.py: change model to Qwen3-8B and rename the test case 2. tests/e2e/multicard/test_quantization.py: rename the test case 3. tests/e2e/multicard/test_qwen3_moe.py: remove duplicate test and rename test cases 4. tests/e2e/multicard/test_qwen3_next.py: rename test cases and change the W8A8 pruning model to the W8A8 model and remove the eager parameter 5. tests/e2e/multicard/test_shared_expert_dp.py: rename test case and remove the eager parameter 6. tests/e2e/multicard/test_single_request_aclgraph.py: rename test case and change Qwen3-30B to Qwen3-0.6B 7. tests/e2e/multicard/test_torchair_graph_mode.py: delete test cases about torchair - vLLM version: v0.12.0 - vLLM main: vllm-project/vllm@ad32e3e Signed-off-by: hfadzxy <[email protected]>
1 parent a6ef3ac commit bfafe30

File tree

8 files changed

+30
-66
lines changed

8 files changed

+30
-66
lines changed

.github/workflows/_e2e_test.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,8 @@ jobs:
197197
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1
198198
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight
199199
200-
pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
201200
pytest -sv tests/e2e/multicard/test_prefix_caching.py
201+
pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
202202
pytest -sv tests/e2e/multicard/test_qwen3_moe.py
203203
pytest -sv tests/e2e/multicard/test_offline_weight_load.py
204204
@@ -266,9 +266,8 @@ jobs:
266266
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
267267
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
268268
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Kimi_K2_Thinking_W4A16
269-
# pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
270-
# pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_W8A8_WITH_EP
271269
pytest -sv tests/e2e/multicard/test_data_parallel_tp2.py
270+
272271
- name: Install Ascend toolkit & triton_ascend (for Qwen3-Next-80B-A3B-Instruct)
273272
shell: bash -l {0}
274273
run: |

tests/e2e/multicard/test_pipeline_parallel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,4 @@ def test_models_pp2(model: str, tp_size: int, pp_size: int,
4444
pipeline_parallel_size=pp_size,
4545
distributed_executor_backend=distributed_executor_backend,
4646
gpu_memory_utilization=0.7) as vllm_model:
47-
vllm_model.generate_greedy(prompts, 64)
47+
vllm_model.generate_greedy(prompts, 64)

tests/e2e/multicard/test_prefix_caching.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3-
"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""
3+
"""Compare the with and without prefix caching."""
44

55
import pytest
66

@@ -9,7 +9,7 @@
99

1010
MODELS = [
1111
# for MHA
12-
"Qwen/Qwen3-8B-Base",
12+
"Qwen/Qwen3-8B",
1313
# for MLA
1414
"deepseek-ai/DeepSeek-V2-Lite-Chat"
1515
]
@@ -60,9 +60,8 @@
6060

6161
@pytest.mark.parametrize("model", MODELS)
6262
@pytest.mark.parametrize("max_tokens", [50])
63-
def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
63+
def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
6464
with VllmRunner(model,
65-
enforce_eager=False,
6665
max_model_len=2048,
6766
tensor_parallel_size=2,
6867
gpu_memory_utilization=0.7) as vllm_model:
@@ -71,7 +70,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
7170

7271
with VllmRunner(model,
7372
enable_prefix_caching=False,
74-
enforce_eager=False,
7573
max_model_len=2048,
7674
tensor_parallel_size=2,
7775
gpu_memory_utilization=0.7) as vllm_model:

tests/e2e/multicard/test_quantization.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,17 @@
2525
from tests.e2e.conftest import VllmRunner
2626

2727

28-
def test_models_distributed_quantized_W8A8():
28+
def test_qwen2_5_w8a8_external_quantized_tp2():
2929
example_prompts = [
3030
"The president of the United States is",
3131
]
3232
max_tokens = 5
33-
with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
34-
tensor_parallel_size=2,
35-
max_model_len=4096,
36-
gpu_memory_utilization=0.8,
37-
enforce_eager=False) as vllm_model:
33+
with VllmRunner(
34+
snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
35+
tensor_parallel_size=2,
36+
max_model_len=4096,
37+
gpu_memory_utilization=0.8,
38+
) as vllm_model:
3839
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
3940

4041
golden_results = [

tests/e2e/multicard/test_qwen3_moe.py

Lines changed: 4 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,8 @@
2929
from tests.e2e.conftest import VllmRunner
3030

3131

32-
def test_models_distributed_Qwen3_MOE_TP2():
33-
example_prompts = [
34-
"Hello, my name is",
35-
]
36-
max_tokens = 5
37-
with VllmRunner(
38-
"Qwen/Qwen3-30B-A3B",
39-
tensor_parallel_size=2,
40-
distributed_executor_backend="mp",
41-
) as vllm_model:
42-
vllm_model.generate_greedy(example_prompts, max_tokens)
43-
44-
4532
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
46-
def test_models_distributed_Qwen3_MOE_TP2_WITH_EP():
33+
def test_qwen3_moe_distributed_mp_tp2_ep():
4734
example_prompts = [
4835
"Hello, my name is",
4936
]
@@ -53,12 +40,11 @@ def test_models_distributed_Qwen3_MOE_TP2_WITH_EP():
5340
tensor_parallel_size=2,
5441
enable_expert_parallel=True,
5542
distributed_executor_backend="mp",
56-
enforce_eager=False,
5743
) as vllm_model:
5844
vllm_model.generate_greedy(example_prompts, max_tokens)
5945

6046

61-
def test_models_distributed_Qwen3_MOE_W8A8():
47+
def test_qwen3_moe_w8a8_distributed_tp2():
6248
example_prompts = [
6349
"Hello, my name is",
6450
]
@@ -73,7 +59,7 @@ def test_models_distributed_Qwen3_MOE_W8A8():
7359

7460

7561
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
76-
def test_models_distributed_Qwen3_MOE_W8A8_WITH_EP():
62+
def test_qwen3_moe_w8a8_distributed_tp2_ep():
7763
example_prompts = [
7864
"Hello, my name is",
7965
]
@@ -88,7 +74,7 @@ def test_models_distributed_Qwen3_MOE_W8A8_WITH_EP():
8874
vllm_model.generate_greedy(example_prompts, max_tokens)
8975

9076

91-
def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH_AIV():
77+
def test_qwen3_moe_distributed_aiv_tp2():
9278
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
9379
example_prompts = [
9480
"Hello, my name is",
@@ -99,23 +85,5 @@ def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH_AIV():
9985
"Qwen/Qwen3-30B-A3B",
10086
dtype=dtype,
10187
tensor_parallel_size=2,
102-
enforce_eager=False,
103-
) as vllm_model:
104-
vllm_model.generate_greedy(example_prompts, max_tokens)
105-
106-
107-
def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH():
108-
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
109-
del os.environ['HCCL_OP_EXPANSION_MODE']
110-
example_prompts = [
111-
"Hello, my name is",
112-
]
113-
dtype = "auto"
114-
max_tokens = 5
115-
with VllmRunner(
116-
"Qwen/Qwen3-30B-A3B",
117-
dtype=dtype,
118-
tensor_parallel_size=2,
119-
enforce_eager=False,
12088
) as vllm_model:
12189
vllm_model.generate_greedy(example_prompts, max_tokens)

tests/e2e/multicard/test_qwen3_next.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from tests.e2e.conftest import VllmRunner
3030

3131

32-
def test_models_distributed_Qwen3_NEXT_TP4():
32+
def test_qwen3_next_distributed_mp_tp4():
3333
example_prompts = [
3434
"Hello, my name is",
3535
] * 4
@@ -38,13 +38,12 @@ def test_models_distributed_Qwen3_NEXT_TP4():
3838
tensor_parallel_size=4,
3939
max_model_len=4096,
4040
gpu_memory_utilization=0.8,
41-
distributed_executor_backend="mp",
42-
enforce_eager=True) as vllm_model:
41+
distributed_executor_backend="mp") as vllm_model:
4342
vllm_model.generate_greedy(example_prompts, max_tokens)
4443
del vllm_model
4544

4645

47-
def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
46+
def test_qwen3_next_distributed_mp_full_decode_only_tp4():
4847
example_prompts = [
4948
"Hello, my name is",
5049
] * 4
@@ -54,7 +53,6 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
5453
max_model_len=4096,
5554
gpu_memory_utilization=0.8,
5655
distributed_executor_backend="mp",
57-
enforce_eager=False,
5856
compilation_config={
5957
"cudagraph_mode": "FULL_DECODE_ONLY",
6058
"cudagraph_capture_sizes": [1, 8, 24, 48, 60]
@@ -64,7 +62,7 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
6462

6563

6664
# TODO: Fix the accuary of batch chunked prefill
67-
def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
65+
def test_qwen3_next_distributed_mp_eager_mtp_similarity_tp4():
6866
example_prompts = ["Hello, my name is"]
6967
max_tokens = 20
7068

@@ -110,16 +108,15 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
110108

111109
# TODO: will conduct accuracy verification after the subsequent version becomes stable
112110
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
113-
def test_models_distributed_Qwen3_NEXT_W8A8DYNAMIC_WITH_EP():
111+
def test_qwen3_next_w8a8dynamic_distributed_tp4_ep():
114112
example_prompts = [
115113
"Hello, my name is",
116114
]
117115
max_tokens = 5
118116
with VllmRunner(
119-
snapshot_download(
120-
"vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8-Pruning"),
117+
snapshot_download("vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8"),
121118
max_model_len=4096,
122-
tensor_parallel_size=2,
119+
tensor_parallel_size=4,
123120
gpu_memory_utilization=0.4,
124121
max_num_seqs=1,
125122
enable_expert_parallel=True,

tests/e2e/multicard/test_shared_expert_dp.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
from tests.e2e.model_utils import check_outputs_equal
88

99
MODELS = [
10-
"vllm-ascend/DeepSeek-V2-Lite",
10+
"deepseek-ai/DeepSeek-V2-Lite",
1111
]
1212
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
1313

1414

1515
@pytest.mark.parametrize("model", MODELS)
16-
def test_models_with_enable_shared_expert_dp(model: str) -> None:
16+
def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
1717

1818
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
1919
del os.environ['HCCL_OP_EXPANSION_MODE']
@@ -51,7 +51,7 @@ def test_models_with_enable_shared_expert_dp(model: str) -> None:
5151
model,
5252
max_model_len=1024,
5353
tensor_parallel_size=2,
54-
enforce_eager=False,
54+
enable_expert_parallel=True,
5555
compilation_config={
5656
"cudagraph_capture_sizes": [1, 4, 8, 16],
5757
"cudagraph_mode": "FULL_DECODE_ONLY",

tests/e2e/multicard/test_single_request_aclgraph.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
from tests.e2e.conftest import RemoteOpenAIServer
2525

26-
MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
26+
MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
2727

2828
DATA_PARALLELS = [2]
2929

@@ -39,7 +39,8 @@
3939
@pytest.mark.asyncio
4040
@pytest.mark.parametrize("model", MODELS)
4141
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
42-
async def test_single_request_aclgraph(model: str, dp_size: int) -> None:
42+
async def test_models_single_request_aclgraph_dp2(model: str,
43+
dp_size: int) -> None:
4344
port = get_open_port()
4445
env_dict = {
4546
"TASK_QUEUE_ENABLE": "1",

0 commit comments

Comments
 (0)