Skip to content

Commit 2bc1df6

Browse files
wangxiyuanmercykid
authored andcommitted
[CI] drop ascend scheduler test (vllm-project#4582)
let' drop ascend scheduler test first to ensure all function works without it. - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: wangxiyuan <[email protected]> Signed-off-by: Che Ruan <[email protected]>
1 parent bf27046 commit 2bc1df6

28 files changed

+53
-376
lines changed

.github/workflows/_e2e_test.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ jobs:
9494
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
9595
pytest -sv tests/e2e/singlecard/test_bge_model.py
9696
pytest -sv tests/e2e/singlecard/test_camem.py
97-
pytest -sv tests/e2e/singlecard/test_chunked.py
9897
pytest -sv tests/e2e/singlecard/test_embedding.py
9998
# pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
10099
pytest -sv tests/e2e/singlecard/test_guided_decoding.py

tests/e2e/310p/test_offline_inference_parallel_310p.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,6 @@
2929
"additional_config": {
3030
"torchair_graph_config": {
3131
"enabled": True
32-
},
33-
"ascend_scheduler_config": {
34-
"enabled": True,
3532
}
3633
}
3734
}]

tests/e2e/multicard/test_expert_parallel.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
1515
max_tokens = 5
1616

1717
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
18-
with VllmRunner(
19-
model_name,
20-
tensor_parallel_size=2,
21-
additional_config={"ascend_scheduler_config": {
22-
"enabled": True
23-
}},
24-
enforce_eager=False) as vllm_model:
18+
with VllmRunner(model_name, tensor_parallel_size=2,
19+
enforce_eager=False) as vllm_model:
2520
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
2621

27-
with VllmRunner(
28-
model_name,
29-
tensor_parallel_size=2,
30-
enable_expert_parallel=True,
31-
additional_config={"ascend_scheduler_config": {
32-
"enabled": True
33-
}},
34-
enforce_eager=False) as vllm_model:
22+
with VllmRunner(model_name,
23+
tensor_parallel_size=2,
24+
enable_expert_parallel=True,
25+
enforce_eager=False) as vllm_model:
3526
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
3627

3728
check_outputs_equal(

tests/e2e/multicard/test_fused_moe_allgather_ep.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
4949
tensor_parallel_size=2,
5050
max_model_len=1024,
5151
dtype="auto",
52-
enable_expert_parallel=True,
53-
additional_config={
54-
"ascend_scheduler_config": {
55-
"enabled": True,
56-
"chunked_prefill_enabled": False,
57-
},
58-
}) as vllm_model:
52+
enable_expert_parallel=True) as vllm_model:
5953
vllm_model.generate(example_prompts, sampling_params)
6054

6155

@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
7670
tensor_parallel_size=2,
7771
max_model_len=1024,
7872
dtype="auto",
79-
enable_expert_parallel=True,
80-
additional_config={
81-
"ascend_scheduler_config": {
82-
"enabled": True,
83-
"chunked_prefill_enabled": False,
84-
},
85-
}) as vllm_model:
73+
enable_expert_parallel=True) as vllm_model:
8674
vllm_model.generate(example_prompts, sampling_params)

tests/e2e/multicard/test_offline_inference_distributed.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
8282
"enabled": True,
8383
},
8484
"enable_multistream_moe": True,
85-
"ascend_scheduler_config": {
86-
"enabled": True,
87-
},
8885
"refresh": True,
8986
},
9087
) as vllm_model:
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
154151
quantization="ascend",
155152
enforce_eager=True,
156153
enable_expert_parallel=True,
157-
additional_config={
158-
"torchair_graph_config": {
159-
"enabled": False,
160-
},
161-
"ascend_scheduler_config": {
162-
"enabled": True,
163-
}
164-
},
154+
additional_config={"torchair_graph_config": {
155+
"enabled": False,
156+
}},
165157
) as vllm_model:
166158
vllm_model.generate_greedy(prompts, max_tokens)
167159

tests/e2e/multicard/test_prefix_caching.py

Lines changed: 0 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
8484
name_0="vllm_output",
8585
name_1="prefix_cache_output",
8686
)
87-
88-
89-
@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
90-
@pytest.mark.parametrize("model", MODELS)
91-
@pytest.mark.parametrize("max_tokens", [50])
92-
def test_prefix_cache_with_ascend_scheduler(model: str,
93-
max_tokens: int) -> None:
94-
95-
with VllmRunner(model,
96-
additional_config={
97-
'ascend_scheduler_config': {
98-
'enabled': True,
99-
},
100-
},
101-
enforce_eager=False,
102-
max_model_len=2048,
103-
tensor_parallel_size=2,
104-
gpu_memory_utilization=0.7) as vllm_model:
105-
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
106-
107-
with VllmRunner(model,
108-
additional_config={
109-
'ascend_scheduler_config': {
110-
'enabled': True,
111-
'enable_prefix_caching': True,
112-
},
113-
},
114-
enforce_eager=False,
115-
max_model_len=2048,
116-
tensor_parallel_size=2,
117-
gpu_memory_utilization=0.7) as vllm_model:
118-
prefix_cache_output = vllm_model.generate_greedy(
119-
INPUT_PROMPTS, max_tokens)
120-
121-
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
122-
# Disable it now. Fix it or drop the ascend scheduler in the future.
123-
# with VllmRunner(model,
124-
# additional_config={
125-
# 'ascend_scheduler_config': {
126-
# 'enabled': True,
127-
# 'enable_prefix_caching': True,
128-
# "enable_chunked_prefill": True,
129-
# },
130-
# },
131-
# enforce_eager=True,
132-
# max_model_len=2048,
133-
# tensor_parallel_size=2,
134-
# gpu_memory_utilization=0.7) as vllm_model:
135-
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
136-
# INPUT_PROMPTS, max_tokens)
137-
138-
check_outputs_equal(
139-
outputs_0_lst=vllm_output,
140-
outputs_1_lst=prefix_cache_output,
141-
name_0="vllm_output",
142-
name_1="prefix_cache_output",
143-
)
144-
145-
# check_outputs_equal(
146-
# outputs_0_lst=chunk_prefill_prefix_cache_output,
147-
# outputs_1_lst=prefix_cache_output,
148-
# name_0="chunk_prefill_prefix_cache_output",
149-
# name_1="prefix_cache_output",
150-
# )

tests/e2e/multicard/test_qwen3_next.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
8989
gpu_memory_utilization=0.8,
9090
distributed_executor_backend="mp",
9191
enforce_eager=True,
92-
additional_config={
93-
"ascend_scheduler_config": {
94-
"enabled": True,
95-
"enable_chunked_prefill": False
96-
}
97-
},
9892
speculative_config={
9993
"method": "qwen3_next_mtp",
10094
"num_speculative_tokens": 1

tests/e2e/multicard/test_torchair_graph_mode.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
4444
kwargs = {}
4545
if not use_v1_schduler:
4646
kwargs = {
47-
"ascend_scheduler_config": {
48-
"enabled": True,
49-
},
5047
"refresh": True,
5148
}
5249
additional_config.update(**kwargs)
@@ -121,9 +118,6 @@ def _pangu_torchair_test_fixture(
121118

122119
# torchair is only work without chunked-prefill now
123120
kwargs = {
124-
"ascend_scheduler_config": {
125-
"enabled": True,
126-
},
127121
"refresh": True,
128122
}
129123
additional_config.update(**kwargs)
@@ -186,9 +180,6 @@ def _qwen_torchair_test_fixture(
186180
"torchair_graph_config": {
187181
"enabled": False,
188182
},
189-
"ascend_scheduler_config": {
190-
"enabled": True,
191-
},
192183
"refresh": True,
193184
}
194185

@@ -245,9 +236,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
245236
kwargs = {}
246237
if not use_v1_schduler:
247238
kwargs = {
248-
"ascend_scheduler_config": {
249-
"enable": True,
250-
},
251239
"refresh": True,
252240
}
253241
additional_config.update(**kwargs)

tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
7373
"VLLM_RPC_TIMEOUT": "3600000",
7474
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
7575
}
76-
additional_config: dict[str, Any] = {
77-
"ascend_scheduler_config": {
78-
"enabled": False
79-
},
80-
}
76+
additional_config: dict[str, Any] = {}
8177
speculative_config = {
8278
"num_speculative_tokens": 2,
8379
"method": "deepseek_mtp"

tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
7474
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
7575
}
7676
additional_config = {
77-
"ascend_scheduler_config": {
78-
"enabled": False
79-
},
8077
"torchair_graph_config": {
8178
"enabled": True,
8279
"enable_multistream_moe": False,

0 commit comments

Comments
 (0)