Skip to content

Commit 39fabee

Browse files
committed
[CI] drop ascend scheduler test
Signed-off-by: wangxiyuan <[email protected]>
1 parent e8e20c0 commit 39fabee

27 files changed

+53
-375
lines changed

tests/e2e/310p/test_offline_inference_parallel_310p.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,6 @@
2929
"additional_config": {
3030
"torchair_graph_config": {
3131
"enabled": True
32-
},
33-
"ascend_scheduler_config": {
34-
"enabled": True,
3532
}
3633
}
3734
}]

tests/e2e/multicard/test_expert_parallel.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
1515
max_tokens = 5
1616

1717
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
18-
with VllmRunner(
19-
model_name,
20-
tensor_parallel_size=2,
21-
additional_config={"ascend_scheduler_config": {
22-
"enabled": True
23-
}},
24-
enforce_eager=False) as vllm_model:
18+
with VllmRunner(model_name, tensor_parallel_size=2,
19+
enforce_eager=False) as vllm_model:
2520
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
2621

27-
with VllmRunner(
28-
model_name,
29-
tensor_parallel_size=2,
30-
enable_expert_parallel=True,
31-
additional_config={"ascend_scheduler_config": {
32-
"enabled": True
33-
}},
34-
enforce_eager=False) as vllm_model:
22+
with VllmRunner(model_name,
23+
tensor_parallel_size=2,
24+
enable_expert_parallel=True,
25+
enforce_eager=False) as vllm_model:
3526
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
3627

3728
check_outputs_equal(

tests/e2e/multicard/test_fused_moe_allgather_ep.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
4949
tensor_parallel_size=2,
5050
max_model_len=1024,
5151
dtype="auto",
52-
enable_expert_parallel=True,
53-
additional_config={
54-
"ascend_scheduler_config": {
55-
"enabled": True,
56-
"chunked_prefill_enabled": False,
57-
},
58-
}) as vllm_model:
52+
enable_expert_parallel=True) as vllm_model:
5953
vllm_model.generate(example_prompts, sampling_params)
6054

6155

@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
7670
tensor_parallel_size=2,
7771
max_model_len=1024,
7872
dtype="auto",
79-
enable_expert_parallel=True,
80-
additional_config={
81-
"ascend_scheduler_config": {
82-
"enabled": True,
83-
"chunked_prefill_enabled": False,
84-
},
85-
}) as vllm_model:
73+
enable_expert_parallel=True) as vllm_model:
8674
vllm_model.generate(example_prompts, sampling_params)

tests/e2e/multicard/test_offline_inference_distributed.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
8282
"enabled": True,
8383
},
8484
"enable_multistream_moe": True,
85-
"ascend_scheduler_config": {
86-
"enabled": True,
87-
},
8885
"refresh": True,
8986
},
9087
) as vllm_model:
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
154151
quantization="ascend",
155152
enforce_eager=True,
156153
enable_expert_parallel=True,
157-
additional_config={
158-
"torchair_graph_config": {
159-
"enabled": False,
160-
},
161-
"ascend_scheduler_config": {
162-
"enabled": True,
163-
}
164-
},
154+
additional_config={"torchair_graph_config": {
155+
"enabled": False,
156+
}},
165157
) as vllm_model:
166158
vllm_model.generate_greedy(prompts, max_tokens)
167159

tests/e2e/multicard/test_prefix_caching.py

Lines changed: 0 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
8484
name_0="vllm_output",
8585
name_1="prefix_cache_output",
8686
)
87-
88-
89-
@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
90-
@pytest.mark.parametrize("model", MODELS)
91-
@pytest.mark.parametrize("max_tokens", [50])
92-
def test_prefix_cache_with_ascend_scheduler(model: str,
93-
max_tokens: int) -> None:
94-
95-
with VllmRunner(model,
96-
additional_config={
97-
'ascend_scheduler_config': {
98-
'enabled': True,
99-
},
100-
},
101-
enforce_eager=False,
102-
max_model_len=2048,
103-
tensor_parallel_size=2,
104-
gpu_memory_utilization=0.7) as vllm_model:
105-
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
106-
107-
with VllmRunner(model,
108-
additional_config={
109-
'ascend_scheduler_config': {
110-
'enabled': True,
111-
'enable_prefix_caching': True,
112-
},
113-
},
114-
enforce_eager=False,
115-
max_model_len=2048,
116-
tensor_parallel_size=2,
117-
gpu_memory_utilization=0.7) as vllm_model:
118-
prefix_cache_output = vllm_model.generate_greedy(
119-
INPUT_PROMPTS, max_tokens)
120-
121-
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
122-
# Disable it now. Fix it or drop the ascend scheduler in the future.
123-
# with VllmRunner(model,
124-
# additional_config={
125-
# 'ascend_scheduler_config': {
126-
# 'enabled': True,
127-
# 'enable_prefix_caching': True,
128-
# "enable_chunked_prefill": True,
129-
# },
130-
# },
131-
# enforce_eager=True,
132-
# max_model_len=2048,
133-
# tensor_parallel_size=2,
134-
# gpu_memory_utilization=0.7) as vllm_model:
135-
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
136-
# INPUT_PROMPTS, max_tokens)
137-
138-
check_outputs_equal(
139-
outputs_0_lst=vllm_output,
140-
outputs_1_lst=prefix_cache_output,
141-
name_0="vllm_output",
142-
name_1="prefix_cache_output",
143-
)
144-
145-
# check_outputs_equal(
146-
# outputs_0_lst=chunk_prefill_prefix_cache_output,
147-
# outputs_1_lst=prefix_cache_output,
148-
# name_0="chunk_prefill_prefix_cache_output",
149-
# name_1="prefix_cache_output",
150-
# )

tests/e2e/multicard/test_qwen3_next.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
8989
gpu_memory_utilization=0.8,
9090
distributed_executor_backend="mp",
9191
enforce_eager=True,
92-
additional_config={
93-
"ascend_scheduler_config": {
94-
"enabled": True,
95-
"enable_chunked_prefill": False
96-
}
97-
},
9892
speculative_config={
9993
"method": "qwen3_next_mtp",
10094
"num_speculative_tokens": 1

tests/e2e/multicard/test_torchair_graph_mode.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
4444
kwargs = {}
4545
if not use_v1_schduler:
4646
kwargs = {
47-
"ascend_scheduler_config": {
48-
"enabled": True,
49-
},
5047
"refresh": True,
5148
}
5249
additional_config.update(**kwargs)
@@ -120,9 +117,6 @@ def _pangu_torchair_test_fixture(
120117

121118
# torchair is only work without chunked-prefill now
122119
kwargs = {
123-
"ascend_scheduler_config": {
124-
"enabled": True,
125-
},
126120
"refresh": True,
127121
}
128122
additional_config.update(**kwargs)
@@ -185,9 +179,6 @@ def _qwen_torchair_test_fixture(
185179
"torchair_graph_config": {
186180
"enabled": False,
187181
},
188-
"ascend_scheduler_config": {
189-
"enabled": True,
190-
},
191182
"refresh": True,
192183
}
193184

@@ -244,9 +235,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
244235
kwargs = {}
245236
if not use_v1_schduler:
246237
kwargs = {
247-
"ascend_scheduler_config": {
248-
"enable": True,
249-
},
250238
"refresh": True,
251239
}
252240
additional_config.update(**kwargs)

tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
7373
"VLLM_RPC_TIMEOUT": "3600000",
7474
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
7575
}
76-
additional_config: dict[str, Any] = {
77-
"ascend_scheduler_config": {
78-
"enabled": False
79-
},
80-
}
76+
additional_config: dict[str, Any] = {}
8177
speculative_config = {
8278
"num_speculative_tokens": 2,
8379
"method": "deepseek_mtp"

tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
7474
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
7575
}
7676
additional_config = {
77-
"ascend_scheduler_config": {
78-
"enabled": False
79-
},
8077
"torchair_graph_config": {
8178
"enabled": True,
8279
"enable_multistream_moe": False,

tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,7 @@
6868
async def test_models(model: str) -> None:
6969
port = get_open_port()
7070
env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
71-
additional_config = {
72-
"ascend_scheduler_config": {
73-
"enabled": False
74-
},
75-
"enable_weight_nz_layout": True
76-
}
71+
additional_config = {"enable_weight_nz_layout": True}
7772
server_args = [
7873
"--quantization", "ascend", "--reasoning-parser", "qwen3",
7974
"--tensor-parallel-size", "4", "--port",

0 commit comments

Comments
 (0)