Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ jobs:
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
pytest -sv tests/e2e/singlecard/test_bge_model.py
pytest -sv tests/e2e/singlecard/test_camem.py
pytest -sv tests/e2e/singlecard/test_chunked.py
pytest -sv tests/e2e/singlecard/test_embedding.py
# pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
Expand Down
3 changes: 0 additions & 3 deletions tests/e2e/310p/test_offline_inference_parallel_310p.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@
"additional_config": {
"torchair_graph_config": {
"enabled": True
},
"ascend_scheduler_config": {
"enabled": True,
}
}
}]
Expand Down
21 changes: 6 additions & 15 deletions tests/e2e/multicard/test_expert_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
max_tokens = 5

# FIXME: Really strange that chunked prefill might lead to different results, investigate further
with VllmRunner(
model_name,
tensor_parallel_size=2,
additional_config={"ascend_scheduler_config": {
"enabled": True
}},
enforce_eager=False) as vllm_model:
with VllmRunner(model_name, tensor_parallel_size=2,
enforce_eager=False) as vllm_model:
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)

with VllmRunner(
model_name,
tensor_parallel_size=2,
enable_expert_parallel=True,
additional_config={"ascend_scheduler_config": {
"enabled": True
}},
enforce_eager=False) as vllm_model:
with VllmRunner(model_name,
tensor_parallel_size=2,
enable_expert_parallel=True,
enforce_eager=False) as vllm_model:
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)

check_outputs_equal(
Expand Down
16 changes: 2 additions & 14 deletions tests/e2e/multicard/test_fused_moe_allgather_ep.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,7 @@ def test_generate_with_allgather():
tensor_parallel_size=2,
max_model_len=1024,
dtype="auto",
enable_expert_parallel=True,
additional_config={
"ascend_scheduler_config": {
"enabled": True,
"chunked_prefill_enabled": False,
},
}) as vllm_model:
enable_expert_parallel=True) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)


Expand All @@ -76,11 +70,5 @@ def test_generate_with_alltoall():
tensor_parallel_size=2,
max_model_len=1024,
dtype="auto",
enable_expert_parallel=True,
additional_config={
"ascend_scheduler_config": {
"enabled": True,
"chunked_prefill_enabled": False,
},
}) as vllm_model:
enable_expert_parallel=True) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
14 changes: 3 additions & 11 deletions tests/e2e/multicard/test_offline_inference_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
"enabled": True,
},
"enable_multistream_moe": True,
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
},
) as vllm_model:
Expand Down Expand Up @@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
quantization="ascend",
enforce_eager=True,
enable_expert_parallel=True,
additional_config={
"torchair_graph_config": {
"enabled": False,
},
"ascend_scheduler_config": {
"enabled": True,
}
},
additional_config={"torchair_graph_config": {
"enabled": False,
}},
) as vllm_model:
vllm_model.generate_greedy(prompts, max_tokens)

Expand Down
64 changes: 0 additions & 64 deletions tests/e2e/multicard/test_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
name_0="vllm_output",
name_1="prefix_cache_output",
)


@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
def test_prefix_cache_with_ascend_scheduler(model: str,
max_tokens: int) -> None:

with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
enforce_eager=False,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)

with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
'enable_prefix_caching': True,
},
},
enforce_eager=False,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)

# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
# Disable it now. Fix it or drop the ascend scheduler in the future.
# with VllmRunner(model,
# additional_config={
# 'ascend_scheduler_config': {
# 'enabled': True,
# 'enable_prefix_caching': True,
# "enable_chunked_prefill": True,
# },
# },
# enforce_eager=True,
# max_model_len=2048,
# tensor_parallel_size=2,
# gpu_memory_utilization=0.7) as vllm_model:
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
# INPUT_PROMPTS, max_tokens)

check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=prefix_cache_output,
name_0="vllm_output",
name_1="prefix_cache_output",
)

# check_outputs_equal(
# outputs_0_lst=chunk_prefill_prefix_cache_output,
# outputs_1_lst=prefix_cache_output,
# name_0="chunk_prefill_prefix_cache_output",
# name_1="prefix_cache_output",
# )
6 changes: 0 additions & 6 deletions tests/e2e/multicard/test_qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
gpu_memory_utilization=0.8,
distributed_executor_backend="mp",
enforce_eager=True,
additional_config={
"ascend_scheduler_config": {
"enabled": True,
"enable_chunked_prefill": False
}
},
speculative_config={
"method": "qwen3_next_mtp",
"num_speculative_tokens": 1
Expand Down
12 changes: 0 additions & 12 deletions tests/e2e/multicard/test_torchair_graph_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
kwargs = {}
if not use_v1_schduler:
kwargs = {
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
}
additional_config.update(**kwargs)
Expand Down Expand Up @@ -120,9 +117,6 @@ def _pangu_torchair_test_fixture(

# torchair is only work without chunked-prefill now
kwargs = {
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
}
additional_config.update(**kwargs)
Expand Down Expand Up @@ -185,9 +179,6 @@ def _qwen_torchair_test_fixture(
"torchair_graph_config": {
"enabled": False,
},
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
}

Expand Down Expand Up @@ -244,9 +235,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
kwargs = {}
if not use_v1_schduler:
kwargs = {
"ascend_scheduler_config": {
"enable": True,
},
"refresh": True,
}
additional_config.update(**kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
"VLLM_RPC_TIMEOUT": "3600000",
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
}
additional_config: dict[str, Any] = {}
speculative_config = {
"num_speculative_tokens": 2,
"method": "deepseek_mtp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"torchair_graph_config": {
"enabled": True,
"enable_multistream_moe": False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,7 @@
async def test_models(model: str) -> None:
port = get_open_port()
env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"enable_weight_nz_layout": True
}
additional_config = {"enable_weight_nz_layout": True}
server_args = [
"--quantization", "ascend", "--reasoning-parser", "qwen3",
"--tensor-parallel-size", "4", "--port",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ async def test_models(model: str, tp_size: int) -> None:
"0.9", "--block-size", "128", "--max-num-seqs", "256",
"--enforce-eager", "--max-model-len", "35840",
"--max-num-batched-tokens", "35840", "--additional-config",
'{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}',
"--compilation-config",
'{"enable_weight_nz_layout":true}', "--compilation-config",
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
]
with RemoteOpenAIServer(model,
Expand Down
10 changes: 1 addition & 9 deletions tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
"single",
"aclgraph",
"aclgraph_mlapo",
"no_chunkprefill",
]

prompts = [
Expand Down Expand Up @@ -82,9 +81,6 @@ async def test_models(model: str, mode: str) -> None:
"method": "deepseek_mtp"
}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"torchair_graph_config": {
"enabled": True,
"enable_multistream_moe": False,
Expand Down Expand Up @@ -112,10 +108,6 @@ async def test_models(model: str, mode: str) -> None:
if mode == "aclgraph_mlapo":
env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
additional_config["torchair_graph_config"] = {"enabled": False}
if mode == "no_chunkprefill":
additional_config["ascend_scheduler_config"] = {"enabled": True}
i = server_args.index("--max-num-batched-tokens") + 1
server_args[i] = "36864"
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
Expand All @@ -134,7 +126,7 @@ async def test_models(model: str, mode: str) -> None:
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
if mode in ["single", "no_chunkprefill"]:
if mode in ["single"]:
return
# aisbench test
run_aisbench_cases(model,
Expand Down
3 changes: 0 additions & 3 deletions tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,6 @@ async def test_models(model: str) -> None:
"cudagraph_mode": "FULL_DECODE_ONLY"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
"torchair_graph_config": {
"enabled": True
},
Expand Down
3 changes: 1 addition & 2 deletions tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ async def test_models(model: str, tp_size: int, dp_size: int,
"--gpu-memory-utilization",
"0.9",
"--additional-config",
'{"ascend_scheduler_config":{"enabled":true},'
'"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
'{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
]
if full_graph:
server_args += [
Expand Down
5 changes: 2 additions & 3 deletions tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,8 @@ async def test_models(model: str, tp_size: int) -> None:
str(tp_size), "--port",
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
"40000", "--max-num-seqs", "400", "--trust-remote-code",
"--gpu-memory-utilization", "0.8", "--additional-config",
'{"ascend_scheduler_config":{"enabled":false}}',
"--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
"--gpu-memory-utilization", "0.8", "--compilation_config",
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
Expand Down
6 changes: 1 addition & 5 deletions tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,7 @@ async def test_models(model: str) -> None:
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
}
additional_config: dict[str, Any] = {}
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
server_args = [
"--quantization", "ascend", "--async-scheduling",
Expand Down
6 changes: 0 additions & 6 deletions tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,6 @@ async def test_models(model: str, mode: str) -> None:
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
}
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
server_args = [
"--quantization", "ascend", "--async-scheduling",
Expand All @@ -82,7 +77,6 @@ async def test_models(model: str, mode: str) -> None:
server_args.extend(
["--compilation-config",
json.dumps(compilation_config)])
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
Expand Down
2 changes: 0 additions & 2 deletions tests/e2e/nightly/models/test_qwq_32b.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
server_args.remove(
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
)
server_args.append("--additional-config")
server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
server_args.append("--enforce-eager")
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
Expand Down
Loading
Loading