vllm-project · MengqingCao · Dec 1, 2025 · Dec 1, 2025
diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
@@ -94,7 +94,6 @@ jobs:
           pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
           pytest -sv tests/e2e/singlecard/test_bge_model.py
           pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_chunked.py
           pytest -sv tests/e2e/singlecard/test_embedding.py
           # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py

diff --git a/tests/e2e/310p/test_offline_inference_parallel_310p.py b/tests/e2e/310p/test_offline_inference_parallel_310p.py
@@ -29,9 +29,6 @@
     "additional_config": {
         "torchair_graph_config": {
             "enabled": True
-        },
-        "ascend_scheduler_config": {
-            "enabled": True,
         }
     }
 }]

diff --git a/tests/e2e/multicard/test_expert_parallel.py b/tests/e2e/multicard/test_expert_parallel.py
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
     max_tokens = 5
 
     # FIXME: Really strange that chunked prefill might lead to different results, investigate further
-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=2,
-            additional_config={"ascend_scheduler_config": {
-                "enabled": True
-            }},
-            enforce_eager=False) as vllm_model:
+    with VllmRunner(model_name, tensor_parallel_size=2,
+                    enforce_eager=False) as vllm_model:
         tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=2,
-            enable_expert_parallel=True,
-            additional_config={"ascend_scheduler_config": {
-                "enabled": True
-            }},
-            enforce_eager=False) as vllm_model:
+    with VllmRunner(model_name,
+                    tensor_parallel_size=2,
+                    enable_expert_parallel=True,
+                    enforce_eager=False) as vllm_model:
         ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(

diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
                     tensor_parallel_size=2,
                     max_model_len=1024,
                     dtype="auto",
-                    enable_expert_parallel=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "chunked_prefill_enabled": False,
-                        },
-                    }) as vllm_model:
+                    enable_expert_parallel=True) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
 
 
@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
                     tensor_parallel_size=2,
                     max_model_len=1024,
                     dtype="auto",
-                    enable_expert_parallel=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "chunked_prefill_enabled": False,
-                        },
-                    }) as vllm_model:
+                    enable_expert_parallel=True) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
                     "enabled": True,
                 },
                 "enable_multistream_moe": True,
-                "ascend_scheduler_config": {
-                    "enabled": True,
-                },
                 "refresh": True,
             },
     ) as vllm_model:
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
             quantization="ascend",
             enforce_eager=True,
             enable_expert_parallel=True,
-            additional_config={
-                "torchair_graph_config": {
-                    "enabled": False,
-                },
-                "ascend_scheduler_config": {
-                    "enabled": True,
-                }
-            },
+            additional_config={"torchair_graph_config": {
+                "enabled": False,
+            }},
     ) as vllm_model:
         vllm_model.generate_greedy(prompts, max_tokens)
 

diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
         name_0="vllm_output",
         name_1="prefix_cache_output",
     )
-
-
-@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [50])
-def test_prefix_cache_with_ascend_scheduler(model: str,
-                                            max_tokens: int) -> None:
-
-    with VllmRunner(model,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    enforce_eager=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
-
-    with VllmRunner(model,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                            'enable_prefix_caching': True,
-                        },
-                    },
-                    enforce_eager=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        prefix_cache_output = vllm_model.generate_greedy(
-            INPUT_PROMPTS, max_tokens)
-
-    # TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
-    # Disable it now. Fix it or drop the ascend scheduler in the future.
-    # with VllmRunner(model,
-    #                 additional_config={
-    #                     'ascend_scheduler_config': {
-    #                         'enabled': True,
-    #                         'enable_prefix_caching': True,
-    #                         "enable_chunked_prefill": True,
-    #                     },
-    #                 },
-    #                 enforce_eager=True,
-    #                 max_model_len=2048,
-    #                 tensor_parallel_size=2,
-    #                 gpu_memory_utilization=0.7) as vllm_model:
-    #     chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
-    #         INPUT_PROMPTS, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_output,
-        outputs_1_lst=prefix_cache_output,
-        name_0="vllm_output",
-        name_1="prefix_cache_output",
-    )
-
-    # check_outputs_equal(
-    #     outputs_0_lst=chunk_prefill_prefix_cache_output,
-    #     outputs_1_lst=prefix_cache_output,
-    #     name_0="chunk_prefill_prefix_cache_output",
-    #     name_1="prefix_cache_output",
-    # )
diff --git a/tests/e2e/multicard/test_qwen3_next.py b/tests/e2e/multicard/test_qwen3_next.py
@@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
                     gpu_memory_utilization=0.8,
                     distributed_executor_backend="mp",
                     enforce_eager=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "enable_chunked_prefill": False
-                        }
-                    },
                     speculative_config={
                         "method": "qwen3_next_mtp",
                         "num_speculative_tokens": 1

diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
     kwargs = {}
     if not use_v1_schduler:
         kwargs = {
-            "ascend_scheduler_config": {
-                "enabled": True,
-            },
             "refresh": True,
         }
     additional_config.update(**kwargs)
@@ -120,9 +117,6 @@ def _pangu_torchair_test_fixture(
 
     # torchair is only work without chunked-prefill now
     kwargs = {
-        "ascend_scheduler_config": {
-            "enabled": True,
-        },
         "refresh": True,
     }
     additional_config.update(**kwargs)
@@ -185,9 +179,6 @@ def _qwen_torchair_test_fixture(
         "torchair_graph_config": {
             "enabled": False,
         },
-        "ascend_scheduler_config": {
-            "enabled": True,
-        },
         "refresh": True,
     }
 
@@ -244,9 +235,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
     kwargs = {}
     if not use_v1_schduler:
         kwargs = {
-            "ascend_scheduler_config": {
-                "enable": True,
-            },
             "refresh": True,
         }
     additional_config.update(**kwargs)

diff --git a/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py
@@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
         "VLLM_RPC_TIMEOUT": "3600000",
         "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
     }
-    additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-    }
+    additional_config: dict[str, Any] = {}
     speculative_config = {
         "num_speculative_tokens": 2,
         "method": "deepseek_mtp"

diff --git a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
@@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
         "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
     }
     additional_config = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
         "torchair_graph_config": {
             "enabled": True,
             "enable_multistream_moe": False,

diff --git a/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py b/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
@@ -68,12 +68,7 @@
 async def test_models(model: str) -> None:
     port = get_open_port()
     env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
-    additional_config = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-        "enable_weight_nz_layout": True
-    }
+    additional_config = {"enable_weight_nz_layout": True}
     server_args = [
         "--quantization", "ascend", "--reasoning-parser", "qwen3",
         "--tensor-parallel-size", "4", "--port",

diff --git a/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py b/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
@@ -83,8 +83,7 @@ async def test_models(model: str, tp_size: int) -> None:
         "0.9", "--block-size", "128", "--max-num-seqs", "256",
         "--enforce-eager", "--max-model-len", "35840",
         "--max-num-batched-tokens", "35840", "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}',
-        "--compilation-config",
+        '{"enable_weight_nz_layout":true}', "--compilation-config",
         '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
     ]
     with RemoteOpenAIServer(model,

diff --git a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
@@ -33,7 +33,6 @@
     "single",
     "aclgraph",
     "aclgraph_mlapo",
-    "no_chunkprefill",
 ]
 
 prompts = [
@@ -82,9 +81,6 @@ async def test_models(model: str, mode: str) -> None:
         "method": "deepseek_mtp"
     }
     additional_config = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
         "torchair_graph_config": {
             "enabled": True,
             "enable_multistream_moe": False,
@@ -112,10 +108,6 @@ async def test_models(model: str, mode: str) -> None:
     if mode == "aclgraph_mlapo":
         env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
         additional_config["torchair_graph_config"] = {"enabled": False}
-    if mode == "no_chunkprefill":
-        additional_config["ascend_scheduler_config"] = {"enabled": True}
-        i = server_args.index("--max-num-batched-tokens") + 1
-        server_args[i] = "36864"
     server_args.extend(["--additional-config", json.dumps(additional_config)])
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
@@ -134,7 +126,7 @@ async def test_models(model: str, mode: str) -> None:
         choices: list[openai.types.CompletionChoice] = batch.choices
         assert choices[0].text, "empty response"
         print(choices)
-        if mode in ["single", "no_chunkprefill"]:
+        if mode in ["single"]:
             return
         # aisbench test
         run_aisbench_cases(model,

diff --git a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
@@ -71,9 +71,6 @@ async def test_models(model: str) -> None:
         "cudagraph_mode": "FULL_DECODE_ONLY"
     }
     additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
         "torchair_graph_config": {
             "enabled": True
         },

diff --git a/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py b/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
@@ -92,8 +92,7 @@ async def test_models(model: str, tp_size: int, dp_size: int,
         "--gpu-memory-utilization",
         "0.9",
         "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":true},'
-        '"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
+        '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
     ]
     if full_graph:
         server_args += [

diff --git a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
@@ -85,9 +85,8 @@ async def test_models(model: str, tp_size: int) -> None:
         str(tp_size), "--port",
         str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
         "40000", "--max-num-seqs", "400", "--trust-remote-code",
-        "--gpu-memory-utilization", "0.8", "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":false}}',
-        "--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
+        "--gpu-memory-utilization", "0.8", "--compilation_config",
+        '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
     ]
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,

diff --git a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
@@ -60,11 +60,7 @@ async def test_models(model: str) -> None:
         "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
         "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
     }
-    additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-    }
+    additional_config: dict[str, Any] = {}
     compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
     server_args = [
         "--quantization", "ascend", "--async-scheduling",

diff --git a/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py b/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
@@ -63,11 +63,6 @@ async def test_models(model: str, mode: str) -> None:
         "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
         "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
     }
-    additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-    }
     compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
     server_args = [
         "--quantization", "ascend", "--async-scheduling",
@@ -82,7 +77,6 @@ async def test_models(model: str, mode: str) -> None:
     server_args.extend(
         ["--compilation-config",
          json.dumps(compilation_config)])
-    server_args.extend(["--additional-config", json.dumps(additional_config)])
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
     }

diff --git a/tests/e2e/nightly/models/test_qwq_32b.py b/tests/e2e/nightly/models/test_qwq_32b.py
@@ -93,8 +93,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
         server_args.remove(
             '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
         )
-        server_args.append("--additional-config")
-        server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
         server_args.append("--enforce-eager")
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,