add new e2e tests for aclgraph

lilinsiman · lilinsiman · commit 914be511c5d5 · 2025-10-27T21:06:10.000+08:00
Signed-off-by: lilinsiman &lt;lilinsiman@gmail.com&gt;
diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
@@ -89,6 +89,7 @@ jobs:
           # the test separately.
 
           pytest -sv tests/e2e/singlecard/test_aclgraph.py
+          pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
           pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
           pytest -sv tests/e2e/singlecard/test_camem.py
           pytest -sv tests/e2e/singlecard/test_chunked.py
@@ -172,6 +173,7 @@ jobs:
           VLLM_USE_MODELSCOPE: True
         if: ${{ inputs.type == 'full' }}
         run: |
+          pytest -sv tests/e2e/multicard/test_aclgraph_replay_capture.py
           pytest -sv tests/e2e/multicard/test_data_parallel.py
           pytest -sv tests/e2e/multicard/test_expert_parallel.py
           # external_launcher test is not stable enough. Fix it later
diff --git a/tests/e2e/multicard/test_aclgraph_replay_capture.py b/tests/e2e/multicard/test_aclgraph_replay_capture.py
@@ -0,0 +1,180 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2025 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import contextlib
+import gc
+import math
+import multiprocessing
+import os
+import sys
+from time import sleep
+from unittest.mock import patch
+
+import pytest
+import torch
+from vllm import LLM, SamplingParams
+from vllm.distributed.parallel_state import (  # noqa E402
+    destroy_distributed_environment, destroy_model_parallel)
+
+MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [4])
+@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
+def test_aclgraph_capture_replay_dp2(
+    model: str,
+    max_tokens: int,
+) -> None:
+    # HCCL_OP_EXPANSION_MODE determines how max_num_batch_sizes is computed.
+    if 'VLLM_WORKER_MULTIPROC_METHOD' in os.environ:
+        del os.environ["VLLM_WORKER_MULTIPROC_METHOD"]
+    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
+        del os.environ['HCCL_OP_EXPANSION_MODE']
+    os.environ["HCCL_NPU_SOCKET_PORT_RANGE"] = 'auto'
+    os.environ["HCCL_HOST_SOCKET_PORT_RANGE"] = 'auto'
+    dp_size = 2
+    tp_size = 1
+    replay_counter = multiprocessing.Value("i", 0)
+    capture_counter = multiprocessing.Value("i", 0)
+    num_hidden_layers_shared = multiprocessing.Value("i", -1)
+    num_execute_model_shared = multiprocessing.Value("i", 0)
+    dp_master_ip = "127.0.0.1"
+    dp_master_port = 11011
+
+    def dp_rank_main(global_dp_rank: int, local_dp_rank: int):
+        os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
+        os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
+        os.environ["VLLM_DP_SIZE"] = str(dp_size)
+        os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
+        os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
+
+        original_replay = torch.npu.NPUGraph.replay
+
+        def replay_wrapper(self):
+            with replay_counter.get_lock():
+                replay_counter.value += 1
+            return original_replay(self)
+
+        original_init = torch.npu.NPUGraph.__init__
+
+        def init_wrapper(self, *args, **kwargs):
+            with capture_counter.get_lock():
+                capture_counter.value += 1
+            return original_init(self, *args, **kwargs)
+
+        with patch.object(torch.npu.NPUGraph, "replay", replay_wrapper), \
+             patch.object(torch.npu.NPUGraph, "__init__", init_wrapper):
+            prompts = [
+                "Hello, my name is", "The president of the United States is",
+                "The capital of France is", "The future of AI is"
+            ]
+            chunk_size = len(prompts) // dp_size
+            start = global_dp_rank * chunk_size
+            end = start + chunk_size if global_dp_rank < dp_size - 1 else len(
+                prompts)
+            my_prompts = prompts[start:end]
+            sampling_params = SamplingParams(max_tokens=max_tokens,
+                                             temperature=0.0)
+
+            def trace_calls(frame, event, arg):
+                if event == 'call':
+                    code = frame.f_code
+                    func_name = code.co_name
+                    file_name = code.co_filename
+                    if func_name == 'execute_dummy_batch' and 'worker_v1.py' in file_name:
+                        with num_execute_model_shared.get_lock():
+                            num_execute_model_shared.value += 1
+                return trace_calls
+
+            sys.settrace(trace_calls)
+            if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
+                llm = LLM(
+                    model=model,
+                    quantization="ascend",
+                    tensor_parallel_size=tp_size,
+                    trust_remote_code=True,
+                )
+            else:
+                llm = LLM(
+                    model=model,
+                    tensor_parallel_size=tp_size,
+                    trust_remote_code=True,
+                )
+            num_hidden_layers_shared.value = llm.llm_engine.model_config.hf_config.num_hidden_layers
+            _ = llm.generate(my_prompts, sampling_params)
+            sys.settrace(None)
+
+            # Give engines time to pause their processing loops before exiting.
+            sleep(5)
+            del llm
+            cleanup_env_and_memory()
+
+    processes = []
+    for local_dp_rank in range(dp_size):
+        global_dp_rank = local_dp_rank
+        p = multiprocessing.Process(target=dp_rank_main,
+                                    args=(global_dp_rank, local_dp_rank))
+        p.start()
+        processes.append(p)
+
+    for p in processes:
+        p.join(timeout=900)
+        if p.exitcode != 0:
+            if p.exitcode is None:
+                p.kill()
+                raise RuntimeError(f"Process {p.pid} timed out")
+            else:
+                raise RuntimeError(
+                    f"Process failed with exit code {p.exitcode}")
+
+    actual_capture = capture_counter.value
+    actual_replay = replay_counter.value
+    num_hidden_layers = num_hidden_layers_shared.value
+    num_execute_model = num_execute_model_shared.value
+
+    num_acl_graphs = num_hidden_layers + 1
+    num_comm_groups = sum(size > 1 for size in [
+        dp_size,
+        tp_size,
+    ])
+    max_num_batch_sizes = math.floor(
+        (1800 - num_comm_groups * 40) / num_acl_graphs /
+        (1 + num_comm_groups * 2))
+    expected_total_capture = max_num_batch_sizes * num_acl_graphs * dp_size
+    assert actual_capture == expected_total_capture, (
+        f"capture count mismatch. Expected: {expected_total_capture}, Got: {actual_capture}"
+    )
+
+    num_inference_steps = max_tokens + 1  # first token + max_tokens
+    expected_total_replay = num_acl_graphs * num_inference_steps * dp_size + num_execute_model * num_acl_graphs
+    assert actual_replay == expected_total_replay, (
+        f"Replay count mismatch. Expected: {expected_total_replay}, Got: {actual_replay}"
+    )
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn'
+    del os.environ["HCCL_NPU_SOCKET_PORT_RANGE"]
+    del os.environ["HCCL_HOST_SOCKET_PORT_RANGE"]
+
+
+def cleanup_env_and_memory():
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
diff --git a/tests/e2e/singlecard/test_aclgraph_mem.py b/tests/e2e/singlecard/test_aclgraph_mem.py
@@ -0,0 +1,99 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import multiprocessing
+import os
+from unittest.mock import patch
+
+import pytest
+import torch
+from modelscope import snapshot_download  # type: ignore
+from vllm import LLM, SamplingParams
+
+from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
+
+MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="aclgraph only support on v1")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [4])
+@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
+def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
+    del os.environ["VLLM_WORKER_MULTIPROC_METHOD"]
+    capture_called = multiprocessing.Value("i", 0)  # int, 0 or 1
+    capture_mem_before = multiprocessing.Value("q", -1)  # long long (64-bit)
+    capture_mem_after = multiprocessing.Value("q", -1)  # long long
+
+    def capture_model_wrapper(original_method):
+
+        def wrapped(self):
+            mem_before = torch.npu.mem_get_info()[0]  # free memory
+            result = original_method(self)
+            mem_after = torch.npu.mem_get_info()[0]
+            with capture_called.get_lock():
+                capture_called.value = 1
+                capture_mem_before.value = mem_before
+                capture_mem_after.value = mem_after
+            return result
+
+        return wrapped
+
+    original_capture = NPUModelRunner._capture_model
+
+    with patch.object(NPUModelRunner,
+                      '_capture_model',
+                      new=capture_model_wrapper(original_capture)):
+        prompts = [
+            "Hello, my name is", "The president of the United States is",
+            "The capital of France is", "The future of AI is"
+        ]
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         temperature=0.0)
+        if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
+            vllm_model = LLM(snapshot_download(model),
+                             max_model_len=1024,
+                             quantization="ascend")
+        else:
+            vllm_model = LLM(snapshot_download(model))
+        _ = vllm_model.generate(prompts, sampling_params)
+
+    assert capture_called.value == 1, "_capture_model was not called during test"
+    assert capture_mem_before.value != -1, "capture_mem_before not set"
+    assert capture_mem_after.value != -1, "capture_mem_after not set"
+
+    print("capture_mem_before =", capture_mem_before.value)
+    print("capture_mem_after =", capture_mem_after.value)
+
+    mem_used_by_capture = capture_mem_before.value - capture_mem_after.value
+    # Empirical observation: capturing ACL graphs for Qwen3-0.6B uses ~0.20 GiB of NPU memory.
+    # DeepSeek-V2-Lite-W8A8 uses ~0.57 GiB of NPU memory
+    # a 1.3x tolerance is applied to account for runtime variance.
+    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
+        baseline_capture_mem = 0.57
+        capture_mem_tolerance = 1.3
+    else:
+        baseline_capture_mem = 0.20
+        capture_mem_tolerance = 1.3
+    max_capture_mem_gib = baseline_capture_mem * capture_mem_tolerance
+    max_mem_expected = max_capture_mem_gib * (1024**3)
+    assert mem_used_by_capture < max_mem_expected, (
+        f"_capture_model used more memory than expected. "
+        f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, "
+        f"Expected: < {max_capture_mem_gib:.2f} GiB")
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn'