vllm-project
diff --git a/‎.github/actionlint.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/actionlint.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/multi_node_test.yaml‎
Lines changed: 109 additions & 0 deletions b/‎.github/workflows/multi_node_test.yaml‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎tests/e2e/conftest.py‎
Lines changed: 187 additions & 1 deletion b/‎tests/e2e/conftest.py‎
Lines changed: 187 additions & 1 deletion
diff --git a/‎tests/e2e/multi_node/__init__.py‎ b/‎tests/e2e/multi_node/__init__.py‎
diff --git a/‎tests/e2e/multi_node/config/__init__.py‎ b/‎tests/e2e/multi_node/config/__init__.py‎
diff --git a/‎tests/e2e/multi_node/config/config.json‎
Lines changed: 41 additions & 0 deletions b/‎tests/e2e/multi_node/config/config.json‎
Lines changed: 41 additions & 0 deletions
@@ -18,3 +18,4 @@ self-hosted-runner:
     - linux-amd64-cpu-0
     - linux-amd64-cpu-8
     - linux-amd64-cpu-16
+    - linux-aarch64-a3-0
@@ -0,0 +1,109 @@
+name: 'e2e test / multi-dp'
+
+on:
+    schedule:
+      - cron: "0 */4 * * *"
+    workflow_dispatch:
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 8 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    # This is a runner with no NPU for k8s controller
+    runs-on: linux-aarch64-a3-0
+    container:
+      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      env:
+        KUBECONFIG: /tmp/kubeconfig
+        KUBECTL: /root/.cache/.kube/kubectl
+        NAMESPACE: vllm-project
+        LEADER_POD: vllm-0
+    steps:
+        - name: Install system denpendencies
+          run: |
+           # configure apt and pip source
+           sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+           apt-get update -y && apt-get install -y git curl
+
+           TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64`
+           git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN"
+
+        - name: Install kubectl
+          run: |
+            install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
+
+            # get kubeconfig from secret
+            echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
+
+        - name: Checkout code
+          uses: actions/checkout@v4
+
+        - name: Prepare scripts
+          run: |
+            # prepare for lws entrypoint scripts
+            install -D tests/e2e/multi_node/scripts/run.sh /root/.cache/tests/run.sh
+
+        - name: Launch cluster
+          run: |
+            kubectl apply -f tests/e2e/multi_node/scripts/lws.yaml
+          
+        - name: Waiting for pod ready
+          run: |
+            echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
+
+            while true; do
+              # get pod status
+              READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
+
+              if [[ "$READY_STATUS" == "true" ]]; then
+                echo "✅ Pod [$LEADER_POD] is Ready!"
+                break
+              else
+                echo "Pod [$LEADER_POD] not ready, waiting..."
+                sleep 3
+              fi
+            done
+
+        - name: Stream logs and monitor pod health
+          run: |
+            set -euo pipefail
+
+            echo "🚀 Start streaming logs for Pod [$LEADER_POD] ..."
+            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" &
+            LOG_PID=$!
+
+            echo "Start monitoring Pod [$LEADER_POD] status ..."
+            while true; do
+              STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
+              if [[ "$STATUS" != "Running" && "$STATUS" != "Succeeded" ]]; then
+                echo "❌ Pod [$LEADER_POD] exited abnormally with status: $STATUS"
+                kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" || true
+                kubectl logs "$LEADER_POD" -n "$NAMESPACE" --previous --all-containers || true
+                kill $LOG_PID || true
+                exit 1
+              fi
+              sleep 5
+            done &
+
+            MONITOR_PID=$!
+            wait $LOG_PID || true
+            kill $MONITOR_PID || true
+
+        - name: Post process
+          if: always()
+          run: |
+            kubectl get pods -n $NAMESPACE
+            kubectl delete -f tests/e2e/multi_node/scripts/lws.yaml
@@ -19,11 +19,18 @@
 
 import contextlib
 import gc
+import json
 import os
+import subprocess
+import sys
+import time
 from typing import Any, List, Optional, Tuple, TypeVar, Union
 
+import httpx
 import numpy as np
+import openai
 import pytest
+import requests
 import torch
 from modelscope import snapshot_download  # type: ignore[import-untyped]
 from PIL import Image
@@ -33,9 +40,14 @@
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm import LLM, SamplingParams
 from vllm.config.model import TaskOption, _get_and_verify_dtype
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.cli.serve import ServeSubcommand
 from vllm.inputs import TextPrompt
+from vllm.model_executor.model_loader import get_model_loader
 from vllm.outputs import RequestOutput
+from vllm.platforms import current_platform
 from vllm.transformers_utils.utils import maybe_model_redirect
+from vllm.utils import FlexibleArgumentParser, get_open_port
 
 from tests.e2e.model_utils import (TokensTextLogprobs,
                                    TokensTextLogprobsPromptLogprobs)
@@ -76,6 +88,181 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
     torch.npu.reset_peak_memory_stats()
 
 
+class RemoteOpenAIServer:
+    DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
+
+    def _start_server(self, model: str, vllm_serve_args: list[str],
+                      env_dict: Optional[dict[str, str]]) -> None:
+        """Subclasses override this method to customize server process launch
+        """
+        env = os.environ.copy()
+        # the current process might initialize npu,
+        # to be safe, we should use spawn method
+        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        if env_dict is not None:
+            env.update(env_dict)
+        self.proc: subprocess.Popen = subprocess.Popen(
+            ["vllm", "serve", model, *vllm_serve_args],
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+
+    def __init__(self,
+                 model: str,
+                 server_host: str,
+                 server_port: int,
+                 vllm_serve_args: list[str],
+                 *,
+                 env_dict: Optional[dict[str, str]] = None,
+                 seed: Optional[int] = 0,
+                 auto_port: bool = True,
+                 max_wait_seconds: Optional[float] = None,
+                 override_hf_configs: Optional[dict[str, Any]] = None) -> None:
+        if auto_port:
+            if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
+                raise ValueError("You have manually specified the port "
+                                 "when `auto_port=True`.")
+
+            # No need for a port if using unix sockets
+            if "--uds" not in vllm_serve_args:
+                # Don't mutate the input args
+                vllm_serve_args = vllm_serve_args + [
+                    "--port", str(get_open_port())
+                ]
+        if seed is not None:
+            if "--seed" in vllm_serve_args:
+                raise ValueError("You have manually specified the seed "
+                                 f"when `seed={seed}`.")
+
+            vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
+
+        if override_hf_configs is not None:
+            vllm_serve_args = vllm_serve_args + [
+                "--hf-overrides",
+                json.dumps(override_hf_configs)
+            ]
+
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        subparsers = parser.add_subparsers(required=False, dest="subparser")
+        parser = ServeSubcommand().subparser_init(subparsers)
+        args = parser.parse_args([*vllm_serve_args])
+        self.uds = args.uds
+        if args.uds:
+            self.host = None
+            self.port = None
+        else:
+            self.host = str(server_host)
+            self.port = int(server_port)
+
+        self.show_hidden_metrics = \
+            args.show_hidden_metrics_for_version is not None
+
+        # download the model before starting the server to avoid timeout
+        is_local = os.path.isdir(model)
+        if not is_local:
+            engine_args = AsyncEngineArgs.from_cli_args(args)
+            model_config = engine_args.create_model_config()
+            load_config = engine_args.create_load_config()
+
+            model_loader = get_model_loader(load_config)
+            model_loader.download_model(model_config)
+
+        self._start_server(model, vllm_serve_args, env_dict)
+        max_wait_seconds = max_wait_seconds or 7200
+        self._wait_for_server(url=self.url_for("health"),
+                              timeout=max_wait_seconds)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.proc.terminate()
+        try:
+            self.proc.wait(8)
+        except subprocess.TimeoutExpired:
+            # force kill if needed
+            self.proc.kill()
+
+    def _poll(self) -> Optional[int]:
+        """Subclasses override this method to customize process polling"""
+        return self.proc.poll()
+
+    def hang_until_terminated(self) -> None:
+        """
+        Wait until the server process terminates.
+        This is for headless mode, where the api server
+        process only exists in the leader node.
+        """
+        if self.uds:
+            client = httpx.Client(transport=httpx.HTTPTransport(uds=self.uds))
+        else:
+            client = requests
+
+        try:
+            while True:
+                try:
+                    resp = client.get(self.url_for("health"), timeout=5)
+                    if resp.status_code != 200:
+                        break
+                    time.sleep(5)
+                except Exception:
+                    break
+        finally:
+            if isinstance(client, httpx.Client):
+                client.close()
+
+    def _wait_for_server(self, *, url: str, timeout: float):
+        # run health check
+        start = time.time()
+        client = (httpx.Client(transport=httpx.HTTPTransport(
+            uds=self.uds)) if self.uds else requests)
+        while True:
+            try:
+                if client.get(url).status_code == 200:
+                    break
+            except Exception:
+                # this exception can only be raised by requests.get,
+                # which means the server is not ready yet.
+                # the stack trace is not useful, so we suppress it
+                # by using `raise from None`.
+                result = self._poll()
+                if result is not None and result != 0:
+                    raise RuntimeError("Server exited unexpectedly.") from None
+
+                time.sleep(1)
+                if time.time() - start > timeout:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from None
+
+    @property
+    def url_root(self) -> str:
+        return (f"http://{self.uds.split('/')[-1]}"
+                if self.uds else f"http://{self.host}:{self.port}")
+
+    def url_for(self, *parts: str) -> str:
+        return self.url_root + "/" + "/".join(parts)
+
+    def get_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
+        return openai.OpenAI(
+            base_url=self.url_for("v1"),
+            api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
+        )
+
+    def get_async_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
+        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
+                                  api_key=self.DUMMY_API_KEY,
+                                  max_retries=0,
+                                  **kwargs)
+
+
 class VllmRunner:
 
     def __init__(
@@ -289,7 +476,6 @@ def __exit__(self, exc_type, exc_value, traceback):
 class HfRunner:
 
     def get_default_device(self):
-        from vllm.platforms import current_platform
 
         return ("cpu"
                 if current_platform.is_cpu() else current_platform.device_type)
 
@@ -0,0 +1,41 @@
+[
+    {
+        "test_name": "test_deepseek_v3",
+        "disaggregate_prefill": false,
+        "enable_multithread_load": false,
+        "num_nodes": 2,
+        "server_parameters": {
+            "leader_config": {
+                "model": "vllm-ascend/DeepSeek-V3-W8A8",
+                "additional_config": {
+                    "ascend_scheduler_config": {
+                        "enabled": true
+                    },
+                    "torchair_graph_config": {
+                        "enabled": true
+                    }
+                }
+            },
+            "worker_config": {
+                "model": "vllm-ascend/DeepSeek-V3-W8A8",
+                "additional_config": {
+                    "ascend_scheduler_config": {
+                        "enabled": true
+                    },
+                    "torchair_graph_config": {
+                        "enabled": true
+                    }
+                }
+            }
+        },
+        "client_parameters": {
+            "model": "vllm-ascend/DeepSeek-V3-W8A8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "/root/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "request_rate": 1
+        },
+        "accuracy_parameters": {}
+    }
+]