Skip to content

Commit 9eb1036

Browse files
authored
[1/N][CI] Add multi node test (#3359)
### What this PR does / why we need it? This pr purpose to add multi-node test, on the first step, add `deepseek-v3` dp+tp+ep test ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: wangli <[email protected]>
1 parent 82b6c84 commit 9eb1036

File tree

11 files changed

+897
-1
lines changed

11 files changed

+897
-1
lines changed

.github/actionlint.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ self-hosted-runner:
1818
- linux-amd64-cpu-0
1919
- linux-amd64-cpu-8
2020
- linux-amd64-cpu-16
21+
- linux-aarch64-a3-0
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
name: 'e2e test / multi-dp'
2+
3+
on:
4+
schedule:
5+
- cron: "0 */4 * * *"
6+
workflow_dispatch:
7+
8+
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
9+
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
10+
# It's used to activate ascend-toolkit environment variables.
11+
defaults:
12+
run:
13+
shell: bash -el {0}
14+
15+
# only cancel in-progress runs of the same workflow
16+
# and ignore the lint / 8 cards test type
17+
concurrency:
18+
group: ${{ github.workflow }}-${{ github.ref }}
19+
cancel-in-progress: true
20+
21+
jobs:
22+
e2e:
23+
# This is a runner with no NPU for k8s controller
24+
runs-on: linux-aarch64-a3-0
25+
container:
26+
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
27+
env:
28+
KUBECONFIG: /tmp/kubeconfig
29+
KUBECTL: /root/.cache/.kube/kubectl
30+
NAMESPACE: vllm-project
31+
LEADER_POD: vllm-0
32+
steps:
33+
- name: Install system denpendencies
34+
run: |
35+
# configure apt and pip source
36+
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
37+
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
38+
39+
apt-get update -y && apt-get install -y git curl
40+
41+
TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64`
42+
git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN"
43+
44+
- name: Install kubectl
45+
run: |
46+
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
47+
48+
# get kubeconfig from secret
49+
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
50+
51+
- name: Checkout code
52+
uses: actions/checkout@v4
53+
54+
- name: Prepare scripts
55+
run: |
56+
# prepare for lws entrypoint scripts
57+
install -D tests/e2e/multi_node/scripts/run.sh /root/.cache/tests/run.sh
58+
59+
- name: Launch cluster
60+
run: |
61+
kubectl apply -f tests/e2e/multi_node/scripts/lws.yaml
62+
63+
- name: Waiting for pod ready
64+
run: |
65+
echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
66+
67+
while true; do
68+
# get pod status
69+
READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
70+
71+
if [[ "$READY_STATUS" == "true" ]]; then
72+
echo "✅ Pod [$LEADER_POD] is Ready!"
73+
break
74+
else
75+
echo "Pod [$LEADER_POD] not ready, waiting..."
76+
sleep 3
77+
fi
78+
done
79+
80+
- name: Stream logs and monitor pod health
81+
run: |
82+
set -euo pipefail
83+
84+
echo "🚀 Start streaming logs for Pod [$LEADER_POD] ..."
85+
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" &
86+
LOG_PID=$!
87+
88+
echo "Start monitoring Pod [$LEADER_POD] status ..."
89+
while true; do
90+
STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
91+
if [[ "$STATUS" != "Running" && "$STATUS" != "Succeeded" ]]; then
92+
echo "❌ Pod [$LEADER_POD] exited abnormally with status: $STATUS"
93+
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" || true
94+
kubectl logs "$LEADER_POD" -n "$NAMESPACE" --previous --all-containers || true
95+
kill $LOG_PID || true
96+
exit 1
97+
fi
98+
sleep 5
99+
done &
100+
101+
MONITOR_PID=$!
102+
wait $LOG_PID || true
103+
kill $MONITOR_PID || true
104+
105+
- name: Post process
106+
if: always()
107+
run: |
108+
kubectl get pods -n $NAMESPACE
109+
kubectl delete -f tests/e2e/multi_node/scripts/lws.yaml

tests/e2e/conftest.py

Lines changed: 187 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,18 @@
1919

2020
import contextlib
2121
import gc
22+
import json
2223
import os
24+
import subprocess
25+
import sys
26+
import time
2327
from typing import Any, List, Optional, Tuple, TypeVar, Union
2428

29+
import httpx
2530
import numpy as np
31+
import openai
2632
import pytest
33+
import requests
2734
import torch
2835
from modelscope import snapshot_download # type: ignore[import-untyped]
2936
from PIL import Image
@@ -33,9 +40,14 @@
3340
from transformers.models.auto.auto_factory import _BaseAutoModelClass
3441
from vllm import LLM, SamplingParams
3542
from vllm.config.model import TaskOption, _get_and_verify_dtype
43+
from vllm.engine.arg_utils import AsyncEngineArgs
44+
from vllm.entrypoints.cli.serve import ServeSubcommand
3645
from vllm.inputs import TextPrompt
46+
from vllm.model_executor.model_loader import get_model_loader
3747
from vllm.outputs import RequestOutput
48+
from vllm.platforms import current_platform
3849
from vllm.transformers_utils.utils import maybe_model_redirect
50+
from vllm.utils import FlexibleArgumentParser, get_open_port
3951

4052
from tests.e2e.model_utils import (TokensTextLogprobs,
4153
TokensTextLogprobsPromptLogprobs)
@@ -76,6 +88,181 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
7688
torch.npu.reset_peak_memory_stats()
7789

7890

91+
class RemoteOpenAIServer:
92+
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
93+
94+
def _start_server(self, model: str, vllm_serve_args: list[str],
95+
env_dict: Optional[dict[str, str]]) -> None:
96+
"""Subclasses override this method to customize server process launch
97+
"""
98+
env = os.environ.copy()
99+
# the current process might initialize npu,
100+
# to be safe, we should use spawn method
101+
env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
102+
if env_dict is not None:
103+
env.update(env_dict)
104+
self.proc: subprocess.Popen = subprocess.Popen(
105+
["vllm", "serve", model, *vllm_serve_args],
106+
env=env,
107+
stdout=sys.stdout,
108+
stderr=sys.stderr,
109+
)
110+
111+
def __init__(self,
112+
model: str,
113+
server_host: str,
114+
server_port: int,
115+
vllm_serve_args: list[str],
116+
*,
117+
env_dict: Optional[dict[str, str]] = None,
118+
seed: Optional[int] = 0,
119+
auto_port: bool = True,
120+
max_wait_seconds: Optional[float] = None,
121+
override_hf_configs: Optional[dict[str, Any]] = None) -> None:
122+
if auto_port:
123+
if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
124+
raise ValueError("You have manually specified the port "
125+
"when `auto_port=True`.")
126+
127+
# No need for a port if using unix sockets
128+
if "--uds" not in vllm_serve_args:
129+
# Don't mutate the input args
130+
vllm_serve_args = vllm_serve_args + [
131+
"--port", str(get_open_port())
132+
]
133+
if seed is not None:
134+
if "--seed" in vllm_serve_args:
135+
raise ValueError("You have manually specified the seed "
136+
f"when `seed={seed}`.")
137+
138+
vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
139+
140+
if override_hf_configs is not None:
141+
vllm_serve_args = vllm_serve_args + [
142+
"--hf-overrides",
143+
json.dumps(override_hf_configs)
144+
]
145+
146+
parser = FlexibleArgumentParser(
147+
description="vLLM's remote OpenAI server.")
148+
subparsers = parser.add_subparsers(required=False, dest="subparser")
149+
parser = ServeSubcommand().subparser_init(subparsers)
150+
args = parser.parse_args([*vllm_serve_args])
151+
self.uds = args.uds
152+
if args.uds:
153+
self.host = None
154+
self.port = None
155+
else:
156+
self.host = str(server_host)
157+
self.port = int(server_port)
158+
159+
self.show_hidden_metrics = \
160+
args.show_hidden_metrics_for_version is not None
161+
162+
# download the model before starting the server to avoid timeout
163+
is_local = os.path.isdir(model)
164+
if not is_local:
165+
engine_args = AsyncEngineArgs.from_cli_args(args)
166+
model_config = engine_args.create_model_config()
167+
load_config = engine_args.create_load_config()
168+
169+
model_loader = get_model_loader(load_config)
170+
model_loader.download_model(model_config)
171+
172+
self._start_server(model, vllm_serve_args, env_dict)
173+
max_wait_seconds = max_wait_seconds or 7200
174+
self._wait_for_server(url=self.url_for("health"),
175+
timeout=max_wait_seconds)
176+
177+
def __enter__(self):
178+
return self
179+
180+
def __exit__(self, exc_type, exc_value, traceback):
181+
self.proc.terminate()
182+
try:
183+
self.proc.wait(8)
184+
except subprocess.TimeoutExpired:
185+
# force kill if needed
186+
self.proc.kill()
187+
188+
def _poll(self) -> Optional[int]:
189+
"""Subclasses override this method to customize process polling"""
190+
return self.proc.poll()
191+
192+
def hang_until_terminated(self) -> None:
193+
"""
194+
Wait until the server process terminates.
195+
This is for headless mode, where the api server
196+
process only exists in the leader node.
197+
"""
198+
if self.uds:
199+
client = httpx.Client(transport=httpx.HTTPTransport(uds=self.uds))
200+
else:
201+
client = requests
202+
203+
try:
204+
while True:
205+
try:
206+
resp = client.get(self.url_for("health"), timeout=5)
207+
if resp.status_code != 200:
208+
break
209+
time.sleep(5)
210+
except Exception:
211+
break
212+
finally:
213+
if isinstance(client, httpx.Client):
214+
client.close()
215+
216+
def _wait_for_server(self, *, url: str, timeout: float):
217+
# run health check
218+
start = time.time()
219+
client = (httpx.Client(transport=httpx.HTTPTransport(
220+
uds=self.uds)) if self.uds else requests)
221+
while True:
222+
try:
223+
if client.get(url).status_code == 200:
224+
break
225+
except Exception:
226+
# this exception can only be raised by requests.get,
227+
# which means the server is not ready yet.
228+
# the stack trace is not useful, so we suppress it
229+
# by using `raise from None`.
230+
result = self._poll()
231+
if result is not None and result != 0:
232+
raise RuntimeError("Server exited unexpectedly.") from None
233+
234+
time.sleep(1)
235+
if time.time() - start > timeout:
236+
raise RuntimeError(
237+
"Server failed to start in time.") from None
238+
239+
@property
240+
def url_root(self) -> str:
241+
return (f"http://{self.uds.split('/')[-1]}"
242+
if self.uds else f"http://{self.host}:{self.port}")
243+
244+
def url_for(self, *parts: str) -> str:
245+
return self.url_root + "/" + "/".join(parts)
246+
247+
def get_client(self, **kwargs):
248+
if "timeout" not in kwargs:
249+
kwargs["timeout"] = 600
250+
return openai.OpenAI(
251+
base_url=self.url_for("v1"),
252+
api_key=self.DUMMY_API_KEY,
253+
max_retries=0,
254+
**kwargs,
255+
)
256+
257+
def get_async_client(self, **kwargs):
258+
if "timeout" not in kwargs:
259+
kwargs["timeout"] = 600
260+
return openai.AsyncOpenAI(base_url=self.url_for("v1"),
261+
api_key=self.DUMMY_API_KEY,
262+
max_retries=0,
263+
**kwargs)
264+
265+
79266
class VllmRunner:
80267

81268
def __init__(
@@ -289,7 +476,6 @@ def __exit__(self, exc_type, exc_value, traceback):
289476
class HfRunner:
290477

291478
def get_default_device(self):
292-
from vllm.platforms import current_platform
293479

294480
return ("cpu"
295481
if current_platform.is_cpu() else current_platform.device_type)

tests/e2e/multi_node/__init__.py

Whitespace-only changes.

tests/e2e/multi_node/config/__init__.py

Whitespace-only changes.
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
[
2+
{
3+
"test_name": "test_deepseek_v3",
4+
"disaggregate_prefill": false,
5+
"enable_multithread_load": false,
6+
"num_nodes": 2,
7+
"server_parameters": {
8+
"leader_config": {
9+
"model": "vllm-ascend/DeepSeek-V3-W8A8",
10+
"additional_config": {
11+
"ascend_scheduler_config": {
12+
"enabled": true
13+
},
14+
"torchair_graph_config": {
15+
"enabled": true
16+
}
17+
}
18+
},
19+
"worker_config": {
20+
"model": "vllm-ascend/DeepSeek-V3-W8A8",
21+
"additional_config": {
22+
"ascend_scheduler_config": {
23+
"enabled": true
24+
},
25+
"torchair_graph_config": {
26+
"enabled": true
27+
}
28+
}
29+
}
30+
},
31+
"client_parameters": {
32+
"model": "vllm-ascend/DeepSeek-V3-W8A8",
33+
"backend": "vllm",
34+
"dataset_name": "sharegpt",
35+
"dataset_path": "/root/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
36+
"num_prompts": 200,
37+
"request_rate": 1
38+
},
39+
"accuracy_parameters": {}
40+
}
41+
]

0 commit comments

Comments
 (0)