Skip to content

Commit 0c735be

Browse files
committed
Merge branch 'main' into fix/validate-tool-requests-29432
Signed-off-by: majiayu000 <[email protected]>
2 parents cde6a6a + 97f2f16 commit 0c735be

File tree

3 files changed

+74
-4
lines changed

3 files changed

+74
-4
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env bash
2+
set -euxo pipefail
3+
4+
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
5+
THRESHOLD=${1:-0.25}
6+
NUM_Q=${2:-1319}
7+
PORT=${3:-8040}
8+
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
9+
mkdir -p "${OUT_DIR}"
10+
11+
wait_for_server() {
12+
local port=$1
13+
timeout 600 bash -c '
14+
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
15+
sleep 1
16+
done'
17+
}
18+
19+
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
20+
21+
# Set BACKENDS based on platform
22+
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
23+
# ROCm platform
24+
BACKENDS=("allgather_reducescatter")
25+
# Disable MOE padding for ROCm since it is causing eplb to fail
26+
export VLLM_ROCM_MOE_PADDING=0
27+
else
28+
# Non-ROCm platform (CUDA/other)
29+
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
30+
fi
31+
32+
cleanup() {
33+
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
34+
kill "${SERVER_PID}" 2>/dev/null || true
35+
for _ in {1..20}; do
36+
kill -0 "${SERVER_PID}" 2>/dev/null || break
37+
sleep 0.5
38+
done
39+
kill -9 "${SERVER_PID}" 2>/dev/null || true
40+
fi
41+
}
42+
trap cleanup EXIT
43+
44+
for BACK in "${BACKENDS[@]}"; do
45+
VLLM_DEEP_GEMM_WARMUP=skip \
46+
VLLM_ALL2ALL_BACKEND=$BACK \
47+
vllm serve "$MODEL" \
48+
--enforce-eager \
49+
--tensor-parallel-size 4 \
50+
--enable-expert-parallel \
51+
--enable-eplb \
52+
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
53+
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
54+
--trust-remote-code \
55+
--max-model-len 2048 \
56+
--gpu-memory-utilization 0.9 \
57+
--port $PORT &
58+
SERVER_PID=$!
59+
wait_for_server $PORT
60+
61+
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
62+
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
63+
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
64+
python3 - <<PY
65+
import json; acc=json.load(open('${OUT}'))['accuracy']
66+
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
67+
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
68+
PY
69+
70+
cleanup
71+
SERVER_PID=
72+
sleep 1
73+
PORT=$((PORT+1))
74+
done

.buildkite/test-amd.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1629,7 +1629,6 @@ steps:
16291629
mirror_hardwares: [amdexperimental]
16301630
agent_pool: mi325_4
16311631
# grade: Blocking
1632-
gpu: h100
16331632
optional: true
16341633
num_gpus: 4
16351634
working_dir: "/vllm-workspace"

vllm/distributed/eplb/rebalance_execute.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,9 +322,6 @@ async def transfer_layer(
322322
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
323323
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
324324
assert num_physical_experts == ep_size * num_local_physical_experts
325-
# A buffer to hold the expert weights in one layer during the exchange.
326-
# NOTE: Currently we assume the same weights across different layers
327-
# have the same shape.
328325

329326
is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
330327
num_local_experts=num_local_physical_experts,

0 commit comments

Comments
 (0)