|
| 1 | +#!/usr/bin/env bash |
| 2 | +set -euxo pipefail |
| 3 | + |
| 4 | +# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] |
| 5 | +THRESHOLD=${1:-0.25} |
| 6 | +NUM_Q=${2:-1319} |
| 7 | +PORT=${3:-8040} |
| 8 | +OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} |
| 9 | +mkdir -p "${OUT_DIR}" |
| 10 | + |
| 11 | +wait_for_server() { |
| 12 | + local port=$1 |
| 13 | + timeout 600 bash -c ' |
| 14 | + until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do |
| 15 | + sleep 1 |
| 16 | + done' |
| 17 | +} |
| 18 | + |
| 19 | +MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct" |
| 20 | + |
| 21 | +# Set BACKENDS based on platform |
| 22 | +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then |
| 23 | + # ROCm platform |
| 24 | + BACKENDS=("allgather_reducescatter") |
| 25 | + # Disable MOE padding for ROCm since it is causing eplb to fail |
| 26 | + export VLLM_ROCM_MOE_PADDING=0 |
| 27 | +else |
| 28 | + # Non-ROCm platform (CUDA/other) |
| 29 | + BACKENDS=("deepep_high_throughput" "deepep_low_latency") |
| 30 | +fi |
| 31 | + |
| 32 | +cleanup() { |
| 33 | + if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then |
| 34 | + kill "${SERVER_PID}" 2>/dev/null || true |
| 35 | + for _ in {1..20}; do |
| 36 | + kill -0 "${SERVER_PID}" 2>/dev/null || break |
| 37 | + sleep 0.5 |
| 38 | + done |
| 39 | + kill -9 "${SERVER_PID}" 2>/dev/null || true |
| 40 | + fi |
| 41 | +} |
| 42 | +trap cleanup EXIT |
| 43 | + |
| 44 | +for BACK in "${BACKENDS[@]}"; do |
| 45 | + VLLM_DEEP_GEMM_WARMUP=skip \ |
| 46 | + VLLM_ALL2ALL_BACKEND=$BACK \ |
| 47 | + vllm serve "$MODEL" \ |
| 48 | + --enforce-eager \ |
| 49 | + --tensor-parallel-size 4 \ |
| 50 | + --enable-expert-parallel \ |
| 51 | + --enable-eplb \ |
| 52 | + --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ |
| 53 | + --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ |
| 54 | + --trust-remote-code \ |
| 55 | + --max-model-len 2048 \ |
| 56 | + --gpu-memory-utilization 0.9 \ |
| 57 | + --port $PORT & |
| 58 | + SERVER_PID=$! |
| 59 | + wait_for_server $PORT |
| 60 | + |
| 61 | + TAG=$(echo "$MODEL" | tr '/: \\n' '_____') |
| 62 | + OUT="${OUT_DIR}/${TAG}_${BACK}.json" |
| 63 | + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} |
| 64 | + python3 - <<PY |
| 65 | +import json; acc=json.load(open('${OUT}'))['accuracy'] |
| 66 | +print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") |
| 67 | +assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" |
| 68 | +PY |
| 69 | + |
| 70 | + cleanup |
| 71 | + SERVER_PID= |
| 72 | + sleep 1 |
| 73 | + PORT=$((PORT+1)) |
| 74 | +done |
0 commit comments