Skip to content

Commit 6c7c895

Browse files
committed
rebase main
Signed-off-by: daishixun <[email protected]>
2 parents 59c9b0e + 178ca16 commit 6c7c895

File tree

426 files changed

+35156
-10973
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

426 files changed

+35156
-10973
lines changed

.github/Dockerfile.buildwheel

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@
1515
# This file is a part of the vllm-ascend project.
1616
#
1717
ARG PY_VERSION=3.11
18-
FROM quay.io/ascend/manylinux:8.3.rc1-910b-manylinux_2_28-py${PY_VERSION}
18+
FROM quay.io/ascend/manylinux:8.3.rc2-910b-manylinux_2_28-py${PY_VERSION}
1919

2020
ARG COMPILE_CUSTOM_KERNELS=1
21+
ARG SOC_VERSION="ascend910b1"
2122

2223
# Define environments
2324
ENV DEBIAN_FRONTEND=noninteractive
2425
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
26+
ENV SOC_VERSION=$SOC_VERSION
2527
RUN yum update -y && \
2628
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
2729
rm -rf /var/cache/yum

.github/workflows/_e2e_nightly_multi_node.yaml

Lines changed: 85 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ on:
1515
required: false
1616
type: string
1717
description: base image for pods
18-
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11"
18+
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11"
1919
config_file_path:
2020
required: true
2121
type: string
@@ -32,7 +32,7 @@ on:
3232
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
3333
vllm_version:
3434
required: false
35-
default: "v0.11.0"
35+
default: "v0.12.0"
3636
type: string
3737
description: vllm version to use
3838
vllm_ascend_remote_url:
@@ -60,7 +60,7 @@ defaults:
6060
# only cancel in-progress runs of the same workflow
6161
# and ignore the lint / 8 cards test type
6262
concurrency:
63-
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.config_file_path }}
63+
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}
6464
cancel-in-progress: true
6565

6666
jobs:
@@ -69,7 +69,7 @@ jobs:
6969
# This is the runner with no NPU for k8s controller
7070
runs-on: ${{ inputs.runner }}
7171
container:
72-
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
72+
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
7373
env:
7474
KUBECONFIG: /tmp/kubeconfig
7575
KUBECTL: /root/.cache/.kube/kubectl
@@ -106,7 +106,7 @@ jobs:
106106
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
107107
108108
- name: Checkout code
109-
uses: actions/checkout@v4
109+
uses: actions/checkout@v6.0.0
110110

111111
- name: Prepare scripts
112112
run: |
@@ -115,8 +115,39 @@ jobs:
115115
116116
- name: Clear resources
117117
run: |
118-
# pre clear the crd resources created by lws
119-
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
118+
set -euo pipefail
119+
120+
CRD_NAME="${CRD_NAME:-vllm}"
121+
TIMEOUT=${TIMEOUT:-120}
122+
SLEEP_INTERVAL=2
123+
124+
echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..."
125+
kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found
126+
127+
echo "Waiting for all pods starting with 'vllm' to be deleted..."
128+
START_TIME=$(date +%s)
129+
130+
while true; do
131+
NOW=$(date +%s)
132+
ELAPSED=$((NOW - START_TIME))
133+
134+
if [[ $ELAPSED -ge $TIMEOUT ]]; then
135+
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
136+
kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true
137+
exit 1
138+
fi
139+
140+
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true)
141+
142+
if [[ -z "$PODS_EXIST" ]]; then
143+
echo "All vllm pods deleted."
144+
break
145+
else
146+
echo "Waiting for pods to be deleted: $PODS_EXIST"
147+
sleep $SLEEP_INTERVAL
148+
fi
149+
done
150+
120151
- name: Launch cluster
121152
id: launcher
122153
run: |
@@ -164,19 +195,58 @@ jobs:
164195
165196
- name: Waiting for pod ready
166197
run: |
167-
echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
198+
POD_PREFIX="${POD_PREFIX:-vllm-0}"
199+
SIZE="${{ inputs.size }}"
200+
TIMEOUT=1200 # default timeout 20 minutes
201+
202+
echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..."
203+
204+
START_TIME=$(date +%s)
168205
169206
while true; do
170-
# get pod status
171-
READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
207+
NOW=$(date +%s)
208+
ELAPSED=$((NOW - START_TIME))
209+
if [[ $ELAPSED -ge $TIMEOUT ]]; then
210+
echo "Timeout reached after ${ELAPSED}s"
211+
echo "Dumping pod status for debugging:"
212+
kubectl get pods -n "$NAMESPACE"
213+
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE"
214+
exit 1
215+
fi
216+
217+
# 1) check follower pods
218+
ALL_FOLLOWERS_READY=true
219+
for ((i=1; i<SIZE; i++)); do
220+
POD="${POD_PREFIX}-${i}"
221+
PHASE=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
222+
READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
223+
224+
echo "Follower [$POD] phase=$PHASE ready=$READY"
172225
173-
if [[ "$READY_STATUS" == "true" ]]; then
174-
echo "Pod [$LEADER_POD] is Ready!"
226+
if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then
227+
echo "Follower [$POD] not Ready yet..."
228+
ALL_FOLLOWERS_READY=false
229+
break
230+
fi
231+
done
232+
233+
# 2) check leader pod
234+
LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
235+
LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
236+
237+
echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY"
238+
239+
if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then
240+
echo "Leader not Ready yet..."
241+
ALL_FOLLOWERS_READY=false
242+
fi
243+
244+
if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then
245+
echo "All follower pods and leader pod are Running and Ready — continuing."
175246
break
176-
else
177-
echo "Pod [$LEADER_POD] not ready, waiting..."
178-
sleep 3
179247
fi
248+
249+
sleep 2
180250
done
181251
182252
- name: Stream logs

.github/workflows/_e2e_nightly_single_node.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ on:
2929
image:
3030
required: false
3131
type: string
32-
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11"
32+
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11"
3333
tests:
3434
required: true
3535
type: string

.github/workflows/_e2e_nightly_single_node_models.yaml

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ jobs:
5959
name: ${{inputs.model_list}} accuracy test
6060
runs-on: ${{ inputs.runner }}
6161
container:
62-
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
62+
image: "${{ inputs.image }}"
6363
env:
6464
VLLM_USE_MODELSCOPE: True
6565
GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }}
@@ -78,15 +78,15 @@ jobs:
7878
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
7979
8080
- name: Checkout vllm-project/vllm-ascend repo
81-
uses: actions/checkout@v4
81+
uses: actions/checkout@v6.0.0
8282

8383
- name: Install system dependencies
8484
run: |
8585
apt-get -y install `cat packages.txt`
8686
apt-get -y install gcc g++ cmake libnuma-dev
8787
8888
- name: Checkout vllm-project/vllm repo
89-
uses: actions/checkout@v4
89+
uses: actions/checkout@v6.0.0
9090
with:
9191
repository: vllm-project/vllm
9292
ref: ${{ inputs.vllm }}
@@ -108,11 +108,14 @@ jobs:
108108
if: ${{ inputs.runner == 'linux-aarch64-a2-4' && contains(inputs.model_list, 'Qwen3-Next-80B-A3B-Instruct') }}
109109
shell: bash -l {0}
110110
run: |
111-
wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/Ascend-BiSheng-toolkit_aarch64.run -O /tmp/Ascend-BiSheng-toolkit_aarch64.run
112-
chmod a+x /tmp/Ascend-BiSheng-toolkit_aarch64.run
113-
/tmp/Ascend-BiSheng-toolkit_aarch64.run --install
114-
. /usr/local/Ascend/8.3.RC1/bisheng_toolkit/set_env.sh
115-
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev20250914-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
111+
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
112+
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
113+
114+
- name: Install tensorflow (for Molmo-7B-D-0924)
115+
if: ${{ inputs.runner == 'linux-aarch64-a2-1' && contains(inputs.model_list, 'Molmo-7B-D-0924') }}
116+
shell: bash -l {0}
117+
run: |
118+
pip install tensorflow --no-cache-dir
116119
117120
- name: Resolve vllm-ascend version
118121
run: |
@@ -132,7 +135,7 @@ jobs:
132135
echo "GHA_VLLM_ASCEND_VERSION=$RESOLVED_VERSION" >> $GITHUB_ENV
133136
134137
- name: Checkout vllm-project/vllm-ascend repo
135-
uses: actions/checkout@v4
138+
uses: actions/checkout@v6.0.0
136139
with:
137140
repository: vllm-project/vllm-ascend
138141
path: ./vllm-ascend
@@ -175,6 +178,7 @@ jobs:
175178
id: report
176179
env:
177180
VLLM_WORKER_MULTIPROC_METHOD: spawn
181+
HF_DATASETS_OFFLINE: True
178182
VLLM_USE_MODELSCOPE: True
179183
VLLM_CI_RUNNER: ${{ inputs.runner }}
180184
VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}
@@ -225,4 +229,4 @@ jobs:
225229
path: ./benchmarks/accuracy/
226230
if-no-files-found: warn
227231
retention-days: 90
228-
overwrite: true
232+
overwrite: true

0 commit comments

Comments
 (0)