Skip to content

Commit 93b3c28

Browse files
authored
ci: Fixes for pre-built wheels (#214)
* build: Allow NGC builds Signed-off-by: oliver könig <[email protected]> * reduce grid Signed-off-by: oliver könig <[email protected]> * update grid Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> * upgrade cuda action Signed-off-by: oliver könig <[email protected]> * remove test Signed-off-by: oliver könig <[email protected]> * py3.8 Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> * exclude Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> * torch-version Signed-off-by: oliver könig <[email protected]> * py3.8/torch2.1/cuda12.3 Signed-off-by: oliver könig <[email protected]> * Update publish.yml * fix grid Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> * cuda11.8 Signed-off-by: oliver könig <[email protected]> * no hopper for 118 Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> --------- Signed-off-by: oliver könig <[email protected]>
1 parent f8f4114 commit 93b3c28

File tree

8 files changed

+337
-34
lines changed

8 files changed

+337
-34
lines changed

.github/scripts/build.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
3+
set -eoxu pipefail
4+
5+
# We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6
6+
# https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810
7+
# However this still fails so I am using a newer version of setuptools
8+
pip install setuptools==68.0.0
9+
pip install ninja packaging wheel
10+
export PATH=/usr/local/cuda/bin:/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
11+
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
12+
13+
# Limit MAX_JOBS otherwise the github runner goes OOM
14+
export MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "129" ] && echo 1 || echo 2)
15+
export NVCC_THREADS=2
16+
17+
export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX"
18+
export DG_USE_LOCAL_VERSION=${DG_USE_LOCAL_VERSION:-0}
19+
20+
# 5h timeout since GH allows max 6h and we want some buffer
21+
EXIT_CODE=0
22+
timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?
23+
24+
if [ $EXIT_CODE -eq 0 ]; then
25+
tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi$CXX11_ABI
26+
wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
27+
ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
28+
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
29+
fi
30+
31+
echo $EXIT_CODE
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/bin/bash
2+
3+
# Configuration
4+
BASE_IMAGE="nvcr.io/nvidia/pytorch"
5+
TAG_SUFFIX="-py3"
6+
MONTHS_TO_CHECK=7 # Check current month and previous 6 months (total 7)
7+
8+
# Initialize an array to store existing tags
9+
EXISTING_TAGS=()
10+
11+
echo "Checking for existence of the last ${MONTHS_TO_CHECK} NGC PyTorch images: ${BASE_IMAGE}:YY.MM${TAG_SUFFIX}"
12+
echo "---------------------------------------------------------------------"
13+
14+
# Loop through the last N months
15+
for i in $(seq 0 $((MONTHS_TO_CHECK - 1))); do
16+
# Calculate Year and Month for the tag
17+
CURRENT_YEAR=$(date +%Y)
18+
CURRENT_MONTH=$(date +%m)
19+
20+
# Calculate target month and year
21+
TARGET_DATE=$(date -d "$CURRENT_YEAR-$CURRENT_MONTH-01 -$i months" +%y.%m)
22+
23+
# Construct the full image tag and the tag-only string
24+
IMAGE_TAG="${TARGET_DATE}${TAG_SUFFIX}"
25+
FULL_IMAGE="${BASE_IMAGE}:${IMAGE_TAG}"
26+
27+
echo "Checking: ${FULL_IMAGE}"
28+
29+
# Use 'docker manifest inspect' to check for image existence without pulling.
30+
if docker manifest inspect "${FULL_IMAGE}" > /dev/null 2>&1; then
31+
echo "✅ EXISTS: Found."
32+
# Add the tag-only string to the array
33+
EXISTING_TAGS+=("nvcr.io/nvidia/pytorch:${IMAGE_TAG}")
34+
else
35+
echo "❌ MISSING: Not found."
36+
fi
37+
done
38+
39+
echo "---------------------------------------------------------------------"
40+
41+
## JSON Output Generation
42+
# This uses the collected array to build a JSON string.
43+
44+
# 1. Convert the shell array to a newline-separated string.
45+
TAGS_NL_SEP=$(printf "%s\n" "${EXISTING_TAGS[@]}")
46+
47+
# 2. Use jq to read the newline-separated list and format it into a JSON array.
48+
# . | split("\n") | .[:-1] reads the input, splits it by newline, and removes the trailing empty element.
49+
if command -v jq &> /dev/null; then
50+
JSON_STRING=$(echo -e "${TAGS_NL_SEP}" | jq -R -s 'split("\n") | .[:-1]')
51+
52+
echo "Generated JSON String of Existing Tags:"
53+
echo "${JSON_STRING}"
54+
55+
# Optional: Save the JSON string to a variable for further use
56+
# echo "JSON_STRING is now available in the shell if you source this script."
57+
else
58+
echo "WARNING: 'jq' is not installed. Cannot format output as JSON."
59+
echo "Found Tags: ${EXISTING_TAGS[*]}"
60+
fi
61+
62+
echo "---"
63+
echo "Check complete."
64+
65+
echo "${JSON_STRING}" > ngc_images.json

.github/scripts/test.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
set -exou pipefail
4+
5+
pip install dist/*.whl
6+
python -c "import deep_gemm; print(deep_gemm.__version__)"

.github/workflows/_build.yml

Lines changed: 19 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ jobs:
5353
ref: ${{ inputs.release-version }}
5454
submodules: recursive
5555

56+
- name: Checkout build scripts
57+
uses: actions/checkout@v4
58+
with:
59+
path: build-scripts/
60+
5661
- name: Set up Python
5762
uses: actions/setup-python@v5
5863
with:
@@ -82,7 +87,7 @@ jobs:
8287

8388
- name: Install CUDA ${{ inputs.cuda-version }}
8489
if: ${{ inputs.cuda-version != 'cpu' }}
85-
uses: Jimver/[email protected].26
90+
uses: Jimver/[email protected].28
8691
id: cuda-toolkit
8792
with:
8893
cuda: ${{ inputs.cuda-version }}
@@ -109,8 +114,8 @@ jobs:
109114
# see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
110115
# This code is ugly, maybe there's a better way to do this.
111116
export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
112-
minv = {'2.4': 118, '2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126}[env['MATRIX_TORCH_VERSION']]; \
113-
maxv = {'2.4': 124, '2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129}[env['MATRIX_TORCH_VERSION']]; \
117+
minv = {'2.1': 121, '2.4': 118, '2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126}[env['MATRIX_TORCH_VERSION']]; \
118+
maxv = {'2.1': 121, '2.4': 124, '2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129}[env['MATRIX_TORCH_VERSION']]; \
114119
print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
115120
)
116121
if [[ ${{ inputs.torch-version }} == *"dev"* ]]; then
@@ -156,39 +161,24 @@ jobs:
156161
157162
- name: Build wheel
158163
id: build_wheel
164+
env:
165+
CXX11_ABI: ${{ inputs.cxx11_abi }}
166+
MATRIX_TORCH_VERSION: ${{ env.MATRIX_TORCH_VERSION}}
167+
WHEEL_CUDA_VERSION: ${{ env.WHEEL_CUDA_VERSION }}
168+
MATRIX_PYTHON_VERSION: ${{ env.MATRIX_PYTHON_VERSION }}
169+
DG_USE_LOCAL_VERSION: ${{ inputs.use-local-version && '1' || '0' }}
159170
run: |
160-
# We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6
161-
# https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810
162-
# However this still fails so I'm using a newer version of setuptools
163-
pip install setuptools==75.8.0
164-
pip install ninja packaging wheel
165-
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
166-
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
167-
# Limit MAX_JOBS otherwise the github runner goes OOM
168-
# nvcc 11.8 can compile with 2 jobs, but nvcc 12.3 goes OOM
169-
170-
export MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "129" ] && echo 1 || echo 2)
171-
export NVCC_THREADS=2
172-
export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX"
173-
export DG_USE_LOCAL_VERSION=${{ inputs.use-local-version && '1' || '0' }}
174-
175-
# 5h timeout since GH allows max 6h and we want some buffer
176-
EXIT_CODE=0
177-
timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?
178-
179-
if [ $EXIT_CODE -eq 0 ]; then
180-
tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ inputs.cxx11_abi }}
181-
wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
182-
ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
183-
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
184-
fi
171+
EXIT_CODE=$(bash build-scripts/.github/scripts/build.sh | tail -n 1)
185172
186173
# Store exit code in GitHub env for later steps
187174
echo "build_exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
188175
189-
# Do not fail the job if timeout killed the build
190176
exit $EXIT_CODE
191177
178+
- name: Log Built Wheels
179+
run: |
180+
ls dist
181+
192182
- name: Log build logs after timeout
193183
if: always() && steps.build_wheel.outputs.build_exit_code == 124
194184
run: |
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
name: ~Build wheel template
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
runs-on:
7+
description: "The runner to use for the build"
8+
required: true
9+
type: string
10+
container-image:
11+
description: "Container image"
12+
required: true
13+
type: string
14+
upload-to-release:
15+
description: "Upload wheel to this release"
16+
required: false
17+
type: boolean
18+
default: false
19+
release-version:
20+
description: "Upload wheel to this release"
21+
required: false
22+
type: string
23+
24+
defaults:
25+
run:
26+
shell: bash -x -e -u -o pipefail {0}
27+
28+
jobs:
29+
build-wheel:
30+
runs-on: ${{ inputs.runs-on }}
31+
name: Build wheel (${{ inputs.container-image }})
32+
steps:
33+
- name: Move /var/lib/docker/
34+
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
35+
36+
- name: Maximize build space
37+
uses: easimon/maximize-build-space@master
38+
with:
39+
root-reserve-mb: 5120
40+
temp-reserve-mb: 32
41+
swap-size-mb: 10240
42+
remove-dotnet: "true"
43+
remove-android: "true"
44+
remove-haskell: "true"
45+
remove-codeql: "true"
46+
build-mount-path: "/var/lib/docker/"
47+
48+
- name: Restore /var/lib/docker/
49+
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
50+
51+
- name: Checkout source
52+
uses: actions/checkout@v4
53+
with:
54+
ref: ${{ inputs.release-version }}
55+
submodules: recursive
56+
57+
- name: Checkout build scripts
58+
uses: actions/checkout@v4
59+
with:
60+
path: build-scripts/
61+
62+
- name: Build
63+
run: |
64+
echo "Free space:"
65+
df -h
66+
67+
- name: Pull the container
68+
run: docker pull ${{ inputs.container-image }}
69+
70+
- name: Set CUDA and PyTorch versions
71+
run: |
72+
cat <<'EOF' >> script.sh
73+
#!/bin/bash
74+
75+
set -eoxu pipefail
76+
77+
echo "MATRIX_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
78+
echo "MATRIX_TORCH_VERSION=$NVIDIA_PYTORCH_VERSION" >> $GITHUB_ENV
79+
echo "WHEEL_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1'})" >> $GITHUB_ENV
80+
echo "MATRIX_PYTHON_VERSION=$(python -c "import sys; print('{}.{}'.format(sys.version_info[0], sys.version_info[1]))" | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
81+
echo "CXX11_ABI=$(python -c 'import torch; print(str(torch._C._GLIBCXX_USE_CXX11_ABI).upper())')" >> $GITHUB_ENV
82+
83+
cat $GITHUB_ENV
84+
EOF
85+
86+
docker run \
87+
--rm \
88+
--shm-size=64g \
89+
--workdir /workspace \
90+
--volume $(pwd):/workspace \
91+
--volume $GITHUB_ENV:$GITHUB_ENV \
92+
-e GITHUB_ENV=$GITHUB_ENV \
93+
${{ inputs.container-image }} bash /workspace/script.sh
94+
95+
- name: Build wheel
96+
id: build_wheel
97+
env:
98+
CXX11_ABI: ${{ env.CXX11_ABI }}
99+
MATRIX_TORCH_VERSION: ${{ env.MATRIX_TORCH_VERSION}}
100+
WHEEL_CUDA_VERSION: ${{ env.WHEEL_CUDA_VERSION }}
101+
MATRIX_PYTHON_VERSION: ${{ env.MATRIX_PYTHON_VERSION }}
102+
run: |
103+
EXIT_CODE=$(docker run \
104+
--rm \
105+
--shm-size=64g \
106+
--workdir /workspace \
107+
--volume $(pwd):/workspace \
108+
--volume $GITHUB_ENV:$GITHUB_ENV \
109+
-e PIP_CONSTRAINT= \
110+
-e GITHUB_ENV=$GITHUB_ENV \
111+
-e CXX11_ABI=$CXX11_ABI \
112+
-e MATRIX_TORCH_VERSION=$MATRIX_TORCH_VERSION \
113+
-e WHEEL_CUDA_VERSION=$WHEEL_CUDA_VERSION \
114+
-e MATRIX_PYTHON_VERSION=$MATRIX_PYTHON_VERSION \
115+
${{ inputs.container-image }} bash /workspace/build-scripts/.github/scripts/build.sh | tail -n 1)
116+
117+
- name: Log Built Wheels
118+
run: |
119+
ls dist
120+
121+
- name: Get Release with tag
122+
id: get_current_release
123+
uses: joutvhu/get-release@v1
124+
with:
125+
tag_name: ${{ inputs.release-version }}
126+
env:
127+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
128+
129+
- name: Upload Release Asset
130+
id: upload_release_asset
131+
if: inputs.upload-to-release
132+
uses: actions/upload-release-asset@v1
133+
env:
134+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
135+
with:
136+
upload_url: ${{ steps.get_current_release.outputs.upload_url }}
137+
asset_path: ./dist/${{env.wheel_name}}
138+
asset_name: ${{env.wheel_name}}
139+
asset_content_type: application/*
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: Build wheels in a container
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
runs-on:
7+
description: "The runner to use for the build"
8+
required: true
9+
type: string
10+
default: ubuntu-22.04
11+
container-image:
12+
description: "Container image"
13+
required: true
14+
type: string
15+
upload-to-release:
16+
description: "Upload wheel to this release"
17+
required: false
18+
type: boolean
19+
default: false
20+
release-version:
21+
description: "Upload wheel to this release"
22+
required: false
23+
type: string
24+
25+
push:
26+
27+
jobs:
28+
build-wheels:
29+
uses: ./.github/workflows/_build_in_container.yml
30+
with:
31+
runs-on: ${{ inputs.runs-on || 'ubuntu-22.04' }}
32+
container-image: ${{ inputs.container-image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
33+
upload-to-release: ${{ inputs.upload-to-release || false }}
34+
release-version: ${{ inputs.release-version || 'v2.2.5' }}

0 commit comments

Comments
 (0)