Skip to content

Commit 325ce59

Browse files
committed
Fix torch nightly
logs .. ... ...
1 parent 938a816 commit 325ce59

File tree

1 file changed

+40
-20
lines changed

1 file changed

+40
-20
lines changed

docker/Dockerfile.nightly_torch

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -228,58 +228,78 @@ COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
228228
RUN --mount=type=cache,target=/root/.cache/uv \
229229
uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128
230230

231-
# install the vllm wheel
232-
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
233-
--mount=type=cache,target=/root/.cache/uv \
234-
uv pip install --system vllm-dist/*.whl --verbose
231+
# Install apache-tvm-ffi first (required by flashinfer)
232+
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
233+
RUN --mount=type=cache,target=/root/.cache/uv \
234+
uv pip install --system --prerelease=allow apache-tvm-ffi==0.1.0b15
235235

236-
# install xformers again for the new environment
236+
# install xformers for the new environment
237237
RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
238238
--mount=type=cache,target=/root/.cache/uv \
239239
uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
240240

241+
# Install common dependencies (needed by flashinfer.aot build process)
242+
COPY requirements/common.txt requirements/common.txt
243+
COPY use_existing_torch.py use_existing_torch.py
244+
COPY pyproject.toml pyproject.toml
245+
RUN python3 use_existing_torch.py
246+
RUN --mount=type=cache,target=/root/.cache/uv \
247+
uv pip install --system -r requirements/common.txt
248+
241249
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
250+
# FlashInfer requires space-separated arch list (different from torch's semicolon format)
251+
# CUDA 12.8 doesn't support sm_120 (cvt.e2m1x2 instruction not available)
252+
# Excluding only 12.0 - sm_100a (Blackwell) compiles successfully
253+
ARG flashinfer_cuda_arch_list='8.0 8.6 8.9 9.0a 10.0a'
242254

243255
# install package for build flashinfer
244256
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
245257
RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1
246258

247-
248259
# build flashinfer for torch nightly from source around 10 mins
260+
# IMPORTANT: Build flashinfer BEFORE installing vllm wheel, so uv doesn't download
261+
# a prebuilt flashinfer that's incompatible with torch nightly
249262
# release version: v0.4.1
250263
# todo(elainewy): cache flashinfer build result for faster build
251264
ENV CCACHE_DIR=/root/.cache/ccache
265+
# Build and install flashinfer in one step (wheel must exist when installing)
252266
RUN --mount=type=cache,target=/root/.cache/ccache \
253267
--mount=type=cache,target=/root/.cache/uv \
254268
echo "git clone flashinfer..." \
255269
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
256270
&& cd flashinfer \
257-
&& git checkout v0.4.1\
271+
&& git checkout v0.4.1 \
258272
&& git submodule update --init --recursive \
259273
&& echo "finish git clone flashinfer..." \
260274
&& rm -rf build \
261275
&& export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \
262-
&& FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \
276+
&& export FLASHINFER_CUDA_ARCH_LIST="${flashinfer_cuda_arch_list}" \
277+
&& export FLASHINFER_EXTRA_CUDAFLAGS="--ptxas-options=-w,--allow-expensive-optimizations=true" \
278+
&& echo "Building FlashInfer with TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}" \
279+
&& echo "Building FlashInfer with FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}" \
280+
&& echo "Building FlashInfer with FLASHINFER_EXTRA_CUDAFLAGS=${FLASHINFER_EXTRA_CUDAFLAGS}" \
281+
&& echo "Running AOT compilation..." \
282+
&& python3 -m flashinfer.aot \
283+
&& echo "AOT compilation completed successfully!" \
284+
&& echo "Building wheel with python build module..." \
285+
&& python3 -m build --no-isolation --wheel --outdir /tmp/flashinfer-dist . \
286+
&& echo "Wheel build completed! Contents:" \
287+
&& ls -lah /tmp/flashinfer-dist/ \
288+
&& echo "Installing flashinfer wheel..." \
289+
&& uv pip install --system --prerelease=allow /tmp/flashinfer-dist/*.whl --verbose \
263290
&& cd .. \
264291
&& rm -rf flashinfer
265292

266-
# install flashinfer
267-
RUN --mount=type=cache,target=/root/.cache/uv \
268-
uv pip install --system flashinfer-dist/*.whl --verbose
269-
270-
# install common packages
271-
COPY requirements/common.txt requirements/common.txt
272-
COPY use_existing_torch.py use_existing_torch.py
273-
COPY pyproject.toml pyproject.toml
293+
# install the vllm wheel (flashinfer already installed, so uv won't download it)
294+
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
295+
--mount=type=cache,target=/root/.cache/uv \
296+
uv pip install --system --prerelease=allow vllm-dist/*.whl --verbose
274297

298+
# Copy examples and benchmarks (common.txt already installed earlier)
275299
COPY examples examples
276300
COPY benchmarks benchmarks
277301
COPY ./vllm/collect_env.py .
278302

279-
RUN python3 use_existing_torch.py
280-
RUN --mount=type=cache,target=/root/.cache/uv \
281-
uv pip install --system -r requirements/common.txt
282-
283303
################### VLLM INSTALLED IMAGE ####################
284304

285305

0 commit comments

Comments
 (0)