vllm-project · rzabarazesh · Nov 4, 2025 · Nov 6, 2025
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
@@ -228,58 +228,79 @@ COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128
 
-# install the vllm wheel
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system vllm-dist/*.whl --verbose
+# Install apache-tvm-ffi first (required by flashinfer)
+# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --prerelease=allow apache-tvm-ffi==0.1.0b15
 
-# install xformers again for the new environment
+# install xformers for the new environment
 RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
     --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
 
+# Install common dependencies (needed by flashinfer.aot build process)
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+RUN python3 use_existing_torch.py
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/common.txt
+
 ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+# FlashInfer requires space-separated arch list (different from torch's semicolon format)
+# CUDA 12.8 doesn't support sm_120 (cvt.e2m1x2 instruction not available)
+# Excluding only 12.0 - sm_100a (Blackwell) compiles successfully
+ARG flashinfer_cuda_arch_list='8.0 8.6 8.9 9.0a 10.0a'
 
-# install package for build flashinfer
+# install packages for building flashinfer
+# Versions must match FlashInfer's pyproject.toml build requirements
 # see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
-RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1
-
+RUN pip install 'setuptools>=77' 'packaging>=24' ninja==1.11.1.3 build==1.2.2.post1
 
 # build flashinfer for torch nightly from source around 10 mins
+# IMPORTANT: Build flashinfer BEFORE installing vllm wheel, so uv doesn't download
+# a prebuilt flashinfer that's incompatible with torch nightly
 # release version: v0.4.1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
+# Build and install flashinfer in one step (wheel must exist when installing)
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     echo "git clone flashinfer..." \
     && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
     && cd flashinfer \
-    && git checkout v0.4.1\
+    && git checkout v0.4.1 \
     && git submodule update --init --recursive \
     && echo "finish git clone flashinfer..." \
     && rm -rf build \
     && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \
-    && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \
+    && export FLASHINFER_CUDA_ARCH_LIST="${flashinfer_cuda_arch_list}" \
+    && export FLASHINFER_EXTRA_CUDAFLAGS="--ptxas-options=-w,--allow-expensive-optimizations=true" \
+    && echo "Building FlashInfer with TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}" \
+    && echo "Building FlashInfer with FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}" \
+    && echo "Building FlashInfer with FLASHINFER_EXTRA_CUDAFLAGS=${FLASHINFER_EXTRA_CUDAFLAGS}" \
+    && echo "Running AOT compilation..." \
+    && python3 -m flashinfer.aot \
+    && echo "AOT compilation completed successfully!" \
+    && echo "Building wheel with python build module..." \
+    && python3 -m build --no-isolation --wheel --outdir /tmp/flashinfer-dist . \
+    && echo "Wheel build completed! Contents:" \
+    && ls -lah /tmp/flashinfer-dist/ \
+    && echo "Installing flashinfer wheel..." \
+    && uv pip install --system --prerelease=allow /tmp/flashinfer-dist/*.whl --verbose \
     && cd .. \
     && rm -rf flashinfer
 
-# install flashinfer
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-dist/*.whl --verbose
-
-# install common packages
-COPY requirements/common.txt requirements/common.txt
-COPY use_existing_torch.py use_existing_torch.py
-COPY pyproject.toml pyproject.toml
+# install the vllm wheel (flashinfer already installed, so uv won't download it)
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --prerelease=allow vllm-dist/*.whl --verbose
 
+# Copy examples and benchmarks (common.txt already installed earlier)
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .
 
-RUN python3 use_existing_torch.py
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/common.txt
-
 ################### VLLM INSTALLED IMAGE ####################