@@ -228,58 +228,78 @@ COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
228228RUN --mount=type=cache,target=/root/.cache/uv \
229229 uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128
230230
231- # install the vllm wheel
232- RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
233- --mount=type=cache,target=/root/.cache/uv \
234- uv pip install --system vllm-dist/*.whl --verbose
231+ # Install apache-tvm-ffi first (required by flashinfer)
232+ # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
233+ RUN --mount=type=cache,target=/root/.cache/uv \
234+ uv pip install --system --prerelease=allow apache-tvm-ffi==0.1.0b15
235235
236- # install xformers again for the new environment
236+ # install xformers for the new environment
237237RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
238238 --mount=type=cache,target=/root/.cache/uv \
239239 uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
240240
241+ # Install common dependencies (needed by flashinfer.aot build process)
242+ COPY requirements/common.txt requirements/common.txt
243+ COPY use_existing_torch.py use_existing_torch.py
244+ COPY pyproject.toml pyproject.toml
245+ RUN python3 use_existing_torch.py
246+ RUN --mount=type=cache,target=/root/.cache/uv \
247+ uv pip install --system -r requirements/common.txt
248+
241249ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
250+ # FlashInfer requires space-separated arch list (different from torch's semicolon format)
251+ # CUDA 12.8 doesn't support sm_120 (cvt.e2m1x2 instruction not available)
252+ # Excluding only 12.0 - sm_100a (Blackwell) compiles successfully
253+ ARG flashinfer_cuda_arch_list='8.0 8.6 8.9 9.0a 10.0a'
242254
243255# install package for build flashinfer
244256# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
245257RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1
246258
247-
248259# build flashinfer for torch nightly from source around 10 mins
260+ # IMPORTANT: Build flashinfer BEFORE installing vllm wheel, so uv doesn't download
261+ # a prebuilt flashinfer that's incompatible with torch nightly
249262# release version: v0.4.1
250263# todo(elainewy): cache flashinfer build result for faster build
251264ENV CCACHE_DIR=/root/.cache/ccache
265+ # Build and install flashinfer in one step (wheel must exist when installing)
252266RUN --mount=type=cache,target=/root/.cache/ccache \
253267 --mount=type=cache,target=/root/.cache/uv \
254268 echo "git clone flashinfer..." \
255269 && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
256270 && cd flashinfer \
257- && git checkout v0.4.1\
271+ && git checkout v0.4.1 \
258272 && git submodule update --init --recursive \
259273 && echo "finish git clone flashinfer..." \
260274 && rm -rf build \
261275 && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \
262- && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \
276+ && export FLASHINFER_CUDA_ARCH_LIST="${flashinfer_cuda_arch_list}" \
277+ && export FLASHINFER_EXTRA_CUDAFLAGS="--ptxas-options=-w,--allow-expensive-optimizations=true" \
278+ && echo "Building FlashInfer with TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}" \
279+ && echo "Building FlashInfer with FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}" \
280+ && echo "Building FlashInfer with FLASHINFER_EXTRA_CUDAFLAGS=${FLASHINFER_EXTRA_CUDAFLAGS}" \
281+ && echo "Running AOT compilation..." \
282+ && python3 -m flashinfer.aot \
283+ && echo "AOT compilation completed successfully!" \
284+ && echo "Building wheel with python build module..." \
285+ && python3 -m build --no-isolation --wheel --outdir /tmp/flashinfer-dist . \
286+ && echo "Wheel build completed! Contents:" \
287+ && ls -lah /tmp/flashinfer-dist/ \
288+ && echo "Installing flashinfer wheel..." \
289+ && uv pip install --system --prerelease=allow /tmp/flashinfer-dist/*.whl --verbose \
263290 && cd .. \
264291 && rm -rf flashinfer
265292
266- # install flashinfer
267- RUN --mount=type=cache,target=/root/.cache/uv \
268- uv pip install --system flashinfer-dist/*.whl --verbose
269-
270- # install common packages
271- COPY requirements/common.txt requirements/common.txt
272- COPY use_existing_torch.py use_existing_torch.py
273- COPY pyproject.toml pyproject.toml
293+ # install the vllm wheel (flashinfer already installed, so uv won't download it)
294+ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
295+ --mount=type=cache,target=/root/.cache/uv \
296+ uv pip install --system --prerelease=allow vllm-dist/*.whl --verbose
274297
298+ # Copy examples and benchmarks (common.txt already installed earlier)
275299COPY examples examples
276300COPY benchmarks benchmarks
277301COPY ./vllm/collect_env.py .
278302
279- RUN python3 use_existing_torch.py
280- RUN --mount=type=cache,target=/root/.cache/uv \
281- uv pip install --system -r requirements/common.txt
282-
283303################### VLLM INSTALLED IMAGE ####################
284304
285305
0 commit comments