Merge pull request #324 from docker/feat/vllm-prebuilt-wheels

ericcurtin · web-flow · commit acab8b5a115e · 2025-10-27T20:16:42.000Z
Use vllm official wheels
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -136,12 +136,14 @@ jobs:
         with:
           file: Dockerfile
           target: final-vllm
-          platforms: linux/amd64
+          platforms: linux/amd64, linux/arm64
           build-args: |
             "LLAMA_SERVER_VERSION=${{ inputs.llamaServerVersion }}"
             "LLAMA_SERVER_VARIANT=cuda"
             "BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04"
             "VLLM_VERSION=${{ inputs.vllmVersion }}"
+            "VLLM_CUDA_VERSION=cu129"
+            "VLLM_PYTHON_TAG=cp38-abi3"
           push: true
           sbom: true
           provenance: mode=max
diff --git a/Dockerfile b/Dockerfile
@@ -79,7 +79,10 @@ ENTRYPOINT ["/app/model-runner"]
 # --- vLLM variant ---
 FROM llamacpp AS vllm
 
-ARG VLLM_VERSION
+ARG VLLM_VERSION=0.11.0
+ARG VLLM_CUDA_VERSION=cu129
+ARG VLLM_PYTHON_TAG=cp38-abi3
+ARG TARGETARCH
 
 USER root
 
@@ -89,10 +92,16 @@ RUN mkdir -p /opt/vllm-env && chown -R modelrunner:modelrunner /opt/vllm-env
 
 USER modelrunner
 
-# Install uv and vLLM as modelrunner user
+# Install uv and vLLM wheel as modelrunner user
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
  && ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
- && ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"
+ && if [ "$TARGETARCH" = "amd64" ]; then \
+      WHEEL_ARCH="manylinux1_x86_64"; \
+    else \
+      WHEEL_ARCH="manylinux2014_aarch64"; \
+    fi \
+ && WHEEL_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}%2B${VLLM_CUDA_VERSION}-${VLLM_PYTHON_TAG}-${WHEEL_ARCH}.whl" \
+ && ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "$WHEEL_URL"
 
 RUN /opt/vllm-env/bin/python -c "import vllm; print(vllm.__version__)" > /opt/vllm-env/version
 
diff --git a/README.md b/README.md
@@ -228,6 +228,67 @@ Available variants:
 
 The binary path in the image follows this pattern: `/com.docker.llama-server.native.linux.${LLAMA_SERVER_VARIANT}.${TARGETARCH}`
 
+### vLLM integration
+
+The Docker image also supports vLLM as an alternative inference backend.
+
+#### Building the vLLM variant
+
+To build a Docker image with vLLM support:
+
+```sh
+# Build with default settings (vLLM 0.11.0)
+make docker-build DOCKER_TARGET=final-vllm BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04 LLAMA_SERVER_VARIANT=cuda
+
+# Build for specific architecture
+docker buildx build \
+  --platform linux/amd64 \
+  --target final-vllm \
+  --build-arg BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04 \
+  --build-arg LLAMA_SERVER_VARIANT=cuda \
+  --build-arg VLLM_VERSION=0.11.0 \
+  -t docker/model-runner:vllm .
+```
+
+#### Build Arguments
+
+The vLLM variant supports the following build arguments:
+
+- **VLLM_VERSION**: The vLLM version to install (default: `0.11.0`)
+- **VLLM_CUDA_VERSION**: The CUDA version suffix for the wheel (default: `cu129`)
+- **VLLM_PYTHON_TAG**: The Python compatibility tag (default: `cp38-abi3`, compatible with Python 3.8+)
+
+#### Multi-Architecture Support
+
+The vLLM variant supports both x86_64 (amd64) and aarch64 (arm64) architectures. The build process automatically selects the appropriate prebuilt wheel:
+
+- **linux/amd64**: Uses `manylinux1_x86_64` wheels
+- **linux/arm64**: Uses `manylinux2014_aarch64` wheels
+
+To build for multiple architectures:
+
+```sh
+docker buildx build \
+  --platform linux/amd64,linux/arm64 \
+  --target final-vllm \
+  --build-arg BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04 \
+  --build-arg LLAMA_SERVER_VARIANT=cuda \
+  -t docker/model-runner:vllm .
+```
+
+#### Updating to a New vLLM Version
+
+To update to a new vLLM version:
+
+```sh
+docker buildx build \
+  --target final-vllm \
+  --build-arg VLLM_VERSION=0.11.1 \
+  -t docker/model-runner:vllm-0.11.1 .
+```
+
+The vLLM wheels are sourced from the official vLLM GitHub Releases at `https://github.com/vllm-project/vllm/releases`, which provides prebuilt wheels for each release version.
+
 ## API Examples
 
 The Model Runner exposes a REST API that can be accessed via TCP port. You can interact with it using curl commands.