use vllm official wheels

ilopezluna · ilopezluna · commit 0c8aaccf16f2 · 2025-10-27T11:31:13.000+01:00
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -24,6 +24,11 @@ on:
         required: false
         type: string
         default: "0.11.0"
+      vllmCommitSha:
+        description: 'vLLM commit SHA (from git rev-list -n 1 v{version})'
+        required: false
+        type: string
+        default: "b8b302cde434df8c9289a2b465406b47ebab1c2d"
 
 jobs:
   test:
@@ -124,12 +129,15 @@ jobs:
         with:
           file: Dockerfile
           target: final-vllm
-          platforms: linux/amd64
+          platforms: linux/amd64, linux/arm64
           build-args: |
             "LLAMA_SERVER_VERSION=${{ inputs.llamaServerVersion }}"
             "LLAMA_SERVER_VARIANT=cuda"
             "BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04"
             "VLLM_VERSION=${{ inputs.vllmVersion }}"
+            "VLLM_COMMIT_SHA=${{ inputs.vllmCommitSha }}"
+            "VLLM_CUDA_VERSION=cu129"
+            "VLLM_PYTHON_TAG=cp38-abi3"
           push: true
           sbom: true
           provenance: mode=max
diff --git a/Dockerfile b/Dockerfile
@@ -79,7 +79,11 @@ ENTRYPOINT ["/app/model-runner"]
 # --- vLLM variant ---
 FROM llamacpp AS vllm
 
-ARG VLLM_VERSION
+ARG VLLM_VERSION=0.11.0
+ARG VLLM_COMMIT_SHA=b8b302cde434df8c9289a2b465406b47ebab1c2d
+ARG VLLM_CUDA_VERSION=cu129
+ARG VLLM_PYTHON_TAG=cp38-abi3
+ARG TARGETARCH
 
 USER root
 
@@ -89,10 +93,16 @@ RUN mkdir -p /opt/vllm-env && chown -R modelrunner:modelrunner /opt/vllm-env
 
 USER modelrunner
 
-# Install uv and vLLM as modelrunner user
+# Install uv and vLLM wheel as modelrunner user
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
  && ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
- && ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"
+ && if [ "$TARGETARCH" = "amd64" ]; then \
+      WHEEL_ARCH="manylinux1_x86_64"; \
+    else \
+      WHEEL_ARCH="manylinux2014_aarch64"; \
+    fi \
+ && WHEEL_URL="https://wheels.vllm.ai/${VLLM_COMMIT_SHA}/vllm-${VLLM_VERSION}%2B${VLLM_CUDA_VERSION}-${VLLM_PYTHON_TAG}-${WHEEL_ARCH}.whl" \
+ && ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "$WHEEL_URL"
 
 RUN /opt/vllm-env/bin/python -c "import vllm; print(vllm.__version__)" > /opt/vllm-env/version
 
diff --git a/README.md b/README.md
@@ -228,6 +228,81 @@ Available variants:
 
 The binary path in the image follows this pattern: `/com.docker.llama-server.native.linux.${LLAMA_SERVER_VARIANT}.${TARGETARCH}`
 
+### vLLM integration
+
+The Docker image also supports vLLM as an alternative inference backend.
+
+#### Building the vLLM variant
+
+To build a Docker image with vLLM support:
+
+```sh
+# Build with default settings (vLLM 0.11.0)
+make docker-build DOCKER_TARGET=final-vllm BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04 LLAMA_SERVER_VARIANT=cuda
+
+# Build for specific architecture
+docker buildx build \
+  --platform linux/amd64 \
+  --target final-vllm \
+  --build-arg BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04 \
+  --build-arg LLAMA_SERVER_VARIANT=cuda \
+  --build-arg VLLM_VERSION=0.11.0 \
+  --build-arg VLLM_COMMIT_SHA=b8b302cde434df8c9289a2b465406b47ebab1c2d \
+  -t docker/model-runner:vllm .
+```
+
+#### Build Arguments
+
+The vLLM variant supports the following build arguments:
+
+- **VLLM_VERSION**: The vLLM version to install (default: `0.11.0`)
+- **VLLM_COMMIT_SHA**: The git commit SHA corresponding to the vLLM version (default: `b8b302cde434df8c9289a2b465406b47ebab1c2d` for v0.11.0)
+- **VLLM_CUDA_VERSION**: The CUDA version suffix for the wheel (default: `cu129`)
+- **VLLM_PYTHON_TAG**: The Python compatibility tag (default: `cp38-abi3`, compatible with Python 3.8+)
+
+#### Multi-Architecture Support
+
+The vLLM variant supports both x86_64 (amd64) and aarch64 (arm64) architectures. The build process automatically selects the appropriate prebuilt wheel:
+
+- **linux/amd64**: Uses `manylinux1_x86_64` wheels
+- **linux/arm64**: Uses `manylinux2014_aarch64` wheels
+
+To build for multiple architectures:
+
+```sh
+docker buildx build \
+  --platform linux/amd64,linux/arm64 \
+  --target final-vllm \
+  --build-arg BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04 \
+  --build-arg LLAMA_SERVER_VARIANT=cuda \
+  -t docker/model-runner:vllm .
+```
+
+#### Updating to a New vLLM Version
+
+To update to a new vLLM version, you need to:
+
+1. **Find the commit SHA for the version:**
+   ```sh
+   # Clone the vLLM repository (if not already cloned)
+   git clone https://github.com/vllm-project/vllm.git
+   cd vllm
+   
+   # Get the commit SHA for a specific version
+   git rev-list -n 1 v0.11.1
+   ```
+
+2. **Build with the new version:**
+   ```sh
+   docker buildx build \
+     --target final-vllm \
+     --build-arg VLLM_VERSION=0.11.1 \
+     --build-arg VLLM_COMMIT_SHA=<commit-sha-from-step-1> \
+     -t docker/model-runner:vllm-0.11.1 .
+   ```
+
+The vLLM wheels are sourced from the official vLLM wheel repository at `https://wheels.vllm.ai/{commit_sha}/vllm/`, which provides prebuilt wheels for every commit.
+
 ## API Examples
 
 The Model Runner exposes a REST API that can be accessed via TCP port. You can interact with it using curl commands.