vllm-project
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 8 additions & 6 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎.github/workflows/macos-smoke-test.yml‎
Lines changed: 4 additions & 1 deletion b/‎.github/workflows/macos-smoke-test.yml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu‎
Lines changed: 1 addition & 1 deletion b/‎csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/Dockerfile.rocm‎
Lines changed: 1 addition & 4 deletions b/‎docker/Dockerfile.rocm‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎docker/Dockerfile.rocm_base‎
Lines changed: 5 additions & 2 deletions b/‎docker/Dockerfile.rocm_base‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎docs/README.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/cli/bench/latency.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/cli/bench/latency.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/cli/bench/serve.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/cli/bench/serve.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/cli/bench/sweep/plot.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/cli/bench/sweep/plot.md‎
Lines changed: 2 additions & 2 deletions
@@ -478,10 +478,11 @@ steps:
   - vllm/
   - tests/compile
   commands:
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
   - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
     # Limit to no custom ops to reduce running time
     # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -925,7 +926,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
 
-- label: Blackwell Fusion Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   gpu: b200
@@ -946,7 +947,9 @@ steps:
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
     # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -969,8 +972,6 @@ steps:
     - nvidia-smi
     # Run all e2e fusion tests
     - pytest -v -s tests/compile/test_fusions_e2e.py
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1266,7 +1267,8 @@ steps:
     - pytest -v -s tests/compile/test_async_tp.py
     - pytest -v -s tests/compile/test_sequence_parallelism.py
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
@@ -11,9 +11,12 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - uses: astral-sh/setup-uv@v4
+      - uses: astral-sh/setup-uv@v7
         with:
           enable-cache: true
+          cache-dependency-glob: |
+            requirements/**/*.txt
+            pyproject.toml
           python-version: '3.12'
 
       - name: Install dependencies
 
@@ -861,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # Hadacore kernels
-  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if(HADACORE_ARCHS)
     set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
     set_gencode_flags_for_srcs(
 
@@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
     });
 
     if (numel % 256 != 0) {
-        out = out.index({torch::indexing::Slice(0, numel / had_size)});
+        out = out.narrow(0, 0, numel / had_size);
     }
 
     if (inplace && out.data_ptr() != x.data_ptr()) {
 
@@ -17,10 +17,7 @@ RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 
 # Install UV
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Activate virtual environment and add uv to PATH
-ENV PATH="/root/.local/bin:$PATH"
+RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
 ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
 ARG PYTORCH_BRANCH="1c57644d"
@@ -7,7 +7,7 @@ ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="9716b1b8"
+ARG AITER_BRANCH="59bd8ff2"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -19,6 +19,9 @@ ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx11
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 ENV AITER_ROCM_ARCH=gfx942;gfx950
 
+# Required for RCCL in ROCm7.1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+
 ARG PYTHON_VERSION=3.12
 
 RUN mkdir -p /app
 
@@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
 Where to get started with vLLM depends on the type of user. If you are looking to:
 
 - Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
-- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
-- Build vLLM, we recommend starting with [Developer Guide](./contributing)
+- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
+- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
 
 For information about the development of vLLM, see:
 
 
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_latency.md"
+--8<-- "docs/argparse/bench_latency.inc.md"
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_serve.md"
+--8<-- "docs/argparse/bench_serve.inc.md"
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_sweep_plot.md"
+--8<-- "docs/argparse/bench_sweep_plot.inc.md"
Original file line number	Diff line number	Diff line change
`@@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {`
`802`	`802`	`});`
`803`	`803`
`804`	`804`	`if (numel % 256 != 0) {`
`805`		`- out = out.index({torch::indexing::Slice(0, numel / had_size)});`
	`805`	`+ out = out.narrow(0, 0, numel / had_size);`
`806`	`806`	`}`
`807`	`807`
`808`	`808`	`if (inplace && out.data_ptr() != x.data_ptr()) {`