Skip to content

Commit 0f6d078

Browse files
author
Andrew Xia
committed
Merge branch 'main' into reasoning-item-input
2 parents 8b6b5e3 + be263f7 commit 0f6d078

File tree

102 files changed

+2289
-1285
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

102 files changed

+2289
-1285
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -478,10 +478,11 @@ steps:
478478
- vllm/
479479
- tests/compile
480480
commands:
481+
# fp8 kv scales not supported on sm89, tested on Blackwell instead
481482
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
482483
# Limit to no custom ops to reduce running time
483484
# Wrap with quotes to escape yaml and avoid starting -k string with a -
484-
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
485+
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
485486

486487
- label: Cudagraph test
487488
timeout_in_minutes: 20
@@ -925,7 +926,7 @@ steps:
925926
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
926927
- pytest -v -s tests/kernels/moe/test_flashinfer.py
927928

928-
- label: Blackwell Fusion Tests # 30 min
929+
- label: Blackwell Fusion and Compile Tests # 30 min
929930
timeout_in_minutes: 40
930931
working_dir: "/vllm-workspace/"
931932
gpu: b200
@@ -946,7 +947,9 @@ steps:
946947
- pytest -v -s tests/compile/test_fusion_all_reduce.py
947948
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
948949
# Wrap with quotes to escape yaml
949-
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
950+
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
951+
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
952+
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
950953

951954
- label: Blackwell Fusion E2E Tests # 30 min
952955
timeout_in_minutes: 40
@@ -969,8 +972,6 @@ steps:
969972
- nvidia-smi
970973
# Run all e2e fusion tests
971974
- pytest -v -s tests/compile/test_fusions_e2e.py
972-
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
973-
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
974975

975976
- label: Blackwell GPT-OSS Eval
976977
timeout_in_minutes: 60
@@ -1266,7 +1267,8 @@ steps:
12661267
- pytest -v -s tests/compile/test_async_tp.py
12671268
- pytest -v -s tests/compile/test_sequence_parallelism.py
12681269
- pytest -v -s tests/compile/test_fusion_all_reduce.py
1269-
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1270+
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
1271+
- pytest -v -s tests/distributed/test_sequence_parallel.py
12701272
- pytest -v -s tests/distributed/test_context_parallel.py
12711273
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
12721274
- pytest -v -s tests/v1/distributed/test_dbo.py

.github/workflows/macos-smoke-test.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ jobs:
1111
steps:
1212
- uses: actions/checkout@v4
1313

14-
- uses: astral-sh/setup-uv@v4
14+
- uses: astral-sh/setup-uv@v7
1515
with:
1616
enable-cache: true
17+
cache-dependency-glob: |
18+
requirements/**/*.txt
19+
pyproject.toml
1720
python-version: '3.12'
1821

1922
- name: Install dependencies

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -861,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
861861
endif()
862862

863863
# Hadacore kernels
864-
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
864+
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
865865
if(HADACORE_ARCHS)
866866
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
867867
set_gencode_flags_for_srcs(

csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
802802
});
803803

804804
if (numel % 256 != 0) {
805-
out = out.index({torch::indexing::Slice(0, numel / had_size)});
805+
out = out.narrow(0, 0, numel / had_size);
806806
}
807807

808808
if (inplace && out.data_ptr() != x.data_ptr()) {

docker/Dockerfile.rocm

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,7 @@ RUN python3 -m pip install --upgrade pip
1717
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
1818

1919
# Install UV
20-
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
21-
22-
# Activate virtual environment and add uv to PATH
23-
ENV PATH="/root/.local/bin:$PATH"
20+
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
2421

2522
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
2623
# Reference: https://github.com/astral-sh/uv/pull/1694

docker/Dockerfile.rocm_base

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
1+
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
22
ARG TRITON_BRANCH="57c693b6"
33
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
44
ARG PYTORCH_BRANCH="1c57644d"
@@ -7,7 +7,7 @@ ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
77
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
88
ARG FA_BRANCH="0e60e394"
99
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
10-
ARG AITER_BRANCH="9716b1b8"
10+
ARG AITER_BRANCH="59bd8ff2"
1111
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
1212

1313
FROM ${BASE_IMAGE} AS base
@@ -19,6 +19,9 @@ ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx11
1919
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
2020
ENV AITER_ROCM_ARCH=gfx942;gfx950
2121

22+
# Required for RCCL in ROCm7.1
23+
ENV HSA_NO_SCRATCH_RECLAIM=1
24+
2225
ARG PYTHON_VERSION=3.12
2326

2427
RUN mkdir -p /app

docs/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
3030
Where to get started with vLLM depends on the type of user. If you are looking to:
3131

3232
- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
33-
- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
34-
- Build vLLM, we recommend starting with [Developer Guide](./contributing)
33+
- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
34+
- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
3535

3636
For information about the development of vLLM, see:
3737

docs/cli/bench/latency.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44

55
--8<-- "docs/cli/json_tip.inc.md"
66

7-
## Options
7+
## Arguments
88

9-
--8<-- "docs/argparse/bench_latency.md"
9+
--8<-- "docs/argparse/bench_latency.inc.md"

docs/cli/bench/serve.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44

55
--8<-- "docs/cli/json_tip.inc.md"
66

7-
## Options
7+
## Arguments
88

9-
--8<-- "docs/argparse/bench_serve.md"
9+
--8<-- "docs/argparse/bench_serve.inc.md"

docs/cli/bench/sweep/plot.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44

55
--8<-- "docs/cli/json_tip.inc.md"
66

7-
## Options
7+
## Arguments
88

9-
--8<-- "docs/argparse/bench_sweep_plot.md"
9+
--8<-- "docs/argparse/bench_sweep_plot.inc.md"

0 commit comments

Comments
 (0)