Skip to content

Commit a5cac32

Browse files
authored
Merge branch 'main' into 096_CTMethod
2 parents 26a08e6 + c0dfc89 commit a5cac32

File tree

5 files changed

+131
-56
lines changed

5 files changed

+131
-56
lines changed

.buildkite/test-amd.yaml

Lines changed: 80 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ steps:
3939
# if this test fails, it means the nightly torch version is not compatible with some
4040
# of the dependencies. Please check the error message and add the package to whitelist
4141
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
42-
mirror_hardwares: [amdexperimental]
42+
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
4343
agent_pool: mi325_1
44-
# grade: Blocking
44+
grade: Blocking
4545
soft_fail: true
4646
source_file_dependencies:
4747
- requirements/nightly_torch_test.txt
@@ -50,9 +50,9 @@ steps:
5050

5151
- label: Async Engine, Inputs, Utils, Worker Test # 10min
5252
timeout_in_minutes: 15
53-
mirror_hardwares: [amdexperimental, amdproduction]
53+
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
5454
agent_pool: mi325_1
55-
# grade: Blocking
55+
grade: Blocking
5656
source_file_dependencies:
5757
- vllm/
5858
- tests/multimodal
@@ -63,9 +63,9 @@ steps:
6363

6464
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
6565
timeout_in_minutes: 20
66-
mirror_hardwares: [amdexperimental, amdproduction]
66+
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
6767
agent_pool: mi325_1
68-
# grade: Blocking
68+
grade: Blocking
6969
source_file_dependencies:
7070
- vllm/
7171
- tests/test_inputs.py
@@ -115,9 +115,9 @@ steps:
115115
- pytest -v -s basic_correctness/test_cpu_offload.py
116116

117117
- label: Entrypoints Unit Tests # 5min
118-
mirror_hardwares: [amdexperimental, amdproduction]
118+
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
119119
agent_pool: mi325_1
120-
# grade: Blocking
120+
grade: Blocking
121121
timeout_in_minutes: 10
122122
working_dir: "/vllm-workspace/tests"
123123
fast_check: true
@@ -214,6 +214,7 @@ steps:
214214
# test with internal dp
215215
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
216216
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
217+
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
217218
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
218219
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
219220
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -252,9 +253,9 @@ steps:
252253
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
253254

254255
- label: EPLB Algorithm Test # 5min
255-
mirror_hardwares: [amdexperimental, amdproduction]
256+
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
256257
agent_pool: mi325_1
257-
# grade: Blocking
258+
grade: Blocking
258259
timeout_in_minutes: 15
259260
working_dir: "/vllm-workspace/tests"
260261
source_file_dependencies:
@@ -341,9 +342,9 @@ steps:
341342

342343
- label: V1 Test entrypoints # 35min
343344
timeout_in_minutes: 50
344-
mirror_hardwares: [amdexperimental, amdproduction]
345+
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
345346
agent_pool: mi325_1
346-
# grade: Blocking
347+
grade: Blocking
347348
source_file_dependencies:
348349
- vllm/
349350
- tests/v1
@@ -391,6 +392,20 @@ steps:
391392
commands:
392393
- pytest -v -s v1/attention
393394

395+
- label: Batch Invariance Tests (H100) # 10min
396+
mirror_hardwares: [amdexperimental]
397+
agent_pool: mi325_1
398+
timeout_in_minutes: 25
399+
gpu: h100
400+
source_file_dependencies:
401+
- vllm/
402+
- tests/v1/determinism/
403+
commands:
404+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
405+
- pip install pytest-timeout pytest-forked
406+
- pytest -v -s v1/determinism/test_batch_invariance.py
407+
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
408+
394409
- label: V1 Test attention (B200) # 10min
395410
timeout_in_minutes: 30
396411
gpu: b200
@@ -401,9 +416,9 @@ steps:
401416
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
402417

403418
- label: V1 Test others (CPU) # 5 mins
404-
mirror_hardwares: [amdexperimental, amdproduction]
419+
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
405420
agent_pool: mi325_1
406-
# grade: Blocking
421+
grade: Blocking
407422
source_file_dependencies:
408423
- vllm/
409424
- tests/v1
@@ -495,7 +510,7 @@ steps:
495510

496511
- label: PyTorch Compilation Unit Tests # 15min
497512
timeout_in_minutes: 30
498-
mirror_hardwares: [amdexperimental]
513+
mirror_hardwares: [amdexperimental, amdproduction]
499514
agent_pool: mi325_1
500515
# grade: Blocking
501516
torch_nightly: true
@@ -512,7 +527,7 @@ steps:
512527

513528
- label: PyTorch Fullgraph Smoke Test # 15min
514529
timeout_in_minutes: 30
515-
mirror_hardwares: [amdexperimental]
530+
mirror_hardwares: [amdexperimental, amdproduction]
516531
agent_pool: mi325_1
517532
# grade: Blocking
518533
torch_nightly: true
@@ -568,7 +583,7 @@ steps:
568583

569584
- label: Kernels Attention Test %N # 23min
570585
timeout_in_minutes: 35
571-
mirror_hardwares: [amdexperimental]
586+
mirror_hardwares: [amdexperimental, amdproduction]
572587
agent_pool: mi325_8
573588
# grade: Blocking
574589
source_file_dependencies:
@@ -595,7 +610,7 @@ steps:
595610

596611
- label: Kernels MoE Test %N # 40min
597612
timeout_in_minutes: 60
598-
mirror_hardwares: [amdexperimental]
613+
mirror_hardwares: [amdexperimental, amdproduction]
599614
agent_pool: mi325_8
600615
# grade: Blocking
601616
source_file_dependencies:
@@ -622,6 +637,26 @@ steps:
622637
commands:
623638
- pytest -v -s kernels/mamba
624639

640+
- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
641+
# Not replicating for CUTLAS & CuTe
642+
timeout_in_minutes: 45
643+
gpu: h100
644+
num_gpus: 1
645+
source_file_dependencies:
646+
- tools/install_deepgemm.sh
647+
- vllm/utils/deep_gemm.py
648+
- vllm/model_executor/layers/fused_moe
649+
- vllm/model_executor/layers/quantization
650+
- tests/kernels/quantization/test_block_fp8.py
651+
- tests/kernels/moe/test_deepgemm.py
652+
- tests/kernels/moe/test_batched_deepgemm.py
653+
- tests/kernels/attention/test_deepgemm_attention.py
654+
commands:
655+
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
656+
- pytest -v -s kernels/moe/test_deepgemm.py
657+
- pytest -v -s kernels/moe/test_batched_deepgemm.py
658+
- pytest -v -s kernels/attention/test_deepgemm_attention.py
659+
625660
- label: Model Executor Test # 23min
626661
timeout_in_minutes: 35
627662
torch_nightly: true
@@ -1055,6 +1090,7 @@ steps:
10551090
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
10561091
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
10571092
- pytest -v -s tests/kernels/moe/test_flashinfer.py
1093+
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
10581094

10591095
- label: Blackwell Fusion and Compile Tests # 30 min
10601096
timeout_in_minutes: 40
@@ -1064,11 +1100,19 @@ steps:
10641100
- csrc/quantization/fp4/
10651101
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
10661102
- vllm/v1/attention/backends/flashinfer.py
1103+
- vllm/v1/worker/
1104+
- vllm/v1/cudagraph_dispatcher.py
10671105
- vllm/compilation/
10681106
# can affect pattern matching
10691107
- vllm/model_executor/layers/layernorm.py
10701108
- vllm/model_executor/layers/activation.py
10711109
- vllm/model_executor/layers/quantization/input_quant_fp8.py
1110+
- vllm/model_executor/layers/fused_moe/layer.py
1111+
- tests/compile/test_fusion_attn.py
1112+
- tests/compile/test_silu_mul_quant_fusion.py
1113+
- tests/compile/distributed/test_fusion_all_reduce.py
1114+
- tests/compile/distributed/test_fusions_e2e.py
1115+
- tests/compile/fullgraph/test_full_graph.py
10721116
commands:
10731117
- nvidia-smi
10741118
- pytest -v -s tests/compile/test_fusion_attn.py
@@ -1079,7 +1123,7 @@ steps:
10791123
# Wrap with quotes to escape yaml
10801124
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
10811125
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1082-
- pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
1126+
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
10831127

10841128
- label: Blackwell Fusion E2E Tests # 30 min
10851129
timeout_in_minutes: 40
@@ -1101,7 +1145,7 @@ steps:
11011145
commands:
11021146
- nvidia-smi
11031147
# Run all e2e fusion tests
1104-
- pytest -v -s tests/compile/test_fusions_e2e.py
1148+
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
11051149

11061150
- label: ROCm GPT-OSS Eval
11071151
timeout_in_minutes: 60
@@ -1216,6 +1260,7 @@ steps:
12161260
- tests/v1/worker/test_worker_memory_snapshot.py
12171261
commands:
12181262
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
1263+
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
12191264
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
12201265
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
12211266
- pytest -v -s entrypoints/llm/test_collective_rpc.py
@@ -1251,7 +1296,7 @@ steps:
12511296

12521297
- label: Plugin Tests (2 GPUs) # 40min
12531298
timeout_in_minutes: 60
1254-
mirror_hardwares: [amdexperimental]
1299+
mirror_hardwares: [amdexperimental, amdproduction]
12551300
agent_pool: mi325_2
12561301
# grade: Blocking
12571302
working_dir: "/vllm-workspace/tests"
@@ -1327,7 +1372,7 @@ steps:
13271372

13281373
- label: Weight Loading Multiple GPU Test # 33min
13291374
timeout_in_minutes: 45
1330-
mirror_hardwares: [amdexperimental]
1375+
mirror_hardwares: [amdexperimental, amdproduction]
13311376
agent_pool: mi325_2
13321377
# grade: Blocking
13331378
working_dir: "/vllm-workspace/tests"
@@ -1432,7 +1477,7 @@ steps:
14321477
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
14331478
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
14341479
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
1435-
- pytest -v -s tests/compile/distributed/test_sequence_parallel.py
1480+
- pytest -v -s tests/distributed/test_sequence_parallel.py
14361481
- pytest -v -s tests/distributed/test_context_parallel.py
14371482
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
14381483
- pytest -v -s tests/v1/distributed/test_dbo.py
@@ -1464,7 +1509,7 @@ steps:
14641509
- bash .buildkite/scripts/run-prime-rl-test.sh
14651510

14661511
- label: DeepSeek V2-Lite Accuracy
1467-
mirror_hardwares: [amdexperimental]
1512+
mirror_hardwares: [amdexperimental, amdproduction]
14681513
agent_pool: mi325_4
14691514
# grade: Blocking
14701515
timeout_in_minutes: 60
@@ -1475,8 +1520,8 @@ steps:
14751520
commands:
14761521
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
14771522

1478-
- label: Qwen3-30B-A3B-FP8-block Accuracy
1479-
mirror_hardwares: [amdexperimental]
1523+
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
1524+
mirror_hardwares: [amdexperimental, amdproduction]
14801525
agent_pool: mi325_4
14811526
# grade: Blocking
14821527
timeout_in_minutes: 60
@@ -1486,3 +1531,12 @@ steps:
14861531
working_dir: "/vllm-workspace"
14871532
commands:
14881533
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
1534+
1535+
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
1536+
timeout_in_minutes: 60
1537+
gpu: b200
1538+
optional: true
1539+
num_gpus: 2
1540+
working_dir: "/vllm-workspace"
1541+
commands:
1542+
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
*/
1616

1717
#include <torch/all.h>
18+
#include <c10/cuda/CUDAGuard.h>
19+
#include "cutlass_extensions/common.hpp"
1820

1921
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
2022
void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
@@ -32,23 +34,34 @@ void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
3234
torch::Tensor const& alpha);
3335
#endif
3436

35-
void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
36-
torch::Tensor const& B, torch::Tensor const& A_sf,
37-
torch::Tensor const& B_sf,
38-
torch::Tensor const& alpha) {
39-
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
40-
return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
41-
#elif defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
42-
return cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
37+
void cutlass_scaled_fp4_mm(torch::Tensor& D, const torch::Tensor& A,
38+
const torch::Tensor& B, const torch::Tensor& A_sf,
39+
const torch::Tensor& B_sf,
40+
const torch::Tensor& alpha) {
41+
// Make sure we’re on A’s device.
42+
const c10::cuda::OptionalCUDAGuard device_guard(device_of(A));
43+
const int32_t sm = get_sm_version_num();
44+
45+
#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100
46+
if (sm >= 100 && sm < 120) {
47+
cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
48+
return;
49+
}
50+
#endif
51+
52+
#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120
53+
if (sm >= 120 && sm < 130) {
54+
cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
55+
return;
56+
}
4357
#endif
44-
TORCH_CHECK_NOT_IMPLEMENTED(false,
45-
"No compiled nvfp4 mm kernel, vLLM should "
46-
"be compiled using CUDA 12.8 and target "
47-
"compute capability 100 or above.");
58+
59+
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel for SM ", sm,
60+
". Recompile with CUDA >= 12.8 and CC >= 100.");
4861
}
4962

5063
bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
5164
int runtimeVersion;
5265
cudaRuntimeGetVersion(&runtimeVersion);
5366
return cuda_device_capability >= 100 && runtimeVersion >= 12080;
54-
}
67+
}

tests/v1/e2e/test_spec_decode.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,8 @@ def test_suffix_decoding_acceptance(
191191
# Expect the acceptance rate to improve.
192192
assert first_accept_rate < last_accept_rate
193193

194-
# Heuristic: expect at least 85% acceptance rate at the end.
195-
assert last_accept_rate > 0.85
194+
# Heuristic: expect at least 82.5% acceptance rate at the end.
195+
assert last_accept_rate > 0.825
196196

197197
del spec_llm
198198
torch.cuda.empty_cache()

vllm/v1/core/sched/async_scheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def _update_after_schedule(
3333
# in this scheduling step.
3434
request.num_output_placeholders += 1 + cur_num_spec_tokens
3535
# Add placeholders for the new tokens in spec_token_ids.
36-
# Wwe will update the actual spec token ids in the worker process.
36+
# We will update the actual spec token ids in the worker process.
3737
request.spec_token_ids = [-1] * self.num_spec_tokens
3838

3939
scheduler_output.pending_structured_output_tokens = (

0 commit comments

Comments
 (0)