@@ -478,10 +478,11 @@ steps:
478478 - vllm/
479479 - tests/compile
480480 commands :
481+ # fp8 kv scales not supported on sm89, tested on Blackwell instead
481482 - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
482483 # Limit to no custom ops to reduce running time
483484 # Wrap with quotes to escape yaml and avoid starting -k string with a -
484- - " pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and - quant_fp8'"
485+ - " pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not + quant_fp8 and not Llama-4 '"
485486
486487- label : Cudagraph test
487488 timeout_in_minutes : 20
@@ -925,7 +926,7 @@ steps:
925926 - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
926927 - pytest -v -s tests/kernels/moe/test_flashinfer.py
927928
928- - label : Blackwell Fusion Tests # 30 min
929+ - label : Blackwell Fusion and Compile Tests # 30 min
929930 timeout_in_minutes : 40
930931 working_dir : " /vllm-workspace/"
931932 gpu : b200
@@ -946,7 +947,9 @@ steps:
946947 - pytest -v -s tests/compile/test_fusion_all_reduce.py
947948 # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
948949 # Wrap with quotes to escape yaml
949- - " pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
950+ - " pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
951+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
952+ - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
950953
951954- label : Blackwell Fusion E2E Tests # 30 min
952955 timeout_in_minutes : 40
@@ -969,8 +972,6 @@ steps:
969972 - nvidia-smi
970973 # Run all e2e fusion tests
971974 - pytest -v -s tests/compile/test_fusions_e2e.py
972- # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
973- - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
974975
975976- label : Blackwell GPT-OSS Eval
976977 timeout_in_minutes : 60
@@ -1266,7 +1267,8 @@ steps:
12661267 - pytest -v -s tests/compile/test_async_tp.py
12671268 - pytest -v -s tests/compile/test_sequence_parallelism.py
12681269 - pytest -v -s tests/compile/test_fusion_all_reduce.py
1269- - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1270+ - " pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
1271+ - pytest -v -s tests/distributed/test_sequence_parallel.py
12701272 - pytest -v -s tests/distributed/test_context_parallel.py
12711273 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
12721274 - pytest -v -s tests/v1/distributed/test_dbo.py
0 commit comments