diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py index 3cae6f46147b..8dd4551ff4b9 100644 --- a/tests/quantization/test_blackwell_moe.py +++ b/tests/quantization/test_blackwell_moe.py @@ -172,21 +172,9 @@ def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT) -def test_gptoss_dp2_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1") - monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "deepep_high_throughput") - can_initialize( - "openai/gpt-oss-20b", - extra_args=["--data-parallel-size", "2", "--enable-expert-parallel"], - hf_overrides=HF_OVERRIDE_TEXT, - ) - - -def test_gptoss_dp2_mxfp4bf16_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1") - monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "deepep_high_throughput") +def test_gptoss_eager(monkeypatch: pytest.MonkeyPatch): can_initialize( "openai/gpt-oss-20b", - extra_args=["--data-parallel-size", "2", "--enable-expert-parallel"], hf_overrides=HF_OVERRIDE_TEXT, + extra_args=["--enforce-eager"], ) diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index e305483eb17d..132d35e65aba 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -127,10 +127,17 @@ def apply( "routing_method_type": 1, "do_finalize": True, "output": output, - "tune_max_num_tokens": self.max_capture_size, + "tune_max_num_tokens": max(self.max_capture_size, 1), } from flashinfer import trtllm_fp4_block_scale_routed_moe - trtllm_fp4_block_scale_routed_moe(**kwargs) + from vllm.utils.flashinfer import autotune + + with autotune(False): + # Enable autotune when, + # https://github.com/flashinfer-ai/flashinfer/issues/2023 is + # resolved. + trtllm_fp4_block_scale_routed_moe(**kwargs) + return output diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 597ee1b6bafe..bf34ec0f3899 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1047,7 +1047,7 @@ def apply( None, 1 if renormalize else 0, # routing_method_type, renormalize True, # do finalize - tune_max_num_tokens=self.max_capture_size, + tune_max_num_tokens=max(self.max_capture_size, 1), )[0] return trtllm_gen_output elif ( @@ -1122,7 +1122,7 @@ def apply( tp_rank=self.moe.tp_rank, ep_size=self.moe.ep_size, ep_rank=self.moe.ep_rank, - tune_max_num_tokens=self.max_capture_size, + tune_max_num_tokens=max(self.max_capture_size, 1), **extra_kwargs, ) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index ffa3bc8f021e..28792338f036 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -11,7 +11,6 @@ import torch import vllm.envs as envs -from vllm.config import CUDAGraphMode, VllmConfig from vllm.logger import init_logger from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup from vllm.platforms import current_platform @@ -25,26 +24,6 @@ logger = init_logger(__name__) -def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool: - """ - Record known issues with vllm + flashinfer autotune here. Return True if - and only if flashinfer autotune will run through without issues. - """ - is_tp_or_dp = (vllm_config.parallel_config.data_parallel_size > 1) or ( - vllm_config.parallel_config.tensor_parallel_size > 1 - ) - is_fi_mxfp4_backend = ( - envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS - ) or ( - current_platform.is_cuda() and current_platform.is_device_capability(100) - ) # on >=sm100, default mxfp4 backend is flashinfer - is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE - - return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager) - - def kernel_warmup(worker: "Worker"): # Deep GEMM warmup do_deep_gemm_warmup = ( @@ -58,11 +37,7 @@ def kernel_warmup(worker: "Worker"): deep_gemm_warmup(model, max_tokens) # FlashInfer autotune for Hopper (SM 9.0) and Blackwell (SM 10.0) GPUs - if ( - has_flashinfer() - and current_platform.has_device_capability(90) - and flashinfer_autotune_supported(worker.vllm_config) - ): + if has_flashinfer() and current_platform.has_device_capability(90): flashinfer_autotune(worker.model_runner) # FlashInfer attention warmup