Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 2 additions & 14 deletions tests/quantization/test_blackwell_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,21 +172,9 @@ def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch
can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)


def test_gptoss_dp2_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "deepep_high_throughput")
can_initialize(
"openai/gpt-oss-20b",
extra_args=["--data-parallel-size", "2", "--enable-expert-parallel"],
hf_overrides=HF_OVERRIDE_TEXT,
)


def test_gptoss_dp2_mxfp4bf16_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "deepep_high_throughput")
def test_gptoss_eager(monkeypatch: pytest.MonkeyPatch):
can_initialize(
"openai/gpt-oss-20b",
extra_args=["--data-parallel-size", "2", "--enable-expert-parallel"],
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=["--enforce-eager"],
)
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/fused_moe/trtllm_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def apply(
"routing_method_type": 1,
"do_finalize": True,
"output": output,
"tune_max_num_tokens": self.max_capture_size,
"tune_max_num_tokens": max(self.max_capture_size, 1),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why were we setting this to self.max_capture_size ? Shouldn't we set this to max_num_batched_tokens atleast ?
Just curious cc @pavanimajety @nvpohanh

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ohh I see, very interesting. Yes I have the same question of why not use the max batch size, since we will want to autotune not only for cudagraphs for the prefill as well

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nvjullin Could you review this PR and comment on this? Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It comes from PR23608. After a quick look in flashinfer, I believe this parameter is needed because autotuning on a dummy input won't result in the maximum number of tokens at each EP rank. I agree max_num_batched_tokens makes more sense.

}

from flashinfer import trtllm_fp4_block_scale_routed_moe
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/layers/quantization/mxfp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -1047,7 +1047,7 @@ def apply(
None,
1 if renormalize else 0, # routing_method_type, renormalize
True, # do finalize
tune_max_num_tokens=self.max_capture_size,
tune_max_num_tokens=max(self.max_capture_size, 1),
)[0]
return trtllm_gen_output
elif (
Expand Down Expand Up @@ -1122,7 +1122,7 @@ def apply(
tp_rank=self.moe.tp_rank,
ep_size=self.moe.ep_size,
ep_rank=self.moe.ep_rank,
tune_max_num_tokens=self.max_capture_size,
tune_max_num_tokens=max(self.max_capture_size, 1),
**extra_kwargs,
)

Expand Down
27 changes: 1 addition & 26 deletions vllm/model_executor/warmup/kernel_warmup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import torch

import vllm.envs as envs
from vllm.config import CUDAGraphMode, VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup
from vllm.platforms import current_platform
Expand All @@ -25,26 +24,6 @@
logger = init_logger(__name__)


def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool:
"""
Record known issues with vllm + flashinfer autotune here. Return True if
and only if flashinfer autotune will run through without issues.
"""
is_tp_or_dp = (vllm_config.parallel_config.data_parallel_size > 1) or (
vllm_config.parallel_config.tensor_parallel_size > 1
)
is_fi_mxfp4_backend = (
envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
) or (
current_platform.is_cuda() and current_platform.is_device_capability(100)
) # on >=sm100, default mxfp4 backend is flashinfer
is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE

return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager)


def kernel_warmup(worker: "Worker"):
# Deep GEMM warmup
do_deep_gemm_warmup = (
Expand All @@ -58,11 +37,7 @@ def kernel_warmup(worker: "Worker"):
deep_gemm_warmup(model, max_tokens)

# FlashInfer autotune for Hopper (SM 9.0) and Blackwell (SM 10.0) GPUs
if (
has_flashinfer()
and current_platform.has_device_capability(90)
and flashinfer_autotune_supported(worker.vllm_config)
):
if has_flashinfer() and current_platform.has_device_capability(90):
flashinfer_autotune(worker.model_runner)

# FlashInfer attention warmup
Expand Down