vllm-project · MatthewBonanni · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
@@ -314,6 +314,7 @@ def test_attention_quant_pattern(
     custom_ops_list = custom_ops.split(",") if custom_ops else []
 
     device = torch.device("cuda:0")
+    torch.set_default_dtype(dtype)
     torch.manual_seed(42)
 
     vllm_config = VllmConfig(

@@ -127,12 +127,13 @@ def test_env(
 
         elif device == "cuda":
             with patch("vllm.platforms.current_platform", CudaPlatform()):
+                capability = torch.cuda.get_device_capability()
                 if use_mla:
                     # CUDA MLA backend logic:
                     # - CUTLASS_MLA: only supported with block_size == 128
-                    #   and Blackwell GPUs (SM 10.0), V1 only
+                    #   and Blackwell GPUs (SM 10.x), V1 only
                     # - FLASHINFER_MLA: only supported on Blackwell GPUs
-                    #   (SM 10.0+), V1 only
+                    #   (SM 10.x), V1 only
                     # - FLASHMLA: only supported with block_size == 64
                     # - FLASH_ATTN_MLA: V1 only
                     # - TRITON_MLA: fallback for other cases
@@ -141,58 +142,72 @@ def test_env(
                         if block_size != 128:
                             # CUTLASS_MLA only supports block_size == 128
                             pytest.skip("CUTLASS_MLA only supports block_size 128")
-                        else:
-                            backend = get_attn_backend(
-                                16, torch.float16, None, block_size, use_mla=use_mla
-                            )
-                            expected = "CUTLASS_MLA"
-                            assert backend.get_name() == expected
+                        if capability[0] != 10:
+                            pytest.skip("CUTLASS MLA is not supported on this platform")
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "CUTLASS_MLA"
+                        assert backend.get_name() == expected
                     elif name == "FLASHINFER_MLA":
+                        if capability[0] != 10:
+                            pytest.skip(
+                                "FlashInfer MLA is not supported on this platform"
+                            )
                         if block_size not in [32, 64]:
                             # FlashInfer MLA only supports block_size 32 or 64
                             pytest.skip(
                                 "FlashInfer MLA only supports block_size 32 or 64"
                             )
-                        else:
-                            backend = get_attn_backend(
-                                16, torch.float16, None, block_size, use_mla=use_mla
-                            )
-                            expected = "FLASHINFER_MLA"
-                            assert backend.get_name() == expected
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "FLASHINFER_MLA"
+                        assert backend.get_name() == expected
                     elif name == "FLASHMLA":
                         if block_size != 64:
                             # FlashMLA only supports block_size == 64
                             pytest.skip("FlashMLA only supports block_size 64")
-                        else:
-                            from vllm.v1.attention.backends.mla.flashmla import (
-                                is_flashmla_dense_supported,
-                            )
+                        from vllm.v1.attention.backends.mla.flashmla import (
+                            is_flashmla_dense_supported,
+                        )
 
-                            is_supported, _ = is_flashmla_dense_supported()
-                            if not is_supported:
-                                pytest.skip("FlashMLA not supported on this platform")
-                            else:
-                                backend = get_attn_backend(
-                                    16, torch.float16, None, block_size, use_mla=use_mla
-                                )
-                                expected = name
-                                assert backend.get_name() == expected
+                        is_supported, _ = is_flashmla_dense_supported()
+                        if not is_supported:
+                            pytest.skip("FlashMLA not supported on this platform")
+                        backend = get_attn_backend(
+                            576,
+                            torch.float16,
+                            None,
+                            block_size,
+                            use_mla=use_mla,
+                        )
+                        expected = name
+                        assert backend.get_name() == expected
                     elif name == "FLASH_ATTN_MLA":
+                        from vllm.attention.utils.fa_utils import (
+                            flash_attn_supports_mla,
+                        )
+
+                        if not flash_attn_supports_mla():
+                            pytest.skip(
+                                "FlashAttention MLA not supported on this platform"
+                            )
                         backend = get_attn_backend(
-                            16, torch.float16, None, block_size, use_mla=use_mla
+                            576, torch.float16, None, block_size, use_mla=use_mla
                         )
                         expected = "FLASH_ATTN_MLA"
                         assert backend.get_name() == expected
                     else:
                         # TRITON_MLA or other fallback
                         backend = get_attn_backend(
-                            16, torch.float16, None, block_size, use_mla=use_mla
+                            576, torch.float16, None, block_size, use_mla=use_mla
                         )
                         expected = "TRITON_MLA"
                         assert backend.get_name() == expected
                 elif name == "FLASHINFER":
                     backend = get_attn_backend(
-                        16, torch.float16, None, block_size, use_mla=use_mla
+                        64, torch.float16, None, block_size, use_mla=use_mla
                     )
                     expected = "FLASHINFER"
                     assert backend.get_name() == expected

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
@@ -425,13 +425,20 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
     # This test checks if GPUModelRunner initializes correctly when an attention
     # backend enforces a non-default KV cache stride order.
     n_heads = model_runner.model_config.get_num_kv_heads(model_runner.parallel_config)
-    expected_kv_cache_shape = [
-        2,
-        NUM_BLOCKS,
-        BLOCK_SIZE,
-        n_heads,
-        model_runner.model_config.get_head_size(),
-    ]
+    head_size = model_runner.model_config.get_head_size()
+
+    # Get the expected shape from the backend's get_kv_cache_shape method
+    # to ensure compatibility with different backends (triton vs flexattention)
+    attn_backend = None
+    for attn_group in model_runner._attn_group_iterator():
+        attn_backend = attn_group.backend
+        break
+
+    assert attn_backend is not None, "No attention backend found"
+    expected_kv_cache_shape = list(
+        attn_backend.get_kv_cache_shape(NUM_BLOCKS, BLOCK_SIZE, n_heads, head_size)
+    )
+
     # TODO mla test
     default_stride = tuple(range(5))
     # Permutation that gets you back to expected kv shape

@@ -2,13 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from typing import Generic, Protocol, TypeVar
+from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 import torch
 
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
 
+if TYPE_CHECKING:
+    from vllm.config.cache import BlockSize, CacheDType
+    from vllm.platforms.interface import DeviceCapability
+    from vllm.v1.attention.backends.utils import KVCacheLayoutType
+
 
 class AttentionType:
     """
@@ -88,6 +93,167 @@ def get_kv_cache_stride_order() -> tuple[int, ...]:
     def full_cls_name(cls) -> tuple[str, str]:
         return (cls.__module__, cls.__qualname__)
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return []
+
+    @classmethod
+    def supports_head_size(cls, head_size: int) -> bool:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        return (not supported_head_sizes) or head_size in supported_head_sizes
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def supports_dtype(cls, dtype: torch.dtype) -> bool:
+        supported_dtypes = cls.get_supported_dtypes()
+        return (not supported_dtypes) or dtype in supported_dtypes
+
+    @classmethod
+    def get_supported_kv_cache_dtypes(cls) -> list["CacheDType"]:
+        return ["auto"]
+
+    @classmethod
+    def supports_kv_cache_dtype(cls, kv_cache_dtype: "CacheDType | None") -> bool:
+        if kv_cache_dtype is None:
+            return True
+        supported_kv_cache_dtypes = cls.get_supported_kv_cache_dtypes()
+        return (not supported_kv_cache_dtypes) or (
+            kv_cache_dtype in supported_kv_cache_dtypes
+        )
+
+    @classmethod
+    def get_supported_block_sizes(cls) -> list["BlockSize"]:
+        return []
+
+    @classmethod
+    def supports_block_size(cls, block_size: "BlockSize | None") -> bool:
+        from vllm.config.cache import BlockSize
+
+        if block_size is None:
+            return True
+        try:
+            block_size_literal = cast(BlockSize, block_size)
+        except ValueError:
+            return False
+        supported_block_sizes = cls.get_supported_block_sizes()
+        return (
+            not supported_block_sizes
+        ) or block_size_literal in supported_block_sizes
+
+    @classmethod
+    def get_default_block_size(cls) -> "BlockSize":
+        supported_block_sizes = cls.get_supported_block_sizes()
+        if not supported_block_sizes:
+            raise ValueError(
+                f"Fallback failed, no explicitly supported block sizes for "
+                f"backend {cls.get_name()}"
+            )
+        return supported_block_sizes[0]
+
+    @classmethod
+    def is_mla(cls) -> bool:
+        return False
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        return False
+
+    @classmethod
+    def is_sparse(cls) -> bool:
+        return False
+
+    @classmethod
+    def get_min_compute_capability(cls) -> "DeviceCapability | None":
+        return None
+
+    @classmethod
+    def get_max_compute_capability(cls) -> "DeviceCapability | None":
+        return None
+
+    @classmethod
+    def supports_compute_capability(cls, capability: "DeviceCapability") -> bool:
+        min_capability = cls.get_min_compute_capability()
+        max_capability = cls.get_max_compute_capability()
+        return ((min_capability is None) or (capability >= min_capability)) and (
+            (max_capability is None) or (capability <= max_capability)
+        )
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: "DeviceCapability",
+    ) -> str | None:
+        return None
+
+    @classmethod
+    def validate_configuration(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: "DeviceCapability",
+    ) -> list[str]:
+        invalid_reasons = []
+        if not cls.supports_head_size(head_size):
+            invalid_reasons.append("head_size not supported")
+        if not cls.supports_dtype(dtype):
+            invalid_reasons.append("dtype not supported")
+        if not cls.supports_kv_cache_dtype(kv_cache_dtype):
+            invalid_reasons.append("kv_cache_dtype not supported")
+        if not cls.supports_block_size(block_size):
+            invalid_reasons.append("block_size not supported")
+        if use_mla != cls.is_mla():
+            if use_mla:
+                invalid_reasons.append("MLA not supported")
+            else:
+                invalid_reasons.append("non-MLA not supported")
+        if has_sink and not cls.supports_sink():
+            invalid_reasons.append("sink setting not supported")
+        if use_sparse != cls.is_sparse():
+            if use_sparse:
+                invalid_reasons.append("sparse not supported")
+            else:
+                invalid_reasons.append("non-sparse not supported")
+        if not cls.supports_compute_capability(device_capability):
+            invalid_reasons.append("compute capability not supported")
+        combination_reason = cls.supports_combination(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_mla,
+            has_sink,
+            use_sparse,
+            device_capability,
+        )
+        if combination_reason is not None:
+            invalid_reasons.append(combination_reason)
+        return invalid_reasons
+
+    @classmethod
+    def get_required_kv_cache_layout(
+        cls, capability: "DeviceCapability"
+    ) -> "KVCacheLayoutType | None":
+        """
+        Some backends require a specific kv cache layout.
+        This function returns the required layout if any.
+        """
+        return None
+
 
 class AttentionMetadata:
     pass
@@ -160,8 +326,8 @@ def __init__(
     ) -> None:
         raise NotImplementedError
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
+    @classmethod
+    def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]:
         # TODO: implement this function for all backends.
         return [MultipleOf(1)]
 

@@ -3,9 +3,13 @@
 """Attention backend registry"""
 
 import enum
+from typing import TYPE_CHECKING
 
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
 
 class _Backend(enum.Enum):
     FLASH_ATTN = enum.auto()
@@ -84,7 +88,7 @@ def backend_to_class_str(backend: _Backend) -> str:
     return BACKEND_MAP[backend]
 
 
-def backend_to_class(backend: _Backend) -> type:
+def backend_to_class(backend: _Backend) -> "type[AttentionBackend]":
     """Get the backend class.
 
     Args:
@@ -95,16 +99,3 @@ def backend_to_class(backend: _Backend) -> type:
     """
     backend_class_name = backend_to_class_str(backend)
     return resolve_obj_by_qualname(backend_class_name)
-
-
-def backend_name_to_enum(backend_name: str) -> _Backend | None:
-    """
-    Convert a string backend name to a _Backend enum value.
-
-    Returns:
-        _Backend: enum value if backend_name is a valid in-tree type
-        None: otherwise it's an invalid in-tree type or an out-of-tree platform
-              is loaded.
-    """
-    assert backend_name is not None
-    return _Backend[backend_name] if backend_name in _Backend.__members__ else None