Enhance MLA Reduce for Prefill (#1562)

ruanjm · web-flow · commit c42c246b40bc · 2025-12-05T16:52:53.000+08:00
* Divide workload of batch whose seqlen &gt; 1 to multiple work groups if there is enough CUs.

* format

* Add simple pipeline

* Fix issues raised by copilot

* Fix trivial issues and add nhead=1, hdim=128 cases.

* simplify template args

* add more group cases

* Remove unnecessary memory check.
diff --git a/aiter/mla.py b/aiter/mla.py
@@ -317,6 +317,7 @@ def mla_decode_fwd(
             reduce_indptr,
             reduce_final_map,
             reduce_partial_map,
+            max_seqlen_q,
             o,
             final_lse,
         )
diff --git a/aiter/ops/attention.py b/aiter/ops/attention.py
@@ -176,6 +176,7 @@ def pa_reduce_v1(
     reduce_indptr: torch.Tensor,
     reduce_final_map: Optional[torch.Tensor],
     reduce_partial_map: torch.Tensor,
+    max_seqlen_q: int,
     final_output: torch.Tensor,
     final_lse: Optional[torch.Tensor] = None,
 ) -> None:
@@ -185,6 +186,7 @@ def pa_reduce_v1(
         reduce_indptr,
         reduce_final_map,
         reduce_partial_map,
+        max_seqlen_q,
         final_output,
         final_lse,
     )
@@ -252,6 +254,7 @@ def pa_persistent_fwd(
         reduce_indptr,
         reduce_final_map,
         reduce_partial_map,
+        max_qlen,
         output,
         final_lse,
     )
@@ -757,6 +760,7 @@ def mla_reduce_v1(
     reduce_indptr: torch.Tensor,
     reduce_final_map: Optional[torch.Tensor],
     reduce_partial_map: torch.Tensor,
+    max_seqlen_q: int,
     final_output: torch.Tensor,
     final_lse: Optional[torch.Tensor] = None,
 ) -> None: ...
diff --git a/csrc/include/mla.h b/csrc/include/mla.h
@@ -68,6 +68,7 @@ void mla_reduce_v1(const torch::Tensor& partial_output,
                    const torch::Tensor& reduce_indptr,
                    const std::optional<torch::Tensor>& reduce_final_map,
                    const torch::Tensor& reduce_partial_map,
+                   const int max_seqlen_q,
                    torch::Tensor& final_output,
                    std::optional<torch::Tensor>& final_lse);
 
diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp
@@ -1406,51 +1406,51 @@ namespace py = pybind11;
           py::arg("stride0"),      \
           py::arg("stride1"));
 
-#define MLA_METADATA_PYBIND                                                       \
-    m.def("get_mla_metadata_v1",                                                  \
-          &get_mla_metadata_v1,                                                   \
-          "get_mla_metadata_v1",                                                  \
-          py::arg("seqlens_qo_indptr"),                                           \
-          py::arg("seqlens_kv_indptr"),                                           \
-          py::arg("num_heads_per_head_k"),                                        \
-          py::arg("num_heads_k"),                                                 \
-          py::arg("is_causal"),                                                   \
-          py::arg("work_metadata_ptrs"),                                          \
-          py::arg("work_info_set"),                                               \
-          py::arg("work_indptr"),                                                 \
-          py::arg("reduce_indptr"),                                               \
-          py::arg("reduce_final_map"),                                            \
-          py::arg("reduce_partial_map"),                                          \
-          py::arg("kv_granularity")      = 16,                                    \
-          py::arg("max_seqlen_qo")       = -1,                                    \
-          py::arg("uni_seqlen_qo")       = -1,                                    \
-          py::arg("fast_mode")           = true,                                  \
-          py::arg("topk")                = -1,                                    \
-          py::arg("max_split_per_batch") = -1,                                    \
-          py::arg("dtype_q")             = std::nullopt,                          \
-          py::arg("dtype_kv")            = std::nullopt);                         \
+#define MLA_METADATA_PYBIND                              \
+    m.def("get_mla_metadata_v1",                         \
+          &get_mla_metadata_v1,                          \
+          "get_mla_metadata_v1",                         \
+          py::arg("seqlens_qo_indptr"),                  \
+          py::arg("seqlens_kv_indptr"),                  \
+          py::arg("num_heads_per_head_k"),               \
+          py::arg("num_heads_k"),                        \
+          py::arg("is_causal"),                          \
+          py::arg("work_metadata_ptrs"),                 \
+          py::arg("work_info_set"),                      \
+          py::arg("work_indptr"),                        \
+          py::arg("reduce_indptr"),                      \
+          py::arg("reduce_final_map"),                   \
+          py::arg("reduce_partial_map"),                 \
+          py::arg("kv_granularity")      = 16,           \
+          py::arg("max_seqlen_qo")       = -1,           \
+          py::arg("uni_seqlen_qo")       = -1,           \
+          py::arg("fast_mode")           = true,         \
+          py::arg("topk")                = -1,           \
+          py::arg("max_split_per_batch") = -1,           \
+          py::arg("dtype_q")             = std::nullopt, \
+          py::arg("dtype_kv")            = std::nullopt);           \
     m.def("get_mla_metadata_v1_no_redundant", &get_mla_metadata_v1_no_redundant);
 
-#define PA_METADATA_PYBIND                                                        \
-    m.def("get_pa_metadata_v1",                                                   \
-          &get_pa_metadata_v1,                                                    \
-          "get_pa_metadata_v1",                                                   \
-          py::arg("seqlens_qo_indptr"),                                           \
-          py::arg("pages_kv_indptr"),                                             \
-          py::arg("num_heads_per_head_k"),                                        \
-          py::arg("num_heads_k"),                                                 \
-          py::arg("is_causal"),                                                   \
-          py::arg("work_metadata_ptrs"),                                          \
-          py::arg("work_indptr"),                                                 \
-          py::arg("work_info"),                                                   \
-          py::arg("reduce_indptr"),                                               \
-          py::arg("reduce_final_map"),                                            \
-          py::arg("reduce_partial_map"),                                          \
-          py::arg("kv_granularity")      = 16,                                    \
-          py::arg("max_seqlen_qo")       = -1,                                    \
-          py::arg("uni_seqlen_qo")       = -1,                                    \
-          py::arg("fast_mode")           = true,                                  \
-          py::arg("topk")                = -1,                                    \
+#define PA_METADATA_PYBIND                       \
+    m.def("get_pa_metadata_v1",                  \
+          &get_pa_metadata_v1,                   \
+          "get_pa_metadata_v1",                  \
+          py::arg("seqlens_qo_indptr"),          \
+          py::arg("pages_kv_indptr"),            \
+          py::arg("num_heads_per_head_k"),       \
+          py::arg("num_heads_k"),                \
+          py::arg("is_causal"),                  \
+          py::arg("work_metadata_ptrs"),         \
+          py::arg("work_indptr"),                \
+          py::arg("work_info"),                  \
+          py::arg("reduce_indptr"),              \
+          py::arg("reduce_final_map"),           \
+          py::arg("reduce_partial_map"),         \
+          py::arg("kv_granularity")      = 16,   \
+          py::arg("max_seqlen_qo")       = -1,   \
+          py::arg("uni_seqlen_qo")       = -1,   \
+          py::arg("fast_mode")           = true, \
+          py::arg("topk")                = -1,   \
           py::arg("max_split_per_batch") = -1);
 
 #define MLA_REDUCE_PYBIND                \
@@ -1462,13 +1462,14 @@ namespace py = pybind11;
           py::arg("reduce_indptr"),      \
           py::arg("reduce_final_map"),   \
           py::arg("reduce_partial_map"), \
+          py::arg("max_seqlen_q"),       \
           py::arg("final_output"),       \
           py::arg("final_lse") = std::nullopt);
 
-#define TOPK_PLAIN_PYBIND           \
-    m.def("topk_plain",             \
-          &topk_plain,              \
-          py::arg("values"),        \
-          py::arg("topk_ids"),      \
-          py::arg("topk"),          \
+#define TOPK_PLAIN_PYBIND      \
+    m.def("topk_plain",        \
+          &topk_plain,         \
+          py::arg("values"),   \
+          py::arg("topk_ids"), \
+          py::arg("topk"),     \
           py::arg("largest"));
diff --git a/csrc/kernels/mla/reduce.cu b/csrc/kernels/mla/reduce.cu

Original file line number	Diff line number	Diff line change
`@@ -317,6 +317,7 @@ def mla_decode_fwd(`
`317`	`317`	`reduce_indptr,`
`318`	`318`	`reduce_final_map,`
`319`	`319`	`reduce_partial_map,`
	`320`	`+ max_seqlen_q,`
`320`	`321`	`o,`
`321`	`322`	`final_lse,`
`322`	`323`	`)`