enable xqa fp8 output (#2081)

qsang-nv · web-flow · commit 53a6da4788ff · 2025-11-12T06:25:18.000-08:00
diff --git a/csrc/flashinfer_xqa_binding.cu b/csrc/flashinfer_xqa_binding.cu
@@ -27,10 +27,7 @@ TVM_FFI_DLL_EXPORT_TYPED_FUNC(xqa_wrapper_mla, xqa_wrapper_mla);
 #else
 
 void xqa_wrapper(bool run_sm90_fp8_mha, int64_t multiProcessorCount, int64_t nbKHeads,
-                 int64_t slidingWinSize, double qScale, TensorView output,
-#if LOW_PREC_OUTPUT
-                 TensorView rcpOutScale,
-#endif
+                 int64_t slidingWinSize, double qScale, TensorView output, double rcpOutScale,
                  TensorView q, tvm::ffi::Optional<TensorView> attentionSinks, TensorView kCacheVLLM,
                  TensorView vCacheVLLM, TensorView kvCachePageList, int64_t maxSeqLen,
                  TensorView seqLen, int64_t batchSize, double kvCacheScale,
diff --git a/csrc/xqa/mha.cu b/csrc/xqa/mha.cu
@@ -1281,7 +1281,7 @@ CUBIN_EXPORT __global__
         float qScale,
         OutputHead* __restrict__ const output,  // [nbReq][beamWidth][nbQHeads]
 #if LOW_PREC_OUTPUT
-        float const* rcpOutScale,
+        float rcpOutScale,
 #endif
         // NOTE: the input is actually Q buffer when integrated to TRT-LLM.
         IOHead const* __restrict__ const q,  // [nbReq][beamWidth][nbQHeads],
@@ -2165,7 +2165,7 @@ CUBIN_EXPORT __global__
       }
       ThrdRegRowMax const rcpRowSum = __frcp_rn(globalRowSum);
 #if LOW_PREC_OUTPUT
-      voScale *= rcpOutScale[0];
+      voScale *= rcpOutScale;
 #endif
       rescaleAcc(warp, acc, fullRescaleMask, rcpRowSum * ThrdRegRowMax::filled(voScale));
     }
@@ -2396,7 +2396,7 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha(
     float qScale,
     OutputHead* __restrict__ const output,  // [nbReq][beamWidth][nbQHeads]
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+    float rcpOutScale,
 #endif
     IOHead const* __restrict__ const q,  // [nbReq][beamWidth][nbQHeads],
 #if SPEC_DEC
@@ -2447,7 +2447,7 @@ void launchMHA(
 #endif
     float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+    float rcpOutScale,
 #endif
 #if USE_INPUT_KV
     InputHead const* qkv,
@@ -2563,7 +2563,7 @@ static uint32_t const hostSmemSize = configureKernel();
 void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32_t slidingWinSize,
                          float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-                         float const* rcpOutScale,
+                         float rcpOutScale,
 #endif
                          InputHead const* q, float const* attentionSinks, GMemCacheHead* kCacheVLLM,
                          GMemCacheHead* vCacheVLLM, KVCachePageIndex const* kvCachePageList,
diff --git a/csrc/xqa/mha.h b/csrc/xqa/mha.h
@@ -95,7 +95,7 @@ void launchMHA(
 #endif
     float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+    float rcpOutScale,
 #endif
 #if USE_INPUT_KV
     InputHead const* qkv,
@@ -125,7 +125,7 @@ void launchMHA(
 void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32_t slidingWinSize,
                          float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-                         float const* rcpOutScale,
+                         float rcpOutScale,
 #endif
                          InputHead const* q, float const* attentionSinks, GMemCacheHead* kCacheVLLM,
                          GMemCacheHead* vCacheVLLM, KVCachePageIndex const* kvCachePageList,
@@ -145,7 +145,7 @@ void launchHopperF8MHA(
 #endif
     float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+    float rcpOutScale,
 #endif
 #if USE_INPUT_KV
     InputHead const* qkv,
@@ -174,7 +174,7 @@ void launchHopperF8MHA(
 void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads,
                                  uint32_t slidingWinSize, float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-                                 float const* rcpOutScale,
+                                 float rcpOutScale,
 #endif
                                  InputHead const* q, float const* attentionSinks,
                                  GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
diff --git a/csrc/xqa/mha_sm90.cu b/csrc/xqa/mha_sm90.cu
@@ -610,7 +610,7 @@ __launch_bounds__(128 * 3)
         float const qScale,
         OutputHead* __restrict__ const output,  // [nbReq][beamWidth][nbQHeads]
 #if LOW_PREC_OUTPUT
-        float const* const rcpOutScale,
+        float rcpOutScale,
 #endif
 #if USE_INPUT_KV
         IOHead const* __restrict__ const qkv,  // [nbReq][beamWidth][nbQHeads+nbKHeads+nbVHeads],
@@ -957,7 +957,7 @@ __launch_bounds__(128 * 3)
 
     constexpr float xScale = 1.f / kE4M3_MAX;
 #if LOW_PREC_OUTPUT
-    float const oScale = rcpOutScale[0];
+    float const oScale = rcpOutScale;
 #else
     constexpr float oScale = 1.F;
 #endif
@@ -2910,7 +2910,7 @@ void launchHopperF8MHA(
 #endif
     float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+    float rcpOutScale,
 #endif
 #if USE_INPUT_KV
     InputHead const* qkv,
@@ -3037,7 +3037,7 @@ static uint32_t const hostSmemSize = configureKernel();
 void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads,
                                  uint32_t slidingWinSize, float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
-                                 float const* rcpOutScale,
+                                 float rcpOutScale,
 #endif
                                  InputHead const* q, float const* attentionSinks,
                                  GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
diff --git a/csrc/xqa/xqa_wrapper.cu b/csrc/xqa/xqa_wrapper.cu
@@ -45,10 +45,7 @@ void xqa_wrapper_mla(int64_t multiProcessorCount, double qScale, TensorView outp
 #else
 
 void xqa_wrapper(bool run_sm90_fp8_mha, int64_t multiProcessorCount, int64_t nbKHeads,
-                 int64_t slidingWinSize, double qScale, TensorView output,
-#if LOW_PREC_OUTPUT
-                 TensorView rcpOutScale,
-#endif
+                 int64_t slidingWinSize, double qScale, TensorView output, double rcpOutScale,
                  TensorView q, Optional<TensorView> attentionSinks, TensorView kCacheVLLM,
                  TensorView vCacheVLLM, TensorView kvCachePageList, int64_t maxSeqLen,
                  TensorView seqLen, int64_t batchSize, double kvCacheScale,
@@ -70,7 +67,7 @@ void xqa_wrapper(bool run_sm90_fp8_mha, int64_t multiProcessorCount, int64_t nbK
   mha_func(multiProcessorCount, nbKHeads, slidingWinSize, qScale,
            reinterpret_cast<OutputHead*>(output.data_ptr()),
 #if LOW_PREC_OUTPUT
-           reinterpret_cast<float const*>(rcpOutScale.data_ptr()),
+           rcpOutScale,
 #endif
            reinterpret_cast<InputHead const*>(q.data_ptr()), attentionSinksPtr,
            reinterpret_cast<GMemCacheHead*>(kCacheVLLM.data_ptr()),
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
@@ -404,6 +404,7 @@ def gen_xqa(
             head_dim=head_size,
             head_group_ratio=head_grp_size,
             use_sliding_window=use_sliding_window,
+            output_dtype=input_type,
         )
 
     if has_sm120 or has_sm121:
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -2077,6 +2077,7 @@ def trtllm_batch_decode_with_kv_cache(
     enable_pdl: Optional[bool] = None,
     backend: str = "auto",
     q_len_per_req: Optional[int] = 1,
+    o_scale: Optional[float] = 1.0,
 ) -> Union[torch.Tensor, FP4Tensor]:
     """
     Parameters
@@ -2142,6 +2143,9 @@ def trtllm_batch_decode_with_kv_cache(
         For sm_100 and sm_103 (blackwell architecture), ``auto`` will choose ``trtllm-gen`` backend.
         For sm_90 (hopper architecture) and sm_120 (blackwell architecture), ``auto`` will choose ``xqa`` backend.
 
+    o_scale : Optional[float] = 1.0
+        output scale factor for xqa fp8 output.
+
     Returns
     -------
     out : Union[torch.Tensor, FP4Tensor]
@@ -2196,6 +2200,7 @@ def trtllm_batch_decode_with_kv_cache(
             kv_layout=kv_layout,
             enable_pdl=enable_pdl,
             q_len_per_req=q_len_per_req,
+            o_scale=o_scale,
         )
     elif backend == "trtllm-gen":
         # Convert NHD layout to HND if necessary (transpose only changes stride, not data)
@@ -2340,6 +2345,7 @@ def xqa_batch_decode_with_kv_cache(
     kv_layout: str = "NHD",
     enable_pdl: bool = None,
     q_len_per_req: Optional[int] = 1,
+    o_scale: Optional[float] = 1.0,
 ) -> torch.Tensor:
     """
     Parameters
@@ -2388,6 +2394,9 @@ def xqa_batch_decode_with_kv_cache(
         Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
         Only supported for >= sm90, and currently only for FA2, CUDA core, and trtllm-gen decode.
 
+    o_scale : Optional[float] = 1.0
+        output scale factor for fp8 output.
+
     Returns
     -------
     out : torch.Tensor
@@ -2434,7 +2443,7 @@ def xqa_batch_decode_with_kv_cache(
     workspace_u8 = workspace_buffer.view(torch.uint8)
     semaphore = workspace_u8[: 8 * 1024 * 1024]  # reserve 8MB for semaphore
     scratch = workspace_u8[8 * 1024 * 1024 :]
-    kv_scale_value = bmm2_scale
+    kv_scale_value = bmm2_scale * o_scale
     q_scale_value = bmm1_scale / kv_scale_value * (head_dim**0.5)
 
     query_new = query.unsqueeze(1)
@@ -2464,6 +2473,7 @@ def xqa_batch_decode_with_kv_cache(
         kv_layout=kv_layout,
         sm_count=sm_count,
         enable_pdl=enable_pdl,
+        rcp_out_scale=1.0 / o_scale,
     )
 
     return out
diff --git a/flashinfer/jit/xqa.py b/flashinfer/jit/xqa.py
@@ -28,7 +28,6 @@
     "-DBEAM_WIDTH=1",
     "-DUSE_INPUT_KV=0",
     "-DUSE_CUSTOM_BARRIER=1",
-    "-DLOW_PREC_OUTPUT=0",
     "-DSPEC_DEC=0",
 ]
 
@@ -40,6 +39,7 @@ def gen_xqa_module(
     head_dim: int,
     head_group_ratio: int,
     use_sliding_window: bool,
+    output_dtype: torch.dtype,
 ) -> JitSpec:
     if input_dtype == torch.float16:
         flag_input_dtype = ["-DINPUT_FP16=1", "-DDTYPE=__half"]
@@ -76,6 +76,11 @@ def gen_xqa_module(
     else:
         flag_sliding_window = ["-DSLIDING_WINDOW=0"]
 
+    if output_dtype == torch.float8_e4m3fn:
+        flag_low_prec_output = ["-DLOW_PREC_OUTPUT=1"]
+    else:
+        flag_low_prec_output = ["-DLOW_PREC_OUTPUT=0"]
+
     compilation_context = CompilationContext()
     nvcc_flags = compilation_context.get_nvcc_flags_list(
         supported_major_versions=[9, 10, 11, 12]
@@ -85,7 +90,7 @@ def gen_xqa_module(
     flag_mla_wrapper = ["-DMLA_WRAPPER=0"]
 
     return gen_jit_spec(
-        f"xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}",
+        f"xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_output_{filename_safe_dtype_map[output_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}",
         [
             jit_env.FLASHINFER_CSRC_DIR / "xqa/mha.cu",
             jit_env.FLASHINFER_CSRC_DIR / "xqa/mha_sm90.cu",
@@ -101,6 +106,7 @@ def gen_xqa_module(
         + flag_kv_cache_dtype
         + flag_head_group_ratio
         + flag_sliding_window
+        + flag_low_prec_output
         + flag_mla_wrapper,
         extra_ldflags=["-lcuda"],  # Add CUDA Driver API library
     )
diff --git a/flashinfer/xqa.py b/flashinfer/xqa.py
@@ -38,6 +38,7 @@ def get_xqa_module(
     head_dim: int,
     head_group_ratio: int,
     use_sliding_window: bool,
+    output_dtype: torch.dtype,
 ):
     module = gen_xqa_module(
         input_dtype,
@@ -46,10 +47,11 @@ def get_xqa_module(
         head_dim,
         head_group_ratio,
         use_sliding_window,
+        output_dtype,
     ).build_and_load()
 
     @register_custom_op(
-        f"flashinfer::xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}",
+        f"flashinfer::xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_output_{filename_safe_dtype_map[output_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}",
         mutates_args=("output", "workspace_buffer"),
     )
     def xqa(
@@ -59,6 +61,7 @@ def xqa(
         sliding_win_size: int,
         q_scale: float,
         output: torch.Tensor,
+        rcp_out_scale: float,
         q: torch.Tensor,
         sinks: Optional[torch.Tensor],
         k_cache: torch.Tensor,
@@ -79,6 +82,7 @@ def xqa(
             sliding_win_size,
             q_scale,
             output,
+            rcp_out_scale,
             q,
             sinks,
             k_cache,
@@ -94,7 +98,7 @@ def xqa(
         )
 
     @register_fake_op(
-        f"flashinfer::xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}"
+        f"flashinfer::xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_output_{filename_safe_dtype_map[output_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}"
     )
     def _fake_xqa(
         run_sm90_fp8_mha: bool,
@@ -103,6 +107,7 @@ def _fake_xqa(
         sliding_win_size: int,
         q_scale: float,
         output: torch.Tensor,
+        rcp_out_scale: float,
         q: torch.Tensor,
         sinks: Optional[torch.Tensor],
         k_cache: torch.Tensor,
@@ -140,6 +145,7 @@ def xqa(
     kv_layout: str = "NHD",
     sm_count: Optional[int] = None,
     enable_pdl: Optional[bool] = None,
+    rcp_out_scale: float = 1.0,
 ) -> None:
     r"""Apply attention with paged KV cache using XQA kernel.
     Parameters
@@ -167,7 +173,7 @@ def xqa(
         Data type should be torch.uint32.
     output : torch.Tensor
         Output tensor with shape ``[batch_size, beam_width, num_q_heads, head_dim]``.
-        Data type should match query tensor. This tensor will be modified in-place.
+        Data type should match query tensor or kv tensor. This tensor will be modified in-place.
     workspace_buffer : torch.Tensor
         Workspace buffer for temporary computations.
         Data type should be torch.uint8.
@@ -196,6 +202,8 @@ def xqa(
     enable_pdl : Optional[bool], default=None
         Whether to enable PDL (Persistent Data Loader) optimization.
         If None, will be set to True if hardware supports it.
+    rcp_out_scale : float, default=1.0
+        Reciprocal of output scale factor.
 
     Note
     ----
@@ -231,6 +239,13 @@ def xqa(
 
     assert k_cache.dtype == v_cache.dtype, "K and V cache must have the same dtype"
 
+    if output.dtype == torch.float8_e4m3fn:
+        assert k_cache.dtype == torch.float8_e4m3fn, (
+            "KV cache must be fp8 when output is fp8"
+        )
+    else:
+        assert output.dtype == q.dtype, "Output and query must have the same dtype"
+
     # Convert HND layout to NHD if necessary (transpose only changes stride, not data)
     if kv_layout == "HND":
         # For HND: [..., H, N, D] -> NHD: [..., N, H, D]
@@ -255,6 +270,7 @@ def xqa(
         head_dim,
         head_group_ratio,
         use_sliding_window,
+        output.dtype,
     )
     xqa_module.xqa(
         run_sm90_fp8_mha,
@@ -263,6 +279,7 @@ def xqa(
         sliding_win_size if use_sliding_window else 0,
         q_scale,
         output,
+        rcp_out_scale,
         q,
         sinks,
         k_cache,
diff --git a/tests/attention/test_trtllm_gen_attention.py b/tests/attention/test_trtllm_gen_attention.py
diff --git a/tests/attention/test_xqa.py b/tests/attention/test_xqa.py
diff --git a/tests/attention/test_xqa_batch_decode.py b/tests/attention/test_xqa_batch_decode.py

Original file line number	Diff line number	Diff line change
`@@ -404,6 +404,7 @@ def gen_xqa(`
`404`	`404`	`head_dim=head_size,`
`405`	`405`	`head_group_ratio=head_grp_size,`
`406`	`406`	`use_sliding_window=use_sliding_window,`
	`407`	`+ output_dtype=input_type,`
`407`	`408`	`)`
`408`	`409`
`409`	`410`	`if has_sm120 or has_sm121:`