Wjx/opt a4w4 (#1564)

lalala-sh · web-flow · commit aae5b49cbb29 · 2025-12-05T08:46:42.000+08:00
* opt a4w4 moe decode

* update
diff --git a/aiter/ops/triton/mha.py b/aiter/ops/triton/mha.py
@@ -784,10 +784,10 @@ def flash_attn_with_kvcache(
         window_size: (left, right) local attention window; (-1,-1) = full.
         softcap: (float) currently must be 0.0 (backend limitation).
         num_splits: 0 or 1 only (backend limitation >1).
-        rotary_cos/rotary_sin: Optional rotary embeddings (applied if provided) – interleaving flag unused here.
+        rotary_cos/rotary_sin: Optional rotary embeddings (applied if provided) - interleaving flag unused here.
         cache_batch_idx/cache_leftpad: Optional indexing / left padding metadata.
             block_table: Optional paging table mapping logical blocks for paged KV cache.
-        alibi_slopes: (nheads,) or (batch,nheads) bias slopes (currently ignored if provided – placeholder).
+        alibi_slopes: (nheads,) or (batch,nheads) bias slopes (currently ignored if provided - placeholder).
         rotary_interleaved: Flag kept for parity (currently forwarded as True constant to backend which ignores it).
             return_softmax_lse: If True returns (out, lse) else out.
 
diff --git a/aiter/ops/triton/pa_decode.py b/aiter/ops/triton/pa_decode.py
@@ -48,7 +48,7 @@ def paged_attention_decode(
 ) -> None:
     """
     Paged attention decode with automatic V1/V2 dispatch and quantization support.
-    V1 for short sequences (≤8192), V2 with sequence partitioning for longer sequences.
+    V1 for short sequences (<=8192), V2 with sequence partitioning for longer sequences.
 
     Args:
         output (torch.Tensor): Pre-allocated output with shape (num_seqs, num_q_heads, head_dim).
diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.cu b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.cu
@@ -65,7 +65,7 @@ void ck_moe_stage1(torch::Tensor &hidden_states,     // [m, k], input token
                 "Out dtype only support BFloat16/Float16!")
 
     int tokens = hidden_states.size(0);
-    int sorted_size = sorted_token_ids.size(0);
+    int sorted_size = std::min(int64_t(tokens * topk * block_m.value()), sorted_token_ids.size(0));
     int E = w1.size(0);
     int N = w1.size(1) / 2;
     int K = hidden_states.size(-1);
@@ -122,7 +122,7 @@ void ck_moe_stage2(torch::Tensor &inter_states,      // [m, k], input token
                 "Out dtype only support BFloat16/Float16!")
 
     int tokens = inter_states.size(0);
-    int sorted_size = sorted_token_ids.size(0);
+    int sorted_size = std::min(int64_t(tokens * topk * block_m.value()), sorted_token_ids.size(0));
     int E = w1.size(0);
     int N = w2.size(1);
     int K = inter_states.size(-1);
diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py
@@ -194,6 +194,7 @@ def name(self) -> str:
      0: kernelInstanceGEMM1(       256,       32,          128,       128,     1,       4,        3,),
      1: kernelInstanceGEMM1(       256,       64,          128,       128,     1,       4,        3,),
      2: kernelInstanceGEMM1(       256,      128,          128,       128,     1,       4,        3,),
+     4: kernelInstanceGEMM1(        64,       32,           32,       128,     1,       1,        3,),
     #  3: kernelInstanceGEMM1(       256,      256,         128,       128,     2,       2,        3,),
 }
 
diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4.cuh b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4.cuh
@@ -77,7 +77,8 @@ void ck_moe_stage1_gemm(const hipStream_t& stream,
     // : 128;
     static constexpr ck::index_t CShuffleMXDLPerWave = MXDLPerWave;
     static constexpr ck::index_t CShuffleNXDLPerWave = NXDLPerWave;
-    static constexpr ck::index_t CShuffleNLane       = NPerBlock / 2 / NXDLPerWave; // 64
+    static constexpr ck::index_t CShuffleNLane =
+        BLOCKSIZE == 64 ? NPerBlock / NXDLPerWave : NPerBlock / 2 / NXDLPerWave; // 64
     static constexpr ck::index_t CShuffleMLane       = BLOCKSIZE / CShuffleNLane;
     static constexpr ck::index_t AK1                 = 16 / sizeof(A0DataType);
     static constexpr ck::index_t BK1                 = 16 / sizeof(B0DataType);
@@ -97,17 +98,17 @@ void ck_moe_stage1_gemm(const hipStream_t& stream,
 ///######|         |         |         |        |           |           |           |          |            |                 |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 ///######|         |         |         |        |           |           |           |          |            |                 |            |            |             |               |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |    S<C, D0, D1>|
 ///###### RCR
-          <     Row,  Col,  DsLayout, ELayout, 
+          <     Row,  Col,  DsLayout, ELayout,
                 A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
-                AElementOp,  BElementOp, CDEElementOp,       GemmSpec, 
-                32,      BLOCKSIZE,   
+                AElementOp,  BElementOp, CDEElementOp,       GemmSpec,
+                32,      BLOCKSIZE,
                 MPerBlock,      NPerBlock,    128,
                 AK1,   BK1,
                 MNPerXDL,   MNPerXDL,
                 MXDLPerWave,     NXDLPerWave,
                 S<K0_A, K0_M_A, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 1,
                 S<K0_B, K0_N_B, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 1,
-                2,    CShuffleNXDLPerWave,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
+                2,    CShuffleNXDLPerWave,   S<1, CShuffleNLane, 1, CShuffleMLane>, S<EVec, D0Vec, D1Vec>,
                 ck::BlockGemmPipelineScheduler::Intrawave, PipelineVer, ActOP, Nswizzle, true, MulRoutedWeight, ck::index_t, A0DataType>; // clang-format on
     // clang-format on
 
@@ -286,10 +287,10 @@ void ck_moe_stage2_gemm(const hipStream_t& stream,
 ///#####|         |         |         |        |           |           |           |          |            |                 |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 ///#####|         |         |         |        |           |           |           |          |            |                 |            |            |             |               |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |    S<C, D0, D1>|
 ///##### RCR
-          <     Row,  Col,  DsLayout, ELayout, 
+          <     Row,  Col,  DsLayout, ELayout,
                 A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
                 AElementOp,  BElementOp, CDEElementOp,       GemmSpec,
-                32,      BLOCKSIZE,   
+                32,      BLOCKSIZE,
                 MPerBlock,      NPerBlock,    128,
                 AK1,   BK1,
                 MNPerXDL,   MNPerXDL,
@@ -365,4 +366,4 @@ void ck_moe_stage2_gemm(const hipStream_t& stream,
         void *&num_valid_ids,                                                                                                                                        \
         void *&out,                                                                                                                                                  \
         std::optional<void *> w2_scale,                                                                                                                              \
-        std::optional<void *> a2_scale);
+        std::optional<void *> a2_scale);
diff --git a/csrc/ck_gemm_moe_2stages_codegen/gen_instances.py b/csrc/ck_gemm_moe_2stages_codegen/gen_instances.py
@@ -181,7 +181,7 @@
     {{
         if (block_m == 32)
         {{
-            return ck_moe_stage1_gemm<{A0DataType}, {B0DataType}, {AccDataType}, {EDataType}, {CDEElementOp}, V3, 256, 32, 128, 128/sizeof({A0DataType}), 1, 4, {Nswizzle}, {Quant} == static_cast<int>(QuantType::per_Tensor), {MulRoutedWeight}, {ActOP}>;
+            return ck_moe_stage1_gemm<{A0DataType}, {B0DataType}, {AccDataType}, {EDataType}, {CDEElementOp}, V3, 64, 32, 32, 128/sizeof({A0DataType}), 1, 1, {Nswizzle}, {Quant} == static_cast<int>(QuantType::per_Tensor), {MulRoutedWeight}, {ActOP}>;
         }}
         else if (block_m == 64)
         {{
diff --git a/csrc/cpp_itfs/pa/pa_ragged.cuh b/csrc/cpp_itfs/pa/pa_ragged.cuh
@@ -90,7 +90,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_
     }
     const int64_t query_loc = static_cast<int64_t>(seq_idx * MTP);
     const int* block_table_seq = kv_page_indices + kv_indptr[seq_idx];
-    
+
     if constexpr (VERSION_ID == 0) // 0: GOLDEN VERSION
     {
         _paged_attention_kernel<scalar_t, cache_t, KV_DTYPE, BLOCK_SIZE, HEAD_SIZE, NUM_THREADS, ALIBI_ENABLED, GQA_RATIO, MTP, AttentionVariant, false>
diff --git a/csrc/pybind/fused_mrope_rms_pybind.cu b/csrc/pybind/fused_mrope_rms_pybind.cu
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #include "rocm_ops.hpp"
 #include "fused_mrope_rms.h"
 

Original file line number	Diff line number	Diff line change
`@@ -194,6 +194,7 @@ def name(self) -> str:`
`194`	`194`	`0: kernelInstanceGEMM1( 256, 32, 128, 128, 1, 4, 3,),`
`195`	`195`	`1: kernelInstanceGEMM1( 256, 64, 128, 128, 1, 4, 3,),`
`196`	`196`	`2: kernelInstanceGEMM1( 256, 128, 128, 128, 1, 4, 3,),`
	`197`	`+ 4: kernelInstanceGEMM1( 64, 32, 32, 128, 1, 1, 3,),`
`197`	`198`	`# 3: kernelInstanceGEMM1( 256, 256, 128, 128, 2, 2, 3,),`
`198`	`199`	`}`
`199`	`200`
Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_`
`90`	`90`	`}`
`91`	`91`	`const int64_t query_loc = static_cast<int64_t>(seq_idx * MTP);`
`92`	`92`	`const int* block_table_seq = kv_page_indices + kv_indptr[seq_idx];`
`93`		`-`
	`93`	`+`
`94`	`94`	`if constexpr (VERSION_ID == 0) // 0: GOLDEN VERSION`
`95`	`95`	`{`
`96`	`96`	`_paged_attention_kernel<scalar_t, cache_t, KV_DTYPE, BLOCK_SIZE, HEAD_SIZE, NUM_THREADS, ALIBI_ENABLED, GQA_RATIO, MTP, AttentionVariant, false>`