[mxfp8 moe training] add wgrad_with_hp option (#3508)

danielvegamyhre · web-flow · commit a8fa9e554f5b · 2025-12-19T11:45:55.000-08:00
diff --git a/test/prototype/moe_training/test_scaled_grouped_mm.py b/test/prototype/moe_training/test_scaled_grouped_mm.py
@@ -319,8 +319,14 @@ def test_emulate_mxfp8_grouped_gemm_2d_2d(M, N, num_experts):
 )
 @pytest.mark.parametrize("num_experts", (2, 4, 8, 16))
 @pytest.mark.parametrize("use_triton_for_dim0_cast", (True, False))
+@pytest.mark.parametrize("wgrad_with_hp", (True, False))
 def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(
-    M, K, N, num_experts, use_triton_for_dim0_cast
+    M,
+    K,
+    N,
+    num_experts,
+    use_triton_for_dim0_cast,
+    wgrad_with_hp,
 ):
     block_size = 32
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda", requires_grad=True)
@@ -340,8 +346,17 @@ def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(
     )
 
     # Forward
+    out_dtype = torch.bfloat16
+    emulated = False
     out = _to_mxfp8_then_scaled_grouped_mm(
-        x, w_t, offs, block_size, torch.bfloat16, use_triton_for_dim0_cast
+        x,
+        w_t,
+        offs,
+        block_size,
+        out_dtype,
+        emulated,
+        use_triton_for_dim0_cast,
+        wgrad_with_hp,
     )
     ref_out = torch._grouped_mm(x_ref, w_t_ref, offs=offs_ref, out_dtype=torch.bfloat16)
     sqnr = compute_error(ref_out, out)
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -294,6 +294,7 @@ def forward(
         out_dtype: Optional[torch.dtype] = torch.bfloat16,
         emulated: bool = False,
         use_triton_for_dim0_cast: bool = False,
+        wgrad_with_hp: bool = False,
         scale_calculation_mode: ScaleCalculationMode = ScaleCalculationMode.RCEIL,
     ) -> torch.Tensor:
         # torchao _quantize_then_scaled_grouped_mm only supports A=2D and B=3D.
@@ -352,6 +353,7 @@ def forward(
         ctx.out_dtype = out_dtype
         ctx.emulated = emulated
         ctx.use_triton_for_dim0_cast = use_triton_for_dim0_cast
+        ctx.wgrad_with_hp = wgrad_with_hp
         ctx.scale_calculation_mode = scale_calculation_mode
         return out
 
@@ -361,6 +363,7 @@ def backward(ctx, grad_out: torch.Tensor):
         block_size = ctx.block_size
         out_dtype = ctx.out_dtype
         use_triton_for_dim0_cast = ctx.use_triton_for_dim0_cast
+        wgrad_with_hp = ctx.wgrad_with_hp
         scale_calculation_mode = ctx.scale_calculation_mode
 
         # grad_out_data shape: (M, N)
@@ -405,59 +408,68 @@ def backward(ctx, grad_out: torch.Tensor):
             out_dtype=out_dtype,
         )
 
-        # grad_out_t_data shape: (M, N)
-        # grad_out_t_scales shape: (N, M//block_size)
-        grad_out_t_mx = _to_mxfp8_dim1_kernel_wrapper(
-            grad_out,
-            block_size,
-            elem_dtype=torch.float8_e4m3fn,
-            hp_dtype=grad_out.dtype,
-            kernel_preference=KernelPreference.AUTO,  # Not used
-            cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
-            scale_calculation_mode=scale_calculation_mode,
-        )
-        grad_out_t_data = grad_out_t_mx.qdata
-        grad_out_t_scales = grad_out_t_mx.scale
-
-        # Transpose A so we can scale along the M dimension, then un-transpose.
-        # A shape: (M, K)
-        # A_t_data shape: (K, M)
-        # A_t_scales shape: (K, M//block_size)
-        A_t_mx = _to_mxfp8_dim1_kernel_wrapper(
-            A,
-            block_size,
-            elem_dtype=torch.float8_e4m3fn,
-            hp_dtype=A.dtype,
-            kernel_preference=KernelPreference.AUTO,  # Not used
-            cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
-            scale_calculation_mode=scale_calculation_mode,
-        )
-        A_t_data = A_t_mx.qdata
-        A_t_scales = A_t_mx.scale
-
-        # Convert scales to blocked format for 2d-2d grouped mm
-        scale_group_offsets = offs // block_size
-        grad_out_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(
-            grad_out_t_scales,
-            scale_group_offsets,
-        )
-        A_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(
-            A_t_scales,
-            scale_group_offsets,
-        )
+        # Optionally compute wgrad in high precision, if specified.
+        if wgrad_with_hp:
+            # TODO: migrate all grouped gemms in this file to new torch.nn.functional API
+            # grad_B_t = scaled grouped mm of (N,total_M) @ (total_M,K) = (E,N,K)
+            grad_B = torch._grouped_mm(
+                grad_out.transpose(-2, -1), A, offs=offs, out_dtype=out_dtype
+            )
+            grad_B_t = grad_B.transpose(-2, -1)
+        else:
+            # grad_out_t_data shape: (M, N)
+            # grad_out_t_scales shape: (N, M//block_size)
+            grad_out_t_mx = _to_mxfp8_dim1_kernel_wrapper(
+                grad_out,
+                block_size,
+                elem_dtype=torch.float8_e4m3fn,
+                hp_dtype=grad_out.dtype,
+                kernel_preference=KernelPreference.AUTO,  # Not used
+                cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
+                scale_calculation_mode=scale_calculation_mode,
+            )
+            grad_out_t_data = grad_out_t_mx.qdata
+            grad_out_t_scales = grad_out_t_mx.scale
+
+            # Transpose A so we can scale along the M dimension, then un-transpose.
+            # A shape: (M, K)
+            # A_t_data shape: (K, M)
+            # A_t_scales shape: (K, M//block_size)
+            A_t_mx = _to_mxfp8_dim1_kernel_wrapper(
+                A,
+                block_size,
+                elem_dtype=torch.float8_e4m3fn,
+                hp_dtype=A.dtype,
+                kernel_preference=KernelPreference.AUTO,  # Not used
+                cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
+                scale_calculation_mode=scale_calculation_mode,
+            )
+            A_t_data = A_t_mx.qdata
+            A_t_scales = A_t_mx.scale
+
+            # Convert scales to blocked format for 2d-2d grouped mm
+            scale_group_offsets = offs // block_size
+            grad_out_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(
+                grad_out_t_scales,
+                scale_group_offsets,
+            )
+            A_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(
+                A_t_scales,
+                scale_group_offsets,
+            )
 
-        # grad_B_t = scaled grouped mm of (N,total_M) @ (total_M,K) = (E,N,K)
-        grad_B = torch._scaled_grouped_mm(
-            grad_out_t_data,
-            A_t_data.transpose(-2, -1),
-            grad_out_t_scales_blocked,
-            A_t_scales_blocked,
-            offs=offs,
-            out_dtype=out_dtype,
-        )
-        # grad_B_t shape =  (E,K,N)
-        grad_B_t = grad_B.transpose(-2, -1)
-        return grad_A, grad_B_t, None, None, None, None
+            # grad_B_t = scaled grouped mm of (N,total_M) @ (total_M,K) = (E,N,K)
+            grad_B = torch._scaled_grouped_mm(
+                grad_out_t_data,
+                A_t_data.transpose(-2, -1),
+                grad_out_t_scales_blocked,
+                A_t_scales_blocked,
+                offs=offs,
+                out_dtype=out_dtype,
+            )
+            # grad_B_t shape =  (E,K,N)
+            grad_B_t = grad_B.transpose(-2, -1)
+        return grad_A, grad_B_t, None, None, None, None, None, None, None
 
 
 def _to_mxfp8_dim1_3d(