Fix BMM style MoE export in fp8_pc_pt recipe (#515)

Edwardf0t1 · web-flow · commit a5025a257770 · 2025-11-21T16:33:06.000-08:00
## What does this PR do? **Type of change:** Bug fix **Overview:** The Llama-4-Scout-17B-16E-Instruct model uses Llama4TextExperts, which stores expert weights in a BMM (batch matrix multiply) layout: (num_experts, input_dim, output_dim). This is different from standard MoE models. The FP8_PC_PT (FP8 per-channel per-token) quantization code didn't handle this layout properly, causing shape mismatches. ## Usage  ```python python3 hf_ptq.py --pyt_ckpt_path /home/scratch.omniml_data_2/models/Llama-4-Scout-17B-16E-Instruct --qformat fp8_pc_pt --export_path /home/scratch.omniml_data_2/zhiyuc/checkpoints/llama4-scout-fp8_pc_pt --trust_remote_code ``` ## Testing  ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes  - **Did you write any new necessary tests?**: No - **Did you add or update any necessary documentation?**: No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: No  ## Additional Information  --------- Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -779,7 +779,36 @@ def to_quantized_weight(
         )[0]._quantized_data
 
     if quantization == QUANTIZATION_FP8_PC_PT:
-        return (weight / weights_scaling_factor.unsqueeze(-1)).to(torch.float8_e4m3fn)
+        if weight.dim() == 3:
+            # Handle different scale tensor shapes
+            if weights_scaling_factor.dim() == 1:
+                # Per-expert scaling only: (num_experts,) -> (num_experts, 1, 1)
+                return (weight / weights_scaling_factor[:, None, None]).to(torch.float8_e4m3fn)
+            elif weights_scaling_factor.dim() == 2:
+                # Per-channel scaling: check which dimension matches
+                if weights_scaling_factor.shape[0] != weight.shape[0]:
+                    raise ValueError(
+                        f"First dimension (num_experts) mismatch for FP8_PC_PT quantization. "
+                        f"weight shape: {weight.shape}, scale shape: {weights_scaling_factor.shape}"
+                    )
+                if weight.shape[-1] == weight.shape[-2]:
+                    raise ValueError(
+                        f"Ambiguous scaling dimension for FP8_PC_PT quantization with square weight matrix. "
+                        f"weight shape: {weight.shape}, scale shape: {weights_scaling_factor.shape}. "
+                        f"Cannot determine if scaling should be applied to input_dim or output_dim."
+                    )
+                if weights_scaling_factor.shape[-1] == weight.shape[-1]:
+                    # (num_experts, input_dim) -> (num_experts, 1, input_dim), BMM-style
+                    return (weight / weights_scaling_factor.unsqueeze(-2)).to(torch.float8_e4m3fn)
+                elif weights_scaling_factor.shape[-1] == weight.shape[-2]:
+                    # (num_experts, output_dim) -> (num_experts, output_dim, 1), Standard MoE case
+                    return (weight / weights_scaling_factor.unsqueeze(-1)).to(torch.float8_e4m3fn)
+                else:
+                    raise ValueError(
+                        f"Cannot determine correct unsqueeze dimension for FP8_PC_PT quantization. "
+                        f"weight shape: {weight.shape}, scale shape: {weights_scaling_factor.shape}"
+                    )
+        return (weight / weights_scaling_factor[:, None]).to(torch.float8_e4m3fn)
 
     if quantization in [QUANTIZATION_INT4_AWQ, QUANTIZATION_W4A8_AWQ]:
         return pack_int4_in_uint8(weight, weights_scaling_factor)
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -50,6 +50,7 @@
     KV_CACHE_NVFP4_AFFINE,
     QUANTIZATION_FP8,
     QUANTIZATION_FP8_PB_REAL,
+    QUANTIZATION_FP8_PC_PT,
     QUANTIZATION_NONE,
     QUANTIZATION_NVFP4,
     QUANTIZATION_NVFP4_AWQ,
@@ -327,13 +328,15 @@ def _export_quantized_weight(
     weight_scale_2: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale_2, None)
 
     # Transpose weight for bmm-style expert quantization (llama4, gpt-oss)
+    # Check if this is a BMM-style expert weight that needs transposition
+    is_bmm_expert_weight = weight.dim() == 3 and any(
+        expert_type in type(sub_module).__name__
+        for expert_type in ["Llama4TextExperts", "GptOssExperts"]
+    )
+
     if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
         # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim)
         # for NVFP4 quantization functions that expect input_dim as the last dimension for block quantization
-        is_bmm_expert_weight = weight.dim() == 3 and any(
-            expert_type in type(sub_module).__name__
-            for expert_type in ["Llama4TextExperts", "GptOssExperts"]
-        )
         weight, _ = maybe_transpose_expert_weight_dimensions(
             weight, is_bmm_expert_weight=is_bmm_expert_weight
         )
@@ -354,6 +357,24 @@ def _export_quantized_weight(
         quantized_weight, weight_scale = maybe_transpose_expert_weight_dimensions(
             quantized_weight, weight_scale, is_bmm_expert_weight=is_bmm_expert_weight
         )
+    elif quantization_format == QUANTIZATION_FP8_PC_PT and is_bmm_expert_weight:
+        # For FP8_PC_PT with BMM-style experts, transpose only the weight (not weight_scale)
+        weight, _ = maybe_transpose_expert_weight_dimensions(
+            weight, is_bmm_expert_weight=is_bmm_expert_weight
+        )
+
+        quantized_weight = to_quantized_weight(
+            weight.to(dtype),
+            weight_scale,
+            quantization_format,
+            weight_scale_2,
+            block_size,
+        )
+
+        # Transpose back to original BMM format
+        quantized_weight, _ = maybe_transpose_expert_weight_dimensions(
+            quantized_weight, is_bmm_expert_weight=is_bmm_expert_weight
+        )
     else:
         quantized_weight = to_quantized_weight(
             weight.to(dtype),