Align memory_format for conv2d/3d in Float8Tensor with hp Tensor (#3352)

jerryzh168 · web-flow · commit 4f5bc7a137ef · 2025-11-19T14:45:08.000-08:00
Align memory_format for conv2d and conv3d in Float8Tensor with high precision Tensors

Summary:
att, we want to make sure the output of

`F.conv3d(input, weight, ...)` and `F.conv3d(input, fp8_weight, ...)` have the same
memory_format

Test Plan:
python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_fp8_conv_variants

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -85,10 +85,6 @@ def __init__(
             dtype=dtype,
             device=device,
         )
-        if dim == 3:
-            self.conv = self.conv.to(memory_format=torch.channels_last_3d)
-        elif dim == 2:
-            self.conv = self.conv.to(memory_format=torch.channels_last)
 
     def forward(self, x):
         return self.conv(x)
@@ -340,41 +336,47 @@ def _test_fp8_matmul_model(
     @common_utils.parametrize("compile", [True, False])
     @common_utils.parametrize("inference_mode", [True, False])
     # test for 2D/3D conv
-    # Inputs are (N, C_in, C_out, (D, H, W) or
-    # (N, C_in, C_out, (H, W)
+    # Inputs are (N, C_in, C_out, (D, H, W), kernel_size or
+    # (N, C_in, C_out, (H, W), kernel_size
     @common_utils.parametrize(
         "sizes",
         [
-            (4, 16, 64, (32, 32, 32)),
-            (4, 16, 64, (32, 32)),
+            (1, 160, 320, (3, 194, 130), 3),
+            # Note: kernel_size can't be 1, otherwise
+            # the weight will be channels_last even though
+            # it's contiguous because of the value of
+            # stride
+            (1, 320, 640, (96, 64), 3),
         ],
     )
+    @common_utils.parametrize(
+        "is_input_channels_last",
+        [True, False],
+    )
+    @common_utils.parametrize(
+        "is_weight_channels_last",
+        [True, False],
+    )
     def test_fp8_conv_variants(
         self,
         dtype: torch.dtype,
         compile: bool,
         inference_mode: bool,
         sizes: Tuple,
+        is_input_channels_last: bool,
+        is_weight_channels_last: bool,
     ):
         torch.compiler.reset()
         granularity = PerTensor()
         kernel_preference = KernelPreference.AUTO
 
-        N, C_in, C_out, spatial_dims = sizes
+        N, C_in, C_out, spatial_dims, kernel_size = sizes
         dim = len(spatial_dims)
         convs = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
         assert dim in convs, f"Unsupported dim: {dim}"
         conv_class = convs[dim]
 
-        kernel_size = 3
-
-        # Note: this is channel last memory format
         input_tensor = torch.randn(N, C_in, *spatial_dims, dtype=dtype, device="cuda")
-        if dim == 3:
-            input_tensor = input_tensor.to(memory_format=torch.channels_last_3d)
-        else:
-            assert dim == 2
-            input_tensor = input_tensor.to(memory_format=torch.channels_last)
 
         model = ToyConvModel(
             dim,
@@ -387,6 +389,14 @@ def test_fp8_conv_variants(
             device="cuda",
         ).eval()
 
+        channels_last_memory_format = (
+            torch.channels_last_3d if dim == 3 else torch.channels_last
+        )
+        if is_input_channels_last:
+            input_tensor = input_tensor.to(memory_format=channels_last_memory_format)
+        if is_weight_channels_last:
+            model = model.to(memory_format=channels_last_memory_format)
+
         quantized_model = copy.deepcopy(model)
 
         config = Float8DynamicActivationFloat8WeightConfig(
@@ -406,6 +416,20 @@ def test_fp8_conv_variants(
             output_original = model(input_tensor)
             output_quantized = quantized_model(input_tensor)
 
+        # making sure quantized kernel produces tensor with memory_format
+        # that's aligned with bf16 kernel
+        is_bf16_output_channels_last = output_original.is_contiguous(
+            memory_format=channels_last_memory_format
+        )
+        is_quantized_output_channels_last = output_quantized.is_contiguous(
+            memory_format=channels_last_memory_format
+        )
+
+        assert is_bf16_output_channels_last == is_quantized_output_channels_last, (
+            "unexpected output strides for quantized model: "
+            f"{output_original.stride()} {output_quantized.stride()}"
+        )
+
         error = compute_error(output_original, output_quantized)
         assert compute_error(output_original, output_quantized) > 20, (
             f"Quantization error is too high got a SQNR of {error}"
@@ -452,13 +476,7 @@ def test_fp8_conv_skip_quant(
 
         kernel_size = 3
 
-        # Note: this is channel last memory format
         input_tensor = torch.randn(N, C_in, *spatial_dims, dtype=dtype, device="cuda")
-        if dim == 3:
-            input_tensor = input_tensor.to(memory_format=torch.channels_last_3d)
-        else:
-            input_tensor = input_tensor.to(memory_format=torch.channels_last)
-
         model = ToyConvModel(
             dim,
             C_in,
@@ -470,6 +488,13 @@ def test_fp8_conv_skip_quant(
             device="cuda",
         ).eval()
 
+        if dim == 3:
+            input_tensor = input_tensor.to(memory_format=torch.channels_last_3d)
+            model = model.to(memory_format=torch.channels_last_3d)
+        else:
+            input_tensor = input_tensor.to(memory_format=torch.channels_last)
+            model = model.to(memory_format=torch.channels_last)
+
         quantized_model = copy.deepcopy(model)
 
         config = Float8DynamicActivationFloat8WeightConfig(
@@ -932,6 +957,8 @@ def test_unsqueeze_conv2d_weight(self):
             device=device,
         ).eval()
 
+        model = model.to(memory_format=torch.channels_last)
+
         quantized_model = copy.deepcopy(model)
 
         config = Float8DynamicActivationFloat8WeightConfig(
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -533,36 +533,60 @@ def _quantize_and_scaled_conv3d(
             )
 
     assert kernel_choice == "fbgemm", "Only fbgemm kernel choice is supported currently"
+    input_qdata = input_tensor.qdata
+    weight_qdata = weight_tensor.qdata
+
+    is_input_channels_last = input_qdata.is_contiguous(
+        memory_format=torch.channels_last_3d
+    )
+    is_weight_channels_last = weight_qdata.is_contiguous(
+        memory_format=torch.channels_last_3d
+    )
+
+    # convert the input/weight to channels_last_3d memory_format here
+    # to make sure we can call the fbgemm conv
+    # kernel, it should be a no-op if both activation and weight are in
+    # channels_last_3d memory_format
+    input_qdata = input_qdata.contiguous(memory_format=torch.channels_last_3d)
+    weight_qdata = weight_qdata.contiguous(memory_format=torch.channels_last_3d)
+
     # move C_in to last dim
     # after permute: (N, D, H, W, C_in)
-    act_qdata = input_tensor.qdata.permute([0, 2, 3, 4, 1])
+    input_qdata = input_qdata.permute([0, 2, 3, 4, 1])
 
     # move C_in to last dim
     # after permute: (C_out, K1, K2, K3, C_in)
+    weight_qdata = weight_qdata.permute([0, 2, 3, 4, 1])
 
-    weight_qdata = weight_tensor.qdata.permute([0, 2, 3, 4, 1])
-
-    assert act_qdata.is_contiguous() and weight_qdata.is_contiguous(), (
-        "Please make sure both activation and weights are in the `channels_last_3d` memory_format"
-    )
-
-    act_scale = input_tensor.scale
+    input_scale = input_tensor.scale
     weight_scale = weight_tensor.scale
     output = torch.ops.fbgemm.f8f8bf16_conv(
-        act_qdata,
+        input_qdata,
         weight_qdata,
-        act_scale * weight_scale,
+        input_scale * weight_scale,
         padding,
         stride,
         dilation,
     )
     # output shape after permute: N, C_out, D_out, H_out, W_out
     output = output.permute([0, 4, 1, 2, 3])
+
+    # aligning the semantics with bfloat16 conv ops, the
+    # output should use contiguous_format if none of the input/weight
+    # are in channels_last format, otherwise, the output is already
+    # in channels_last format (from fbgemm kernel)
+    if not (is_input_channels_last or is_weight_channels_last):
+        output = output.contiguous()
     return output
 
 
 @implements(aten.convolution.default)
 def _(func, types, args, kwargs):
+    """The semantics of memory_format will match high precision counterparts
+    i.e. if any of input or weight are in channels_last_3d format
+    the output will be in channels_last_3d format, otherwise the output
+    will be contiguous
+    """
     (
         input_tensor,
         weight_tensor,
@@ -580,11 +604,6 @@ def _(func, types, args, kwargs):
     assert groups == 1, f"Only 1 is supported for `groups`, got: {groups}"
 
     if dim == 2:
-        assert input_tensor.is_contiguous(
-            memory_format=torch.channels_last
-        ) and weight_tensor.qdata.is_contiguous(memory_format=torch.channels_last), (
-            "Please make sure both activation and weights are in the `channels_last` memory_format"
-        )
         # (N, C, H, W) --> (N, C, 1, H, W)
         input_tensor = input_tensor.unsqueeze(2)
         weight_tensor = weight_tensor.unsqueeze(2)
@@ -606,11 +625,6 @@ def _(func, types, args, kwargs):
         res = res.squeeze(2)
         return res
     else:
-        assert input_tensor.is_contiguous(
-            memory_format=torch.channels_last_3d
-        ) and weight_tensor.qdata.is_contiguous(memory_format=torch.channels_last_3d), (
-            "Please make sure both activation and weights are in the `channels_last_3d` memory_format"
-        )
         assert tuple(output_padding) == (0, 0, 0), (
             f"Only (0, 0, 0) is supported for `output_padding`, got: f{output_padding}"
         )
@@ -626,6 +640,11 @@ def _(func, types, args, kwargs):
 
 @implements(aten.conv3d.default)
 def _(func, types, args, kwargs):
+    """The semantics of memory_format will match high precision counterparts
+    i.e. if any of input or weight are in channels_last_3d format
+    the output will be in channels_last_3d format, otherwise the output
+    will be contiguous
+    """
     (
         input_tensor,
         weight_tensor,
@@ -635,23 +654,24 @@ def _(func, types, args, kwargs):
         dilation,
         groups,
     ) = fill_defaults(args, 7, [None, [1, 1, 1], [0, 0, 0], [1, 1, 1], 1])
-    assert input_tensor.is_contiguous(
-        memory_format=torch.channels_last_3d
-    ) and weight_tensor.qdata.is_contiguous(memory_format=torch.channels_last_3d), (
-        "Please make sure both activation and weights are in the `channels_last_3d` memory_format"
-    )
-    return _quantize_and_scaled_conv3d(
+    conv3d_output = _quantize_and_scaled_conv3d(
         input_tensor,
         weight_tensor,
         bias,
         stride,
         padding,
         dilation,
     )
+    return conv3d_output
 
 
 @implements(aten.conv2d.default)
 def _(func, types, args, kwargs):
+    """The semantics of memory_format will match high precision counterparts
+    i.e. if any of input or weight are in channels_last_3d format
+    the output will be in channels_last_3d format, otherwise the output
+    will be contiguous
+    """
     (
         input_tensor,
         weight_tensor,
@@ -662,20 +682,9 @@ def _(func, types, args, kwargs):
         groups,
     ) = fill_defaults(args, 7, [None, [1, 1], [0, 0], [1, 1], 1])
     # (N, C, H, W) --> (N, C, 1, H, W)
-    # memory_format of both tensors should be torch.channels_last
-    # and it should be preserved with unsqueeze(2) (becoming torch.channels_last_3d)
-    assert input_tensor.is_contiguous(
-        memory_format=torch.channels_last
-    ) and weight_tensor.qdata.is_contiguous(memory_format=torch.channels_last), (
-        "Please make sure both activation and weights are in the `channels_last` memory_format"
-    )
     input_tensor = input_tensor.unsqueeze(2)
     weight_tensor = weight_tensor.unsqueeze(2)
 
-    assert input_tensor.is_contiguous(
-        memory_format=torch.channels_last_3d
-    ) and weight_tensor.qdata.is_contiguous(memory_format=torch.channels_last_3d)
-
     padding = [0, *padding]
     stride = [1, *stride]
     dilation = [1, *dilation]