Cortex_m backend: Add permute op (pytorch#15848)

AdrianLundell · web-flow · commit 2f9e57474af6 · 2025-11-18T16:29:54.000+01:00
Since the transpose op doesn't require qparams but still expects input
to be int8, the check in quantized_op_fusion_pass is moved from the
call_operator level to the _get_replacement level. This way different
ops can have different checks.

---------

Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
@@ -58,6 +58,7 @@ set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_mul.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_transpose.cpp
 )
 
 # Generate C++ bindings to register kernels into Executorch
diff --git a/backends/cortex_m/ops/op_transpose.cpp b/backends/cortex_m/ops/op_transpose.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cortex_m_ops_common.h"
+
+#include <array>
+#include <limits>
+#include <vector>
+
+// Include CMSIS-NN headers with C linkage
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+namespace {
+
+constexpr size_t kMaxSupportedDims = 4;
+
+} // namespace
+
+Tensor& transpose_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const IntArrayRef perm,
+    Tensor& out) {
+  if (input.scalar_type() != ScalarType::Char ||
+      out.scalar_type() != ScalarType::Char) {
+    ET_LOG(
+        Error,
+        "transpose_out: only int8 tensors are supported (input=%d, out=%d)",
+        static_cast<int>(input.scalar_type()),
+        static_cast<int>(out.scalar_type()));
+    context.fail(Error::InvalidArgument);
+    return out;
+  }
+
+  const size_t rank = input.dim();
+  if (rank == 0 || rank > kMaxSupportedDims) {
+    ET_LOG(
+        Error,
+        "transpose_out: expected tensor rank in [1, %zu], got %zu",
+        kMaxSupportedDims,
+        rank);
+    context.fail(Error::InvalidArgument);
+    return out;
+  }
+
+  if (perm.size() != static_cast<int64_t>(rank)) {
+    ET_LOG(
+        Error,
+        "transpose_out: permutation length %zd does not match tensor rank %zu",
+        perm.size(),
+        rank);
+    context.fail(Error::InvalidArgument);
+    return out;
+  }
+
+  std::array<int32_t, kMaxSupportedDims> input_dims_arr{1, 1, 1, 1};
+  std::array<int32_t, kMaxSupportedDims> output_dims_arr{1, 1, 1, 1};
+  for (size_t i = 0; i < rank; ++i) {
+    const auto in_size = input.size(i);
+    const auto out_size = out.size(i);
+    if (in_size > std::numeric_limits<int32_t>::max() ||
+        out_size > std::numeric_limits<int32_t>::max()) {
+      ET_LOG(
+          Error,
+          "transpose_out: dimension size exceeds int32_t range (input=%lld, output=%lld)",
+          static_cast<long long>(in_size),
+          static_cast<long long>(out_size));
+      context.fail(Error::InvalidArgument);
+      return out;
+    }
+    input_dims_arr[i] = static_cast<int32_t>(in_size);
+    output_dims_arr[i] = static_cast<int32_t>(out_size);
+  }
+
+  cmsis_nn_dims input_dims = {
+      input_dims_arr[0],
+      input_dims_arr[1],
+      input_dims_arr[2],
+      input_dims_arr[3]};
+  cmsis_nn_dims output_dims = {
+      output_dims_arr[0],
+      output_dims_arr[1],
+      output_dims_arr[2],
+      output_dims_arr[3]};
+
+  std::array<uint32_t, kMaxSupportedDims> perm_buffer{0, 1, 2, 3};
+  for (size_t i = 0; i < rank; ++i) {
+    perm_buffer[i] = static_cast<uint32_t>(perm[i]);
+  }
+
+  const cmsis_nn_transpose_params transpose_params{
+      static_cast<int32_t>(rank), perm_buffer.data()};
+
+  const int8_t* input_data = input.const_data_ptr<int8_t>();
+  int8_t* output_data = out.mutable_data_ptr<int8_t>();
+
+  const arm_cmsis_nn_status status = arm_transpose_s8(
+      input_data, output_data, &input_dims, &output_dims, &transpose_params);
+
+  if (status != ARM_CMSIS_NN_SUCCESS) {
+    ET_LOG(
+        Error,
+        "transpose_out: arm_transpose_s8 failed with status [%d]",
+        static_cast<int>(status));
+    context.fail(Error::Internal);
+    return out;
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
@@ -349,3 +349,21 @@ def quantized_linear_impl(
     output += output_offset
     output = torch.clamp(output, activation_min, activation_max).to(torch.int8)
     return output
+
+
+# ===================================================================
+# TRANSPOSE OPERATION DEFINITION
+# ===================================================================
+lib.define("transpose(Tensor input, int[] perm) -> Tensor")
+lib.define("transpose.out(Tensor input, int[] perm, *, Tensor(a!) out) -> Tensor(a!)")
+
+
+@register_fake("cortex_m::transpose")
+def transpose_meta(input: torch.Tensor, perm) -> torch.Tensor:
+    output_shape = [input.shape[idx] for idx in perm]
+    return torch.empty(output_shape, dtype=input.dtype, device=input.device)
+
+
+@impl(lib, "transpose", "CompositeExplicitAutograd")
+def transpose_impl(input: torch.Tensor, perm) -> torch.Tensor:
+    return input.permute(tuple(perm)).contiguous()
diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml
@@ -34,3 +34,9 @@
   kernels:
     - arg_meta: null
       kernel_name: cortex_m::quantized_linear_out
+
+- func: cortex_m::transpose.out(Tensor input, int[] perm, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::transpose_out
diff --git a/backends/cortex_m/passes/quantized_op_fusion_pass.py b/backends/cortex_m/passes/quantized_op_fusion_pass.py
@@ -7,6 +7,8 @@
 
 from typing import Dict
 
+import torch
+
 from executorch.backends.cortex_m.passes.passes_utils import (
     quantize_multiplier_aot,
     SHIFT_INT8,
@@ -30,6 +32,11 @@ class QuantizedOpFusionPass(ExportPass):
     """
 
     def _get_add_replacement(self, args, meta):
+        if (
+            meta.data.get("input_qparams", {}) == {}
+            or meta.data.get("output_qparams", {}) == {}
+        ):
+            return exir_ops.edge.aten.add.Tensor, args
 
         # Extract values
         scale1 = meta["input_qparams"][0].scale
@@ -64,7 +71,12 @@ def _get_add_replacement(self, args, meta):
 
         return exir_ops.edge.cortex_m.quantized_add.default, args
 
-    def _get_mul_replacement(self, args, meta) -> int:
+    def _get_mul_replacement(self, args, meta):
+        if (
+            meta.data.get("input_qparams", {}) == {}
+            or meta.data.get("output_qparams", {}) == {}
+        ):
+            return exir_ops.edge.aten.mul.Tensor, args
 
         # Extract values
         scale1 = meta["input_qparams"][0].scale
@@ -89,24 +101,30 @@ def _get_mul_replacement(self, args, meta) -> int:
 
         return exir_ops.edge.cortex_m.quantized_mul.default, args
 
+    def _get_permute_replacement(self, args, meta):
+        if args[0].data.dtype != torch.int8:
+            return exir_ops.edge.aten.permute_copy.default, args
+
+        rank = len(args[0].data.shape)
+        perms = [p % rank for p in args[1]]
+        args = (args[0], perms)
+        return exir_ops.edge.cortex_m.transpose.default, args
+
     def call_operator(
         self,
         op: EdgeOpOverload,
         args: tuple[Argument, ...],
         kwargs: Dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if (
-            meta.data.get("input_qparams", {}) == {}
-            or meta.data.get("output_qparams", {}) == {}
-        ):
-            return super().call_operator(op, args, {}, meta)
 
         match op:
             case exir_ops.edge.aten.add.Tensor:
                 op, args = self._get_add_replacement(args, meta)
             case exir_ops.edge.aten.mul.Tensor:
                 op, args = self._get_mul_replacement(args, meta)
+            case exir_ops.edge.aten.permute_copy.default:
+                op, args = self._get_permute_replacement(args, meta)
             case _:
                 pass
 
diff --git a/backends/cortex_m/test/ops/test_transpose.py b/backends/cortex_m/test/ops/test_transpose.py
@@ -0,0 +1,102 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+
+OPS_BEFORE_PASSES = {
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+    "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
+}
+
+OPS_AFTER_PASSES = {
+    "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+    "executorch_exir_dialects_edge__ops_cortex_m_transpose_default": 1,
+    "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+}
+
+
+class CortexMPermute(torch.nn.Module):
+    ops_before_transforms = OPS_BEFORE_PASSES
+    ops_after_transforms = OPS_AFTER_PASSES
+
+    def __init__(self, perms):
+        super().__init__()
+        self.perms = perms
+
+    def forward(self, x):
+        return x.permute(self.perms)
+
+
+class CortexMTranspose(torch.nn.Module):
+    ops_before_transforms = OPS_BEFORE_PASSES
+    ops_after_transforms = OPS_AFTER_PASSES
+
+    def __init__(self, dim0, dim1):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x):
+        return x.transpose(self.dim0, self.dim1)
+
+
+class CortexMT(torch.nn.Module):
+    ops_before_transforms = OPS_BEFORE_PASSES
+    ops_after_transforms = OPS_AFTER_PASSES
+
+    def forward(self, x):
+        return x.t()
+
+
+test_cases = {
+    "permute_nhwc_to_nchw": McuTestCase(
+        CortexMPermute((0, 3, 1, 2)),
+        (ramp_tensor(-0.5, 0.5, (2, 3, 4, 2)),),
+    ),
+    "permute_nchw_to_nhwc_neg_index": McuTestCase(
+        CortexMPermute((0, -2, -1, -3)),
+        (ramp_tensor(10, 100, (2, 3, 4, 2)),),
+    ),
+    "permute_rank_1": McuTestCase(
+        CortexMPermute((0,)),
+        (ramp_tensor(10, 100, (3)),),
+    ),
+    "transpose_1_2": McuTestCase(
+        CortexMTranspose(1, 2),
+        (ramp_tensor(-1.0, 1.0, (1, 3, 4)),),
+    ),
+    "transpose_0_1": McuTestCase(
+        CortexMTranspose(0, 1),
+        (ramp_tensor(-2.0, 2.0, (2, 3, 4, 3)),),
+    ),
+    "t_operator": McuTestCase(
+        CortexMT(),
+        (ramp_tensor(-0.5, 0.5, (4, 2)),),
+    ),
+}
+
+
+@parametrize("test_case", test_cases)
+def test_dialect_transpose(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms,
+        test_case.model.ops_after_transforms,
+        qtol=1,
+    )
+
+
+@parametrize("test_case", test_cases)
+def test_implementation_transpose(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_implementation(qtol=1)

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ set(_cortex_m_kernels__srcs`
`58`	`58`	`${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp`
`59`	`59`	`${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp`
`60`	`60`	`${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_mul.cpp`
	`61`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_transpose.cpp`
`61`	`62`	`)`
`62`	`63`
`63`	`64`	`# Generate C++ bindings to register kernels into Executorch`