webgpu: add FormatTransform kernel and tests

jchen10 · jchen10 · commit becb7605100a · 2025-11-23T10:41:16.000+08:00
- add the WebGPU FormatTransform kernel, headers, and WGSL template,
supporting Plain &lt;-&gt; nChw4c/ABcd16a4b conversions
- register the FormatTransform schema in the internal NHWC domain with
padding-aware shape inference
- hook the kernel into the WebGPU execution provider and add WebGPU
tests covering both blocked formats, padding cases, and round trips
diff --git a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
@@ -163,6 +163,84 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
   REGISTER_NHWC_SCHEMA_FROM_MSDOMAIN(fn, QLinearAveragePool, 1);
   REGISTER_NHWC_SCHEMA_FROM_MSDOMAIN(fn, QLinearConvTranspose, 1);
 
+  // FormatTransform operator for OneDNN blocked format support
+  fn(std::move(::ONNX_NAMESPACE::OpSchema()
+                   .SetName("FormatTransform")
+                   .SetDomain(onnxruntime::kMSInternalNHWCDomain)
+                   .SinceVersion(1)
+                   .SetDoc("Transform tensor between plain (NCHW) and OneDNN blocked formats (nChw4c, ABcd16a4b).")
+                   .Attr("src_format", "Source format: Plain, nChw4c, or ABcd16a4b",
+                         ONNX_NAMESPACE::AttributeProto::STRING)
+                   .Attr("dst_format", "Destination format: Plain, nChw4c, or ABcd16a4b",
+                         ONNX_NAMESPACE::AttributeProto::STRING)
+                   .Input(0, "X", "Input tensor", "T")
+                   .Output(0, "Y", "Output tensor with transformed layout", "T")
+                   .TypeConstraint("T", {"tensor(float)", "tensor(float16)", "tensor(bfloat16)"},
+                                   "Constrain input and output types to floating-point tensors.")
+                   .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+                     ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+                     if (!ONNX_NAMESPACE::hasInputShape(ctx, 0)) {
+                       return;
+                     }
+
+                     const auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+                     if (input_shape.dim_size() != 4) {
+                       fail_shape_inference("FormatTransform requires 4D input tensor (NCHW)");
+                     }
+
+                     auto* output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+                     output_shape->clear_dim();  // Clear any existing dimensions before writing
+
+                     // Get destination format attribute
+                     std::string dst_format;
+                     if (ctx.getAttribute("dst_format") != nullptr) {
+                       dst_format = ctx.getAttribute("dst_format")->s();
+                     }
+
+                     // Calculate output shape with padding if needed
+                     if (dst_format == "nChw4c") {
+                       // Pad channels (dimension 1) to multiple of 4
+                       *output_shape->add_dim() = input_shape.dim(0);
+
+                       if (input_shape.dim(1).has_dim_value()) {
+                         int64_t C = input_shape.dim(1).dim_value();
+                         int64_t padded_C = ((C + 3) / 4) * 4;
+                         output_shape->add_dim()->set_dim_value(padded_C);
+                       } else {
+                         // Dynamic channel dimension - can't compute padding statically
+                         output_shape->add_dim()->set_dim_param(input_shape.dim(1).dim_param());
+                       }
+
+                       *output_shape->add_dim() = input_shape.dim(2);
+                       *output_shape->add_dim() = input_shape.dim(3);
+                     } else if (dst_format == "ABcd16a4b") {
+                       // Pad N (dimension 0) to multiple of 16 and C (dimension 1) to multiple of 4
+                       if (input_shape.dim(0).has_dim_value()) {
+                         int64_t N = input_shape.dim(0).dim_value();
+                         int64_t padded_N = ((N + 15) / 16) * 16;
+                         output_shape->add_dim()->set_dim_value(padded_N);
+                       } else {
+                         output_shape->add_dim()->set_dim_param(input_shape.dim(0).dim_param());
+                       }
+
+                       if (input_shape.dim(1).has_dim_value()) {
+                         int64_t C = input_shape.dim(1).dim_value();
+                         int64_t padded_C = ((C + 3) / 4) * 4;
+                         output_shape->add_dim()->set_dim_value(padded_C);
+                       } else {
+                         output_shape->add_dim()->set_dim_param(input_shape.dim(1).dim_param());
+                       }
+
+                       *output_shape->add_dim() = input_shape.dim(2);
+                       *output_shape->add_dim() = input_shape.dim(3);
+                     } else {
+                       // Plain or other formats: no padding needed
+                       for (int i = 0; i < input_shape.dim_size(); ++i) {
+                         *output_shape->add_dim() = input_shape.dim(i);
+                       }
+                     }
+                   })));
+
   // not all schema are registered here. For part of layout insensitive ops
   // we will use onnx schema directly, for others, like fused-node/qdq-group
   // we may leverage internal schema or create on the fly.
diff --git a/onnxruntime/core/providers/webgpu/vendor/intel/contrib/format_transform.cc b/onnxruntime/core/providers/webgpu/vendor/intel/contrib/format_transform.cc
@@ -0,0 +1,115 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/vendor/intel/contrib/format_transform.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/string_macros.h"
+#include "core/common/narrow.h"
+
+namespace onnxruntime {
+namespace webgpu {
+namespace intel {
+
+namespace {
+std::string GetFormatName(BlockedFormat format) {
+  switch (format) {
+    case BlockedFormat::Plain:
+      return "Plain";
+    case BlockedFormat::nChw4c:
+      return "nChw4c";
+    case BlockedFormat::ABcd16a4b:
+      return "ABcd16a4b";
+    default:
+      return "Unknown";
+  }
+}
+
+BlockedFormat ParseFormat(const std::string& format_str) {
+  if (format_str == "Plain" || format_str == "NCHW") {
+    return BlockedFormat::Plain;
+  } else if (format_str == "nChw4c") {
+    return BlockedFormat::nChw4c;
+  } else if (format_str == "ABcd16a4b") {
+    return BlockedFormat::ABcd16a4b;
+  } else {
+    ORT_THROW("Unsupported format: ", format_str);
+  }
+}
+}  // namespace
+
+FormatTransformProgram::FormatTransformProgram(BlockedFormat src_format, BlockedFormat dst_format,
+                                               const TensorShape& input_shape, const TensorShape& output_shape)
+    : Program{"FormatTransform"},
+      src_format_(src_format),
+      dst_format_(dst_format),
+      input_shape_(input_shape),
+      output_shape_(output_shape) {
+}
+
+Status FormatTransformProgram::GenerateShaderCode(ShaderHelper& sh) const {
+  const auto& input = sh.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseShapeAndStride);
+  const auto& output = sh.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseShapeAndStride | ShaderUsage::UseValueTypeAlias);
+
+  auto rank = input_shape_.NumDimensions();
+  ORT_RETURN_IF_NOT(rank == 4, "FormatTransform currently only supports 4D tensors (NCHW)");
+
+  const int src_format_val = static_cast<int>(src_format_);
+  const int dst_format_val = static_cast<int>(dst_format_);
+
+  return WGSL_TEMPLATE_APPLY(sh, "vendor/intel/contrib/format_transform.wgsl.template",
+                             WGSL_TEMPLATE_PARAMETER(dst_format, dst_format_val),
+                             WGSL_TEMPLATE_PARAMETER(src_format, src_format_val),
+                             WGSL_TEMPLATE_VARIABLE(input, input),
+                             WGSL_TEMPLATE_VARIABLE(output, output));
+}
+
+FormatTransform::FormatTransform(const OpKernelInfo& info)
+    : WebGpuKernel(info) {
+  std::string src_format_str = info.GetAttrOrDefault<std::string>("src_format", "Plain");
+  std::string dst_format_str = info.GetAttrOrDefault<std::string>("dst_format", "Plain");
+
+  src_format_ = ParseFormat(src_format_str);
+  dst_format_ = ParseFormat(dst_format_str);
+}
+
+Status FormatTransform::ComputeInternal(ComputeContext& context) const {
+  const auto* input = context.Input<Tensor>(0);
+  const auto& input_shape = input->Shape();
+
+  ORT_RETURN_IF_NOT(input_shape.NumDimensions() == 4, "FormatTransform only supports 4D tensors");
+
+  // Calculate output shape with padding if needed for blocked formats
+  TensorShape output_shape = input_shape;
+
+  if (dst_format_ == BlockedFormat::nChw4c) {
+    // For nChw4c, pad channels to multiple of 4
+    int64_t C = input_shape[1];
+    int64_t padded_C = ((C + 3) / 4) * 4;  // Round up to multiple of 4
+    output_shape = TensorShape({input_shape[0], padded_C, input_shape[2], input_shape[3]});
+  } else if (dst_format_ == BlockedFormat::ABcd16a4b) {
+    // For ABcd16a4b, pad N to multiple of 16 and C to multiple of 4
+    int64_t N = input_shape[0];
+    int64_t C = input_shape[1];
+    int64_t padded_N = ((N + 15) / 16) * 16;  // Round up to multiple of 16
+    int64_t padded_C = ((C + 3) / 4) * 4;     // Round up to multiple of 4
+    output_shape = TensorShape({padded_N, padded_C, input_shape[2], input_shape[3]});
+  }
+  // For Plain output format, no padding needed (output_shape remains input_shape)
+
+  auto* output = context.Output(0, output_shape);
+
+  FormatTransformProgram program{src_format_, dst_format_, input_shape, output_shape};
+  program
+      .AddInput({input, ProgramTensorMetadataDependency::TypeAndRank})
+      .AddOutput({output, ProgramTensorMetadataDependency::None})
+      .SetDispatchGroupSize((static_cast<uint32_t>(output_shape.Size()) + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .CacheHint(static_cast<int>(src_format_), static_cast<int>(dst_format_))
+      .AddUniformVariables({{static_cast<uint32_t>(output_shape.Size())}});
+
+  return context.RunProgram(program);
+}
+
+}  // namespace intel
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/vendor/intel/contrib/format_transform.h b/onnxruntime/core/providers/webgpu/vendor/intel/contrib/format_transform.h
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+
+namespace onnxruntime {
+namespace webgpu {
+namespace intel {
+
+// OneDNN blocked format types
+// IMPORTANT: keep enum order/numeric values in sync with format_transform.wgsl.template format macro values.
+enum class BlockedFormat {
+  Plain,      // Standard NCHW layout
+  nChw4c,     // Blocked with 4-channel blocks
+  ABcd16a4b,  // 2D blocked format: blocks A dimension with 16, B dimension with 4
+};
+
+class FormatTransformProgram final : public Program<FormatTransformProgram> {
+ public:
+  FormatTransformProgram(BlockedFormat src_format, BlockedFormat dst_format,
+                         const TensorShape& input_shape, const TensorShape& output_shape);
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  BlockedFormat src_format_;
+  BlockedFormat dst_format_;
+  TensorShape input_shape_;
+  TensorShape output_shape_;
+};
+
+// Internal operator for format transformation between plain and blocked formats
+class FormatTransform final : public WebGpuKernel {
+ public:
+  FormatTransform(const OpKernelInfo& info);
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  BlockedFormat src_format_;
+  BlockedFormat dst_format_;
+};
+
+}  // namespace intel
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/vendor/intel/contrib/format_transform.wgsl.template b/onnxruntime/core/providers/webgpu/vendor/intel/contrib/format_transform.wgsl.template
@@ -0,0 +1,114 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Format transformation shader for OneDNN blocked layouts
+// Supports Plain (NCHW), nChw4c, and ABcd16a4b formats
+
+#define FORMAT_PLAIN 0
+#define FORMAT_NCHW4C 1
+#define FORMAT_ABCD16A4B 2
+
+#use getElementAt
+#use .getByOffset .setByOffset
+#use guardAgainstOutOfBoundsWorkgroupSizes
+
+#param src_format
+#param dst_format
+
+$MAIN {
+    guardAgainstOutOfBoundsWorkgroupSizes(uniforms.output_size);
+
+  // Get shapes from uniforms
+  let N = uniforms.input_shape[0];
+  let C = uniforms.input_shape[1];
+  let H = uniforms.input_shape[2];
+  let W = uniforms.input_shape[3];
+
+  let out_N = uniforms.output_shape[0];
+  let out_C = uniforms.output_shape[1];
+  let out_H = uniforms.output_shape[2];
+  let out_W = uniforms.output_shape[3];
+
+  // Calculate NCHW indices from output global index
+  let n = global_idx / (out_C * out_H * out_W);
+  let chw_idx = global_idx % (out_C * out_H * out_W);
+  let c = chw_idx / (out_H * out_W);
+  let hw_idx = chw_idx % (out_H * out_W);
+  let h = hw_idx / out_W;
+  let w = hw_idx % out_W;
+
+  var output_idx: u32 = 0u;
+#if dst_format == FORMAT_PLAIN
+  // Plain format: NCHW
+  output_idx = n * C * H * W + c * H * W + h * W + w;
+#elif dst_format == FORMAT_NCHW4C
+  // nChw4c format: [N, C/4, H, W, 4]
+  let block_size = 4u;
+  let num_blocks = (out_C + block_size - 1u) / block_size;
+  let block_idx = c / block_size;
+  let c_in_block = c % block_size;
+  output_idx = n * num_blocks * out_H * out_W * block_size +
+               block_idx * out_H * out_W * block_size +
+               h * out_W * block_size +
+               w * block_size +
+               c_in_block;
+#elif dst_format == FORMAT_ABCD16A4B
+  // ABcd16a4b format: [N/16, C/4, H, W, 16, 4]
+  let a_block = 16u;
+  let b_block = 4u;
+  let num_a_blocks = (out_N + a_block - 1u) / a_block;
+  let num_b_blocks = (out_C + b_block - 1u) / b_block;
+  let a_block_idx = n / a_block;
+  let n_in_block = n % a_block;
+  let b_block_idx = c / b_block;
+  let c_in_block = c % b_block;
+  output_idx = a_block_idx * num_b_blocks * out_H * out_W * a_block * b_block +
+               b_block_idx * out_H * out_W * a_block * b_block +
+               h * out_W * a_block * b_block +
+               w * a_block * b_block +
+               n_in_block * b_block +
+               c_in_block;
+#endif
+
+  // Check if this output position is within input bounds
+  if (n >= N || c >= C || h >= H || w >= W) {
+    // Padding area - fill with zero
+    output.setByOffset(output_idx, output_value_t(0));
+  } else {
+    // Within input bounds - transform data
+    // Calculate input index
+#if src_format == FORMAT_PLAIN
+    // Plain format: NCHW
+    let input_idx = n * C * H * W + c * H * W + h * W + w;
+#elif src_format == FORMAT_NCHW4C
+    // nChw4c format: [N, C/4, H, W, 4]
+    let block_size = 4u;
+    let num_blocks = (C + block_size - 1u) / block_size;
+    let block_idx = c / block_size;
+    let c_in_block = c % block_size;
+    let input_idx = n * num_blocks * H * W * block_size +
+                    block_idx * H * W * block_size +
+                    h * W * block_size +
+                    w * block_size +
+                    c_in_block;
+#elif src_format == FORMAT_ABCD16A4B
+    // ABcd16a4b format: [N/16, C/4, H, W, 16, 4]
+    let a_block = 16u;
+    let b_block = 4u;
+    let num_a_blocks = (N + a_block - 1u) / a_block;
+    let num_b_blocks = (C + b_block - 1u) / b_block;
+    let a_block_idx = n / a_block;
+    let n_in_block = n % a_block;
+    let b_block_idx = c / b_block;
+    let c_in_block = c % b_block;
+    let input_idx = a_block_idx * num_b_blocks * H * W * a_block * b_block +
+                    b_block_idx * H * W * a_block * b_block +
+                    h * W * a_block * b_block +
+                    w * a_block * b_block +
+                    n_in_block * b_block +
+                    c_in_block;
+#endif
+
+    output.setByOffset(output_idx, input.getByOffset(input_idx));
+  }
+} // MAIN
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
diff --git a/onnxruntime/test/providers/format_transform_test.cc b/onnxruntime/test/providers/format_transform_test.cc