From d6ff83dd767f86eb24974b1c517efc25c707c543 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Wed, 12 Nov 2025 15:25:47 -0800 Subject: [PATCH] [webgpu] Add implementation of BiasGelu --- .../contrib_ops/webgpu/bert/bias_gelu.cc | 95 +++++++++++++++++++ .../contrib_ops/webgpu/bert/bias_gelu.h | 38 ++++++++ .../webgpu/webgpu_contrib_kernels.cc | 2 + .../test/contrib_ops/element_wise_ops_test.cc | 2 +- 4 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 onnxruntime/contrib_ops/webgpu/bert/bias_gelu.cc create mode 100644 onnxruntime/contrib_ops/webgpu/bert/bias_gelu.h diff --git a/onnxruntime/contrib_ops/webgpu/bert/bias_gelu.cc b/onnxruntime/contrib_ops/webgpu/bert/bias_gelu.cc new file mode 100644 index 0000000000000..7cad1f4d06d0c --- /dev/null +++ b/onnxruntime/contrib_ops/webgpu/bert/bias_gelu.cc @@ -0,0 +1,95 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/webgpu/shader_helper.h" +#include "core/providers/webgpu/webgpu_supported_types.h" +#include "core/providers/webgpu/math/unary_elementwise_ops.h" +#include "contrib_ops/webgpu/bert/bias_gelu.h" +#include "contrib_ops/webgpu/webgpu_contrib_kernels.h" + +namespace onnxruntime { +namespace contrib { +namespace webgpu { + +ONNX_OPERATOR_KERNEL_EX( + BiasGelu, + kMSDomain, + 1, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", WebGpuSupportedFloatTypes()), + BiasGelu); + +Status BiasGeluProgram::GenerateShaderCode(ShaderHelper& shader) const { + const auto& x = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); + const auto& bias = shader.AddInput("bias", ShaderUsage::UseUniform | ShaderUsage::UseShapeAndStride); + const auto& y = shader.AddOutput("y", ShaderUsage::UseUniform); + + shader.AdditionalImplementation() << onnxruntime::webgpu::ErfImpl; + shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size") + << " var a = " << x.GetByOffset("global_idx") << ";\n"; + + // Add bias to input + if (bias_components_ == 1) { + shader.MainFunctionBody() << " let bias_offset = global_idx * 4;\n" + " a += x_value_t(" + << bias.GetByOffset("bias_offset % uniforms.bias_shape") << ", " + << bias.GetByOffset("(bias_offset + 1) % uniforms.bias_shape") << ", " + << bias.GetByOffset("(bias_offset + 2) % uniforms.bias_shape") << ", " + << bias.GetByOffset("(bias_offset + 3) % uniforms.bias_shape") << ");\n"; + } else { + shader.MainFunctionBody() << " a += " << bias.GetByOffset("global_idx % uniforms.bias_shape") + ";\n"; + } + + // Apply GELU activation: 0.5 * a * (1.0 + erf(a * 0.7071067811865475)) + shader.MainFunctionBody() << y.SetByOffset("global_idx", onnxruntime::webgpu::GeluExpr); + + return Status::OK(); +} + +Status BiasGelu::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const { + const auto* input = context.Input(0); + const auto* bias = context.Input(1); + auto* output = context.Output(0, input->Shape()); + + uint32_t data_size = onnxruntime::narrow(output->Shape().Size()); + if (data_size == 0) { + return Status::OK(); + } + + const auto& input_shape = input->Shape(); + const auto& bias_shape = bias->Shape(); + + // Validate inputs + if (input_shape.NumDimensions() < 1 || bias_shape.NumDimensions() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "BiasGelu: input must have at least 1 dimension and bias must be 1-dimensional."); + } + + if (input_shape.GetDims().back() != bias_shape.GetDims().back()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "BiasGelu: bias must match the last dimension of input."); + } + + const auto vec_size = (data_size + 3) / 4; + uint32_t bias_size = onnxruntime::narrow(bias->Shape().Size()); + int bias_components = 1; + + if (bias_size % 4 == 0) { + bias_components = 4; + bias_size = bias_size / 4; + } + + BiasGeluProgram program{bias_components}; + program.AddInput({input, ProgramTensorMetadataDependency::Type, {vec_size}, 4}) + .AddInput({bias, ProgramTensorMetadataDependency::TypeAndRank, {bias_size}, bias_components}) + .AddOutput({output, ProgramTensorMetadataDependency::None, {vec_size}, 4}) + .SetDispatchGroupSize((vec_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .AddUniformVariable({vec_size}); + + return context.RunProgram(program); +} + +} // namespace webgpu +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/webgpu/bert/bias_gelu.h b/onnxruntime/contrib_ops/webgpu/bert/bias_gelu.h new file mode 100644 index 0000000000000..ac7cba249ca61 --- /dev/null +++ b/onnxruntime/contrib_ops/webgpu/bert/bias_gelu.h @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/webgpu/program.h" +#include "core/providers/webgpu/webgpu_kernel.h" + +namespace onnxruntime { +namespace contrib { +namespace webgpu { + +using namespace onnxruntime::webgpu; +using onnxruntime::webgpu::ComputeContext; + +class BiasGeluProgram final : public Program { + public: + BiasGeluProgram(int bias_components) : Program{"BiasGelu"}, bias_components_{bias_components} { + } + + Status GenerateShaderCode(ShaderHelper& sh) const override; + + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"vec_size", ProgramUniformVariableDataType::Uint32}); + + private: + int bias_components_; +}; + +class BiasGelu final : public WebGpuKernel { + public: + BiasGelu(const OpKernelInfo& info) : WebGpuKernel(info) {} + + Status ComputeInternal(ComputeContext& context) const override; +}; + +} // namespace webgpu +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc index fe0bc5dee92ff..d632e81372a6d 100644 --- a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc @@ -12,6 +12,7 @@ namespace webgpu { class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, Attention); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, BiasAdd); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, BiasGelu); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, BiasSplitGelu); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, FastGelu); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, FusedConv); @@ -40,6 +41,7 @@ Status RegisterWebGpuContribKernels(KernelRegistry& kernel_registry, bool enable BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/test/contrib_ops/element_wise_ops_test.cc b/onnxruntime/test/contrib_ops/element_wise_ops_test.cc index c641103a74465..38659fbd9f2b9 100644 --- a/onnxruntime/test/contrib_ops/element_wise_ops_test.cc +++ b/onnxruntime/test/contrib_ops/element_wise_ops_test.cc @@ -109,7 +109,7 @@ TEST(BiasGeluTest, Float) { RunBiasGeluTestFloat({2, 2333}, {2333}); } -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) || defined(USE_WEBGPU) static void RunBiasGeluTestHalf(const std::vector& input_dims, const std::vector& bias_dims) { RandomValueGenerator random{2333}; std::vector input_data = random.Uniform(input_dims, -1.0f, 1.0f);