Skip to content

Commit 334bc04

Browse files
authored
[webgpu] Add CeilDiv into webgpu utils (#26723)
### Description The `CeilDiv` function, which is heavily used by operations to determine dispatch sizes, was duplicated in multiple files. This PR moves the implementation into the common WebGPU utils to ensure a single source and reduce code duplication. ### Motivation and Context See above.
1 parent b6d914b commit 334bc04

File tree

5 files changed

+24
-43
lines changed

5 files changed

+24
-43
lines changed

onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,7 @@ namespace contrib {
1919
namespace webgpu {
2020

2121
namespace {
22-
2322
constexpr unsigned int kMinMForTileOptimization = 4;
24-
25-
template <typename T>
26-
inline T ceil_div(T numerator, T denominator) {
27-
return (numerator + denominator - 1) / denominator;
28-
}
29-
3023
} // namespace
3124

3225
ONNX_OPERATOR_KERNEL_EX(
@@ -246,8 +239,8 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
246239
constexpr uint32_t workgroup_size = 128;
247240
constexpr uint32_t tile_m = workgroup_size / 8;
248241
constexpr uint32_t tile_n = workgroup_size;
249-
const uint32_t num_N_tile = ceil_div(N, tile_n);
250-
const uint32_t num_M_tile = ceil_div(M, tile_m);
242+
const uint32_t num_N_tile = CeilDiv(N, tile_n);
243+
const uint32_t num_M_tile = CeilDiv(M, tile_m);
251244

252245
MatMulNBitsWideTileProgram program{has_zero_points, has_bias, has_weight_idx, tile_m, tile_n, static_cast<uint32_t>(nbits)};
253246
program.SetWorkgroupSize(workgroup_size);
@@ -268,7 +261,7 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
268261
if (has_zero_points) {
269262
program.AddInput({zero_points,
270263
ProgramTensorMetadataDependency::TypeAndRank,
271-
{ceil_div(zero_points->Shape().Size(), static_cast<int64_t>(4))},
264+
{CeilDiv(zero_points->Shape().Size(), static_cast<int64_t>(4))},
272265
4});
273266
}
274267
if (has_bias) {

onnxruntime/core/providers/webgpu/math/matmul.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "core/providers/webgpu/webgpu_supported_types.h"
99
#include "core/providers/webgpu/nn/fuse_utils.h"
1010
#include "core/providers/webgpu/data_transfer.h"
11+
#include "core/providers/webgpu/webgpu_utils.h"
1112

1213
namespace onnxruntime {
1314
namespace webgpu {
@@ -147,7 +148,7 @@ Status MatMul::ComputeInternal(ComputeContext& context) const {
147148
}
148149
program
149150
.AddOutputs({{output_tensor, ProgramTensorMetadataDependency::None, output_shape_shader, components}})
150-
.SetDispatchGroupSize((output_size + 63) / 64) // Integer ceiling division
151+
.SetDispatchGroupSize(CeilDiv(output_size, 64u))
151152
.AddIndices(outer_dims)
152153
.AddUniformVariables({{output_size}, {m}, {n}, {k}});
153154

onnxruntime/core/providers/webgpu/nn/im2col_matmul.cc

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,7 @@
1010

1111
namespace onnxruntime {
1212
namespace webgpu {
13-
1413
namespace {
15-
16-
// TODO: move to common header.
17-
template <typename T>
18-
inline T ceil_div(T numerator, T denominator) {
19-
return (numerator + denominator - 1) / denominator;
20-
}
21-
2214
// Chooses the optimal tile size (M, N) for the im2col operation.
2315
// This tile size is performance-tuned and varies depending on the target device.
2416
std::pair<uint32_t, uint32_t> ChooseTileSize(uint32_t im2col_m, uint32_t im2col_n) {
@@ -32,8 +24,8 @@ std::pair<uint32_t, uint32_t> ChooseTileSize(uint32_t im2col_m, uint32_t im2col_
3224
const uint32_t tile_m = tile_pair.first;
3325
const uint32_t tile_n = tile_pair.second;
3426

35-
const uint32_t dispatch_m = ceil_div(im2col_m, tile_m);
36-
const uint32_t dispatch_n = ceil_div(im2col_n, tile_n);
27+
const uint32_t dispatch_m = CeilDiv(im2col_m, tile_m);
28+
const uint32_t dispatch_n = CeilDiv(im2col_n, tile_n);
3729
const uint32_t dispatch = dispatch_m * dispatch_n;
3830

3931
if (dispatch >= 128) {
@@ -115,7 +107,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
115107
OIHW2OHWIProgram transpose_program{};
116108
transpose_program.SetWorkgroupSize(64);
117109

118-
const uint32_t Ci_tiles = ceil_div(channel_input, 64u);
110+
const uint32_t Ci_tiles = CeilDiv(channel_input, 64u);
119111
transpose_program.SetDispatchGroupSize(channel_output, Ci_tiles);
120112

121113
transpose_program.AddInput({weight,
@@ -127,7 +119,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
127119
{kernel_height},
128120
{kernel_width},
129121
{Ci_tiles},
130-
{ceil_div(kernel_height * kernel_height, 4u)}});
122+
{CeilDiv(kernel_height * kernel_height, 4u)}});
131123
ORT_RETURN_IF_ERROR(context.RunProgram(transpose_program));
132124

133125
// im2col-matmul
@@ -156,8 +148,8 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
156148
Im2ColMatMulProgram im2col_mm_program{has_bias, tile_m, tile_n, use_subgroup};
157149
im2col_mm_program.SetWorkgroupSize(workgroup_size);
158150

159-
const uint32_t M_tiles = ceil_div(im2col_m, tile_m);
160-
const uint32_t N_tiles = ceil_div(im2col_n, tile_n);
151+
const uint32_t M_tiles = CeilDiv(im2col_m, tile_m);
152+
const uint32_t N_tiles = CeilDiv(im2col_n, tile_n);
161153
im2col_mm_program.SetDispatchGroupSize(M_tiles, N_tiles, batch);
162154

163155
im2col_mm_program.AddInput({src,
@@ -185,7 +177,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
185177
{im2col_n},
186178
{M_tiles},
187179
{N_tiles},
188-
{ceil_div(ceil_div(im2col_k, 4u), 4u)},
180+
{CeilDiv(CeilDiv(im2col_k, 4u), 4u)},
189181
{dilations},
190182
{pads},
191183
{strides}});

onnxruntime/core/providers/webgpu/tensor/transpose.cc

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,15 @@
22
// Licensed under the MIT License.
33

44
#include "core/common/inlined_containers.h"
5-
#include "core/providers/webgpu/tensor/transpose.h"
65
#include "core/providers/cpu/tensor/utils.h"
6+
#include "core/providers/webgpu/tensor/transpose.h"
77
#include "core/providers/webgpu/shader_variable.h"
88
#include "core/providers/webgpu/shader_helper.h"
99
#include "core/providers/webgpu/webgpu_supported_types.h"
10-
11-
namespace {
12-
13-
inline uint32_t ceil_div(int64_t numerator, int32_t denominator) {
14-
return static_cast<uint32_t>((numerator + denominator - 1) / denominator);
15-
}
16-
17-
} // namespace
10+
#include "core/providers/webgpu/webgpu_utils.h"
1811

1912
namespace onnxruntime {
2013
namespace webgpu {
21-
2214
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
2315
Transpose,
2416
kOnnxDomain,
@@ -139,25 +131,23 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContextBase& context,
139131
new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
140132
}
141133

142-
uint32_t output_size = onnxruntime::narrow<int32_t>(input_shape.Size());
134+
uint32_t output_size = onnxruntime::narrow<uint32_t>(input_shape.Size());
143135
TransposeProgram program{permutations, use_shared};
144136

145137
program
146138
.CacheHint(absl::StrJoin(permutations, "-"))
147139
.AddInputs({{&input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
148140
.AddOutputs({{&output, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
149-
.AddUniformVariables({
150-
{static_cast<uint32_t>(output_size)},
151-
});
141+
.AddUniformVariables({{output_size}});
152142

153143
if (use_shared) {
154144
program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1);
155145
program.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
156146
static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)));
157147
} else {
158-
program.SetWorkgroupSize(WORKGROUP_SIZE);
148+
program.SetWorkgroupSize(64u);
159149

160-
uint32_t dispatch_x = ceil_div(output_size, WORKGROUP_SIZE);
150+
uint32_t dispatch_x = CeilDiv(output_size, 64u);
161151
uint32_t dispatch_y = 1;
162152
uint32_t dispatch_z = 1;
163153

@@ -171,7 +161,7 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContextBase& context,
171161
uint32_t dispatch_size = dispatch_x;
172162
dispatch_x = 4;
173163
dispatch_y = 8;
174-
dispatch_z = ceil_div(dispatch_size, dispatch_x * dispatch_y);
164+
dispatch_z = CeilDiv(dispatch_size, dispatch_x * dispatch_y);
175165
}
176166
program.SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z);
177167
}

onnxruntime/core/providers/webgpu/webgpu_utils.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ namespace webgpu {
1515

1616
class ShaderVariableHelper;
1717

18+
template <typename T>
19+
inline T CeilDiv(T numerator, T denominator) {
20+
return (numerator + denominator - 1) / denominator;
21+
}
22+
1823
/**
1924
* Returns the maximum number of components `N` to be used as `vecN` for the given size.
2025
*/

0 commit comments

Comments
 (0)