From 5753bd1635a7e4ac7d1e52687ffa4008e9327f99 Mon Sep 17 00:00:00 2001 From: syedshazli Date: Thu, 18 Dec 2025 12:45:02 -0500 Subject: [PATCH] remove ::kernels --- .../benchmark_linear_8bit_act_xbit_weight.cpp | 2 +- .../embedding_xbit/op_embedding_xbit-impl.h | 6 +- .../kernel_selector.h | 2 +- .../kernel_selector.h | 8 +-- .../test_groupwise_lowbit_weight_lut.cpp | 2 +- .../test_linear_8bit_act_xbit_weight.cpp | 6 +- .../aarch64/benchmarks/benchmark_linear.cpp | 6 +- .../benchmarks/benchmark_quantization.cpp | 4 +- .../aarch64/embedding/embedding.h | 6 +- .../aarch64/embedding/embedding_lut.h | 4 +- .../kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h | 4 +- .../torch_free_kernels/aarch64/kleidi/pack.h | 4 +- ..._8bit_activation_groupwise_lowbit_weight.h | 6 +- .../kernel_1x1x32_f32_neondot-impl.h | 4 +- .../kernel_1x4x16_f32_neondot-impl.h | 4 +- .../kernel_1x8x16_f32_neondot-impl.h | 4 +- .../pack_activations.h | 12 ++-- .../pack_weights.h | 4 +- .../groupwise_lowbit_weight_lut.h | 4 +- .../groupwise_lowbit_weight/kernel_f32-impl.h | 6 +- .../pack_activations.h | 4 +- .../groupwise_lowbit_weight/pack_weights.h | 4 +- ...hannelwise_8bit_b_1x16x16_f32_smlal-impl.h | 6 +- ...annelwise_8bit_b_1x8x16_f32_neondot-impl.h | 6 +- ...hannelwise_8bit_b_4x8x8_f32_neondot-impl.h | 6 +- ...input_channelwise_8bit_b_1x16x4_f32_impl.h | 6 +- ...input_channelwise_8bit_b_4x16x4_f32_impl.h | 6 +- .../aarch64/matmul/matmul.h | 4 +- .../aarch64/matmul/matmul_utils.h | 4 +- .../aarch64/quantization/quantize.cpp | 2 +- .../aarch64/reduction/compute_sum.cpp | 2 +- .../aarch64/reduction/find_min_and_max.cpp | 2 +- .../test_bitpack_fallback_compatibility.cpp | 56 +++++++++---------- .../aarch64/tests/test_embedding.cpp | 10 ++-- .../aarch64/tests/test_embedding_lut.cpp | 6 +- .../aarch64/tests/test_linear.cpp | 8 +-- .../aarch64/tests/test_lut.cpp | 4 +- .../aarch64/tests/test_qmatmul.cpp | 16 +++--- .../aarch64/tests/test_quantization.cpp | 4 +- .../aarch64/tests/test_reduction.cpp | 10 ++-- .../aarch64/tests/test_utils.h | 8 +-- .../aarch64/tests/test_weight_packing.cpp | 8 +-- .../aarch64/valpacking/interleave.cpp | 2 +- .../fallback/bitpacking/bitpack.h | 4 +- .../fallback/bitpacking/uint1.h | 4 +- .../fallback/bitpacking/uint2.h | 4 +- .../fallback/bitpacking/uint3.h | 4 +- .../fallback/bitpacking/uint4.h | 4 +- .../fallback/bitpacking/uint5.h | 2 +- .../fallback/bitpacking/uint6.h | 4 +- .../fallback/bitpacking/uint7.h | 4 +- .../channelwise_8bit_a_channelwise_8bit_b.h | 8 +-- .../matmul/fp32_a_channelwise_8bit_b_fp32_c.h | 5 +- .../fallback/tests/test_bitpacking.cpp | 50 ++++++++--------- .../interface/quantized_matmul.h | 12 ++-- .../interface/test_qmatmul_interface.cpp | 4 +- .../experimental/kernels/mps/src/dispatch.h | 4 +- torchao/experimental/kernels/mps/src/lowbit.h | 4 +- .../experimental/kernels/mps/src/packing.h | 4 +- .../kernels/mps/test/test_lowbit.mm | 6 +- .../ops/mps/linear_fp_act_xbit_weight_aten.mm | 4 +- .../linear_fp_act_xbit_weight_executorch.mm | 2 +- 62 files changed, 207 insertions(+), 208 deletions(-) diff --git a/torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp b/torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp index caf03acf21..24418ec510 100644 --- a/torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp +++ b/torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp @@ -18,7 +18,7 @@ template UKernelConfig get_ukernel_config() { UKernelConfig config; - namespace ukernel = torchao::kernels::cpu::aarch64::linear:: + namespace ukernel = torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot; config.mr = 1; config.nr = 8; diff --git a/torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h b/torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h index 6c1181873b..e1f1a2a39a 100644 --- a/torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h +++ b/torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h @@ -133,7 +133,7 @@ Tensor embedding_out_cpu( } TORCHAO_CHECK(index >= 0 && index < num_embeddings, "index out of bounds"); #if defined(TORCHAO_BUILD_CPU_AARCH64) - torchao::kernels::cpu::aarch64::embedding::embedding( + torchao::cpu::aarch64::embedding::embedding( out.mutable_data_ptr() + idx * embedding_dim, embedding_dim, group_size, @@ -199,7 +199,7 @@ Tensor pack_embedding_cpu(const Tensor& weight_qvals) { torchao::parallel_1d(0, num_embeddings, [&](int64_t idx) { #if defined(TORCHAO_BUILD_CPU_AARCH64) - torchao::kernels::cpu::aarch64::embedding::pack_embedding_weight_qvals< + torchao::cpu::aarch64::embedding::pack_embedding_weight_qvals< weight_nbit>( out.mutable_data_ptr() + torchao::ops::PackedWeightsHeader::size(), @@ -289,7 +289,7 @@ Tensor shared_embedding_out_cpu( } TORCHAO_CHECK(index >= 0 && index < n, "index out of bounds"); #if defined(TORCHAO_BUILD_CPU_AARCH64) - torchao::kernels::cpu::aarch64::embedding:: + torchao::cpu::aarch64::embedding:: shared_embedding( out.mutable_data_ptr() + idx * k, packed_weights.const_data_ptr() + diff --git a/torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h b/torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h index f8bdc4cafb..f49ab2b146 100644 --- a/torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h +++ b/torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h @@ -117,7 +117,7 @@ void register_ukernel_config( int preferred_alignment = 16; namespace kernel_api = - torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut; + torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut; using kernel_fn_ptr_t = decltype(&kernel_api::groupwise_lowbit_weight_lut_kernel_1x4x32< diff --git a/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h b/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h index 88b27f4217..7998ded6b0 100644 --- a/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h +++ b/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h @@ -97,7 +97,7 @@ void register_ukernel_config_universal( torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_universal, weight_nbit); - namespace kernel = torchao::kernels::cpu::aarch64::linear:: + namespace kernel = torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight; constexpr bool has_lut = false; @@ -181,7 +181,7 @@ void register_ukernel_config_lut( int preferred_alignment = 16; #if defined(TORCHAO_ENABLE_ARM_NEON_DOT) - namespace kernel = torchao::kernels::cpu::aarch64::linear:: + namespace kernel = torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight; if (!cpuinfo_has_arm_neon_dot()) { @@ -232,7 +232,7 @@ void register_ukernel_config_lut( template UKernelConfig::linear_config_type get_linear_config_kleidi(int n_step, int nr, int kr, int sr) { - namespace op = torchao::kernels::cpu::aarch64::kleidi:: + namespace op = torchao::cpu::aarch64::kleidi:: kai_matmul_clamp_f32_qai8dxp_qsi4c32p; assert(n_step == kernel_struct::get_ukernel().get_n_step()); assert(nr == kernel_struct::get_ukernel().get_nr()); @@ -256,7 +256,7 @@ void register_ukernel_config_kleidi( throw std::runtime_error("Failed to initialize cpuinfo!"); } check_format(format, torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_kleidi_ai, weight_nbit); - namespace op = torchao::kernels::cpu::aarch64::kleidi:: + namespace op = torchao::cpu::aarch64::kleidi:: kai_matmul_clamp_f32_qai8dxp_qsi4c32p; auto uk = UKernelConfig::make( diff --git a/torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp b/torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp index 10bf9bcd3c..1a331afcfa 100644 --- a/torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp +++ b/torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp @@ -19,7 +19,7 @@ using namespace torchao::ops::groupwise_lowbit_weight_lut; template UKernelConfig get_ukernel_config(bool has_bias) { namespace kernel = - torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut; + torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut; int preferred_alignment = 16; int n_step = 8; diff --git a/torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp b/torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp index 7631d34a03..9394430b20 100644 --- a/torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp +++ b/torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp @@ -16,7 +16,7 @@ #if defined(TORCHAO_ENABLE_KLEIDI) #include -using namespace torchao::kernels::cpu::aarch64::kleidi:: +using namespace torchao::cpu::aarch64::kleidi:: kai_matmul_clamp_f32_qai8dxp_qsi4c32p; #endif // TORCHAO_ENABLE_KLEIDI @@ -27,7 +27,7 @@ using namespace torchao::ops::linear_8bit_act_xbit_weight; template UKernelConfig get_ukernel_config() { - namespace kernel = torchao::kernels::cpu::aarch64::linear:: + namespace kernel = torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight; int preferred_alignment = 16; @@ -213,7 +213,7 @@ enum kai_kernel_id { template UKernelConfig get_ukernel_config_kleidi_impl() { - namespace op = torchao::kernels::cpu::aarch64::kleidi:: + namespace op = torchao::cpu::aarch64::kleidi:: kai_matmul_clamp_f32_qai8dxp_qsi4c32p; auto uk = kernel_struct::get_ukernel(); diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp index 26abe6918a..12cad8a0bd 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp @@ -19,7 +19,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot( int k = state.range(2); int group_size = state.range(3); - using namespace torchao::kernels::cpu::aarch64::linear:: + using namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot; auto test_case = torchao:: @@ -91,7 +91,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot( int k = state.range(2); int group_size = state.range(3); - using namespace torchao::kernels::cpu::aarch64::linear:: + using namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight; auto test_case = torchao:: @@ -163,7 +163,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot( int k = state.range(2); int group_size = state.range(3); - using namespace torchao::kernels::cpu::aarch64::linear:: + using namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight; auto test_case = torchao:: diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp index d877b905d0..e5cfe647a5 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp @@ -21,7 +21,7 @@ static void benchmark_quantize(benchmark::State& state) { float vmin, vmax, scale; for (auto _ : state) { - torchao::kernels::cpu::aarch64::reduction::find_min_and_max( + torchao::cpu::aarch64::reduction::find_min_and_max( vmin, vmax, vals.data(), vals.size()); torchao::quantization::get_qvals_range( @@ -30,7 +30,7 @@ static void benchmark_quantize(benchmark::State& state) { torchao::quantization::get_scale_and_zero( scale, zero, vmin, vmax, qmin, qmax); - torchao::kernels::cpu::aarch64::quantization::quantize( + torchao::cpu::aarch64::quantization::quantize( qvals.data(), vals.data(), vals.size(), scale, zero, qmin, qmax); } } diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h index 0f6d8a2339..d2f2a82b98 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h @@ -15,7 +15,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::embedding { +namespace torchao::cpu::aarch64::embedding { namespace internal { @@ -353,7 +353,7 @@ inline void shared_embedding( n_idx = n_idx * nr; int j = index - n_idx; - torchao::kernels::cpu::aarch64::linear:: + torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing:: unpack_weights_at_n_idx( weight_qvals.data(), @@ -381,6 +381,6 @@ inline void shared_embedding( } } -} // namespace torchao::kernels::cpu::aarch64::embedding +} // namespace torchao::cpu::aarch64::embedding #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h index 573fc8020d..cf860a8e01 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h @@ -14,7 +14,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::embedding { +namespace torchao::cpu::aarch64::embedding { /** * @brief Calculates the size in bytes for a single row of packed embeddings. @@ -377,6 +377,6 @@ inline void dequantize_embedding_row_at_idx_lut( vst1q_f32(out + j + 12, out3); } } -} // namespace torchao::kernels::cpu::aarch64::embedding +} // namespace torchao::cpu::aarch64::embedding #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h index 777d73cebc..008c3e4f53 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h @@ -30,7 +30,7 @@ #include -namespace torchao::kernels::cpu::aarch64::kleidi { +namespace torchao::cpu::aarch64::kleidi { // Helper functions // TODO: find a better place for these? @@ -319,4 +319,4 @@ DEFINE_KERNEL_STRUCT(matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm); #undef DEFINE_KERNEL_STRUCT } // namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p -} // namespace torchao::kernels::cpu::aarch64::kleidi +} // namespace torchao::cpu::aarch64::kleidi diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h index 692df73d55..0d5d9bfd44 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h @@ -12,7 +12,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::kleidi { +namespace torchao::cpu::aarch64::kleidi { namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p { // All the kernels in this namespace use following packing interface/routines. // TODO: move these to Kleidi as interfaces? @@ -115,4 +115,4 @@ lhs_packing get_lhs_packing() { } } // namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p -} // namespace torchao::kernels::cpu::aarch64::kleidi +} // namespace torchao::cpu::aarch64::kleidi diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h index 849d99cb8a..f7e1c41e19 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h @@ -17,7 +17,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::linear:: +namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight { inline size_t packed_activations_size( @@ -148,7 +148,7 @@ void pack_weights_with_lut( (void)nr; // unused (void)kr; // unused (void)sr; // unused - torchao::kernels::cpu::aarch64::linear:: + torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing:: pack_weights_with_lut( packed_weights, @@ -298,6 +298,6 @@ void kernel_1x8x16_f32_neondot( } } // namespace - // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight + // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h index 535bf7a084..dbb653ca5d 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h @@ -11,7 +11,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::linear:: +namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::kernel { namespace internal { @@ -174,6 +174,6 @@ void kernel_1x1x32_f32_neondot( } } // namespace - // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel + // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h index 40be2c5231..0dab5dd88a 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h @@ -12,7 +12,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::linear:: +namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::kernel { namespace internal { inline float32x4_t clamp(float32x4_t x, float min, float max) { @@ -245,6 +245,6 @@ void kernel_1x4x16_f32_neondot( } } // namespace - // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel + // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h index 78246e211d..4ceaeeb893 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h @@ -12,7 +12,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::linear:: +namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::kernel { namespace internal { @@ -361,6 +361,6 @@ void kernel_1x8x16_f32_neondot( } } // namespace - // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel + // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h index d7558dd4ce..a27615f2c3 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h @@ -12,7 +12,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing { +namespace torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing { // Prepares activation data for kernel_impl. // Per m_idx (row), activations are stored as follows: @@ -72,7 +72,7 @@ void inline pack_activations( qmin, qmax, /*nbit=*/8, /*is_symmetric=*/false); for (int m_idx = 0; m_idx < m; m_idx++) { - torchao::kernels::cpu::aarch64::reduction::find_min_and_max( + torchao::cpu::aarch64::reduction::find_min_and_max( vmin, vmax, activations, k); torchao::quantization::get_scale_and_zero( scale, zero, vmin, vmax, qmin, qmax); @@ -86,7 +86,7 @@ void inline pack_activations( if (has_weight_zeros) { for (int k_idx = 0; k_idx < k; k_idx += group_size) { - torchao::kernels::cpu::aarch64::quantization::quantize( + torchao::cpu::aarch64::quantization::quantize( /*qvals=*/(int8_t*)activation_data_byte_ptr, /*vals=*/activations, /*size=*/group_size, @@ -95,7 +95,7 @@ void inline pack_activations( /*qmin=*/qmin, /*qmax=*/qmax); - qvals_sum = torchao::kernels::cpu::aarch64::reduction::compute_sum( + qvals_sum = torchao::cpu::aarch64::reduction::compute_sum( /*vals=*/(int8_t*)activation_data_byte_ptr, /*size=*/group_size); @@ -107,7 +107,7 @@ void inline pack_activations( activations += group_size; } } else { - torchao::kernels::cpu::aarch64::quantization::quantize( + torchao::cpu::aarch64::quantization::quantize( /*qvals=*/(int8_t*)activation_data_byte_ptr, /*vals=*/activations, /*size=*/k, @@ -121,6 +121,6 @@ void inline pack_activations( } } -} // namespace torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing +} // namespace torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h index 133c4a7f25..66ad7cb479 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h @@ -9,7 +9,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::linear:: +namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing { namespace internal { @@ -583,6 +583,6 @@ size_t inline packed_weights_with_lut_size( } } // namespace - // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing + // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h index b0fea65afb..3ab0ba45ac 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h @@ -11,7 +11,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut { +namespace torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut { /** * @brief Calculates the total size in bytes required for the packed weight. @@ -258,6 +258,6 @@ inline size_t packed_weights_offset( return (n_idx / nr) * packed_tile_size_for_nr_cols; } } // namespace - // torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut + // torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/kernel_f32-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/kernel_f32-impl.h index b50c886d11..ff14da3382 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/kernel_f32-impl.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/kernel_f32-impl.h @@ -13,11 +13,11 @@ #include #include -namespace torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut:: +namespace torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut:: kernel { namespace lut_utils = torchao::lut; -namespace weight_packing = torchao::kernels::cpu::aarch64::linear:: +namespace weight_packing = torchao::cpu::aarch64::linear:: groupwise_lowbit_weight_lut::weight_packing; namespace internal { @@ -235,5 +235,5 @@ void groupwise_lowbit_weight_lut_kernel_1x4x32( } } } // namespace - // torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut::kernel + // torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut::kernel #endif // defined(aarch64) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h index bf16e04bda..50a6fb87fb 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h @@ -7,7 +7,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut:: +namespace torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut:: activation_packing { inline size_t packed_activations_size(int m, int k) { @@ -26,6 +26,6 @@ void pack_activations( std::memcpy(packed_activations, activations, sizeof(float) * m * k); } } // namespace - // torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut::activation_packing + // torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut::activation_packing #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h index 021693caec..c7e722dd9e 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h @@ -11,7 +11,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut:: +namespace torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut:: weight_packing { namespace lut_utils = torchao::lut; namespace packing_utils = torchao::packing; @@ -224,5 +224,5 @@ TORCHAO_ALWAYS_INLINE inline void pack_weights( } } } // namespace - // torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut::weight_packing + // torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut::weight_packing #endif // defined(aarch64) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h index 925bbbb4bd..c2492eaf13 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h @@ -16,7 +16,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::quantized_matmul { +namespace torchao::cpu::aarch64::quantized_matmul { namespace channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal::internal { namespace { @@ -359,7 +359,7 @@ void kernel( const float* rhs_scales, const int lhs_qparams_stride, const int rhs_qparams_stride) { - torchao::kernels::cpu::aarch64::quantized_matmul:: + torchao::cpu::aarch64::quantized_matmul:: channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal::internal:: KernelImpl::run( m, @@ -379,6 +379,6 @@ void kernel( rhs_qparams_stride); } } // namespace channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal -} // namespace torchao::kernels::cpu::aarch64::quantized_matmul +} // namespace torchao::cpu::aarch64::quantized_matmul #endif // defined(__aarch64__) && defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h index 2c34cebc3c..835d47476e 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h @@ -16,7 +16,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::quantized_matmul { +namespace torchao::cpu::aarch64::quantized_matmul { namespace channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot::internal { /* @@ -315,7 +315,7 @@ void kernel( const float* rhs_scales, const int lhs_qparams_stride, const int rhs_qparams_stride) { - torchao::kernels::cpu::aarch64::quantized_matmul:: + torchao::cpu::aarch64::quantized_matmul:: channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot::internal:: KernelImpl::run( m, @@ -335,6 +335,6 @@ void kernel( rhs_qparams_stride); } } // namespace channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot -} // namespace torchao::kernels::cpu::aarch64::quantized_matmul +} // namespace torchao::cpu::aarch64::quantized_matmul #endif // defined(__aarch64__) && defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot-impl.h index 80417f37e4..938cd8d06e 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot-impl.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot-impl.h @@ -16,7 +16,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::quantized_matmul { +namespace torchao::cpu::aarch64::quantized_matmul { namespace channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot::internal { TORCHAO_ALWAYS_INLINE static void block_mul_4x8x8( @@ -386,7 +386,7 @@ void kernel( const float* rhs_scales, const int lhs_qparams_stride, const int rhs_qparams_stride) { - torchao::kernels::cpu::aarch64::quantized_matmul:: + torchao::cpu::aarch64::quantized_matmul:: channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot::internal:: KernelImpl::run( m, @@ -406,6 +406,6 @@ void kernel( rhs_qparams_stride); } } // namespace channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot -} // namespace torchao::kernels::cpu::aarch64::quantized_matmul +} // namespace torchao::cpu::aarch64::quantized_matmul #endif // defined(__aarch64__) && defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h index 28f173e9bc..454b8f3ebc 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h @@ -16,7 +16,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::quantized_matmul { +namespace torchao::cpu::aarch64::quantized_matmul { namespace fp32_a_input_channelwise_8bit_b_1x16x4_f32::internal { namespace { @@ -258,7 +258,7 @@ void kernel( const float* rhs_scales, const float beta, const int rhs_qparams_stride) { - torchao::kernels::cpu::aarch64::quantized_matmul:: + torchao::cpu::aarch64::quantized_matmul:: fp32_a_input_channelwise_8bit_b_1x16x4_f32::internal:: KernelImpl::run( m, @@ -276,6 +276,6 @@ void kernel( rhs_qparams_stride); } } // namespace fp32_a_input_channelwise_8bit_b_1x16x4_f32 -} // namespace torchao::kernels::cpu::aarch64::quantized_matmul +} // namespace torchao::cpu::aarch64::quantized_matmul #endif // defined(__aarch64__) && defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_4x16x4_f32_impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_4x16x4_f32_impl.h index ffcd0a1f1d..d9cce306fe 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_4x16x4_f32_impl.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_4x16x4_f32_impl.h @@ -16,7 +16,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::quantized_matmul { +namespace torchao::cpu::aarch64::quantized_matmul { namespace fp32_a_input_channelwise_8bit_b_4x16x4_f32::internal { namespace { @@ -305,7 +305,7 @@ void kernel( const float* rhs_scales, const float beta, const int rhs_qparams_stride) { - torchao::kernels::cpu::aarch64::quantized_matmul:: + torchao::cpu::aarch64::quantized_matmul:: fp32_a_input_channelwise_8bit_b_4x16x4_f32::internal:: KernelImpl::run( m, @@ -323,6 +323,6 @@ void kernel( rhs_qparams_stride); } } // namespace fp32_a_input_channelwise_8bit_b_4x16x4_f32 -} // namespace torchao::kernels::cpu::aarch64::quantized_matmul +} // namespace torchao::cpu::aarch64::quantized_matmul #endif // defined(__aarch64__) && defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul.h index 371dc55666..c0a8f1b914 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul.h @@ -15,7 +15,7 @@ #include -namespace torchao::kernels::cpu::aarch64::quantized_matmul { +namespace torchao::cpu::aarch64::quantized_matmul { namespace channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot { template < @@ -307,7 +307,7 @@ void kernel( } } // namespace fp32_a_input_channelwise_8bit_b_f32 -} // namespace torchao::kernels::cpu::aarch64::quantized_matmul +} // namespace torchao::cpu::aarch64::quantized_matmul #include #include diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h index db577c39a8..65069bd4ca 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h @@ -13,7 +13,7 @@ #include #include -namespace torchao::kernels::cpu::aarch64::quantized_matmul { +namespace torchao::cpu::aarch64::quantized_matmul { namespace utils { TORCHAO_ALWAYS_INLINE static void transpose_scales_and_zero_points( @@ -148,6 +148,6 @@ void pack_kxn_b_matrix_for_mx8_dotprod_ukernel( } } } // namespace utils -} // namespace torchao::kernels::cpu::aarch64::quantized_matmul +} // namespace torchao::cpu::aarch64::quantized_matmul #endif // defined(__aarch64__) || defined(__ARM_NEON) diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/quantization/quantize.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/quantization/quantize.cpp index 42301dc2fa..bcc1dec571 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/quantization/quantize.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/quantization/quantize.cpp @@ -55,7 +55,7 @@ _vec_clip_inplace(int32x4_t& vec, int32x4_t vec_min, int32x4_t vec_max) { } } // namespace -void torchao::kernels::cpu::aarch64::quantization::quantize( +void torchao::cpu::aarch64::quantization::quantize( // Output int8_t* qvals, // Inputs diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/compute_sum.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/compute_sum.cpp index 1b9d2aa97b..451b69c87e 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/compute_sum.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/compute_sum.cpp @@ -9,7 +9,7 @@ #include #include -int32_t torchao::kernels::cpu::aarch64::reduction::compute_sum( +int32_t torchao::cpu::aarch64::reduction::compute_sum( const int8_t* vals, int size) { assert(size >= 1); diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/find_min_and_max.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/find_min_and_max.cpp index ea4efcf1cc..521a778efd 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/find_min_and_max.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/find_min_and_max.cpp @@ -9,7 +9,7 @@ #include #include -void torchao::kernels::cpu::aarch64::reduction::find_min_and_max( +void torchao::cpu::aarch64::reduction::find_min_and_max( float32_t& min, float32_t& max, const float32_t* vals, diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_bitpack_fallback_compatibility.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_bitpack_fallback_compatibility.cpp index ccae74cbcd..ddbed9223b 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_bitpack_fallback_compatibility.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_bitpack_fallback_compatibility.cpp @@ -22,7 +22,7 @@ TEST(test_bitpacking_64_uint1_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint1_values( + torchao::cpu::fallback::bitpacking::internal::pack_64_uint1_values( packed.data(), input.data()); uint8x16_t u0, u1, u2, u3; @@ -52,7 +52,7 @@ TEST(test_bitpacking_64_uint1_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_64_uint1_values( packed.data(), i0, i1, i2, i3); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint1_values( + torchao::cpu::fallback::bitpacking::internal::unpack_64_uint1_values( unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -68,7 +68,7 @@ TEST(test_bitpacking_128_uint1_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint1_values( + torchao::cpu::fallback::bitpacking::internal::pack_128_uint1_values( packed.data(), input.data()); uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7; @@ -104,7 +104,7 @@ TEST(test_bitpacking_128_uint1_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_128_uint1_values( packed.data(), i0, i1, i2, i3, i4, i5, i6, i7); - torchao::kernels::cpu::fallback::bitpacking::internal:: + torchao::cpu::fallback::bitpacking::internal:: unpack_128_uint1_values(unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -122,7 +122,7 @@ TEST(test_bitpacking_32_uint2_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint2_values( + torchao::cpu::fallback::bitpacking::internal::pack_32_uint2_values( packed.data(), input.data()); uint8x8_t u0, u1, u2, u3; @@ -152,7 +152,7 @@ TEST(test_bitpacking_32_uint2_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_32_uint2_values( packed.data(), i0, i1, i2, i3); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint2_values( + torchao::cpu::fallback::bitpacking::internal::unpack_32_uint2_values( unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -168,7 +168,7 @@ TEST(test_bitpacking_64_uint2_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values( + torchao::cpu::fallback::bitpacking::internal::pack_64_uint2_values( packed.data(), input.data()); uint8x16_t u0, u1, u2, u3; @@ -198,7 +198,7 @@ TEST(test_bitpacking_64_uint2_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_64_uint2_values( packed.data(), i0, i1, i2, i3); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint2_values( + torchao::cpu::fallback::bitpacking::internal::unpack_64_uint2_values( unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -216,7 +216,7 @@ TEST(test_bitpacking_64_uint3_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint3_values( + torchao::cpu::fallback::bitpacking::internal::pack_64_uint3_values( packed.data(), input.data()); uint8x16_t u0, u1, u2, u3; @@ -246,7 +246,7 @@ TEST(test_bitpacking_64_uint3_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_64_uint3_values( packed.data(), i0, i1, i2, i3); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint3_values( + torchao::cpu::fallback::bitpacking::internal::unpack_64_uint3_values( unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -262,7 +262,7 @@ TEST(test_bitpacking_128_uint3_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint3_values( + torchao::cpu::fallback::bitpacking::internal::pack_128_uint3_values( packed.data(), input.data()); uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7; @@ -298,7 +298,7 @@ TEST(test_bitpacking_128_uint3_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_128_uint3_values( packed.data(), i0, i1, i2, i3, i4, i5, i6, i7); - torchao::kernels::cpu::fallback::bitpacking::internal:: + torchao::cpu::fallback::bitpacking::internal:: unpack_128_uint3_values(unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -316,7 +316,7 @@ TEST(test_bitpacking_16_uint4_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_16_uint4_values( + torchao::cpu::fallback::bitpacking::internal::pack_16_uint4_values( packed.data(), input.data()); uint8x16_t unpacked0; @@ -341,7 +341,7 @@ TEST(test_bitpacking_16_uint4_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_16_uint4_values( packed.data(), input0); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_16_uint4_values( + torchao::cpu::fallback::bitpacking::internal::unpack_16_uint4_values( unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -357,7 +357,7 @@ TEST(test_bitpacking_32_uint4_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values( packed.data(), input.data()); uint8x16_t unpacked0, unpacked1; @@ -384,7 +384,7 @@ TEST(test_bitpacking_32_uint4_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_32_uint4_values( packed.data(), input0, input1); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint4_values( + torchao::cpu::fallback::bitpacking::internal::unpack_32_uint4_values( unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -402,7 +402,7 @@ TEST(test_bitpacking_64_uint5_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint5_values( + torchao::cpu::fallback::bitpacking::internal::pack_64_uint5_values( packed.data(), input.data()); uint8x16_t unpacked0, unpacked1, unpacked2, unpacked3; @@ -432,7 +432,7 @@ TEST(test_bitpacking_64_uint5_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_64_uint5_values( packed.data(), input0, input1, input2, input3); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint5_values( + torchao::cpu::fallback::bitpacking::internal::unpack_64_uint5_values( unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -448,7 +448,7 @@ TEST(test_bitpacking_128_uint5_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint5_values( + torchao::cpu::fallback::bitpacking::internal::pack_128_uint5_values( packed.data(), input.data()); uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7; @@ -484,7 +484,7 @@ TEST(test_bitpacking_128_uint5_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_128_uint5_values( packed.data(), i0, i1, i2, i3, i4, i5, i6, i7); - torchao::kernels::cpu::fallback::bitpacking::internal:: + torchao::cpu::fallback::bitpacking::internal:: unpack_128_uint5_values(unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -502,7 +502,7 @@ TEST(test_bitpacking_32_uint6_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint6_values( + torchao::cpu::fallback::bitpacking::internal::pack_32_uint6_values( packed.data(), input.data()); uint8x16_t u0, u1; @@ -529,7 +529,7 @@ TEST(test_bitpacking_32_uint6_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_32_uint6_values( packed.data(), i0, i1); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint6_values( + torchao::cpu::fallback::bitpacking::internal::unpack_32_uint6_values( unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -545,7 +545,7 @@ TEST(test_bitpacking_64_uint6_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint6_values( + torchao::cpu::fallback::bitpacking::internal::pack_64_uint6_values( packed.data(), input.data()); uint8x16_t u0, u1, u2, u3; @@ -575,7 +575,7 @@ TEST(test_bitpacking_64_uint6_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_64_uint6_values( packed.data(), i0, i1, i2, i3); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint6_values( + torchao::cpu::fallback::bitpacking::internal::unpack_64_uint6_values( unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -593,7 +593,7 @@ TEST(test_bitpacking_64_uint7_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint7_values( + torchao::cpu::fallback::bitpacking::internal::pack_64_uint7_values( packed.data(), input.data()); uint8x16_t unpacked0, unpacked1, unpacked2, unpacked3; @@ -623,7 +623,7 @@ TEST(test_bitpacking_64_uint7_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_64_uint7_values( packed.data(), input0, input1, input2, input3); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint7_values( + torchao::cpu::fallback::bitpacking::internal::unpack_64_uint7_values( unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { @@ -639,7 +639,7 @@ TEST(test_bitpacking_128_uint7_values, CppToNeon) { std::vector packed(packed_bytes, 0); std::vector unpacked(unpacked_bytes, 0); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint7_values( + torchao::cpu::fallback::bitpacking::internal::pack_128_uint7_values( packed.data(), input.data()); uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7; @@ -675,7 +675,7 @@ TEST(test_bitpacking_128_uint7_values, NeonToCpp) { torchao::bitpacking::internal::vec_pack_128_uint7_values( packed.data(), i0, i1, i2, i3, i4, i5, i6, i7); - torchao::kernels::cpu::fallback::bitpacking::internal:: + torchao::cpu::fallback::bitpacking::internal:: unpack_128_uint7_values(unpacked.data(), packed.data()); for (int i = 0; i < unpacked_bytes; ++i) { diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding.cpp index e5cdfb0a1b..efd6c78ebf 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding.cpp @@ -28,7 +28,7 @@ void test_embedding( auto output = std::vector(num_embeddings * embedding_dim, 0.0); for (int i = 0; i < num_embeddings; i++) { - torchao::kernels::cpu::aarch64::embedding::pack_embedding_weight_qvals< + torchao::cpu::aarch64::embedding::pack_embedding_weight_qvals< weight_nbit>( packed.data(), embedding_dim, test_case.weight_qvals.data(), i); } @@ -39,7 +39,7 @@ void test_embedding( } for (int i = 0; i < num_embeddings; i++) { - torchao::kernels::cpu::aarch64::embedding::embedding( + torchao::cpu::aarch64::embedding::embedding( output.data() + i * embedding_dim, embedding_dim, group_size, @@ -69,7 +69,7 @@ void test_shared_embedding( bool has_bias = false; float* bias = nullptr; std::vector packed_weights( - torchao::kernels::cpu::aarch64::linear:: + torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing:: packed_weights_size( n, @@ -79,7 +79,7 @@ void test_shared_embedding( has_weight_zeros, has_bias, nr)); - torchao::kernels::cpu::aarch64::linear:: + torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing:: pack_weights( packed_weights.data(), @@ -94,7 +94,7 @@ void test_shared_embedding( // Call shared_embedding auto output = std::vector(num_embeddings * embedding_dim, 0.0); for (int i = 0; i < num_embeddings; i++) { - torchao::kernels::cpu::aarch64::embedding:: + torchao::cpu::aarch64::embedding:: shared_embedding( output.data() + i * embedding_dim, packed_weights.data(), diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding_lut.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding_lut.cpp index 5802a179d0..33e1f3478c 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding_lut.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding_lut.cpp @@ -28,7 +28,7 @@ void test_embedding( has_scales); const size_t packed_embedding_size = - torchao::kernels::cpu::aarch64::embedding::packed_embedding_size( + torchao::cpu::aarch64::embedding::packed_embedding_size( weight_nbit, num_embeddings, embedding_dim, @@ -40,7 +40,7 @@ void test_embedding( auto output = std::vector(num_embeddings * embedding_dim, 0.0); for (int i = 0; i < num_embeddings; i++) { - torchao::kernels::cpu::aarch64::embedding::pack_embedding_row_at_index_lut< + torchao::cpu::aarch64::embedding::pack_embedding_row_at_index_lut< weight_nbit>( packed.data(), i, @@ -55,7 +55,7 @@ void test_embedding( } for (int i = 0; i < num_embeddings; i++) { - torchao::kernels::cpu::aarch64::embedding:: + torchao::cpu::aarch64::embedding:: dequantize_embedding_row_at_idx_lut( output.data() + i * embedding_dim, packed.data(), diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_linear.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_linear.cpp index bf99823052..69deeac517 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_linear.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_linear.cpp @@ -40,7 +40,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32( has_bias, has_clamp); - using namespace torchao::kernels::cpu::aarch64::linear:: + using namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight; std::vector packed_activations( @@ -116,7 +116,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16( has_bias, has_clamp); - using namespace torchao::kernels::cpu::aarch64::linear:: + using namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight; std::vector packed_activations( @@ -192,7 +192,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16( has_bias, has_clamp); - using namespace torchao::kernels::cpu::aarch64::linear:: + using namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight; std::vector packed_activations( @@ -438,7 +438,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_lut( has_bias, has_clamp); - using namespace torchao::kernels::cpu::aarch64::linear:: + using namespace torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight; std::vector packed_activations( diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_lut.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_lut.cpp index 6d9214eeba..cde73ba6ee 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_lut.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_lut.cpp @@ -16,7 +16,7 @@ namespace lut_utils = torchao::lut; namespace kernel_api = - torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut; + torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut; TEST(test_fp32_lut, LutLookup) { auto lut = torchao::get_random_vector(16, -1.0, 1.0); @@ -53,7 +53,7 @@ void test_groupwise_lowbit_lut_kernel( bool has_bias, bool has_clamp) { namespace kernel_api = - torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut; + torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut; // 1. Generate test case auto test_case = torchao::groupwise_lowbit_weight_lut_test_case:: generate_with_decoupled_grouping( diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_qmatmul.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_qmatmul.cpp index 5d46937ccf..46b7a024f4 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_qmatmul.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_qmatmul.cpp @@ -54,11 +54,11 @@ struct test_channelwise_8bit_channelwise_8bit_b< const int); kernel_fn_type kernel_fn = nullptr; if (use_gemm && (m % 4 == 0) && (n % 8 == 0) && (k % 16 == 0)) { - using namespace torchao::kernels::cpu::aarch64::quantized_matmul:: + using namespace torchao::cpu::aarch64::quantized_matmul:: channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot; kernel_fn = kernel; } else { - using namespace torchao::kernels::cpu::aarch64::quantized_matmul:: + using namespace torchao::cpu::aarch64::quantized_matmul:: channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot; kernel_fn = kernel; } @@ -99,7 +99,7 @@ struct test_channelwise_8bit_channelwise_8bit_b< torchao::channelwise_8bit_a_channelwise_8bit_b_qmatmul_test_case:: generate(m, k, n, a_has_zeros, a_has_zeros, false, false); - using namespace torchao::kernels::cpu::aarch64::quantized_matmul:: + using namespace torchao::cpu::aarch64::quantized_matmul:: channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal; std::vector output(m * n); @@ -416,11 +416,11 @@ static void test_fp32_a_input_channelwise_8bit_b( kernel_fn_type kernel_fn = nullptr; if (test_case.use_gemm_kernel() && (m % 4 == 0)) { - using namespace torchao::kernels::cpu::aarch64::quantized_matmul:: + using namespace torchao::cpu::aarch64::quantized_matmul:: fp32_a_input_channelwise_8bit_b_4x16x4_f32; kernel_fn = kernel; } else { - using namespace torchao::kernels::cpu::aarch64::quantized_matmul:: + using namespace torchao::cpu::aarch64::quantized_matmul:: fp32_a_input_channelwise_8bit_b_1x16x4_f32; kernel_fn = kernel; } @@ -568,11 +568,11 @@ static void test_8bit_per_token_q_at_k_matmul_attention( const int); kernel_fn_type kernel_fn = nullptr; if ((s_q % 4 == 0) && (s_k % 8 == 0) && (d % 16 == 0)) { - using namespace torchao::kernels::cpu::aarch64::quantized_matmul:: + using namespace torchao::cpu::aarch64::quantized_matmul:: channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot; kernel_fn = kernel; } else { - using namespace torchao::kernels::cpu::aarch64::quantized_matmul:: + using namespace torchao::cpu::aarch64::quantized_matmul:: channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot; kernel_fn = kernel; } @@ -660,7 +660,7 @@ static void test_fp32_attn_scores_at_v_matmul_attention( torchao::fp32_a_channelwise_8bit_b_attn_scores_at_v_test_case::generate( b, s_attn, s_v, h, d, transpose_v); - using namespace torchao::kernels::cpu::aarch64::quantized_matmul:: + using namespace torchao::cpu::aarch64::quantized_matmul:: fp32_a_input_channelwise_8bit_b_f32; size_t attn_b_stride = test_case.b_attn_stride; diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_quantization.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_quantization.cpp index ebe3fbdfa8..402046445f 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_quantization.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_quantization.cpp @@ -49,7 +49,7 @@ TEST(test_quantize, ExpectedOutput) { int qmin, qmax, zero; float vmin, vmax, scale; - torchao::kernels::cpu::aarch64::reduction::find_min_and_max( + torchao::cpu::aarch64::reduction::find_min_and_max( vmin, vmax, vals.data(), vals.size()); std::vector qvals(vals.size()); @@ -61,7 +61,7 @@ TEST(test_quantize, ExpectedOutput) { torchao::quantization::get_scale_and_zero( scale, zero, vmin, vmax, qmin, qmax); - torchao::kernels::cpu::aarch64::quantization::quantize( + torchao::cpu::aarch64::quantization::quantize( qvals.data(), vals.data(), vals.size(), scale, zero, qmin, qmax); for (int i = 0; i < vals.size(); ++i) { diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_reduction.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_reduction.cpp index 44dbafafa5..dc2e106d66 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_reduction.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_reduction.cpp @@ -16,7 +16,7 @@ TEST(test_find_min_and_sum, SizeHasRemainderAfterDivideBy4) { auto vals = torchao::get_random_vector(19, -1.0, 1.0); float vmin, vmax; - torchao::kernels::cpu::aarch64::reduction::find_min_and_max( + torchao::cpu::aarch64::reduction::find_min_and_max( vmin, vmax, vals.data(), vals.size()); auto expected_vmin = *std::min_element(vals.begin(), vals.end()); @@ -28,7 +28,7 @@ TEST(test_find_min_and_sum, SizeHasRemainderAfterDivideBy4) { TEST(test_find_min_and_sum, SizeSmallerThan4) { auto vals = torchao::get_random_vector(3, -1.0, 1.0); float vmin, vmax; - torchao::kernels::cpu::aarch64::reduction::find_min_and_max( + torchao::cpu::aarch64::reduction::find_min_and_max( vmin, vmax, vals.data(), vals.size()); auto expected_vmin = *std::min_element(vals.begin(), vals.end()); @@ -39,7 +39,7 @@ TEST(test_find_min_and_sum, SizeSmallerThan4) { TEST(test_compute_sum, ExpectedOutput) { auto vals = torchao::get_random_lowbit_vector(/*size=*/19, /*int8*/ 3); - int sum = torchao::kernels::cpu::aarch64::reduction::compute_sum( + int sum = torchao::cpu::aarch64::reduction::compute_sum( (int8_t*)vals.data(), vals.size()); int expected_sum = std::accumulate(vals.begin(), vals.end(), 0); EXPECT_EQ(sum, expected_sum); @@ -47,7 +47,7 @@ TEST(test_compute_sum, ExpectedOutput) { TEST(test_compute_sum, SizeHasRemainderAfterDivideBy16) { auto vals = torchao::get_random_lowbit_vector(/*size=*/17, /*int8*/ 3); - int sum = torchao::kernels::cpu::aarch64::reduction::compute_sum( + int sum = torchao::cpu::aarch64::reduction::compute_sum( (int8_t*)vals.data(), vals.size()); int expected_sum = std::accumulate(vals.begin(), vals.end(), 0); EXPECT_EQ(sum, expected_sum); @@ -55,7 +55,7 @@ TEST(test_compute_sum, SizeHasRemainderAfterDivideBy16) { TEST(test_compute_sum, SizeSmallerThan16) { auto vals = torchao::get_random_lowbit_vector(/*size=*/3, /*int8*/ 3); - int sum = torchao::kernels::cpu::aarch64::reduction::compute_sum( + int sum = torchao::cpu::aarch64::reduction::compute_sum( (int8_t*)vals.data(), vals.size()); int expected_sum = std::accumulate(vals.begin(), vals.end(), 0); EXPECT_EQ(sum, expected_sum); diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_utils.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_utils.h index e5742d3f56..623c5215f5 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_utils.h +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_utils.h @@ -55,13 +55,13 @@ auto generate_per_token_quantized_tensor(int m, int n, bool transposed) { torchao::quantization::get_qvals_range( qmin, qmax, /*nbit=*/8, /*is_symmetric=*/false); for (int m_idx = 0; m_idx < m; m_idx++) { - torchao::kernels::cpu::aarch64::reduction::find_min_and_max( + torchao::cpu::aarch64::reduction::find_min_and_max( vmin, vmax, /*vals=*/activations.data() + m_idx * n, /*size=*/n); torchao::quantization::get_scale_and_zero( scale, zero, vmin, vmax, qmin, qmax); activation_scales[m_idx] = scale; activation_zeros[m_idx] = zero; - torchao::kernels::cpu::aarch64::quantization::quantize( + torchao::cpu::aarch64::quantization::quantize( /*qvals=*/activation_qvals.data() + m_idx * n, /*vals=*/activations.data() + m_idx * n, /*size=*/n, @@ -209,7 +209,7 @@ struct channelwise_8bit_activation_groupwise_lowbit_weight_test_case { int n_groups = (n * k) / weight_group_size; for (int group_idx = 0; group_idx < n_groups; group_idx += 1) { - torchao::kernels::cpu::aarch64::reduction::find_min_and_max( + torchao::cpu::aarch64::reduction::find_min_and_max( vmin, vmax, /*vals=*/weights.data() + group_idx * weight_group_size, @@ -230,7 +230,7 @@ struct channelwise_8bit_activation_groupwise_lowbit_weight_test_case { weight_scales[group_idx] = scale; weight_zeros[group_idx] = zero; - torchao::kernels::cpu::aarch64::quantization::quantize( + torchao::cpu::aarch64::quantization::quantize( /*qvals=*/weight_qvals.data() + group_idx * weight_group_size, /*vals=*/weights.data() + group_idx * weight_group_size, /*size=*/weight_group_size, diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_weight_packing.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_weight_packing.cpp index b64d4b2754..ed6e394899 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_weight_packing.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_weight_packing.cpp @@ -26,10 +26,10 @@ void test_weight_packing( has_bias, /*has_clamp*/ false); - // using namespace torchao::kernels::cpu::aarch64::linear::packing; + // using namespace torchao::cpu::aarch64::linear::packing; std::vector packed_weights( - torchao::kernels::cpu::aarch64::linear:: + torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing:: packed_weights_size( n, @@ -56,7 +56,7 @@ void test_weight_packing( std::vector weight_zeros_out(test_case.weight_zeros.size()); std::vector bias_out(test_case.bias.size()); - torchao::kernels::cpu::aarch64::linear:: + torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing:: pack_weights( packed_weights.data(), @@ -67,7 +67,7 @@ void test_weight_packing( weight_scales_in, weight_zeros_in, bias_in); - torchao::kernels::cpu::aarch64::linear:: + torchao::cpu::aarch64::linear:: channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing:: unpack_weights( weight_qvals_out.data(), diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/valpacking/interleave.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/valpacking/interleave.cpp index 3818fac2d0..05d347f1e2 100644 --- a/torchao/csrc/cpu/torch_free_kernels/aarch64/valpacking/interleave.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/valpacking/interleave.cpp @@ -42,7 +42,7 @@ // but it can be something else if we are applying this method // to a matrix tile. -void torchao::kernels::cpu::valpacking::interleave_data( +void torchao::cpu::valpacking::interleave_data( void* data_interleaved, const void* data, int bytes_per_val, diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/bitpack.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/bitpack.h index c28c6ec90d..ba981c6f8a 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/bitpack.h +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/bitpack.h @@ -16,7 +16,7 @@ #include #include -namespace torchao::kernels::cpu::fallback::bitpacking { +namespace torchao::cpu::fallback::bitpacking { namespace internal { /** * @brief Packs 128 unsigned 8-bit integers into a packed format of 'nbit' bits. @@ -176,4 +176,4 @@ inline void unpack_128_lowbit_values_with_lut( } } } // namespace internal -} // namespace torchao::kernels::cpu::fallback::bitpacking +} // namespace torchao::cpu::fallback::bitpacking diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint1.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint1.h index 08e231716b..da0e17c8fc 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint1.h +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint1.h @@ -9,7 +9,7 @@ #include #include -namespace torchao::kernels::cpu::fallback::bitpacking { +namespace torchao::cpu::fallback::bitpacking { namespace internal { /** @@ -151,4 +151,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_128_uint1_values( } } } // namespace internal -} // namespace torchao::kernels::cpu::fallback::bitpacking +} // namespace torchao::cpu::fallback::bitpacking diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint2.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint2.h index 9dc1cce463..c054c7fc9d 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint2.h +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint2.h @@ -8,7 +8,7 @@ #include #include -namespace torchao::kernels::cpu::fallback::bitpacking { +namespace torchao::cpu::fallback::bitpacking { namespace internal { /** @@ -116,4 +116,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_64_uint2_values( } } // namespace internal -} // namespace torchao::kernels::cpu::fallback::bitpacking +} // namespace torchao::cpu::fallback::bitpacking diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint3.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint3.h index 277317d5a2..314caee0a7 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint3.h +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint3.h @@ -9,7 +9,7 @@ #include #include -namespace torchao::kernels::cpu::fallback::bitpacking { +namespace torchao::cpu::fallback::bitpacking { namespace internal { /** @@ -192,4 +192,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_128_uint3_values( } } // namespace internal -} // namespace torchao::kernels::cpu::fallback::bitpacking +} // namespace torchao::cpu::fallback::bitpacking diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint4.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint4.h index 4b98a47143..528507a651 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint4.h +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint4.h @@ -9,7 +9,7 @@ #include #include -namespace torchao::kernels::cpu::fallback::bitpacking { +namespace torchao::cpu::fallback::bitpacking { namespace internal { /** * @brief Packs 2 bytes, each holding a 4-bit value (0-15), into a single @@ -106,4 +106,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_32_uint4_values( } } } // namespace internal -} // namespace torchao::kernels::cpu::fallback::bitpacking +} // namespace torchao::cpu::fallback::bitpacking diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint5.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint5.h index 3de577e05f..86c3c8d1f2 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint5.h +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint5.h @@ -9,7 +9,7 @@ #include #include -namespace torchao::kernels::cpu::fallback::bitpacking { +namespace torchao::cpu::fallback::bitpacking { namespace internal { /** diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint6.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint6.h index 2fcd9334ec..347f772951 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint6.h +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint6.h @@ -9,7 +9,7 @@ #include #include -namespace torchao::kernels::cpu::fallback::bitpacking { +namespace torchao::cpu::fallback::bitpacking { namespace internal { /** @@ -139,4 +139,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_64_uint6_values( } } // namespace internal -} // namespace torchao::kernels::cpu::fallback::bitpacking +} // namespace torchao::cpu::fallback::bitpacking diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint7.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint7.h index 60493a20b2..6d045132f9 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint7.h +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint7.h @@ -9,7 +9,7 @@ #include #include -namespace torchao::kernels::cpu::fallback::bitpacking { +namespace torchao::cpu::fallback::bitpacking { namespace internal { /** * @brief Packs 8 bytes, each holding a 7-bit value (0-127), into 7 bytes. @@ -137,4 +137,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_128_uint7_values( } } // namespace internal -} // namespace torchao::kernels::cpu::fallback::bitpacking +} // namespace torchao::cpu::fallback::bitpacking diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/channelwise_8bit_a_channelwise_8bit_b.h b/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/channelwise_8bit_a_channelwise_8bit_b.h index 3b070eb2b3..5e7b7eb9f7 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/channelwise_8bit_a_channelwise_8bit_b.h +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/channelwise_8bit_a_channelwise_8bit_b.h @@ -8,7 +8,7 @@ #include -namespace torchao::kernels::cpu::fallback::quantized_matmul { +namespace torchao::cpu::fallback::quantized_matmul { namespace channelwise_8bit_a_channelwise_8bit_b::internal { template < @@ -85,10 +85,10 @@ struct KernelImpl { } // namespace // channelwise_8bit_a_channelwise_8bit_b::internal -} // namespace torchao::kernels::cpu::fallback::quantized_matmul +} // namespace torchao::cpu::fallback::quantized_matmul // TODO: Remove all ::kernels. No need for extra namespace. -namespace torchao::kernels::cpu::fallback::quantized_matmul { +namespace torchao::cpu::fallback::quantized_matmul { namespace channelwise_8bit_a_channelwise_8bit_b { template < bool a_has_zeros, @@ -130,4 +130,4 @@ void kernel( rhs_qparams_stride); } } // namespace channelwise_8bit_a_channelwise_8bit_b -} // namespace torchao::kernels::cpu::fallback::quantized_matmul +} // namespace torchao::cpu::fallback::quantized_matmul diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/fp32_a_channelwise_8bit_b_fp32_c.h b/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/fp32_a_channelwise_8bit_b_fp32_c.h index 58e2853617..f89b281891 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/fp32_a_channelwise_8bit_b_fp32_c.h +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/fp32_a_channelwise_8bit_b_fp32_c.h @@ -8,8 +8,7 @@ #include -// TODO: Remove all ::kernels. No need for extra namespace. -namespace torchao::kernels::cpu::fallback::quantized_matmul { +namespace torchao::cpu::fallback::quantized_matmul { namespace fp32_a_input_channelwise_8bit_b_fp32 { template void kernel( @@ -47,4 +46,4 @@ void kernel( } } } // namespace fp32_a_input_channelwise_8bit_b_fp32 -} // namespace torchao::kernels::cpu::fallback::quantized_matmul +} // namespace torchao::cpu::fallback::quantized_matmul diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/tests/test_bitpacking.cpp b/torchao/csrc/cpu/torch_free_kernels/fallback/tests/test_bitpacking.cpp index 32177e63da..f672bc5d17 100644 --- a/torchao/csrc/cpu/torch_free_kernels/fallback/tests/test_bitpacking.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/fallback/tests/test_bitpacking.cpp @@ -23,9 +23,9 @@ TEST(FallbackBitpackingTest, PackUnpack8_uint1) { std::vector packed(packed_bytes); std::vector unpacked(unpacked_bytes); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint1_values( + torchao::cpu::fallback::bitpacking::internal::pack_8_uint1_values( packed.data(), input.data()); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint1_values( + torchao::cpu::fallback::bitpacking::internal::unpack_8_uint1_values( unpacked.data(), packed.data()); ASSERT_EQ(input, unpacked); @@ -38,9 +38,9 @@ TEST(FallbackBitpackingTest, PackUnpack4_uint2) { std::vector packed(packed_bytes); std::vector unpacked(unpacked_bytes); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_4_uint2_values( + torchao::cpu::fallback::bitpacking::internal::pack_4_uint2_values( packed.data(), input.data()); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_4_uint2_values( + torchao::cpu::fallback::bitpacking::internal::unpack_4_uint2_values( unpacked.data(), packed.data()); ASSERT_EQ(input, unpacked); @@ -53,9 +53,9 @@ TEST(FallbackBitpackingTest, PackUnpack8_uint3) { std::vector packed(packed_bytes); std::vector unpacked(unpacked_bytes); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint3_values( + torchao::cpu::fallback::bitpacking::internal::pack_8_uint3_values( packed.data(), input.data()); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint3_values( + torchao::cpu::fallback::bitpacking::internal::unpack_8_uint3_values( unpacked.data(), packed.data()); ASSERT_EQ(input, unpacked); @@ -68,9 +68,9 @@ TEST(FallbackBitpackingTest, PackUnpack32_uint4) { std::vector packed(packed_bytes); std::vector unpacked(unpacked_bytes); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values( packed.data(), input.data()); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint4_values( + torchao::cpu::fallback::bitpacking::internal::unpack_32_uint4_values( unpacked.data(), packed.data()); ASSERT_EQ(input, unpacked); @@ -83,9 +83,9 @@ TEST(FallbackBitpackingTest, PackUnpack8_uint5) { std::vector packed(packed_bytes); std::vector unpacked(unpacked_bytes); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint5_values( + torchao::cpu::fallback::bitpacking::internal::pack_8_uint5_values( packed.data(), input.data()); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint5_values( + torchao::cpu::fallback::bitpacking::internal::unpack_8_uint5_values( unpacked.data(), packed.data()); ASSERT_EQ(input, unpacked); @@ -98,9 +98,9 @@ TEST(FallbackBitpackingTest, PackUnpack4_uint6) { std::vector packed(packed_bytes); std::vector unpacked(unpacked_bytes); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_4_uint6_values( + torchao::cpu::fallback::bitpacking::internal::pack_4_uint6_values( packed.data(), input.data()); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_4_uint6_values( + torchao::cpu::fallback::bitpacking::internal::unpack_4_uint6_values( unpacked.data(), packed.data()); ASSERT_EQ(input, unpacked); @@ -113,9 +113,9 @@ TEST(FallbackBitpackingTest, PackUnpack8_uint7) { std::vector packed(packed_bytes); std::vector unpacked(unpacked_bytes); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint7_values( + torchao::cpu::fallback::bitpacking::internal::pack_8_uint7_values( packed.data(), input.data()); - torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint7_values( + torchao::cpu::fallback::bitpacking::internal::unpack_8_uint7_values( unpacked.data(), packed.data()); ASSERT_EQ(input, unpacked); @@ -131,9 +131,9 @@ void test_bitpacking_128_lowbit_values() { std::vector packed(packed_bytes); std::vector unpacked(unpacked_bytes); - torchao::kernels::cpu::fallback::bitpacking::internal:: + torchao::cpu::fallback::bitpacking::internal:: pack_128_lowbit_int_values(packed.data(), input.data()); - torchao::kernels::cpu::fallback::bitpacking::internal:: + torchao::cpu::fallback::bitpacking::internal:: unpack_128_lowbit_int_values(unpacked.data(), packed.data()); ASSERT_EQ(input, unpacked); @@ -159,31 +159,31 @@ void test_bitpacking_128_lowbit_values_with_lut() { // 3. Pack the indices std::vector packed(packed_bytes); if constexpr (nbit == 1) - torchao::kernels::cpu::fallback::bitpacking::internal:: + torchao::cpu::fallback::bitpacking::internal:: pack_128_uint1_values(packed.data(), indices.data()); if constexpr (nbit == 2) { - torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values( + torchao::cpu::fallback::bitpacking::internal::pack_64_uint2_values( packed.data(), indices.data()); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values( + torchao::cpu::fallback::bitpacking::internal::pack_64_uint2_values( packed.data() + 16, indices.data() + 64); } if constexpr (nbit == 3) - torchao::kernels::cpu::fallback::bitpacking::internal:: + torchao::cpu::fallback::bitpacking::internal:: pack_128_uint3_values(packed.data(), indices.data()); if constexpr (nbit == 4) { - torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values( packed.data(), indices.data()); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values( packed.data() + 16, indices.data() + 32); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values( packed.data() + 32, indices.data() + 64); - torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values( + torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values( packed.data() + 48, indices.data() + 96); } // 4. Unpack using the LUT function std::vector unpacked(unpacked_bytes); - torchao::kernels::cpu::fallback::bitpacking::internal:: + torchao::cpu::fallback::bitpacking::internal:: unpack_128_lowbit_values_with_lut( unpacked.data(), packed.data(), lut.data()); diff --git a/torchao/csrc/cpu/torch_free_kernels/interface/quantized_matmul.h b/torchao/csrc/cpu/torch_free_kernels/interface/quantized_matmul.h index da3fd32747..b8d99dc563 100644 --- a/torchao/csrc/cpu/torch_free_kernels/interface/quantized_matmul.h +++ b/torchao/csrc/cpu/torch_free_kernels/interface/quantized_matmul.h @@ -15,7 +15,7 @@ #include #endif // defined(__aarch64__) && defined(__ARM_NEON) -namespace torchao::kernels::cpu::quantized_matmul { +namespace torchao::cpu::quantized_matmul { /* a_stride_m: stride of a in memory to indiciate how far apart each row is. @@ -78,10 +78,10 @@ get_int8_a_int8_b_channelwise_qmatmul( if (b_transposed) { a_stride_m = k; b_stride_n = k; - return torchao::kernels::cpu::fallback::quantized_matmul:: + return torchao::cpu::fallback::quantized_matmul:: channelwise_8bit_a_channelwise_8bit_b::kernel; } else { - return torchao::kernels::cpu::fallback::quantized_matmul:: + return torchao::cpu::fallback::quantized_matmul:: channelwise_8bit_a_channelwise_8bit_b::kernel; } } @@ -144,13 +144,13 @@ get_fp32_a_input_channelwise_8bit_b_f32_c_matmul( if (b_transposed) { a_stride_m = k; b_stride_n = k; - return torchao::kernels::cpu::fallback::quantized_matmul:: + return torchao::cpu::fallback::quantized_matmul:: fp32_a_input_channelwise_8bit_b_fp32::kernel; } else { a_stride_m = k; b_stride_n = n; - return torchao::kernels::cpu::fallback::quantized_matmul:: + return torchao::cpu::fallback::quantized_matmul:: fp32_a_input_channelwise_8bit_b_fp32::kernel; } } -} // namespace torchao::kernels::cpu::quantized_matmul +} // namespace torchao::cpu::quantized_matmul diff --git a/torchao/csrc/cpu/torch_free_kernels/interface/test_qmatmul_interface.cpp b/torchao/csrc/cpu/torch_free_kernels/interface/test_qmatmul_interface.cpp index 5ce1593732..8f091a9138 100644 --- a/torchao/csrc/cpu/torch_free_kernels/interface/test_qmatmul_interface.cpp +++ b/torchao/csrc/cpu/torch_free_kernels/interface/test_qmatmul_interface.cpp @@ -297,7 +297,7 @@ struct test_channelwise_8bit_channelwise_8bit_b< m, k, n, a_has_zeros, a_has_zeros, false, true, stride); int a_stride_m, b_stride_n; - auto kernel = torchao::kernels::cpu::quantized_matmul:: + auto kernel = torchao::cpu::quantized_matmul:: get_int8_a_int8_b_channelwise_qmatmul( m, n, k, false, true, a_stride_m, b_stride_n); a_stride_m = a_stride_m * stride; @@ -563,7 +563,7 @@ static void test_fp32_a_input_channelwise_8bit_b( test_case.execute(beta); int a_stride_m, b_stride_n; - auto kernel = torchao::kernels::cpu::quantized_matmul:: + auto kernel = torchao::cpu::quantized_matmul:: get_fp32_a_input_channelwise_8bit_b_f32_c_matmul( m, n, k, false, false, a_stride_m, b_stride_n); b_stride_n = b_stride_n * stride; diff --git a/torchao/experimental/kernels/mps/src/dispatch.h b/torchao/experimental/kernels/mps/src/dispatch.h index a04452cece..92b1f619ef 100644 --- a/torchao/experimental/kernels/mps/src/dispatch.h +++ b/torchao/experimental/kernels/mps/src/dispatch.h @@ -8,7 +8,7 @@ #include -namespace torchao::kernels::mps::lowbit::dispatch { +namespace torchao::mps::lowbit::dispatch { inline void dispatch_mm( id encoder, @@ -48,4 +48,4 @@ inline void dispatch_qmv_fast( threadsPerThreadgroup:MTLSizeMake(32, 2, 1)]; } -} // namespace torchao::kernels::mps::lowbit::dispatch +} // namespace torchao::mps::lowbit::dispatch diff --git a/torchao/experimental/kernels/mps/src/lowbit.h b/torchao/experimental/kernels/mps/src/lowbit.h index 8071398eba..c7008034c0 100644 --- a/torchao/experimental/kernels/mps/src/lowbit.h +++ b/torchao/experimental/kernels/mps/src/lowbit.h @@ -14,7 +14,7 @@ #include // metal_lowbit_quantized_lib #include -namespace torchao::kernels::mps::lowbit { +namespace torchao::mps::lowbit { namespace { template @@ -175,4 +175,4 @@ struct LowBitQuantWeights { static constexpr auto pack = LowBitConfig::packing_fn; }; -} // namespace torchao::kernels::mps::lowbit +} // namespace torchao::mps::lowbit diff --git a/torchao/experimental/kernels/mps/src/packing.h b/torchao/experimental/kernels/mps/src/packing.h index 5412c04a12..18fa19ca1b 100644 --- a/torchao/experimental/kernels/mps/src/packing.h +++ b/torchao/experimental/kernels/mps/src/packing.h @@ -4,7 +4,7 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. -namespace torchao::kernels::mps::lowbit::packing { +namespace torchao::mps::lowbit::packing { /** * Pack weights into a smaller number of bits. @@ -187,4 +187,4 @@ pack<7>(const uint8_t* w_ptr, uint8_t* b_ptr, int32_t N, int32_t K) { } } -} // namespace torchao::kernels::mps::lowbit::packing +} // namespace torchao::mps::lowbit::packing diff --git a/torchao/experimental/kernels/mps/test/test_lowbit.mm b/torchao/experimental/kernels/mps/test/test_lowbit.mm index 8481e5cef6..e00570247a 100644 --- a/torchao/experimental/kernels/mps/test/test_lowbit.mm +++ b/torchao/experimental/kernels/mps/test/test_lowbit.mm @@ -37,7 +37,7 @@ return rc; } -namespace torchao::kernels::mps::lowbit { +namespace torchao::mps::lowbit { // Reference CPU implementation of lowbit quantized linear template @@ -184,11 +184,11 @@ void allocBuffers(id device) { id buf_Z; // (K/group_size)xN elements }; -} // namespace torchao::kernels::mps::lowbit +} // namespace torchao::mps::lowbit template void run_test(int32_t m, int32_t k, int32_t n, int32_t group_size) { - torchao::kernels::mps::lowbit::LowBitTester tester( + torchao::mps::lowbit::LowBitTester tester( m, k, n, group_size); tester.init(); tester.pack(); diff --git a/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_aten.mm b/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_aten.mm index b8ecb8c7aa..d0acd7d695 100644 --- a/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_aten.mm +++ b/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_aten.mm @@ -12,7 +12,7 @@ #include // clang-format on -namespace torchao::kernels::mps::lowbit::aten { +namespace torchao::mps::lowbit::aten { using Tensor = at::Tensor; using namespace at::native::mps; @@ -240,7 +240,7 @@ Tensor pack_weights_cpu_kernel(const Tensor& W) { m.impl("_linear_fp_act_7bit_weight", &linear_mps_kernel_meta<7>); } -} // namespace torchao::kernels::mps::lowbit::aten +} // namespace torchao::mps::lowbit::aten // c-shim wrappers for AOTInductor diff --git a/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_executorch.mm b/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_executorch.mm index 22693b417e..ca0c2e5a0a 100644 --- a/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_executorch.mm +++ b/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_executorch.mm @@ -94,7 +94,7 @@ bool check_linear_mps_args( auto N = B.size(0); auto K = A.size(1); - torchao::kernels::mps::lowbit::LowBitQuantWeights::linear( + torchao::mps::lowbit::LowBitQuantWeights::linear( {getMTLBufferStorage(A), A.storage_offset() * A.element_size()}, {getMTLBufferStorage(B), B.storage_offset() * B.element_size()}, group_size,