pytorch · syedshazli · Dec 18, 2025
diff --git a/torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp b/torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp
@@ -18,7 +18,7 @@ template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp>
 UKernelConfig get_ukernel_config() {
   UKernelConfig config;
 
-  namespace ukernel = torchao::kernels::cpu::aarch64::linear::
+  namespace ukernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot;
   config.mr = 1;
   config.nr = 8;

diff --git a/torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h b/torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h
@@ -133,7 +133,7 @@ Tensor embedding_out_cpu(
     }
     TORCHAO_CHECK(index >= 0 && index < num_embeddings, "index out of bounds");
 #if defined(TORCHAO_BUILD_CPU_AARCH64)
-    torchao::kernels::cpu::aarch64::embedding::embedding<weight_nbit>(
+    torchao::cpu::aarch64::embedding::embedding<weight_nbit>(
         out.mutable_data_ptr<float>() + idx * embedding_dim,
         embedding_dim,
         group_size,
@@ -199,7 +199,7 @@ Tensor pack_embedding_cpu(const Tensor& weight_qvals) {
 
   torchao::parallel_1d(0, num_embeddings, [&](int64_t idx) {
 #if defined(TORCHAO_BUILD_CPU_AARCH64)
-    torchao::kernels::cpu::aarch64::embedding::pack_embedding_weight_qvals<
+    torchao::cpu::aarch64::embedding::pack_embedding_weight_qvals<
         weight_nbit>(
         out.mutable_data_ptr<int8_t>() +
             torchao::ops::PackedWeightsHeader::size(),
@@ -289,7 +289,7 @@ Tensor shared_embedding_out_cpu(
     }
     TORCHAO_CHECK(index >= 0 && index < n, "index out of bounds");
 #if defined(TORCHAO_BUILD_CPU_AARCH64)
-    torchao::kernels::cpu::aarch64::embedding::
+    torchao::cpu::aarch64::embedding::
         shared_embedding<weight_nbit, nr, kr, sr>(
             out.mutable_data_ptr<float>() + idx * k,
             packed_weights.const_data_ptr<int8_t>() +

diff --git a/torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h b/torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h
@@ -117,7 +117,7 @@ void register_ukernel_config(
   int preferred_alignment = 16;
 
   namespace kernel_api =
-      torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
+      torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
 
   using kernel_fn_ptr_t =
       decltype(&kernel_api::groupwise_lowbit_weight_lut_kernel_1x4x32<

diff --git a/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h b/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h
@@ -97,7 +97,7 @@ void register_ukernel_config_universal(
       torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_universal,
       weight_nbit);
 
-  namespace kernel = torchao::kernels::cpu::aarch64::linear::
+  namespace kernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   constexpr bool has_lut = false;
@@ -181,7 +181,7 @@ void register_ukernel_config_lut(
     int preferred_alignment = 16;
 
     #if defined(TORCHAO_ENABLE_ARM_NEON_DOT)
-    namespace kernel = torchao::kernels::cpu::aarch64::linear::
+    namespace kernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
     if (!cpuinfo_has_arm_neon_dot()) {
@@ -232,7 +232,7 @@ void register_ukernel_config_lut(
 template <typename kernel_struct>
 UKernelConfig::linear_config_type
 get_linear_config_kleidi(int n_step, int nr, int kr, int sr) {
-  namespace op = torchao::kernels::cpu::aarch64::kleidi::
+  namespace op = torchao::cpu::aarch64::kleidi::
       kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
   assert(n_step == kernel_struct::get_ukernel().get_n_step());
   assert(nr == kernel_struct::get_ukernel().get_nr());
@@ -256,7 +256,7 @@ void register_ukernel_config_kleidi(
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
   check_format(format, torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_kleidi_ai, weight_nbit);
-  namespace op = torchao::kernels::cpu::aarch64::kleidi::
+  namespace op = torchao::cpu::aarch64::kleidi::
       kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
 
   auto uk = UKernelConfig::make(

diff --git a/torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp b/torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp
@@ -19,7 +19,7 @@ using namespace torchao::ops::groupwise_lowbit_weight_lut;
 template <int weight_nbit, bool has_scales>
 UKernelConfig get_ukernel_config(bool has_bias) {
   namespace kernel =
-      torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
+      torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
 
   int preferred_alignment = 16;
   int n_step = 8;

diff --git a/torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp b/torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp
@@ -16,7 +16,7 @@
 
 #if defined(TORCHAO_ENABLE_KLEIDI)
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h>
-using namespace torchao::kernels::cpu::aarch64::kleidi::
+using namespace torchao::cpu::aarch64::kleidi::
     kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
 #endif // TORCHAO_ENABLE_KLEIDI
 
@@ -27,7 +27,7 @@ using namespace torchao::ops::linear_8bit_act_xbit_weight;
 
 template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp, bool has_lut = false>
 UKernelConfig get_ukernel_config() {
-  namespace kernel = torchao::kernels::cpu::aarch64::linear::
+  namespace kernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   int preferred_alignment = 16;
@@ -213,7 +213,7 @@ enum kai_kernel_id {
 
 template <typename kernel_struct>
 UKernelConfig get_ukernel_config_kleidi_impl() {
-  namespace op = torchao::kernels::cpu::aarch64::kleidi::
+  namespace op = torchao::cpu::aarch64::kleidi::
       kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
 
   auto uk = kernel_struct::get_ukernel();

diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp
@@ -19,7 +19,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot(
   int k = state.range(2);
   int group_size = state.range(3);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot;
 
   auto test_case = torchao::
@@ -91,7 +91,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot(
   int k = state.range(2);
   int group_size = state.range(3);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   auto test_case = torchao::
@@ -163,7 +163,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot(
   int k = state.range(2);
   int group_size = state.range(3);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   auto test_case = torchao::

diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp
@@ -21,7 +21,7 @@ static void benchmark_quantize(benchmark::State& state) {
   float vmin, vmax, scale;
 
   for (auto _ : state) {
-    torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+    torchao::cpu::aarch64::reduction::find_min_and_max(
         vmin, vmax, vals.data(), vals.size());
 
     torchao::quantization::get_qvals_range(
@@ -30,7 +30,7 @@ static void benchmark_quantize(benchmark::State& state) {
     torchao::quantization::get_scale_and_zero(
         scale, zero, vmin, vmax, qmin, qmax);
 
-    torchao::kernels::cpu::aarch64::quantization::quantize(
+    torchao::cpu::aarch64::quantization::quantize(
         qvals.data(), vals.data(), vals.size(), scale, zero, qmin, qmax);
   }
 }

diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h
@@ -15,7 +15,7 @@
 #include <cassert>
 #include <vector>
 
-namespace torchao::kernels::cpu::aarch64::embedding {
+namespace torchao::cpu::aarch64::embedding {
 
 namespace internal {
 
@@ -353,7 +353,7 @@ inline void shared_embedding(
   n_idx = n_idx * nr;
   int j = index - n_idx;
 
-  torchao::kernels::cpu::aarch64::linear::
+  torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
           unpack_weights_at_n_idx<weight_nbit, nr, kr, sr>(
               weight_qvals.data(),
@@ -381,6 +381,6 @@ inline void shared_embedding(
   }
 }
 
-} // namespace torchao::kernels::cpu::aarch64::embedding
+} // namespace torchao::cpu::aarch64::embedding
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h
@@ -14,7 +14,7 @@
 #include <cassert>
 #include <vector>
 
-namespace torchao::kernels::cpu::aarch64::embedding {
+namespace torchao::cpu::aarch64::embedding {
 
 /**
  * @brief Calculates the size in bytes for a single row of packed embeddings.
@@ -377,6 +377,6 @@ inline void dequantize_embedding_row_at_idx_lut(
     vst1q_f32(out + j + 12, out3);
   }
 }
-} // namespace torchao::kernels::cpu::aarch64::embedding
+} // namespace torchao::cpu::aarch64::embedding
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h
@@ -30,7 +30,7 @@
 
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h>
 
-namespace torchao::kernels::cpu::aarch64::kleidi {
+namespace torchao::cpu::aarch64::kleidi {
 
 // Helper functions
 // TODO: find a better place for these?
@@ -319,4 +319,4 @@ DEFINE_KERNEL_STRUCT(matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm);
 #undef DEFINE_KERNEL_STRUCT
 
 } // namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p
-} // namespace torchao::kernels::cpu::aarch64::kleidi
+} // namespace torchao::cpu::aarch64::kleidi
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h
@@ -12,7 +12,7 @@
 #include <kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h>
 #include <kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h>
 
-namespace torchao::kernels::cpu::aarch64::kleidi {
+namespace torchao::cpu::aarch64::kleidi {
 namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p {
 // All the kernels in this namespace use following packing interface/routines.
 // TODO: move these to Kleidi as interfaces?
@@ -115,4 +115,4 @@ lhs_packing get_lhs_packing() {
 }
 
 } // namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p
-} // namespace torchao::kernels::cpu::aarch64::kleidi
+} // namespace torchao::cpu::aarch64::kleidi
diff --git a/..._activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h b/..._activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h
@@ -17,7 +17,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h>
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h>
 
-namespace torchao::kernels::cpu::aarch64::linear::
+namespace torchao::cpu::aarch64::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight {
 
 inline size_t packed_activations_size(
@@ -148,7 +148,7 @@ void pack_weights_with_lut(
   (void)nr; // unused
   (void)kr; // unused
   (void)sr; // unused
-  torchao::kernels::cpu::aarch64::linear::
+  torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
           pack_weights_with_lut<weight_nbit, nr_, kr_, sr_>(
               packed_weights,
@@ -298,6 +298,6 @@ void kernel_1x8x16_f32_neondot(
 }
 
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight
+  // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/...near/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h b/...near/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h
@@ -11,7 +11,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/bitpacking/bitpack.h>
 #include <cassert>
 
-namespace torchao::kernels::cpu::aarch64::linear::
+namespace torchao::cpu::aarch64::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight::kernel {
 
 namespace internal {
@@ -174,6 +174,6 @@ void kernel_1x1x32_f32_neondot(
 }
 
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
+  // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/...near/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h b/...near/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h
@@ -12,7 +12,7 @@
 #include <cassert>
 #include <cstring>
 
-namespace torchao::kernels::cpu::aarch64::linear::
+namespace torchao::cpu::aarch64::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight::kernel {
 namespace internal {
 inline float32x4_t clamp(float32x4_t x, float min, float max) {
@@ -245,6 +245,6 @@ void kernel_1x4x16_f32_neondot(
 }
 
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
+  // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/...near/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h b/...near/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h
@@ -12,7 +12,7 @@
 #include <cassert>
 #include <cstring>
 
-namespace torchao::kernels::cpu::aarch64::linear::
+namespace torchao::cpu::aarch64::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight::kernel {
 namespace internal {
 
@@ -361,6 +361,6 @@ void kernel_1x8x16_f32_neondot(
 }
 
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
+  // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/...els/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h b/...els/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h
@@ -12,7 +12,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/reduction.h>
 #include <cassert>
 
-namespace torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing {
+namespace torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing {
 
 // Prepares activation data for kernel_impl.
 //   Per m_idx (row), activations are stored as follows:
@@ -72,7 +72,7 @@ void inline pack_activations(
       qmin, qmax, /*nbit=*/8, /*is_symmetric=*/false);
 
   for (int m_idx = 0; m_idx < m; m_idx++) {
-    torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+    torchao::cpu::aarch64::reduction::find_min_and_max(
         vmin, vmax, activations, k);
     torchao::quantization::get_scale_and_zero(
         scale, zero, vmin, vmax, qmin, qmax);
@@ -86,7 +86,7 @@ void inline pack_activations(
 
     if (has_weight_zeros) {
       for (int k_idx = 0; k_idx < k; k_idx += group_size) {
-        torchao::kernels::cpu::aarch64::quantization::quantize(
+        torchao::cpu::aarch64::quantization::quantize(
             /*qvals=*/(int8_t*)activation_data_byte_ptr,
             /*vals=*/activations,
             /*size=*/group_size,
@@ -95,7 +95,7 @@ void inline pack_activations(
             /*qmin=*/qmin,
             /*qmax=*/qmax);
 
-        qvals_sum = torchao::kernels::cpu::aarch64::reduction::compute_sum(
+        qvals_sum = torchao::cpu::aarch64::reduction::compute_sum(
             /*vals=*/(int8_t*)activation_data_byte_ptr,
             /*size=*/group_size);
 
@@ -107,7 +107,7 @@ void inline pack_activations(
         activations += group_size;
       }
     } else {
-      torchao::kernels::cpu::aarch64::quantization::quantize(
+      torchao::cpu::aarch64::quantization::quantize(
           /*qvals=*/(int8_t*)activation_data_byte_ptr,
           /*vals=*/activations,
           /*size=*/k,
@@ -121,6 +121,6 @@ void inline pack_activations(
   }
 }
 
-} // namespace torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing
+} // namespace torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/...kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h b/...kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h
@@ -9,7 +9,7 @@
 #include <array>
 #include <cstring>
 
-namespace torchao::kernels::cpu::aarch64::linear::
+namespace torchao::cpu::aarch64::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing {
 
 namespace internal {
@@ -583,6 +583,6 @@ size_t inline packed_weights_with_lut_size(
 }
 
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing
+  // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/...u/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h b/...u/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h
@@ -11,7 +11,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h>
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h>
 
-namespace torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut {
+namespace torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut {
 
 /**
  * @brief Calculates the total size in bytes required for the packed weight.
@@ -258,6 +258,6 @@ inline size_t packed_weights_offset(
   return (n_idx / nr) * packed_tile_size_for_nr_cols;
 }
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut
+  // torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)