From 5753bd1635a7e4ac7d1e52687ffa4008e9327f99 Mon Sep 17 00:00:00 2001
From: syedshazli <syed.abdu1818@gmail.com>
Date: Thu, 18 Dec 2025 12:45:02 -0500
Subject: [PATCH] remove ::kernels

---
 .../benchmark_linear_8bit_act_xbit_weight.cpp |  2 +-
 .../embedding_xbit/op_embedding_xbit-impl.h   |  6 +-
 .../kernel_selector.h                         |  2 +-
 .../kernel_selector.h                         |  8 +--
 .../test_groupwise_lowbit_weight_lut.cpp      |  2 +-
 .../test_linear_8bit_act_xbit_weight.cpp      |  6 +-
 .../aarch64/benchmarks/benchmark_linear.cpp   |  6 +-
 .../benchmarks/benchmark_quantization.cpp     |  4 +-
 .../aarch64/embedding/embedding.h             |  6 +-
 .../aarch64/embedding/embedding_lut.h         |  4 +-
 .../kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h   |  4 +-
 .../torch_free_kernels/aarch64/kleidi/pack.h  |  4 +-
 ..._8bit_activation_groupwise_lowbit_weight.h |  6 +-
 .../kernel_1x1x32_f32_neondot-impl.h          |  4 +-
 .../kernel_1x4x16_f32_neondot-impl.h          |  4 +-
 .../kernel_1x8x16_f32_neondot-impl.h          |  4 +-
 .../pack_activations.h                        | 12 ++--
 .../pack_weights.h                            |  4 +-
 .../groupwise_lowbit_weight_lut.h             |  4 +-
 .../groupwise_lowbit_weight/kernel_f32-impl.h |  6 +-
 .../pack_activations.h                        |  4 +-
 .../groupwise_lowbit_weight/pack_weights.h    |  4 +-
 ...hannelwise_8bit_b_1x16x16_f32_smlal-impl.h |  6 +-
 ...annelwise_8bit_b_1x8x16_f32_neondot-impl.h |  6 +-
 ...hannelwise_8bit_b_4x8x8_f32_neondot-impl.h |  6 +-
 ...input_channelwise_8bit_b_1x16x4_f32_impl.h |  6 +-
 ...input_channelwise_8bit_b_4x16x4_f32_impl.h |  6 +-
 .../aarch64/matmul/matmul.h                   |  4 +-
 .../aarch64/matmul/matmul_utils.h             |  4 +-
 .../aarch64/quantization/quantize.cpp         |  2 +-
 .../aarch64/reduction/compute_sum.cpp         |  2 +-
 .../aarch64/reduction/find_min_and_max.cpp    |  2 +-
 .../test_bitpack_fallback_compatibility.cpp   | 56 +++++++++----------
 .../aarch64/tests/test_embedding.cpp          | 10 ++--
 .../aarch64/tests/test_embedding_lut.cpp      |  6 +-
 .../aarch64/tests/test_linear.cpp             |  8 +--
 .../aarch64/tests/test_lut.cpp                |  4 +-
 .../aarch64/tests/test_qmatmul.cpp            | 16 +++---
 .../aarch64/tests/test_quantization.cpp       |  4 +-
 .../aarch64/tests/test_reduction.cpp          | 10 ++--
 .../aarch64/tests/test_utils.h                |  8 +--
 .../aarch64/tests/test_weight_packing.cpp     |  8 +--
 .../aarch64/valpacking/interleave.cpp         |  2 +-
 .../fallback/bitpacking/bitpack.h             |  4 +-
 .../fallback/bitpacking/uint1.h               |  4 +-
 .../fallback/bitpacking/uint2.h               |  4 +-
 .../fallback/bitpacking/uint3.h               |  4 +-
 .../fallback/bitpacking/uint4.h               |  4 +-
 .../fallback/bitpacking/uint5.h               |  2 +-
 .../fallback/bitpacking/uint6.h               |  4 +-
 .../fallback/bitpacking/uint7.h               |  4 +-
 .../channelwise_8bit_a_channelwise_8bit_b.h   |  8 +--
 .../matmul/fp32_a_channelwise_8bit_b_fp32_c.h |  5 +-
 .../fallback/tests/test_bitpacking.cpp        | 50 ++++++++---------
 .../interface/quantized_matmul.h              | 12 ++--
 .../interface/test_qmatmul_interface.cpp      |  4 +-
 .../experimental/kernels/mps/src/dispatch.h   |  4 +-
 torchao/experimental/kernels/mps/src/lowbit.h |  4 +-
 .../experimental/kernels/mps/src/packing.h    |  4 +-
 .../kernels/mps/test/test_lowbit.mm           |  6 +-
 .../ops/mps/linear_fp_act_xbit_weight_aten.mm |  4 +-
 .../linear_fp_act_xbit_weight_executorch.mm   |  2 +-
 62 files changed, 207 insertions(+), 208 deletions(-)

diff --git a/torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp b/torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp
index caf03acf21..24418ec510 100644
--- a/torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp
+++ b/torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp
@@ -18,7 +18,7 @@ template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp>
 UKernelConfig get_ukernel_config() {
   UKernelConfig config;
 
-  namespace ukernel = torchao::kernels::cpu::aarch64::linear::
+  namespace ukernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot;
   config.mr = 1;
   config.nr = 8;
diff --git a/torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h b/torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h
index 6c1181873b..e1f1a2a39a 100644
--- a/torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h
+++ b/torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h
@@ -133,7 +133,7 @@ Tensor embedding_out_cpu(
     }
     TORCHAO_CHECK(index >= 0 && index < num_embeddings, "index out of bounds");
 #if defined(TORCHAO_BUILD_CPU_AARCH64)
-    torchao::kernels::cpu::aarch64::embedding::embedding<weight_nbit>(
+    torchao::cpu::aarch64::embedding::embedding<weight_nbit>(
         out.mutable_data_ptr<float>() + idx * embedding_dim,
         embedding_dim,
         group_size,
@@ -199,7 +199,7 @@ Tensor pack_embedding_cpu(const Tensor& weight_qvals) {
 
   torchao::parallel_1d(0, num_embeddings, [&](int64_t idx) {
 #if defined(TORCHAO_BUILD_CPU_AARCH64)
-    torchao::kernels::cpu::aarch64::embedding::pack_embedding_weight_qvals<
+    torchao::cpu::aarch64::embedding::pack_embedding_weight_qvals<
         weight_nbit>(
         out.mutable_data_ptr<int8_t>() +
             torchao::ops::PackedWeightsHeader::size(),
@@ -289,7 +289,7 @@ Tensor shared_embedding_out_cpu(
     }
     TORCHAO_CHECK(index >= 0 && index < n, "index out of bounds");
 #if defined(TORCHAO_BUILD_CPU_AARCH64)
-    torchao::kernels::cpu::aarch64::embedding::
+    torchao::cpu::aarch64::embedding::
         shared_embedding<weight_nbit, nr, kr, sr>(
             out.mutable_data_ptr<float>() + idx * k,
             packed_weights.const_data_ptr<int8_t>() +
diff --git a/torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h b/torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h
index f8bdc4cafb..f49ab2b146 100644
--- a/torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h
+++ b/torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h
@@ -117,7 +117,7 @@ void register_ukernel_config(
   int preferred_alignment = 16;
 
   namespace kernel_api =
-      torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
+      torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
 
   using kernel_fn_ptr_t =
       decltype(&kernel_api::groupwise_lowbit_weight_lut_kernel_1x4x32<
diff --git a/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h b/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h
index 88b27f4217..7998ded6b0 100644
--- a/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h
+++ b/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h
@@ -97,7 +97,7 @@ void register_ukernel_config_universal(
       torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_universal,
       weight_nbit);
 
-  namespace kernel = torchao::kernels::cpu::aarch64::linear::
+  namespace kernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   constexpr bool has_lut = false;
@@ -181,7 +181,7 @@ void register_ukernel_config_lut(
     int preferred_alignment = 16;
 
     #if defined(TORCHAO_ENABLE_ARM_NEON_DOT)
-    namespace kernel = torchao::kernels::cpu::aarch64::linear::
+    namespace kernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
     if (!cpuinfo_has_arm_neon_dot()) {
@@ -232,7 +232,7 @@ void register_ukernel_config_lut(
 template <typename kernel_struct>
 UKernelConfig::linear_config_type
 get_linear_config_kleidi(int n_step, int nr, int kr, int sr) {
-  namespace op = torchao::kernels::cpu::aarch64::kleidi::
+  namespace op = torchao::cpu::aarch64::kleidi::
       kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
   assert(n_step == kernel_struct::get_ukernel().get_n_step());
   assert(nr == kernel_struct::get_ukernel().get_nr());
@@ -256,7 +256,7 @@ void register_ukernel_config_kleidi(
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
   check_format(format, torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_kleidi_ai, weight_nbit);
-  namespace op = torchao::kernels::cpu::aarch64::kleidi::
+  namespace op = torchao::cpu::aarch64::kleidi::
       kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
 
   auto uk = UKernelConfig::make(
diff --git a/torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp b/torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp
index 10bf9bcd3c..1a331afcfa 100644
--- a/torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp
+++ b/torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp
@@ -19,7 +19,7 @@ using namespace torchao::ops::groupwise_lowbit_weight_lut;
 template <int weight_nbit, bool has_scales>
 UKernelConfig get_ukernel_config(bool has_bias) {
   namespace kernel =
-      torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
+      torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
 
   int preferred_alignment = 16;
   int n_step = 8;
diff --git a/torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp b/torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp
index 7631d34a03..9394430b20 100644
--- a/torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp
+++ b/torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp
@@ -16,7 +16,7 @@
 
 #if defined(TORCHAO_ENABLE_KLEIDI)
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h>
-using namespace torchao::kernels::cpu::aarch64::kleidi::
+using namespace torchao::cpu::aarch64::kleidi::
     kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
 #endif // TORCHAO_ENABLE_KLEIDI
 
@@ -27,7 +27,7 @@ using namespace torchao::ops::linear_8bit_act_xbit_weight;
 
 template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp, bool has_lut = false>
 UKernelConfig get_ukernel_config() {
-  namespace kernel = torchao::kernels::cpu::aarch64::linear::
+  namespace kernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   int preferred_alignment = 16;
@@ -213,7 +213,7 @@ enum kai_kernel_id {
 
 template <typename kernel_struct>
 UKernelConfig get_ukernel_config_kleidi_impl() {
-  namespace op = torchao::kernels::cpu::aarch64::kleidi::
+  namespace op = torchao::cpu::aarch64::kleidi::
       kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
 
   auto uk = kernel_struct::get_ukernel();
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp
index 26abe6918a..12cad8a0bd 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp
@@ -19,7 +19,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot(
   int k = state.range(2);
   int group_size = state.range(3);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot;
 
   auto test_case = torchao::
@@ -91,7 +91,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot(
   int k = state.range(2);
   int group_size = state.range(3);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   auto test_case = torchao::
@@ -163,7 +163,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot(
   int k = state.range(2);
   int group_size = state.range(3);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   auto test_case = torchao::
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp
index d877b905d0..e5cfe647a5 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp
@@ -21,7 +21,7 @@ static void benchmark_quantize(benchmark::State& state) {
   float vmin, vmax, scale;
 
   for (auto _ : state) {
-    torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+    torchao::cpu::aarch64::reduction::find_min_and_max(
         vmin, vmax, vals.data(), vals.size());
 
     torchao::quantization::get_qvals_range(
@@ -30,7 +30,7 @@ static void benchmark_quantize(benchmark::State& state) {
     torchao::quantization::get_scale_and_zero(
         scale, zero, vmin, vmax, qmin, qmax);
 
-    torchao::kernels::cpu::aarch64::quantization::quantize(
+    torchao::cpu::aarch64::quantization::quantize(
         qvals.data(), vals.data(), vals.size(), scale, zero, qmin, qmax);
   }
 }
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h
index 0f6d8a2339..d2f2a82b98 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h
@@ -15,7 +15,7 @@
 #include <cassert>
 #include <vector>
 
-namespace torchao::kernels::cpu::aarch64::embedding {
+namespace torchao::cpu::aarch64::embedding {
 
 namespace internal {
 
@@ -353,7 +353,7 @@ inline void shared_embedding(
   n_idx = n_idx * nr;
   int j = index - n_idx;
 
-  torchao::kernels::cpu::aarch64::linear::
+  torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
           unpack_weights_at_n_idx<weight_nbit, nr, kr, sr>(
               weight_qvals.data(),
@@ -381,6 +381,6 @@ inline void shared_embedding(
   }
 }
 
-} // namespace torchao::kernels::cpu::aarch64::embedding
+} // namespace torchao::cpu::aarch64::embedding
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h
index 573fc8020d..cf860a8e01 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h
@@ -14,7 +14,7 @@
 #include <cassert>
 #include <vector>
 
-namespace torchao::kernels::cpu::aarch64::embedding {
+namespace torchao::cpu::aarch64::embedding {
 
 /**
  * @brief Calculates the size in bytes for a single row of packed embeddings.
@@ -377,6 +377,6 @@ inline void dequantize_embedding_row_at_idx_lut(
     vst1q_f32(out + j + 12, out3);
   }
 }
-} // namespace torchao::kernels::cpu::aarch64::embedding
+} // namespace torchao::cpu::aarch64::embedding
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h
index 777d73cebc..008c3e4f53 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h
@@ -30,7 +30,7 @@
 
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h>
 
-namespace torchao::kernels::cpu::aarch64::kleidi {
+namespace torchao::cpu::aarch64::kleidi {
 
 // Helper functions
 // TODO: find a better place for these?
@@ -319,4 +319,4 @@ DEFINE_KERNEL_STRUCT(matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm);
 #undef DEFINE_KERNEL_STRUCT
 
 } // namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p
-} // namespace torchao::kernels::cpu::aarch64::kleidi
+} // namespace torchao::cpu::aarch64::kleidi
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h
index 692df73d55..0d5d9bfd44 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h
@@ -12,7 +12,7 @@
 #include <kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h>
 #include <kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h>
 
-namespace torchao::kernels::cpu::aarch64::kleidi {
+namespace torchao::cpu::aarch64::kleidi {
 namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p {
 // All the kernels in this namespace use following packing interface/routines.
 // TODO: move these to Kleidi as interfaces?
@@ -115,4 +115,4 @@ lhs_packing get_lhs_packing() {
 }
 
 } // namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p
-} // namespace torchao::kernels::cpu::aarch64::kleidi
+} // namespace torchao::cpu::aarch64::kleidi
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h
index 849d99cb8a..f7e1c41e19 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h
@@ -17,7 +17,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h>
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h>
 
-namespace torchao::kernels::cpu::aarch64::linear::
+namespace torchao::cpu::aarch64::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight {
 
 inline size_t packed_activations_size(
@@ -148,7 +148,7 @@ void pack_weights_with_lut(
   (void)nr; // unused
   (void)kr; // unused
   (void)sr; // unused
-  torchao::kernels::cpu::aarch64::linear::
+  torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
           pack_weights_with_lut<weight_nbit, nr_, kr_, sr_>(
               packed_weights,
@@ -298,6 +298,6 @@ void kernel_1x8x16_f32_neondot(
 }
 
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight
+  // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h
index 535bf7a084..dbb653ca5d 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h
@@ -11,7 +11,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/bitpacking/bitpack.h>
 #include <cassert>
 
-namespace torchao::kernels::cpu::aarch64::linear::
+namespace torchao::cpu::aarch64::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight::kernel {
 
 namespace internal {
@@ -174,6 +174,6 @@ void kernel_1x1x32_f32_neondot(
 }
 
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
+  // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h
index 40be2c5231..0dab5dd88a 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h
@@ -12,7 +12,7 @@
 #include <cassert>
 #include <cstring>
 
-namespace torchao::kernels::cpu::aarch64::linear::
+namespace torchao::cpu::aarch64::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight::kernel {
 namespace internal {
 inline float32x4_t clamp(float32x4_t x, float min, float max) {
@@ -245,6 +245,6 @@ void kernel_1x4x16_f32_neondot(
 }
 
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
+  // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h
index 78246e211d..4ceaeeb893 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h
@@ -12,7 +12,7 @@
 #include <cassert>
 #include <cstring>
 
-namespace torchao::kernels::cpu::aarch64::linear::
+namespace torchao::cpu::aarch64::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight::kernel {
 namespace internal {
 
@@ -361,6 +361,6 @@ void kernel_1x8x16_f32_neondot(
 }
 
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
+  // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h
index d7558dd4ce..a27615f2c3 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h
@@ -12,7 +12,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/reduction.h>
 #include <cassert>
 
-namespace torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing {
+namespace torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing {
 
 // Prepares activation data for kernel_impl.
 //   Per m_idx (row), activations are stored as follows:
@@ -72,7 +72,7 @@ void inline pack_activations(
       qmin, qmax, /*nbit=*/8, /*is_symmetric=*/false);
 
   for (int m_idx = 0; m_idx < m; m_idx++) {
-    torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+    torchao::cpu::aarch64::reduction::find_min_and_max(
         vmin, vmax, activations, k);
     torchao::quantization::get_scale_and_zero(
         scale, zero, vmin, vmax, qmin, qmax);
@@ -86,7 +86,7 @@ void inline pack_activations(
 
     if (has_weight_zeros) {
       for (int k_idx = 0; k_idx < k; k_idx += group_size) {
-        torchao::kernels::cpu::aarch64::quantization::quantize(
+        torchao::cpu::aarch64::quantization::quantize(
             /*qvals=*/(int8_t*)activation_data_byte_ptr,
             /*vals=*/activations,
             /*size=*/group_size,
@@ -95,7 +95,7 @@ void inline pack_activations(
             /*qmin=*/qmin,
             /*qmax=*/qmax);
 
-        qvals_sum = torchao::kernels::cpu::aarch64::reduction::compute_sum(
+        qvals_sum = torchao::cpu::aarch64::reduction::compute_sum(
             /*vals=*/(int8_t*)activation_data_byte_ptr,
             /*size=*/group_size);
 
@@ -107,7 +107,7 @@ void inline pack_activations(
         activations += group_size;
       }
     } else {
-      torchao::kernels::cpu::aarch64::quantization::quantize(
+      torchao::cpu::aarch64::quantization::quantize(
           /*qvals=*/(int8_t*)activation_data_byte_ptr,
           /*vals=*/activations,
           /*size=*/k,
@@ -121,6 +121,6 @@ void inline pack_activations(
   }
 }
 
-} // namespace torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing
+} // namespace torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h
index 133c4a7f25..66ad7cb479 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h
@@ -9,7 +9,7 @@
 #include <array>
 #include <cstring>
 
-namespace torchao::kernels::cpu::aarch64::linear::
+namespace torchao::cpu::aarch64::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing {
 
 namespace internal {
@@ -583,6 +583,6 @@ size_t inline packed_weights_with_lut_size(
 }
 
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing
+  // torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h
index b0fea65afb..3ab0ba45ac 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h
@@ -11,7 +11,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h>
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h>
 
-namespace torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut {
+namespace torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut {
 
 /**
  * @brief Calculates the total size in bytes required for the packed weight.
@@ -258,6 +258,6 @@ inline size_t packed_weights_offset(
   return (n_idx / nr) * packed_tile_size_for_nr_cols;
 }
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut
+  // torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/kernel_f32-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/kernel_f32-impl.h
index b50c886d11..ff14da3382 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/kernel_f32-impl.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/kernel_f32-impl.h
@@ -13,11 +13,11 @@
 #include <cassert>
 #include <cstring>
 
-namespace torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut::
+namespace torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut::
     kernel {
 
 namespace lut_utils = torchao::lut;
-namespace weight_packing = torchao::kernels::cpu::aarch64::linear::
+namespace weight_packing = torchao::cpu::aarch64::linear::
     groupwise_lowbit_weight_lut::weight_packing;
 
 namespace internal {
@@ -235,5 +235,5 @@ void groupwise_lowbit_weight_lut_kernel_1x4x32(
   }
 }
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut::kernel
+  // torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut::kernel
 #endif // defined(aarch64) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h
index bf16e04bda..50a6fb87fb 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h
@@ -7,7 +7,7 @@
 #include <cstring>
 #include <vector>
 
-namespace torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut::
+namespace torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut::
     activation_packing {
 
 inline size_t packed_activations_size(int m, int k) {
@@ -26,6 +26,6 @@ void pack_activations(
   std::memcpy(packed_activations, activations, sizeof(float) * m * k);
 }
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut::activation_packing
+  // torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut::activation_packing
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h
index 021693caec..c7e722dd9e 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h
@@ -11,7 +11,7 @@
 #include <numeric>
 #include <vector>
 
-namespace torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut::
+namespace torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut::
     weight_packing {
 namespace lut_utils = torchao::lut;
 namespace packing_utils = torchao::packing;
@@ -224,5 +224,5 @@ TORCHAO_ALWAYS_INLINE inline void pack_weights(
   }
 }
 } // namespace
-  // torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut::weight_packing
+  // torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut::weight_packing
 #endif // defined(aarch64) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h
index 925bbbb4bd..c2492eaf13 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h
@@ -16,7 +16,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h>
 
-namespace torchao::kernels::cpu::aarch64::quantized_matmul {
+namespace torchao::cpu::aarch64::quantized_matmul {
 namespace channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal::internal {
 
 namespace {
@@ -359,7 +359,7 @@ void kernel(
     const float* rhs_scales,
     const int lhs_qparams_stride,
     const int rhs_qparams_stride) {
-  torchao::kernels::cpu::aarch64::quantized_matmul::
+  torchao::cpu::aarch64::quantized_matmul::
       channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal::internal::
           KernelImpl<a_has_zeros, b_has_zeros, a_transposed, b_transposed>::run(
               m,
@@ -379,6 +379,6 @@ void kernel(
               rhs_qparams_stride);
 }
 } // namespace channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal
-} // namespace torchao::kernels::cpu::aarch64::quantized_matmul
+} // namespace torchao::cpu::aarch64::quantized_matmul
 
 #endif // defined(__aarch64__) && defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h
index 2c34cebc3c..835d47476e 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h
@@ -16,7 +16,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h>
 
-namespace torchao::kernels::cpu::aarch64::quantized_matmul {
+namespace torchao::cpu::aarch64::quantized_matmul {
 namespace channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot::internal {
 
 /*
@@ -315,7 +315,7 @@ void kernel(
     const float* rhs_scales,
     const int lhs_qparams_stride,
     const int rhs_qparams_stride) {
-  torchao::kernels::cpu::aarch64::quantized_matmul::
+  torchao::cpu::aarch64::quantized_matmul::
       channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot::internal::
           KernelImpl<a_has_zeros, b_has_zeros, a_transposed, b_transposed>::run(
               m,
@@ -335,6 +335,6 @@ void kernel(
               rhs_qparams_stride);
 }
 } // namespace channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot
-} // namespace torchao::kernels::cpu::aarch64::quantized_matmul
+} // namespace torchao::cpu::aarch64::quantized_matmul
 
 #endif // defined(__aarch64__) && defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot-impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot-impl.h
index 80417f37e4..938cd8d06e 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot-impl.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot-impl.h
@@ -16,7 +16,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h>
 
-namespace torchao::kernels::cpu::aarch64::quantized_matmul {
+namespace torchao::cpu::aarch64::quantized_matmul {
 namespace channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot::internal {
 
 TORCHAO_ALWAYS_INLINE static void block_mul_4x8x8(
@@ -386,7 +386,7 @@ void kernel(
     const float* rhs_scales,
     const int lhs_qparams_stride,
     const int rhs_qparams_stride) {
-  torchao::kernels::cpu::aarch64::quantized_matmul::
+  torchao::cpu::aarch64::quantized_matmul::
       channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot::internal::
           KernelImpl<a_has_zeros, b_has_zeros, a_transposed, b_transposed>::run(
               m,
@@ -406,6 +406,6 @@ void kernel(
               rhs_qparams_stride);
 }
 } // namespace channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot
-} // namespace torchao::kernels::cpu::aarch64::quantized_matmul
+} // namespace torchao::cpu::aarch64::quantized_matmul
 
 #endif // defined(__aarch64__) && defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h
index 28f173e9bc..454b8f3ebc 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h
@@ -16,7 +16,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h>
 
-namespace torchao::kernels::cpu::aarch64::quantized_matmul {
+namespace torchao::cpu::aarch64::quantized_matmul {
 namespace fp32_a_input_channelwise_8bit_b_1x16x4_f32::internal {
 
 namespace {
@@ -258,7 +258,7 @@ void kernel(
     const float* rhs_scales,
     const float beta,
     const int rhs_qparams_stride) {
-  torchao::kernels::cpu::aarch64::quantized_matmul::
+  torchao::cpu::aarch64::quantized_matmul::
       fp32_a_input_channelwise_8bit_b_1x16x4_f32::internal::
           KernelImpl<b_has_zeros, a_transposed, b_transposed>::run(
               m,
@@ -276,6 +276,6 @@ void kernel(
               rhs_qparams_stride);
 }
 } // namespace fp32_a_input_channelwise_8bit_b_1x16x4_f32
-} // namespace torchao::kernels::cpu::aarch64::quantized_matmul
+} // namespace torchao::cpu::aarch64::quantized_matmul
 
 #endif // defined(__aarch64__) && defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_4x16x4_f32_impl.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_4x16x4_f32_impl.h
index ffcd0a1f1d..d9cce306fe 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_4x16x4_f32_impl.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/fp32_a_input_channelwise_8bit_b_4x16x4_f32_impl.h
@@ -16,7 +16,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h>
 
-namespace torchao::kernels::cpu::aarch64::quantized_matmul {
+namespace torchao::cpu::aarch64::quantized_matmul {
 namespace fp32_a_input_channelwise_8bit_b_4x16x4_f32::internal {
 
 namespace {
@@ -305,7 +305,7 @@ void kernel(
     const float* rhs_scales,
     const float beta,
     const int rhs_qparams_stride) {
-  torchao::kernels::cpu::aarch64::quantized_matmul::
+  torchao::cpu::aarch64::quantized_matmul::
       fp32_a_input_channelwise_8bit_b_4x16x4_f32::internal::
           KernelImpl<b_has_zeros, a_transposed, b_transposed>::run(
               m,
@@ -323,6 +323,6 @@ void kernel(
               rhs_qparams_stride);
 }
 } // namespace fp32_a_input_channelwise_8bit_b_4x16x4_f32
-} // namespace torchao::kernels::cpu::aarch64::quantized_matmul
+} // namespace torchao::cpu::aarch64::quantized_matmul
 
 #endif // defined(__aarch64__) && defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul.h
index 371dc55666..c0a8f1b914 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul.h
@@ -15,7 +15,7 @@
 
 #include <arm_neon.h>
 
-namespace torchao::kernels::cpu::aarch64::quantized_matmul {
+namespace torchao::cpu::aarch64::quantized_matmul {
 namespace channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot {
 
 template <
@@ -307,7 +307,7 @@ void kernel(
 }
 
 } // namespace fp32_a_input_channelwise_8bit_b_f32
-} // namespace torchao::kernels::cpu::aarch64::quantized_matmul
+} // namespace torchao::cpu::aarch64::quantized_matmul
 
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h>
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h>
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h
index db577c39a8..65069bd4ca 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul_utils.h
@@ -13,7 +13,7 @@
 #include <cassert>
 #include <cstddef>
 
-namespace torchao::kernels::cpu::aarch64::quantized_matmul {
+namespace torchao::cpu::aarch64::quantized_matmul {
 namespace utils {
 
 TORCHAO_ALWAYS_INLINE static void transpose_scales_and_zero_points(
@@ -148,6 +148,6 @@ void pack_kxn_b_matrix_for_mx8_dotprod_ukernel(
   }
 }
 } // namespace utils
-} // namespace torchao::kernels::cpu::aarch64::quantized_matmul
+} // namespace torchao::cpu::aarch64::quantized_matmul
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/quantization/quantize.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/quantization/quantize.cpp
index 42301dc2fa..bcc1dec571 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/quantization/quantize.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/quantization/quantize.cpp
@@ -55,7 +55,7 @@ _vec_clip_inplace(int32x4_t& vec, int32x4_t vec_min, int32x4_t vec_max) {
 }
 } // namespace
 
-void torchao::kernels::cpu::aarch64::quantization::quantize(
+void torchao::cpu::aarch64::quantization::quantize(
     // Output
     int8_t* qvals,
     // Inputs
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/compute_sum.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/compute_sum.cpp
index 1b9d2aa97b..451b69c87e 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/compute_sum.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/compute_sum.cpp
@@ -9,7 +9,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/reduction.h>
 #include <cassert>
 
-int32_t torchao::kernels::cpu::aarch64::reduction::compute_sum(
+int32_t torchao::cpu::aarch64::reduction::compute_sum(
     const int8_t* vals,
     int size) {
   assert(size >= 1);
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/find_min_and_max.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/find_min_and_max.cpp
index ea4efcf1cc..521a778efd 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/find_min_and_max.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/find_min_and_max.cpp
@@ -9,7 +9,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/reduction.h>
 #include <cassert>
 
-void torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+void torchao::cpu::aarch64::reduction::find_min_and_max(
     float32_t& min,
     float32_t& max,
     const float32_t* vals,
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_bitpack_fallback_compatibility.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_bitpack_fallback_compatibility.cpp
index ccae74cbcd..ddbed9223b 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_bitpack_fallback_compatibility.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_bitpack_fallback_compatibility.cpp
@@ -22,7 +22,7 @@ TEST(test_bitpacking_64_uint1_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint1_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_64_uint1_values(
       packed.data(), input.data());
 
   uint8x16_t u0, u1, u2, u3;
@@ -52,7 +52,7 @@ TEST(test_bitpacking_64_uint1_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_64_uint1_values(
       packed.data(), i0, i1, i2, i3);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint1_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_64_uint1_values(
       unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -68,7 +68,7 @@ TEST(test_bitpacking_128_uint1_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint1_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_128_uint1_values(
       packed.data(), input.data());
 
   uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7;
@@ -104,7 +104,7 @@ TEST(test_bitpacking_128_uint1_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_128_uint1_values(
       packed.data(), i0, i1, i2, i3, i4, i5, i6, i7);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::
+  torchao::cpu::fallback::bitpacking::internal::
       unpack_128_uint1_values(unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -122,7 +122,7 @@ TEST(test_bitpacking_32_uint2_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint2_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_32_uint2_values(
       packed.data(), input.data());
 
   uint8x8_t u0, u1, u2, u3;
@@ -152,7 +152,7 @@ TEST(test_bitpacking_32_uint2_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_32_uint2_values(
       packed.data(), i0, i1, i2, i3);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint2_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_32_uint2_values(
       unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -168,7 +168,7 @@ TEST(test_bitpacking_64_uint2_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_64_uint2_values(
       packed.data(), input.data());
 
   uint8x16_t u0, u1, u2, u3;
@@ -198,7 +198,7 @@ TEST(test_bitpacking_64_uint2_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_64_uint2_values(
       packed.data(), i0, i1, i2, i3);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint2_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_64_uint2_values(
       unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -216,7 +216,7 @@ TEST(test_bitpacking_64_uint3_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint3_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_64_uint3_values(
       packed.data(), input.data());
 
   uint8x16_t u0, u1, u2, u3;
@@ -246,7 +246,7 @@ TEST(test_bitpacking_64_uint3_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_64_uint3_values(
       packed.data(), i0, i1, i2, i3);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint3_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_64_uint3_values(
       unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -262,7 +262,7 @@ TEST(test_bitpacking_128_uint3_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint3_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_128_uint3_values(
       packed.data(), input.data());
 
   uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7;
@@ -298,7 +298,7 @@ TEST(test_bitpacking_128_uint3_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_128_uint3_values(
       packed.data(), i0, i1, i2, i3, i4, i5, i6, i7);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::
+  torchao::cpu::fallback::bitpacking::internal::
       unpack_128_uint3_values(unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -316,7 +316,7 @@ TEST(test_bitpacking_16_uint4_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_16_uint4_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_16_uint4_values(
       packed.data(), input.data());
 
   uint8x16_t unpacked0;
@@ -341,7 +341,7 @@ TEST(test_bitpacking_16_uint4_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_16_uint4_values(
       packed.data(), input0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_16_uint4_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_16_uint4_values(
       unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -357,7 +357,7 @@ TEST(test_bitpacking_32_uint4_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
       packed.data(), input.data());
 
   uint8x16_t unpacked0, unpacked1;
@@ -384,7 +384,7 @@ TEST(test_bitpacking_32_uint4_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_32_uint4_values(
       packed.data(), input0, input1);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint4_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_32_uint4_values(
       unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -402,7 +402,7 @@ TEST(test_bitpacking_64_uint5_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint5_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_64_uint5_values(
       packed.data(), input.data());
 
   uint8x16_t unpacked0, unpacked1, unpacked2, unpacked3;
@@ -432,7 +432,7 @@ TEST(test_bitpacking_64_uint5_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_64_uint5_values(
       packed.data(), input0, input1, input2, input3);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint5_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_64_uint5_values(
       unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -448,7 +448,7 @@ TEST(test_bitpacking_128_uint5_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint5_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_128_uint5_values(
       packed.data(), input.data());
 
   uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7;
@@ -484,7 +484,7 @@ TEST(test_bitpacking_128_uint5_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_128_uint5_values(
       packed.data(), i0, i1, i2, i3, i4, i5, i6, i7);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::
+  torchao::cpu::fallback::bitpacking::internal::
       unpack_128_uint5_values(unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -502,7 +502,7 @@ TEST(test_bitpacking_32_uint6_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint6_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_32_uint6_values(
       packed.data(), input.data());
 
   uint8x16_t u0, u1;
@@ -529,7 +529,7 @@ TEST(test_bitpacking_32_uint6_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_32_uint6_values(
       packed.data(), i0, i1);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint6_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_32_uint6_values(
       unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -545,7 +545,7 @@ TEST(test_bitpacking_64_uint6_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint6_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_64_uint6_values(
       packed.data(), input.data());
 
   uint8x16_t u0, u1, u2, u3;
@@ -575,7 +575,7 @@ TEST(test_bitpacking_64_uint6_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_64_uint6_values(
       packed.data(), i0, i1, i2, i3);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint6_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_64_uint6_values(
       unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -593,7 +593,7 @@ TEST(test_bitpacking_64_uint7_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint7_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_64_uint7_values(
       packed.data(), input.data());
 
   uint8x16_t unpacked0, unpacked1, unpacked2, unpacked3;
@@ -623,7 +623,7 @@ TEST(test_bitpacking_64_uint7_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_64_uint7_values(
       packed.data(), input0, input1, input2, input3);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_64_uint7_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_64_uint7_values(
       unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
@@ -639,7 +639,7 @@ TEST(test_bitpacking_128_uint7_values, CppToNeon) {
   std::vector<uint8_t> packed(packed_bytes, 0);
   std::vector<uint8_t> unpacked(unpacked_bytes, 0);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_128_uint7_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_128_uint7_values(
       packed.data(), input.data());
 
   uint8x16_t u0, u1, u2, u3, u4, u5, u6, u7;
@@ -675,7 +675,7 @@ TEST(test_bitpacking_128_uint7_values, NeonToCpp) {
   torchao::bitpacking::internal::vec_pack_128_uint7_values(
       packed.data(), i0, i1, i2, i3, i4, i5, i6, i7);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::
+  torchao::cpu::fallback::bitpacking::internal::
       unpack_128_uint7_values(unpacked.data(), packed.data());
 
   for (int i = 0; i < unpacked_bytes; ++i) {
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding.cpp
index e5cdfb0a1b..efd6c78ebf 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding.cpp
@@ -28,7 +28,7 @@ void test_embedding(
   auto output = std::vector<float>(num_embeddings * embedding_dim, 0.0);
 
   for (int i = 0; i < num_embeddings; i++) {
-    torchao::kernels::cpu::aarch64::embedding::pack_embedding_weight_qvals<
+    torchao::cpu::aarch64::embedding::pack_embedding_weight_qvals<
         weight_nbit>(
         packed.data(), embedding_dim, test_case.weight_qvals.data(), i);
   }
@@ -39,7 +39,7 @@ void test_embedding(
   }
 
   for (int i = 0; i < num_embeddings; i++) {
-    torchao::kernels::cpu::aarch64::embedding::embedding<weight_nbit>(
+    torchao::cpu::aarch64::embedding::embedding<weight_nbit>(
         output.data() + i * embedding_dim,
         embedding_dim,
         group_size,
@@ -69,7 +69,7 @@ void test_shared_embedding(
   bool has_bias = false;
   float* bias = nullptr;
   std::vector<char> packed_weights(
-      torchao::kernels::cpu::aarch64::linear::
+      torchao::cpu::aarch64::linear::
           channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
               packed_weights_size(
                   n,
@@ -79,7 +79,7 @@ void test_shared_embedding(
                   has_weight_zeros,
                   has_bias,
                   nr));
-  torchao::kernels::cpu::aarch64::linear::
+  torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
           pack_weights<weight_nbit, nr, kr, sr>(
               packed_weights.data(),
@@ -94,7 +94,7 @@ void test_shared_embedding(
   // Call shared_embedding
   auto output = std::vector<float>(num_embeddings * embedding_dim, 0.0);
   for (int i = 0; i < num_embeddings; i++) {
-    torchao::kernels::cpu::aarch64::embedding::
+    torchao::cpu::aarch64::embedding::
         shared_embedding<weight_nbit, nr, kr, sr>(
             output.data() + i * embedding_dim,
             packed_weights.data(),
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding_lut.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding_lut.cpp
index 5802a179d0..33e1f3478c 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding_lut.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_embedding_lut.cpp
@@ -28,7 +28,7 @@ void test_embedding(
       has_scales);
 
   const size_t packed_embedding_size =
-      torchao::kernels::cpu::aarch64::embedding::packed_embedding_size(
+      torchao::cpu::aarch64::embedding::packed_embedding_size(
           weight_nbit,
           num_embeddings,
           embedding_dim,
@@ -40,7 +40,7 @@ void test_embedding(
   auto output = std::vector<float>(num_embeddings * embedding_dim, 0.0);
 
   for (int i = 0; i < num_embeddings; i++) {
-    torchao::kernels::cpu::aarch64::embedding::pack_embedding_row_at_index_lut<
+    torchao::cpu::aarch64::embedding::pack_embedding_row_at_index_lut<
         weight_nbit>(
         packed.data(),
         i,
@@ -55,7 +55,7 @@ void test_embedding(
   }
 
   for (int i = 0; i < num_embeddings; i++) {
-    torchao::kernels::cpu::aarch64::embedding::
+    torchao::cpu::aarch64::embedding::
         dequantize_embedding_row_at_idx_lut<weight_nbit>(
             output.data() + i * embedding_dim,
             packed.data(),
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_linear.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_linear.cpp
index bf99823052..69deeac517 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_linear.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_linear.cpp
@@ -40,7 +40,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32(
           has_bias,
           has_clamp);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   std::vector<char> packed_activations(
@@ -116,7 +116,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16(
           has_bias,
           has_clamp);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   std::vector<char> packed_activations(
@@ -192,7 +192,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16(
           has_bias,
           has_clamp);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   std::vector<char> packed_activations(
@@ -438,7 +438,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_lut(
           has_bias,
           has_clamp);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   std::vector<char> packed_activations(
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_lut.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_lut.cpp
index 6d9214eeba..cde73ba6ee 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_lut.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_lut.cpp
@@ -16,7 +16,7 @@
 
 namespace lut_utils = torchao::lut;
 namespace kernel_api =
-    torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
+    torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
 
 TEST(test_fp32_lut, LutLookup) {
   auto lut = torchao::get_random_vector(16, -1.0, 1.0);
@@ -53,7 +53,7 @@ void test_groupwise_lowbit_lut_kernel(
     bool has_bias,
     bool has_clamp) {
   namespace kernel_api =
-      torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
+      torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
   // 1. Generate test case
   auto test_case = torchao::groupwise_lowbit_weight_lut_test_case::
       generate_with_decoupled_grouping(
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_qmatmul.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_qmatmul.cpp
index 5d46937ccf..46b7a024f4 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_qmatmul.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_qmatmul.cpp
@@ -54,11 +54,11 @@ struct test_channelwise_8bit_channelwise_8bit_b<
         const int);
     kernel_fn_type kernel_fn = nullptr;
     if (use_gemm && (m % 4 == 0) && (n % 8 == 0) && (k % 16 == 0)) {
-      using namespace torchao::kernels::cpu::aarch64::quantized_matmul::
+      using namespace torchao::cpu::aarch64::quantized_matmul::
           channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot;
       kernel_fn = kernel<a_has_zeros, b_has_zeros, false, true>;
     } else {
-      using namespace torchao::kernels::cpu::aarch64::quantized_matmul::
+      using namespace torchao::cpu::aarch64::quantized_matmul::
           channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot;
       kernel_fn = kernel<a_has_zeros, b_has_zeros, false, true>;
     }
@@ -99,7 +99,7 @@ struct test_channelwise_8bit_channelwise_8bit_b<
         torchao::channelwise_8bit_a_channelwise_8bit_b_qmatmul_test_case::
             generate(m, k, n, a_has_zeros, a_has_zeros, false, false);
 
-    using namespace torchao::kernels::cpu::aarch64::quantized_matmul::
+    using namespace torchao::cpu::aarch64::quantized_matmul::
         channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal;
 
     std::vector<float> output(m * n);
@@ -416,11 +416,11 @@ static void test_fp32_a_input_channelwise_8bit_b(
 
   kernel_fn_type kernel_fn = nullptr;
   if (test_case.use_gemm_kernel() && (m % 4 == 0)) {
-    using namespace torchao::kernels::cpu::aarch64::quantized_matmul::
+    using namespace torchao::cpu::aarch64::quantized_matmul::
         fp32_a_input_channelwise_8bit_b_4x16x4_f32;
     kernel_fn = kernel<true, false, false>;
   } else {
-    using namespace torchao::kernels::cpu::aarch64::quantized_matmul::
+    using namespace torchao::cpu::aarch64::quantized_matmul::
         fp32_a_input_channelwise_8bit_b_1x16x4_f32;
     kernel_fn = kernel<true, false, false>;
   }
@@ -568,11 +568,11 @@ static void test_8bit_per_token_q_at_k_matmul_attention(
       const int);
   kernel_fn_type kernel_fn = nullptr;
   if ((s_q % 4 == 0) && (s_k % 8 == 0) && (d % 16 == 0)) {
-    using namespace torchao::kernels::cpu::aarch64::quantized_matmul::
+    using namespace torchao::cpu::aarch64::quantized_matmul::
         channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot;
     kernel_fn = kernel<true, true, false, true>;
   } else {
-    using namespace torchao::kernels::cpu::aarch64::quantized_matmul::
+    using namespace torchao::cpu::aarch64::quantized_matmul::
         channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot;
     kernel_fn = kernel<true, true, false, true>;
   }
@@ -660,7 +660,7 @@ static void test_fp32_attn_scores_at_v_matmul_attention(
       torchao::fp32_a_channelwise_8bit_b_attn_scores_at_v_test_case::generate(
           b, s_attn, s_v, h, d, transpose_v);
 
-  using namespace torchao::kernels::cpu::aarch64::quantized_matmul::
+  using namespace torchao::cpu::aarch64::quantized_matmul::
       fp32_a_input_channelwise_8bit_b_f32;
 
   size_t attn_b_stride = test_case.b_attn_stride;
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_quantization.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_quantization.cpp
index ebe3fbdfa8..402046445f 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_quantization.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_quantization.cpp
@@ -49,7 +49,7 @@ TEST(test_quantize, ExpectedOutput) {
   int qmin, qmax, zero;
   float vmin, vmax, scale;
 
-  torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+  torchao::cpu::aarch64::reduction::find_min_and_max(
       vmin, vmax, vals.data(), vals.size());
 
   std::vector<int8_t> qvals(vals.size());
@@ -61,7 +61,7 @@ TEST(test_quantize, ExpectedOutput) {
     torchao::quantization::get_scale_and_zero(
         scale, zero, vmin, vmax, qmin, qmax);
 
-    torchao::kernels::cpu::aarch64::quantization::quantize(
+    torchao::cpu::aarch64::quantization::quantize(
         qvals.data(), vals.data(), vals.size(), scale, zero, qmin, qmax);
 
     for (int i = 0; i < vals.size(); ++i) {
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_reduction.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_reduction.cpp
index 44dbafafa5..dc2e106d66 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_reduction.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_reduction.cpp
@@ -16,7 +16,7 @@
 TEST(test_find_min_and_sum, SizeHasRemainderAfterDivideBy4) {
   auto vals = torchao::get_random_vector(19, -1.0, 1.0);
   float vmin, vmax;
-  torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+  torchao::cpu::aarch64::reduction::find_min_and_max(
       vmin, vmax, vals.data(), vals.size());
 
   auto expected_vmin = *std::min_element(vals.begin(), vals.end());
@@ -28,7 +28,7 @@ TEST(test_find_min_and_sum, SizeHasRemainderAfterDivideBy4) {
 TEST(test_find_min_and_sum, SizeSmallerThan4) {
   auto vals = torchao::get_random_vector(3, -1.0, 1.0);
   float vmin, vmax;
-  torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+  torchao::cpu::aarch64::reduction::find_min_and_max(
       vmin, vmax, vals.data(), vals.size());
 
   auto expected_vmin = *std::min_element(vals.begin(), vals.end());
@@ -39,7 +39,7 @@ TEST(test_find_min_and_sum, SizeSmallerThan4) {
 
 TEST(test_compute_sum, ExpectedOutput) {
   auto vals = torchao::get_random_lowbit_vector(/*size=*/19, /*int8*/ 3);
-  int sum = torchao::kernels::cpu::aarch64::reduction::compute_sum(
+  int sum = torchao::cpu::aarch64::reduction::compute_sum(
       (int8_t*)vals.data(), vals.size());
   int expected_sum = std::accumulate(vals.begin(), vals.end(), 0);
   EXPECT_EQ(sum, expected_sum);
@@ -47,7 +47,7 @@ TEST(test_compute_sum, ExpectedOutput) {
 
 TEST(test_compute_sum, SizeHasRemainderAfterDivideBy16) {
   auto vals = torchao::get_random_lowbit_vector(/*size=*/17, /*int8*/ 3);
-  int sum = torchao::kernels::cpu::aarch64::reduction::compute_sum(
+  int sum = torchao::cpu::aarch64::reduction::compute_sum(
       (int8_t*)vals.data(), vals.size());
   int expected_sum = std::accumulate(vals.begin(), vals.end(), 0);
   EXPECT_EQ(sum, expected_sum);
@@ -55,7 +55,7 @@ TEST(test_compute_sum, SizeHasRemainderAfterDivideBy16) {
 
 TEST(test_compute_sum, SizeSmallerThan16) {
   auto vals = torchao::get_random_lowbit_vector(/*size=*/3, /*int8*/ 3);
-  int sum = torchao::kernels::cpu::aarch64::reduction::compute_sum(
+  int sum = torchao::cpu::aarch64::reduction::compute_sum(
       (int8_t*)vals.data(), vals.size());
   int expected_sum = std::accumulate(vals.begin(), vals.end(), 0);
   EXPECT_EQ(sum, expected_sum);
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_utils.h b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_utils.h
index e5742d3f56..623c5215f5 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_utils.h
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_utils.h
@@ -55,13 +55,13 @@ auto generate_per_token_quantized_tensor(int m, int n, bool transposed) {
   torchao::quantization::get_qvals_range(
       qmin, qmax, /*nbit=*/8, /*is_symmetric=*/false);
   for (int m_idx = 0; m_idx < m; m_idx++) {
-    torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+    torchao::cpu::aarch64::reduction::find_min_and_max(
         vmin, vmax, /*vals=*/activations.data() + m_idx * n, /*size=*/n);
     torchao::quantization::get_scale_and_zero(
         scale, zero, vmin, vmax, qmin, qmax);
     activation_scales[m_idx] = scale;
     activation_zeros[m_idx] = zero;
-    torchao::kernels::cpu::aarch64::quantization::quantize(
+    torchao::cpu::aarch64::quantization::quantize(
         /*qvals=*/activation_qvals.data() + m_idx * n,
         /*vals=*/activations.data() + m_idx * n,
         /*size=*/n,
@@ -209,7 +209,7 @@ struct channelwise_8bit_activation_groupwise_lowbit_weight_test_case {
 
     int n_groups = (n * k) / weight_group_size;
     for (int group_idx = 0; group_idx < n_groups; group_idx += 1) {
-      torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+      torchao::cpu::aarch64::reduction::find_min_and_max(
           vmin,
           vmax,
           /*vals=*/weights.data() + group_idx * weight_group_size,
@@ -230,7 +230,7 @@ struct channelwise_8bit_activation_groupwise_lowbit_weight_test_case {
       weight_scales[group_idx] = scale;
       weight_zeros[group_idx] = zero;
 
-      torchao::kernels::cpu::aarch64::quantization::quantize(
+      torchao::cpu::aarch64::quantization::quantize(
           /*qvals=*/weight_qvals.data() + group_idx * weight_group_size,
           /*vals=*/weights.data() + group_idx * weight_group_size,
           /*size=*/weight_group_size,
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_weight_packing.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_weight_packing.cpp
index b64d4b2754..ed6e394899 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_weight_packing.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/tests/test_weight_packing.cpp
@@ -26,10 +26,10 @@ void test_weight_packing(
           has_bias,
           /*has_clamp*/ false);
 
-  //   using namespace torchao::kernels::cpu::aarch64::linear::packing;
+  //   using namespace torchao::cpu::aarch64::linear::packing;
 
   std::vector<char> packed_weights(
-      torchao::kernels::cpu::aarch64::linear::
+      torchao::cpu::aarch64::linear::
           channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
               packed_weights_size(
                   n,
@@ -56,7 +56,7 @@ void test_weight_packing(
   std::vector<int8_t> weight_zeros_out(test_case.weight_zeros.size());
   std::vector<float> bias_out(test_case.bias.size());
 
-  torchao::kernels::cpu::aarch64::linear::
+  torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
           pack_weights<weight_nbit, nr, kr, sr>(
               packed_weights.data(),
@@ -67,7 +67,7 @@ void test_weight_packing(
               weight_scales_in,
               weight_zeros_in,
               bias_in);
-  torchao::kernels::cpu::aarch64::linear::
+  torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
           unpack_weights<weight_nbit, nr, kr, sr>(
               weight_qvals_out.data(),
diff --git a/torchao/csrc/cpu/torch_free_kernels/aarch64/valpacking/interleave.cpp b/torchao/csrc/cpu/torch_free_kernels/aarch64/valpacking/interleave.cpp
index 3818fac2d0..05d347f1e2 100644
--- a/torchao/csrc/cpu/torch_free_kernels/aarch64/valpacking/interleave.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/aarch64/valpacking/interleave.cpp
@@ -42,7 +42,7 @@
 // but it can be something else if we are applying this method
 // to a matrix tile.
 
-void torchao::kernels::cpu::valpacking::interleave_data(
+void torchao::cpu::valpacking::interleave_data(
     void* data_interleaved,
     const void* data,
     int bytes_per_val,
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/bitpack.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/bitpack.h
index c28c6ec90d..ba981c6f8a 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/bitpack.h
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/bitpack.h
@@ -16,7 +16,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint7.h>
 #include <cassert>
 
-namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace torchao::cpu::fallback::bitpacking {
 namespace internal {
 /**
  * @brief Packs 128 unsigned 8-bit integers into a packed format of 'nbit' bits.
@@ -176,4 +176,4 @@ inline void unpack_128_lowbit_values_with_lut(
   }
 }
 } // namespace internal
-} // namespace torchao::kernels::cpu::fallback::bitpacking
+} // namespace torchao::cpu::fallback::bitpacking
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint1.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint1.h
index 08e231716b..da0e17c8fc 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint1.h
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint1.h
@@ -9,7 +9,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <cstdint>
 
-namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace torchao::cpu::fallback::bitpacking {
 namespace internal {
 
 /**
@@ -151,4 +151,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_128_uint1_values(
   }
 }
 } // namespace internal
-} // namespace torchao::kernels::cpu::fallback::bitpacking
+} // namespace torchao::cpu::fallback::bitpacking
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint2.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint2.h
index 9dc1cce463..c054c7fc9d 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint2.h
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint2.h
@@ -8,7 +8,7 @@
 
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <cstdint>
-namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace torchao::cpu::fallback::bitpacking {
 namespace internal {
 
 /**
@@ -116,4 +116,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_64_uint2_values(
 }
 
 } // namespace internal
-} // namespace torchao::kernels::cpu::fallback::bitpacking
+} // namespace torchao::cpu::fallback::bitpacking
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint3.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint3.h
index 277317d5a2..314caee0a7 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint3.h
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint3.h
@@ -9,7 +9,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <cstdint>
 
-namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace torchao::cpu::fallback::bitpacking {
 namespace internal {
 
 /**
@@ -192,4 +192,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_128_uint3_values(
 }
 
 } // namespace internal
-} // namespace torchao::kernels::cpu::fallback::bitpacking
+} // namespace torchao::cpu::fallback::bitpacking
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint4.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint4.h
index 4b98a47143..528507a651 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint4.h
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint4.h
@@ -9,7 +9,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <cstdint>
 
-namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace torchao::cpu::fallback::bitpacking {
 namespace internal {
 /**
  * @brief Packs 2 bytes, each holding a 4-bit value (0-15), into a single
@@ -106,4 +106,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_32_uint4_values(
   }
 }
 } // namespace internal
-} // namespace torchao::kernels::cpu::fallback::bitpacking
+} // namespace torchao::cpu::fallback::bitpacking
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint5.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint5.h
index 3de577e05f..86c3c8d1f2 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint5.h
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint5.h
@@ -9,7 +9,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <cstdint>
 
-namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace torchao::cpu::fallback::bitpacking {
 namespace internal {
 
 /**
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint6.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint6.h
index 2fcd9334ec..347f772951 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint6.h
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint6.h
@@ -9,7 +9,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <cstdint>
 
-namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace torchao::cpu::fallback::bitpacking {
 namespace internal {
 
 /**
@@ -139,4 +139,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_64_uint6_values(
 }
 
 } // namespace internal
-} // namespace torchao::kernels::cpu::fallback::bitpacking
+} // namespace torchao::cpu::fallback::bitpacking
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint7.h b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint7.h
index 60493a20b2..6d045132f9 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint7.h
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/bitpacking/uint7.h
@@ -9,7 +9,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/macro.h>
 #include <cstdint>
 
-namespace torchao::kernels::cpu::fallback::bitpacking {
+namespace torchao::cpu::fallback::bitpacking {
 namespace internal {
 /**
  * @brief Packs 8 bytes, each holding a 7-bit value (0-127), into 7 bytes.
@@ -137,4 +137,4 @@ TORCHAO_ALWAYS_INLINE inline void unpack_128_uint7_values(
 }
 
 } // namespace internal
-} // namespace torchao::kernels::cpu::fallback::bitpacking
+} // namespace torchao::cpu::fallback::bitpacking
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/channelwise_8bit_a_channelwise_8bit_b.h b/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/channelwise_8bit_a_channelwise_8bit_b.h
index 3b070eb2b3..5e7b7eb9f7 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/channelwise_8bit_a_channelwise_8bit_b.h
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/channelwise_8bit_a_channelwise_8bit_b.h
@@ -8,7 +8,7 @@
 
 #include <cstdint>
 
-namespace torchao::kernels::cpu::fallback::quantized_matmul {
+namespace torchao::cpu::fallback::quantized_matmul {
 namespace channelwise_8bit_a_channelwise_8bit_b::internal {
 
 template <
@@ -85,10 +85,10 @@ struct KernelImpl<true, true, false, b_transposed> {
 
 } // namespace
   // channelwise_8bit_a_channelwise_8bit_b::internal
-} // namespace torchao::kernels::cpu::fallback::quantized_matmul
+} // namespace torchao::cpu::fallback::quantized_matmul
 
 // TODO: Remove all ::kernels. No need for extra namespace.
-namespace torchao::kernels::cpu::fallback::quantized_matmul {
+namespace torchao::cpu::fallback::quantized_matmul {
 namespace channelwise_8bit_a_channelwise_8bit_b {
 template <
     bool a_has_zeros,
@@ -130,4 +130,4 @@ void kernel(
           rhs_qparams_stride);
 }
 } // namespace channelwise_8bit_a_channelwise_8bit_b
-} // namespace torchao::kernels::cpu::fallback::quantized_matmul
+} // namespace torchao::cpu::fallback::quantized_matmul
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/fp32_a_channelwise_8bit_b_fp32_c.h b/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/fp32_a_channelwise_8bit_b_fp32_c.h
index 58e2853617..f89b281891 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/fp32_a_channelwise_8bit_b_fp32_c.h
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/matmul/fp32_a_channelwise_8bit_b_fp32_c.h
@@ -8,8 +8,7 @@
 
 #include <cassert>
 
-// TODO: Remove all ::kernels. No need for extra namespace.
-namespace torchao::kernels::cpu::fallback::quantized_matmul {
+namespace torchao::cpu::fallback::quantized_matmul {
 namespace fp32_a_input_channelwise_8bit_b_fp32 {
 template <bool b_has_zeros, bool a_transposed, bool b_transposed>
 void kernel(
@@ -47,4 +46,4 @@ void kernel(
   }
 }
 } // namespace fp32_a_input_channelwise_8bit_b_fp32
-} // namespace torchao::kernels::cpu::fallback::quantized_matmul
+} // namespace torchao::cpu::fallback::quantized_matmul
diff --git a/torchao/csrc/cpu/torch_free_kernels/fallback/tests/test_bitpacking.cpp b/torchao/csrc/cpu/torch_free_kernels/fallback/tests/test_bitpacking.cpp
index 32177e63da..f672bc5d17 100644
--- a/torchao/csrc/cpu/torch_free_kernels/fallback/tests/test_bitpacking.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/fallback/tests/test_bitpacking.cpp
@@ -23,9 +23,9 @@ TEST(FallbackBitpackingTest, PackUnpack8_uint1) {
   std::vector<uint8_t> packed(packed_bytes);
   std::vector<uint8_t> unpacked(unpacked_bytes);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint1_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_8_uint1_values(
       packed.data(), input.data());
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint1_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_8_uint1_values(
       unpacked.data(), packed.data());
 
   ASSERT_EQ(input, unpacked);
@@ -38,9 +38,9 @@ TEST(FallbackBitpackingTest, PackUnpack4_uint2) {
   std::vector<uint8_t> packed(packed_bytes);
   std::vector<uint8_t> unpacked(unpacked_bytes);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_4_uint2_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_4_uint2_values(
       packed.data(), input.data());
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_4_uint2_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_4_uint2_values(
       unpacked.data(), packed.data());
 
   ASSERT_EQ(input, unpacked);
@@ -53,9 +53,9 @@ TEST(FallbackBitpackingTest, PackUnpack8_uint3) {
   std::vector<uint8_t> packed(packed_bytes);
   std::vector<uint8_t> unpacked(unpacked_bytes);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint3_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_8_uint3_values(
       packed.data(), input.data());
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint3_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_8_uint3_values(
       unpacked.data(), packed.data());
 
   ASSERT_EQ(input, unpacked);
@@ -68,9 +68,9 @@ TEST(FallbackBitpackingTest, PackUnpack32_uint4) {
   std::vector<uint8_t> packed(packed_bytes);
   std::vector<uint8_t> unpacked(unpacked_bytes);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
       packed.data(), input.data());
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_32_uint4_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_32_uint4_values(
       unpacked.data(), packed.data());
 
   ASSERT_EQ(input, unpacked);
@@ -83,9 +83,9 @@ TEST(FallbackBitpackingTest, PackUnpack8_uint5) {
   std::vector<uint8_t> packed(packed_bytes);
   std::vector<uint8_t> unpacked(unpacked_bytes);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint5_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_8_uint5_values(
       packed.data(), input.data());
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint5_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_8_uint5_values(
       unpacked.data(), packed.data());
 
   ASSERT_EQ(input, unpacked);
@@ -98,9 +98,9 @@ TEST(FallbackBitpackingTest, PackUnpack4_uint6) {
   std::vector<uint8_t> packed(packed_bytes);
   std::vector<uint8_t> unpacked(unpacked_bytes);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_4_uint6_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_4_uint6_values(
       packed.data(), input.data());
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_4_uint6_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_4_uint6_values(
       unpacked.data(), packed.data());
 
   ASSERT_EQ(input, unpacked);
@@ -113,9 +113,9 @@ TEST(FallbackBitpackingTest, PackUnpack8_uint7) {
   std::vector<uint8_t> packed(packed_bytes);
   std::vector<uint8_t> unpacked(unpacked_bytes);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::pack_8_uint7_values(
+  torchao::cpu::fallback::bitpacking::internal::pack_8_uint7_values(
       packed.data(), input.data());
-  torchao::kernels::cpu::fallback::bitpacking::internal::unpack_8_uint7_values(
+  torchao::cpu::fallback::bitpacking::internal::unpack_8_uint7_values(
       unpacked.data(), packed.data());
 
   ASSERT_EQ(input, unpacked);
@@ -131,9 +131,9 @@ void test_bitpacking_128_lowbit_values() {
   std::vector<uint8_t> packed(packed_bytes);
   std::vector<int8_t> unpacked(unpacked_bytes);
 
-  torchao::kernels::cpu::fallback::bitpacking::internal::
+  torchao::cpu::fallback::bitpacking::internal::
       pack_128_lowbit_int_values<nbit>(packed.data(), input.data());
-  torchao::kernels::cpu::fallback::bitpacking::internal::
+  torchao::cpu::fallback::bitpacking::internal::
       unpack_128_lowbit_int_values<nbit>(unpacked.data(), packed.data());
 
   ASSERT_EQ(input, unpacked);
@@ -159,31 +159,31 @@ void test_bitpacking_128_lowbit_values_with_lut() {
   // 3. Pack the indices
   std::vector<uint8_t> packed(packed_bytes);
   if constexpr (nbit == 1)
-    torchao::kernels::cpu::fallback::bitpacking::internal::
+    torchao::cpu::fallback::bitpacking::internal::
         pack_128_uint1_values(packed.data(), indices.data());
   if constexpr (nbit == 2) {
-    torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values(
+    torchao::cpu::fallback::bitpacking::internal::pack_64_uint2_values(
         packed.data(), indices.data());
-    torchao::kernels::cpu::fallback::bitpacking::internal::pack_64_uint2_values(
+    torchao::cpu::fallback::bitpacking::internal::pack_64_uint2_values(
         packed.data() + 16, indices.data() + 64);
   }
   if constexpr (nbit == 3)
-    torchao::kernels::cpu::fallback::bitpacking::internal::
+    torchao::cpu::fallback::bitpacking::internal::
         pack_128_uint3_values(packed.data(), indices.data());
   if constexpr (nbit == 4) {
-    torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+    torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
         packed.data(), indices.data());
-    torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+    torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
         packed.data() + 16, indices.data() + 32);
-    torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+    torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
         packed.data() + 32, indices.data() + 64);
-    torchao::kernels::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
+    torchao::cpu::fallback::bitpacking::internal::pack_32_uint4_values(
         packed.data() + 48, indices.data() + 96);
   }
 
   // 4. Unpack using the LUT function
   std::vector<int8_t> unpacked(unpacked_bytes);
-  torchao::kernels::cpu::fallback::bitpacking::internal::
+  torchao::cpu::fallback::bitpacking::internal::
       unpack_128_lowbit_values_with_lut<nbit>(
           unpacked.data(), packed.data(), lut.data());
 
diff --git a/torchao/csrc/cpu/torch_free_kernels/interface/quantized_matmul.h b/torchao/csrc/cpu/torch_free_kernels/interface/quantized_matmul.h
index da3fd32747..b8d99dc563 100644
--- a/torchao/csrc/cpu/torch_free_kernels/interface/quantized_matmul.h
+++ b/torchao/csrc/cpu/torch_free_kernels/interface/quantized_matmul.h
@@ -15,7 +15,7 @@
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/matmul/matmul.h>
 #endif // defined(__aarch64__) && defined(__ARM_NEON)
 
-namespace torchao::kernels::cpu::quantized_matmul {
+namespace torchao::cpu::quantized_matmul {
 
 /*
 a_stride_m: stride of a in memory to indiciate how far apart each row is.
@@ -78,10 +78,10 @@ get_int8_a_int8_b_channelwise_qmatmul(
   if (b_transposed) {
     a_stride_m = k;
     b_stride_n = k;
-    return torchao::kernels::cpu::fallback::quantized_matmul::
+    return torchao::cpu::fallback::quantized_matmul::
         channelwise_8bit_a_channelwise_8bit_b::kernel<true, true, false, true>;
   } else {
-    return torchao::kernels::cpu::fallback::quantized_matmul::
+    return torchao::cpu::fallback::quantized_matmul::
         channelwise_8bit_a_channelwise_8bit_b::kernel<true, true, false, false>;
   }
 }
@@ -144,13 +144,13 @@ get_fp32_a_input_channelwise_8bit_b_f32_c_matmul(
   if (b_transposed) {
     a_stride_m = k;
     b_stride_n = k;
-    return torchao::kernels::cpu::fallback::quantized_matmul::
+    return torchao::cpu::fallback::quantized_matmul::
         fp32_a_input_channelwise_8bit_b_fp32::kernel<true, false, true>;
   } else {
     a_stride_m = k;
     b_stride_n = n;
-    return torchao::kernels::cpu::fallback::quantized_matmul::
+    return torchao::cpu::fallback::quantized_matmul::
         fp32_a_input_channelwise_8bit_b_fp32::kernel<true, false, false>;
   }
 }
-} // namespace torchao::kernels::cpu::quantized_matmul
+} // namespace torchao::cpu::quantized_matmul
diff --git a/torchao/csrc/cpu/torch_free_kernels/interface/test_qmatmul_interface.cpp b/torchao/csrc/cpu/torch_free_kernels/interface/test_qmatmul_interface.cpp
index 5ce1593732..8f091a9138 100644
--- a/torchao/csrc/cpu/torch_free_kernels/interface/test_qmatmul_interface.cpp
+++ b/torchao/csrc/cpu/torch_free_kernels/interface/test_qmatmul_interface.cpp
@@ -297,7 +297,7 @@ struct test_channelwise_8bit_channelwise_8bit_b<
             m, k, n, a_has_zeros, a_has_zeros, false, true, stride);
 
     int a_stride_m, b_stride_n;
-    auto kernel = torchao::kernels::cpu::quantized_matmul::
+    auto kernel = torchao::cpu::quantized_matmul::
         get_int8_a_int8_b_channelwise_qmatmul(
             m, n, k, false, true, a_stride_m, b_stride_n);
     a_stride_m = a_stride_m * stride;
@@ -563,7 +563,7 @@ static void test_fp32_a_input_channelwise_8bit_b(
   test_case.execute(beta);
 
   int a_stride_m, b_stride_n;
-  auto kernel = torchao::kernels::cpu::quantized_matmul::
+  auto kernel = torchao::cpu::quantized_matmul::
       get_fp32_a_input_channelwise_8bit_b_f32_c_matmul(
           m, n, k, false, false, a_stride_m, b_stride_n);
   b_stride_n = b_stride_n * stride;
diff --git a/torchao/experimental/kernels/mps/src/dispatch.h b/torchao/experimental/kernels/mps/src/dispatch.h
index a04452cece..92b1f619ef 100644
--- a/torchao/experimental/kernels/mps/src/dispatch.h
+++ b/torchao/experimental/kernels/mps/src/dispatch.h
@@ -8,7 +8,7 @@
 
 #include <functional>
 
-namespace torchao::kernels::mps::lowbit::dispatch {
+namespace torchao::mps::lowbit::dispatch {
 
 inline void dispatch_mm(
     id<MTLComputeCommandEncoder> encoder,
@@ -48,4 +48,4 @@ inline void dispatch_qmv_fast(
       threadsPerThreadgroup:MTLSizeMake(32, 2, 1)];
 }
 
-} // namespace torchao::kernels::mps::lowbit::dispatch
+} // namespace torchao::mps::lowbit::dispatch
diff --git a/torchao/experimental/kernels/mps/src/lowbit.h b/torchao/experimental/kernels/mps/src/lowbit.h
index 8071398eba..c7008034c0 100644
--- a/torchao/experimental/kernels/mps/src/lowbit.h
+++ b/torchao/experimental/kernels/mps/src/lowbit.h
@@ -14,7 +14,7 @@
 #include <torchao/experimental/kernels/mps/src/metal_shader_lib.h> // metal_lowbit_quantized_lib
 #include <torchao/experimental/kernels/mps/src/packing.h>
 
-namespace torchao::kernels::mps::lowbit {
+namespace torchao::mps::lowbit {
 namespace {
 
 template <int nbit>
@@ -175,4 +175,4 @@ struct LowBitQuantWeights {
   static constexpr auto pack = LowBitConfig<nbit>::packing_fn;
 };
 
-} // namespace torchao::kernels::mps::lowbit
+} // namespace torchao::mps::lowbit
diff --git a/torchao/experimental/kernels/mps/src/packing.h b/torchao/experimental/kernels/mps/src/packing.h
index 5412c04a12..18fa19ca1b 100644
--- a/torchao/experimental/kernels/mps/src/packing.h
+++ b/torchao/experimental/kernels/mps/src/packing.h
@@ -4,7 +4,7 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
 
-namespace torchao::kernels::mps::lowbit::packing {
+namespace torchao::mps::lowbit::packing {
 
 /**
  * Pack weights into a smaller number of bits.
@@ -187,4 +187,4 @@ pack<7>(const uint8_t* w_ptr, uint8_t* b_ptr, int32_t N, int32_t K) {
   }
 }
 
-} // namespace torchao::kernels::mps::lowbit::packing
+} // namespace torchao::mps::lowbit::packing
diff --git a/torchao/experimental/kernels/mps/test/test_lowbit.mm b/torchao/experimental/kernels/mps/test/test_lowbit.mm
index 8481e5cef6..e00570247a 100644
--- a/torchao/experimental/kernels/mps/test/test_lowbit.mm
+++ b/torchao/experimental/kernels/mps/test/test_lowbit.mm
@@ -37,7 +37,7 @@
   return rc;
 }
 
-namespace torchao::kernels::mps::lowbit {
+namespace torchao::mps::lowbit {
 
 // Reference CPU implementation of lowbit quantized linear
 template <typename T>
@@ -184,11 +184,11 @@ void allocBuffers(id<MTLDevice> device) {
   id<MTLBuffer> buf_Z; // (K/group_size)xN elements
 };
 
-} // namespace torchao::kernels::mps::lowbit
+} // namespace torchao::mps::lowbit
 
 template <typename T, int nbit>
 void run_test(int32_t m, int32_t k, int32_t n, int32_t group_size) {
-  torchao::kernels::mps::lowbit::LowBitTester<T, nbit> tester(
+  torchao::mps::lowbit::LowBitTester<T, nbit> tester(
       m, k, n, group_size);
   tester.init();
   tester.pack();
diff --git a/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_aten.mm b/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_aten.mm
index b8ecb8c7aa..d0acd7d695 100644
--- a/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_aten.mm
+++ b/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_aten.mm
@@ -12,7 +12,7 @@
 #include <torchao/experimental/kernels/mps/src/lowbit.h>
 // clang-format on
 
-namespace torchao::kernels::mps::lowbit::aten {
+namespace torchao::mps::lowbit::aten {
 
 using Tensor = at::Tensor;
 using namespace at::native::mps;
@@ -240,7 +240,7 @@ Tensor pack_weights_cpu_kernel(const Tensor& W) {
   m.impl("_linear_fp_act_7bit_weight", &linear_mps_kernel_meta<7>);
 }
 
-} // namespace torchao::kernels::mps::lowbit::aten
+} // namespace torchao::mps::lowbit::aten
 
 
 // c-shim wrappers for AOTInductor
diff --git a/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_executorch.mm b/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_executorch.mm
index 22693b417e..ca0c2e5a0a 100644
--- a/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_executorch.mm
+++ b/torchao/experimental/ops/mps/linear_fp_act_xbit_weight_executorch.mm
@@ -94,7 +94,7 @@ bool check_linear_mps_args(
   auto N = B.size(0);
   auto K = A.size(1);
 
-  torchao::kernels::mps::lowbit::LowBitQuantWeights<nbit>::linear(
+  torchao::mps::lowbit::LowBitQuantWeights<nbit>::linear(
       {getMTLBufferStorage(A), A.storage_offset() * A.element_size()},
       {getMTLBufferStorage(B), B.storage_offset() * B.element_size()},
       group_size,