Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp>
UKernelConfig get_ukernel_config() {
UKernelConfig config;

namespace ukernel = torchao::kernels::cpu::aarch64::linear::
namespace ukernel = torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot;
config.mr = 1;
config.nr = 8;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ Tensor embedding_out_cpu(
}
TORCHAO_CHECK(index >= 0 && index < num_embeddings, "index out of bounds");
#if defined(TORCHAO_BUILD_CPU_AARCH64)
torchao::kernels::cpu::aarch64::embedding::embedding<weight_nbit>(
torchao::cpu::aarch64::embedding::embedding<weight_nbit>(
out.mutable_data_ptr<float>() + idx * embedding_dim,
embedding_dim,
group_size,
Expand Down Expand Up @@ -199,7 +199,7 @@ Tensor pack_embedding_cpu(const Tensor& weight_qvals) {

torchao::parallel_1d(0, num_embeddings, [&](int64_t idx) {
#if defined(TORCHAO_BUILD_CPU_AARCH64)
torchao::kernels::cpu::aarch64::embedding::pack_embedding_weight_qvals<
torchao::cpu::aarch64::embedding::pack_embedding_weight_qvals<
weight_nbit>(
out.mutable_data_ptr<int8_t>() +
torchao::ops::PackedWeightsHeader::size(),
Expand Down Expand Up @@ -289,7 +289,7 @@ Tensor shared_embedding_out_cpu(
}
TORCHAO_CHECK(index >= 0 && index < n, "index out of bounds");
#if defined(TORCHAO_BUILD_CPU_AARCH64)
torchao::kernels::cpu::aarch64::embedding::
torchao::cpu::aarch64::embedding::
shared_embedding<weight_nbit, nr, kr, sr>(
out.mutable_data_ptr<float>() + idx * k,
packed_weights.const_data_ptr<int8_t>() +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ void register_ukernel_config(
int preferred_alignment = 16;

namespace kernel_api =
torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut;

using kernel_fn_ptr_t =
decltype(&kernel_api::groupwise_lowbit_weight_lut_kernel_1x4x32<
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ void register_ukernel_config_universal(
torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_universal,
weight_nbit);

namespace kernel = torchao::kernels::cpu::aarch64::linear::
namespace kernel = torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight;

constexpr bool has_lut = false;
Expand Down Expand Up @@ -181,7 +181,7 @@ void register_ukernel_config_lut(
int preferred_alignment = 16;

#if defined(TORCHAO_ENABLE_ARM_NEON_DOT)
namespace kernel = torchao::kernels::cpu::aarch64::linear::
namespace kernel = torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight;

if (!cpuinfo_has_arm_neon_dot()) {
Expand Down Expand Up @@ -232,7 +232,7 @@ void register_ukernel_config_lut(
template <typename kernel_struct>
UKernelConfig::linear_config_type
get_linear_config_kleidi(int n_step, int nr, int kr, int sr) {
namespace op = torchao::kernels::cpu::aarch64::kleidi::
namespace op = torchao::cpu::aarch64::kleidi::
kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
assert(n_step == kernel_struct::get_ukernel().get_n_step());
assert(nr == kernel_struct::get_ukernel().get_nr());
Expand All @@ -256,7 +256,7 @@ void register_ukernel_config_kleidi(
throw std::runtime_error("Failed to initialize cpuinfo!");
}
check_format(format, torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_kleidi_ai, weight_nbit);
namespace op = torchao::kernels::cpu::aarch64::kleidi::
namespace op = torchao::cpu::aarch64::kleidi::
kai_matmul_clamp_f32_qai8dxp_qsi4c32p;

auto uk = UKernelConfig::make(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ using namespace torchao::ops::groupwise_lowbit_weight_lut;
template <int weight_nbit, bool has_scales>
UKernelConfig get_ukernel_config(bool has_bias) {
namespace kernel =
torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut;

int preferred_alignment = 16;
int n_step = 8;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#if defined(TORCHAO_ENABLE_KLEIDI)
#include <torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h>
using namespace torchao::kernels::cpu::aarch64::kleidi::
using namespace torchao::cpu::aarch64::kleidi::
kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
#endif // TORCHAO_ENABLE_KLEIDI

Expand All @@ -27,7 +27,7 @@ using namespace torchao::ops::linear_8bit_act_xbit_weight;

template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp, bool has_lut = false>
UKernelConfig get_ukernel_config() {
namespace kernel = torchao::kernels::cpu::aarch64::linear::
namespace kernel = torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight;

int preferred_alignment = 16;
Expand Down Expand Up @@ -213,7 +213,7 @@ enum kai_kernel_id {

template <typename kernel_struct>
UKernelConfig get_ukernel_config_kleidi_impl() {
namespace op = torchao::kernels::cpu::aarch64::kleidi::
namespace op = torchao::cpu::aarch64::kleidi::
kai_matmul_clamp_f32_qai8dxp_qsi4c32p;

auto uk = kernel_struct::get_ukernel();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot(
int k = state.range(2);
int group_size = state.range(3);

using namespace torchao::kernels::cpu::aarch64::linear::
using namespace torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot;

auto test_case = torchao::
Expand Down Expand Up @@ -91,7 +91,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot(
int k = state.range(2);
int group_size = state.range(3);

using namespace torchao::kernels::cpu::aarch64::linear::
using namespace torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight;

auto test_case = torchao::
Expand Down Expand Up @@ -163,7 +163,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot(
int k = state.range(2);
int group_size = state.range(3);

using namespace torchao::kernels::cpu::aarch64::linear::
using namespace torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight;

auto test_case = torchao::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ static void benchmark_quantize(benchmark::State& state) {
float vmin, vmax, scale;

for (auto _ : state) {
torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
torchao::cpu::aarch64::reduction::find_min_and_max(
vmin, vmax, vals.data(), vals.size());

torchao::quantization::get_qvals_range(
Expand All @@ -30,7 +30,7 @@ static void benchmark_quantize(benchmark::State& state) {
torchao::quantization::get_scale_and_zero(
scale, zero, vmin, vmax, qmin, qmax);

torchao::kernels::cpu::aarch64::quantization::quantize(
torchao::cpu::aarch64::quantization::quantize(
qvals.data(), vals.data(), vals.size(), scale, zero, qmin, qmax);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include <cassert>
#include <vector>

namespace torchao::kernels::cpu::aarch64::embedding {
namespace torchao::cpu::aarch64::embedding {

namespace internal {

Expand Down Expand Up @@ -353,7 +353,7 @@ inline void shared_embedding(
n_idx = n_idx * nr;
int j = index - n_idx;

torchao::kernels::cpu::aarch64::linear::
torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
unpack_weights_at_n_idx<weight_nbit, nr, kr, sr>(
weight_qvals.data(),
Expand Down Expand Up @@ -381,6 +381,6 @@ inline void shared_embedding(
}
}

} // namespace torchao::kernels::cpu::aarch64::embedding
} // namespace torchao::cpu::aarch64::embedding

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include <cassert>
#include <vector>

namespace torchao::kernels::cpu::aarch64::embedding {
namespace torchao::cpu::aarch64::embedding {

/**
* @brief Calculates the size in bytes for a single row of packed embeddings.
Expand Down Expand Up @@ -377,6 +377,6 @@ inline void dequantize_embedding_row_at_idx_lut(
vst1q_f32(out + j + 12, out3);
}
}
} // namespace torchao::kernels::cpu::aarch64::embedding
} // namespace torchao::cpu::aarch64::embedding

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

#include <torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h>

namespace torchao::kernels::cpu::aarch64::kleidi {
namespace torchao::cpu::aarch64::kleidi {

// Helper functions
// TODO: find a better place for these?
Expand Down Expand Up @@ -319,4 +319,4 @@ DEFINE_KERNEL_STRUCT(matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm);
#undef DEFINE_KERNEL_STRUCT

} // namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p
} // namespace torchao::kernels::cpu::aarch64::kleidi
} // namespace torchao::cpu::aarch64::kleidi
4 changes: 2 additions & 2 deletions torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/pack.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h>
#include <kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h>

namespace torchao::kernels::cpu::aarch64::kleidi {
namespace torchao::cpu::aarch64::kleidi {
namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p {
// All the kernels in this namespace use following packing interface/routines.
// TODO: move these to Kleidi as interfaces?
Expand Down Expand Up @@ -115,4 +115,4 @@ lhs_packing get_lhs_packing() {
}

} // namespace kai_matmul_clamp_f32_qai8dxp_qsi4c32p
} // namespace torchao::kernels::cpu::aarch64::kleidi
} // namespace torchao::cpu::aarch64::kleidi
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h>
#include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h>

namespace torchao::kernels::cpu::aarch64::linear::
namespace torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight {

inline size_t packed_activations_size(
Expand Down Expand Up @@ -148,7 +148,7 @@ void pack_weights_with_lut(
(void)nr; // unused
(void)kr; // unused
(void)sr; // unused
torchao::kernels::cpu::aarch64::linear::
torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
pack_weights_with_lut<weight_nbit, nr_, kr_, sr_>(
packed_weights,
Expand Down Expand Up @@ -298,6 +298,6 @@ void kernel_1x8x16_f32_neondot(
}

} // namespace
// torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight
// torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <torchao/csrc/cpu/torch_free_kernels/aarch64/bitpacking/bitpack.h>
#include <cassert>

namespace torchao::kernels::cpu::aarch64::linear::
namespace torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight::kernel {

namespace internal {
Expand Down Expand Up @@ -174,6 +174,6 @@ void kernel_1x1x32_f32_neondot(
}

} // namespace
// torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
// torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <cassert>
#include <cstring>

namespace torchao::kernels::cpu::aarch64::linear::
namespace torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight::kernel {
namespace internal {
inline float32x4_t clamp(float32x4_t x, float min, float max) {
Expand Down Expand Up @@ -245,6 +245,6 @@ void kernel_1x4x16_f32_neondot(
}

} // namespace
// torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
// torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <cassert>
#include <cstring>

namespace torchao::kernels::cpu::aarch64::linear::
namespace torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight::kernel {
namespace internal {

Expand Down Expand Up @@ -361,6 +361,6 @@ void kernel_1x8x16_f32_neondot(
}

} // namespace
// torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel
// torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::kernel

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <torchao/csrc/cpu/torch_free_kernels/aarch64/reduction/reduction.h>
#include <cassert>

namespace torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing {
namespace torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing {

// Prepares activation data for kernel_impl.
// Per m_idx (row), activations are stored as follows:
Expand Down Expand Up @@ -72,7 +72,7 @@ void inline pack_activations(
qmin, qmax, /*nbit=*/8, /*is_symmetric=*/false);

for (int m_idx = 0; m_idx < m; m_idx++) {
torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
torchao::cpu::aarch64::reduction::find_min_and_max(
vmin, vmax, activations, k);
torchao::quantization::get_scale_and_zero(
scale, zero, vmin, vmax, qmin, qmax);
Expand All @@ -86,7 +86,7 @@ void inline pack_activations(

if (has_weight_zeros) {
for (int k_idx = 0; k_idx < k; k_idx += group_size) {
torchao::kernels::cpu::aarch64::quantization::quantize(
torchao::cpu::aarch64::quantization::quantize(
/*qvals=*/(int8_t*)activation_data_byte_ptr,
/*vals=*/activations,
/*size=*/group_size,
Expand All @@ -95,7 +95,7 @@ void inline pack_activations(
/*qmin=*/qmin,
/*qmax=*/qmax);

qvals_sum = torchao::kernels::cpu::aarch64::reduction::compute_sum(
qvals_sum = torchao::cpu::aarch64::reduction::compute_sum(
/*vals=*/(int8_t*)activation_data_byte_ptr,
/*size=*/group_size);

Expand All @@ -107,7 +107,7 @@ void inline pack_activations(
activations += group_size;
}
} else {
torchao::kernels::cpu::aarch64::quantization::quantize(
torchao::cpu::aarch64::quantization::quantize(
/*qvals=*/(int8_t*)activation_data_byte_ptr,
/*vals=*/activations,
/*size=*/k,
Expand All @@ -121,6 +121,6 @@ void inline pack_activations(
}
}

} // namespace torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing
} // namespace torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::activation_packing

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <array>
#include <cstring>

namespace torchao::kernels::cpu::aarch64::linear::
namespace torchao::cpu::aarch64::linear::
channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing {

namespace internal {
Expand Down Expand Up @@ -583,6 +583,6 @@ size_t inline packed_weights_with_lut_size(
}

} // namespace
// torchao::kernels::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing
// torchao::cpu::aarch64::linear::channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_activations.h>
#include <torchao/csrc/cpu/torch_free_kernels/aarch64/linear/groupwise_lowbit_weight/pack_weights.h>

namespace torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut {
namespace torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut {

/**
* @brief Calculates the total size in bytes required for the packed weight.
Expand Down Expand Up @@ -258,6 +258,6 @@ inline size_t packed_weights_offset(
return (n_idx / nr) * packed_tile_size_for_nr_cols;
}
} // namespace
// torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut
// torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut

#endif // defined(__aarch64__) || defined(__ARM_NEON)
Loading