pytorch
diff --git a/‎torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp‎
Lines changed: 1 addition & 1 deletion b/‎torchao/csrc/cpu/shared_kernels/benchmarks/benchmark_linear_8bit_act_xbit_weight.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h‎
Lines changed: 3 additions & 3 deletions b/‎torchao/csrc/cpu/shared_kernels/embedding_xbit/op_embedding_xbit-impl.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h‎
Lines changed: 1 addition & 1 deletion b/‎torchao/csrc/cpu/shared_kernels/groupwise_lowbit_weight_lut/kernel_selector.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h‎
Lines changed: 4 additions & 4 deletions b/‎torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight/kernel_selector.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp‎
Lines changed: 1 addition & 1 deletion b/‎torchao/csrc/cpu/shared_kernels/tests/test_groupwise_lowbit_weight_lut.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp‎
Lines changed: 3 additions & 3 deletions b/‎torchao/csrc/cpu/shared_kernels/tests/test_linear_8bit_act_xbit_weight.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp‎
Lines changed: 3 additions & 3 deletions b/‎torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_linear.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp‎
Lines changed: 2 additions & 2 deletions b/‎torchao/csrc/cpu/torch_free_kernels/aarch64/benchmarks/benchmark_quantization.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h‎
Lines changed: 3 additions & 3 deletions b/‎torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h‎
Lines changed: 2 additions & 2 deletions b/‎torchao/csrc/cpu/torch_free_kernels/aarch64/embedding/embedding_lut.h‎
Lines changed: 2 additions & 2 deletions
@@ -18,7 +18,7 @@ template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp>
 UKernelConfig get_ukernel_config() {
   UKernelConfig config;
 
-  namespace ukernel = torchao::kernels::cpu::aarch64::linear::
+  namespace ukernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot;
   config.mr = 1;
   config.nr = 8;
 
@@ -133,7 +133,7 @@ Tensor embedding_out_cpu(
     }
     TORCHAO_CHECK(index >= 0 && index < num_embeddings, "index out of bounds");
 #if defined(TORCHAO_BUILD_CPU_AARCH64)
-    torchao::kernels::cpu::aarch64::embedding::embedding<weight_nbit>(
+    torchao::cpu::aarch64::embedding::embedding<weight_nbit>(
         out.mutable_data_ptr<float>() + idx * embedding_dim,
         embedding_dim,
         group_size,
@@ -199,7 +199,7 @@ Tensor pack_embedding_cpu(const Tensor& weight_qvals) {
 
   torchao::parallel_1d(0, num_embeddings, [&](int64_t idx) {
 #if defined(TORCHAO_BUILD_CPU_AARCH64)
-    torchao::kernels::cpu::aarch64::embedding::pack_embedding_weight_qvals<
+    torchao::cpu::aarch64::embedding::pack_embedding_weight_qvals<
         weight_nbit>(
         out.mutable_data_ptr<int8_t>() +
             torchao::ops::PackedWeightsHeader::size(),
@@ -289,7 +289,7 @@ Tensor shared_embedding_out_cpu(
     }
     TORCHAO_CHECK(index >= 0 && index < n, "index out of bounds");
 #if defined(TORCHAO_BUILD_CPU_AARCH64)
-    torchao::kernels::cpu::aarch64::embedding::
+    torchao::cpu::aarch64::embedding::
         shared_embedding<weight_nbit, nr, kr, sr>(
             out.mutable_data_ptr<float>() + idx * k,
             packed_weights.const_data_ptr<int8_t>() +
 
@@ -117,7 +117,7 @@ void register_ukernel_config(
   int preferred_alignment = 16;
 
   namespace kernel_api =
-      torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
+      torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
 
   using kernel_fn_ptr_t =
       decltype(&kernel_api::groupwise_lowbit_weight_lut_kernel_1x4x32<
 
@@ -97,7 +97,7 @@ void register_ukernel_config_universal(
       torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_universal,
       weight_nbit);
 
-  namespace kernel = torchao::kernels::cpu::aarch64::linear::
+  namespace kernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   constexpr bool has_lut = false;
@@ -181,7 +181,7 @@ void register_ukernel_config_lut(
     int preferred_alignment = 16;
 
     #if defined(TORCHAO_ENABLE_ARM_NEON_DOT)
-    namespace kernel = torchao::kernels::cpu::aarch64::linear::
+    namespace kernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
     if (!cpuinfo_has_arm_neon_dot()) {
@@ -232,7 +232,7 @@ void register_ukernel_config_lut(
 template <typename kernel_struct>
 UKernelConfig::linear_config_type
 get_linear_config_kleidi(int n_step, int nr, int kr, int sr) {
-  namespace op = torchao::kernels::cpu::aarch64::kleidi::
+  namespace op = torchao::cpu::aarch64::kleidi::
       kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
   assert(n_step == kernel_struct::get_ukernel().get_n_step());
   assert(nr == kernel_struct::get_ukernel().get_nr());
@@ -256,7 +256,7 @@ void register_ukernel_config_kleidi(
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
   check_format(format, torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_kleidi_ai, weight_nbit);
-  namespace op = torchao::kernels::cpu::aarch64::kleidi::
+  namespace op = torchao::cpu::aarch64::kleidi::
       kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
 
   auto uk = UKernelConfig::make(
 
@@ -19,7 +19,7 @@ using namespace torchao::ops::groupwise_lowbit_weight_lut;
 template <int weight_nbit, bool has_scales>
 UKernelConfig get_ukernel_config(bool has_bias) {
   namespace kernel =
-      torchao::kernels::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
+      torchao::cpu::aarch64::linear::groupwise_lowbit_weight_lut;
 
   int preferred_alignment = 16;
   int n_step = 8;
 
@@ -16,7 +16,7 @@
 
 #if defined(TORCHAO_ENABLE_KLEIDI)
 #include <torchao/csrc/cpu/torch_free_kernels/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h>
-using namespace torchao::kernels::cpu::aarch64::kleidi::
+using namespace torchao::cpu::aarch64::kleidi::
     kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
 #endif // TORCHAO_ENABLE_KLEIDI
 
@@ -27,7 +27,7 @@ using namespace torchao::ops::linear_8bit_act_xbit_weight;
 
 template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp, bool has_lut = false>
 UKernelConfig get_ukernel_config() {
-  namespace kernel = torchao::kernels::cpu::aarch64::linear::
+  namespace kernel = torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   int preferred_alignment = 16;
@@ -213,7 +213,7 @@ enum kai_kernel_id {
 
 template <typename kernel_struct>
 UKernelConfig get_ukernel_config_kleidi_impl() {
-  namespace op = torchao::kernels::cpu::aarch64::kleidi::
+  namespace op = torchao::cpu::aarch64::kleidi::
       kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
 
   auto uk = kernel_struct::get_ukernel();
 
@@ -19,7 +19,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot(
   int k = state.range(2);
   int group_size = state.range(3);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot;
 
   auto test_case = torchao::
@@ -91,7 +91,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot(
   int k = state.range(2);
   int group_size = state.range(3);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   auto test_case = torchao::
@@ -163,7 +163,7 @@ channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot(
   int k = state.range(2);
   int group_size = state.range(3);
 
-  using namespace torchao::kernels::cpu::aarch64::linear::
+  using namespace torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
   auto test_case = torchao::
 
@@ -21,7 +21,7 @@ static void benchmark_quantize(benchmark::State& state) {
   float vmin, vmax, scale;
 
   for (auto _ : state) {
-    torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
+    torchao::cpu::aarch64::reduction::find_min_and_max(
         vmin, vmax, vals.data(), vals.size());
 
     torchao::quantization::get_qvals_range(
@@ -30,7 +30,7 @@ static void benchmark_quantize(benchmark::State& state) {
     torchao::quantization::get_scale_and_zero(
         scale, zero, vmin, vmax, qmin, qmax);
 
-    torchao::kernels::cpu::aarch64::quantization::quantize(
+    torchao::cpu::aarch64::quantization::quantize(
         qvals.data(), vals.data(), vals.size(), scale, zero, qmin, qmax);
   }
 }
 
@@ -15,7 +15,7 @@
 #include <cassert>
 #include <vector>
 
-namespace torchao::kernels::cpu::aarch64::embedding {
+namespace torchao::cpu::aarch64::embedding {
 
 namespace internal {
 
@@ -353,7 +353,7 @@ inline void shared_embedding(
   n_idx = n_idx * nr;
   int j = index - n_idx;
 
-  torchao::kernels::cpu::aarch64::linear::
+  torchao::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
           unpack_weights_at_n_idx<weight_nbit, nr, kr, sr>(
               weight_qvals.data(),
@@ -381,6 +381,6 @@ inline void shared_embedding(
   }
 }
 
-} // namespace torchao::kernels::cpu::aarch64::embedding
+} // namespace torchao::cpu::aarch64::embedding
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
@@ -14,7 +14,7 @@
 #include <cassert>
 #include <vector>
 
-namespace torchao::kernels::cpu::aarch64::embedding {
+namespace torchao::cpu::aarch64::embedding {
 
 /**
  * @brief Calculates the size in bytes for a single row of packed embeddings.
@@ -377,6 +377,6 @@ inline void dequantize_embedding_row_at_idx_lut(
     vst1q_f32(out + j + 12, out3);
   }
 }
-} // namespace torchao::kernels::cpu::aarch64::embedding
+} // namespace torchao::cpu::aarch64::embedding
 
 #endif // defined(__aarch64__) || defined(__ARM_NEON)
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ static void benchmark_quantize(benchmark::State& state) {`
`21`	`21`	`float vmin, vmax, scale;`
`22`	`22`
`23`	`23`	`for (auto _ : state) {`
`24`		`- torchao::kernels::cpu::aarch64::reduction::find_min_and_max(`
	`24`	`+ torchao::cpu::aarch64::reduction::find_min_and_max(`
`25`	`25`	`vmin, vmax, vals.data(), vals.size());`
`26`	`26`
`27`	`27`	`torchao::quantization::get_qvals_range(`
`@@ -30,7 +30,7 @@ static void benchmark_quantize(benchmark::State& state) {`
`30`	`30`	`torchao::quantization::get_scale_and_zero(`
`31`	`31`	`scale, zero, vmin, vmax, qmin, qmax);`
`32`	`32`
`33`		`- torchao::kernels::cpu::aarch64::quantization::quantize(`
	`33`	`+ torchao::cpu::aarch64::quantization::quantize(`
`34`	`34`	`qvals.data(), vals.data(), vals.size(), scale, zero, qmin, qmax);`
`35`	`35`	`}`
`36`	`36`	`}`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`	`#include <cassert>`
`15`	`15`	`#include <vector>`
`16`	`16`
`17`		`-namespace torchao::kernels::cpu::aarch64::embedding {`
	`17`	`+namespace torchao::cpu::aarch64::embedding {`
`18`	`18`
`19`	`19`	`/**`
`20`	`20`	`* @brief Calculates the size in bytes for a single row of packed embeddings.`
`@@ -377,6 +377,6 @@ inline void dequantize_embedding_row_at_idx_lut(`
`377`	`377`	`vst1q_f32(out + j + 12, out3);`
`378`	`378`	`}`
`379`	`379`	`}`
`380`		`-} // namespace torchao::kernels::cpu::aarch64::embedding`
	`380`	`+} // namespace torchao::cpu::aarch64::embedding`
`381`	`381`
`382`	`382`	`#endif // defined(__aarch64__) \|\| defined(__ARM_NEON)`