Update function params and corresponding usages.

szyszyzys · web-flow · commit 9da7ad5f4419 · 2025-07-11T09:54:31.000-07:00
Differential Revision: D78056221 Pull Request resolved: #2524
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h b/torchao/experimental/kernels/cpu/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h
@@ -44,7 +44,17 @@ chunked and interleaved during the packing process.
  * @param input Pointer to the source activation matrix (float32, row-major).
  */
 template <int mr_, int kr_, int sr_>
-inline void pack_activations(float* output, int m, int k, const float* input) {
+inline void pack_activations(
+    float* output,
+    int m,
+    int k,
+    const float* input,
+    int mr,
+    int kr,
+    int sr) {
+  (void)mr; // unused
+  (void)kr; // unused
+  (void)sr; // unused
   activation_packing::pack_activations<mr_, kr_, sr_>(output, m, k, input);
 }
 
@@ -100,7 +110,7 @@ row-major).
  * @param bias Pointer to the bias vector (float32, row-major).
  */
 template <int weight_nbit_, int nr_, int kr_, int sr_>
-void pack_weights_for_groupwise_lut_kernel(
+void pack_weights(
     /*output*/
     void* packed_weights_ptr,
     /*inputs*/
@@ -113,7 +123,14 @@ void pack_weights_for_groupwise_lut_kernel(
     int lut_group_size,
     bool has_scales,
     bool has_bias,
-    const float* bias) {
+    const float* bias,
+    int nr,
+    int kr,
+    int sr) {
+  (void)nr; // unused
+  (void)kr; // unused
+  (void)sr; // unused
+
   weight_packing::pack_weights<weight_nbit_, nr_, kr_, sr_>(
       packed_weights_ptr,
       weight_qvals_indices,
@@ -190,7 +207,12 @@ inline void groupwise_lowbit_weight_lut_kernel_1x4x32(
  * @param k The K dimension (width) of the activation matrix.
  * @return The byte offset from the start of the buffer.
  */
-inline size_t packed_activations_offset(int m_idx, int k) {
+inline size_t
+packed_activations_offset(int m_idx, int k, int mr, int kr, int sr) {
+  (void)mr; // unused
+  (void)kr; // unused
+  (void)sr; // unused
+
   // For a simple padded row-major format, the offset is just m_idx * k.
   return sizeof(float) * m_idx * k;
 }
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_lut.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_lut.cpp
@@ -71,7 +71,7 @@ void test_groupwise_lowbit_lut_kernel(
   std::vector<float> packed_activations_buffer(
       kernel_api::packed_activations_size(m, k, mr_, kr_, sr_));
   kernel_api::pack_activations<mr_, kr_, sr_>(
-      packed_activations_buffer.data(), m, k, source_activations.data());
+      packed_activations_buffer.data(), m, k, source_activations.data(), mr_, kr_, sr_);
   // 3. Pack Weights
   std::vector<char> packed_weights(kernel_api::packed_weights_size(
       n,
@@ -84,7 +84,7 @@ void test_groupwise_lowbit_lut_kernel(
       kr_,
       sr_));
   kernel_api::
-      pack_weights_for_groupwise_lut_kernel<weight_nbit_, nr_, kr_, sr_>(
+      pack_weights<weight_nbit_, nr_, kr_, sr_>(
           packed_weights.data(),
           test_case.weight_qval_indices.data(),
           test_case.weight_scales.data(),
@@ -95,7 +95,7 @@ void test_groupwise_lowbit_lut_kernel(
           flat_lut_group_size,
           has_scales_,
           has_bias,
-          test_case.bias.data());
+          test_case.bias.data(), nr_, kr_, sr_);
 
   // 4. Run the kernel
   std::vector<float> output(m * n);
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h b/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h
@@ -640,11 +640,10 @@ struct groupwise_lowbit_weight_lut_test_case {
     const int total_weights = n * k;
     // Frequencies are controlled by their group sizes.
     assert(total_weights % scale_group_size == 0);
-    assert(total_weights % lut_group_size == 0);
 
     // The number of unique scales/LUTs is derived directly from their group size.
     const int num_scales = total_weights / scale_group_size;
-    const int num_luts = total_weights / lut_group_size;
+    const int num_luts = (total_weights + lut_group_size - 1) / lut_group_size;
     const int lut_size = 1 << weight_nbit;
     std::mt19937 gen(std::random_device{}());
 
@@ -726,9 +725,6 @@ struct groupwise_lowbit_weight_lut_test_case {
     int weight_nbit, bool has_scales,
     bool has_bias, bool has_clamp) {
 
-    std::cout << "[Generator Info] Using 'Per-Group' model.\n"
-              << "  - Both scales and LUTs will switch every " << group_size << " weights." << std::endl;
-
     // Just call the decoupled generator with the same group size for both.
     return _generate_master(
       m, k, n,
@@ -748,10 +744,6 @@ struct groupwise_lowbit_weight_lut_test_case {
     int scale_group_size, int lut_group_size, int weight_nbit, bool has_scales,
     bool has_bias, bool has_clamp) {
 
-    std::cout << "[Generator Info] Using 'Decoupled Grouping' model.\n"
-              << "  - Scales will switch every " << scale_group_size << " weights.\n"
-              << "  - LUTs will switch every " << lut_group_size << " weights." << std::endl;
-
     return _generate_master(
         m, k, n,
         scale_group_size, lut_group_size,