Add gemm int8 a x int8 b to interface

kimishpatel · web-flow · commit 663a95d6de34 · 2025-04-14T14:19:32.000-07:00
Differential Revision: D71936844 Pull Request resolved: #2055
diff --git a/torchao/experimental/kernels/cpu/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot-impl.h
@@ -289,7 +289,7 @@ struct KernelImpl<true, true, false, true> {
     constexpr int kr = 8;
     assert(m % mr == 0);
     assert(k % 16 == 0);
-    assert(n >= nr);
+    assert(n % nr == 0);
     std::vector<int8_t> rhs_packed(n * k);
     // Since we are casting int8_t to float32_t in order to tranpose matrix in a
     // way to keep 4 of the k values to gether, we must adjust stride as well as
@@ -307,17 +307,6 @@ struct KernelImpl<true, true, false, true> {
 
     for (int m_idx = 0; m_idx < m; m_idx += mr) {
       for (int n_idx = 0; n_idx < n; n_idx += nr) {
-        // If remaining is < nr, that must mean that (nr - remaining) items
-        // dont need to be computed.
-        // In order to avoid out-of-bounds access, we need to rewind n_indx a
-        // bit
-        // |-------------------|-------------------|
-        // 0-------------------8-------------------16
-        // 0-------------------8-----10
-        // If n = 10 and nr = 8 then at n_idx = 8, we need to rewind n_idx to
-        // 8 - (8 - 10) = 2
-        int remaining = std::min(n - n_idx, nr);
-        n_idx = n_idx - (nr - remaining);
         // Set activation_ptr to start of activation qvals for row m_idx
         const int8_t* lhs_ptr = (const int8_t*)lhs + m_idx * lhs_stride_m;
         const int8_t* rhs_ptr = (const int8_t*)rhs_packed.data() +
diff --git a/torchao/experimental/kernels/cpu/aarch64/matmul/matmul.h b/torchao/experimental/kernels/cpu/aarch64/matmul/matmul.h
@@ -42,6 +42,129 @@ void kernel(
 
 } // namespace channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot
 
+namespace channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot {
+
+template <
+    bool a_has_zeros,
+    bool b_has_zeros,
+    bool a_transposed,
+    bool b_tranposed>
+void kernel(
+    int m,
+    int n,
+    int k,
+    const void* lhs,
+    int lhs_stride_m,
+    const void* rhs,
+    int rhs_stride_n,
+    float32_t* output,
+    int out_stride_m,
+    const int8_t* lhs_zero_points,
+    const int8_t* rhs_zero_points,
+    const float* lhs_scales,
+    const float* rhs_scales,
+    const int lhs_qparams_stride,
+    const int rhs_qparams_stride);
+
+} // namespace channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot
+
+namespace channelwise_8bit_a_channelwise_8bit_b_f32 {
+
+template <
+    bool a_has_zeros,
+    bool b_has_zeros,
+    bool a_transposed,
+    bool b_tranposed>
+void kernel(
+    int m,
+    int n,
+    int k,
+    const void* lhs,
+    int lhs_stride_m,
+    const void* rhs,
+    int rhs_stride_n,
+    float32_t* output,
+    int out_stride_m,
+    const int8_t* lhs_zero_points,
+    const int8_t* rhs_zero_points,
+    const float* lhs_scales,
+    const float* rhs_scales,
+    const int lhs_qparams_stride,
+    const int rhs_qparams_stride);
+
+template <
+    bool a_has_zeros,
+    bool b_has_zeros,
+    bool a_transposed,
+    bool b_tranposed>
+void kernel(
+    int m,
+    int n,
+    int k,
+    const void* lhs,
+    int lhs_stride_m,
+    const void* rhs,
+    int rhs_stride_n,
+    float32_t* output,
+    int out_stride_m,
+    const int8_t* lhs_zero_points,
+    const int8_t* rhs_zero_points,
+    const float* lhs_scales,
+    const float* rhs_scales,
+    const int lhs_qparams_stride,
+    const int rhs_qparams_stride) {
+  // TODO: Replace this with KerneConfig based dispatch
+  constexpr size_t gemm_nr = 8;
+  constexpr size_t gemm_kr = 16;
+  if ((n % gemm_nr == 0) && (k % gemm_kr == 0) && m > 4) {
+    auto remaining_m = m % 4;
+    auto m_for_gemm_kernel = m - remaining_m;
+    channelwise_8bit_a_channelwise_8bit_b_4x8x8_f32_neondot::
+        kernel<a_has_zeros, b_has_zeros, a_transposed, b_tranposed>(
+            m_for_gemm_kernel,
+            n,
+            k,
+            lhs,
+            lhs_stride_m,
+            rhs,
+            rhs_stride_n,
+            output,
+            out_stride_m,
+            lhs_zero_points,
+            rhs_zero_points,
+            lhs_scales,
+            rhs_scales,
+            lhs_qparams_stride,
+            rhs_qparams_stride);
+    output += m_for_gemm_kernel * out_stride_m;
+    lhs = (static_cast<const int8_t*>(lhs) + m_for_gemm_kernel * lhs_stride_m);
+    lhs_zero_points = lhs_zero_points + m_for_gemm_kernel * lhs_qparams_stride;
+    lhs_scales = lhs_scales + m_for_gemm_kernel * lhs_qparams_stride;
+    m = remaining_m;
+  }
+  if (m > 0) {
+    channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot::
+        kernel<a_has_zeros, b_has_zeros, a_transposed, b_tranposed>(
+            m,
+            n,
+            k,
+            lhs,
+            lhs_stride_m,
+            rhs,
+            rhs_stride_n,
+            output,
+            out_stride_m,
+            lhs_zero_points,
+            rhs_zero_points,
+            lhs_scales,
+            rhs_scales,
+            lhs_qparams_stride,
+            rhs_qparams_stride);
+  }
+}
+
+} // namespace channelwise_8bit_a_channelwise_8bit_b_f32
+
 namespace channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal {
 
 template <
diff --git a/torchao/experimental/kernels/cpu/interface/quantized_matmul.h b/torchao/experimental/kernels/cpu/interface/quantized_matmul.h
@@ -70,7 +70,7 @@ get_int8_a_int8_b_channelwise_qmatmul(
     a_stride_m = k;
     b_stride_n = k;
     return aarch64::quantized_matmul::
-        channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot::
+        channelwise_8bit_a_channelwise_8bit_b_f32::
             kernel<true, true, false, true>;
   }
 #endif // defined(__aarch64__) && defined(__ARM_NEON)
diff --git a/torchao/experimental/kernels/cpu/interface/test_qmatmul_interface.cpp b/torchao/experimental/kernels/cpu/interface/test_qmatmul_interface.cpp
@@ -347,6 +347,18 @@ TEST(test_channelwise_8bit_channelwise_8bit_b, TranposeBWithZeroPointsLargeM) {
           /*m=*/4, /*k=*/128, /*n=*/16);
 }
 
+TEST(
+    test_channelwise_8bit_channelwise_8bit_b,
+    TranposeBWithZeroPointsLargeMWithGemmGemvMix) {
+  test_channelwise_8bit_channelwise_8bit_b<
+      true /*a_has_zeros*/,
+      true /*b_has_zeros*/,
+      false /*a_transposed*/,
+      true /*b_transposed*/>::
+      Run(
+          /*m=*/11, /*k=*/128, /*n=*/16);
+}
+
 TEST(
     test_channelwise_8bit_channelwise_8bit_b,
     TranposedBWithZeroPointsOddSizes) {

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ get_int8_a_int8_b_channelwise_qmatmul(`
`70`	`70`	`a_stride_m = k;`
`71`	`71`	`b_stride_n = k;`
`72`	`72`	`return aarch64::quantized_matmul::`
`73`		`- channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot::`
	`73`	`+ channelwise_8bit_a_channelwise_8bit_b_f32::`
`74`	`74`	`kernel<true, true, false, true>;`
`75`	`75`	`}`
`76`	`76`	`#endif // defined(__aarch64__) && defined(__ARM_NEON)`