pytorch · kimishpatel · Aug 21, 2024
diff --git a/...ch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h b/...ch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h
@@ -218,21 +218,7 @@ void kernel_impl(
       if constexpr (has_clamp) {
         res = clamp(res, clamp_min, clamp_max);
       }
-
-      // Store result
-      int remaining = n - n_idx;
-      float* store_loc = output + m_idx * output_m_stride + n_idx;
-      if (remaining >= 4) {
-        vst1q_f32(store_loc, res);
-      } else if (remaining >= 3) {
-        vst1_f32(store_loc, vget_low_f32(res));
-        *(store_loc + 2) = res[2];
-      } else if (remaining >= 2) {
-        vst1_f32(store_loc, vget_low_f32(res));
-      } else {
-        *(store_loc) = res[0];
-      }
-
+      vst1q_f32(output + m_idx * output_m_stride + n_idx, res);
     } // n_idx
     activation_data_byte_ptr += (activation_ptr - activation_data_byte_ptr);
   } // m_idx

diff --git a/...ch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h b/...ch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h
@@ -290,34 +290,8 @@ void kernel_impl(
         res_0123 = vec_clamp(res_0123, vec_min, vec_max);
         res_4567 = vec_clamp(res_4567, vec_min, vec_max);
       }
-
-      // Store result
-      int remaining = n - n_idx;
-      float* store_loc = output + m_idx * output_m_stride + n_idx;
-      if (remaining >= 8) {
-        vst1q_f32(store_loc, res_0123);
-        vst1q_f32(store_loc + 4, res_4567);
-      } else if (remaining >= 7) {
-        vst1q_f32(store_loc, res_0123);
-        vst1_f32(store_loc + 4, vget_low_f32(res_4567));
-        *(store_loc + 6) = res_4567[2];
-      } else if (remaining >= 6) {
-        vst1q_f32(store_loc, res_0123);
-        vst1_f32(store_loc + 4, vget_low_f32(res_4567));
-      } else if (remaining >= 5) {
-        vst1q_f32(store_loc, res_0123);
-        *(store_loc + 4) = res_4567[0];
-      } else if (remaining >= 4) {
-        vst1q_f32(store_loc, res_0123);
-      } else if (remaining >= 3) {
-        vst1_f32(store_loc, vget_low_f32(res_0123));
-        *(store_loc + 2) = res_0123[2];
-      } else if (remaining >= 2) {
-        vst1_f32(store_loc, vget_low_f32(res_0123));
-      } else {
-        *store_loc = res_0123[0];
-      }
-
+      vst1q_f32(output + m_idx * output_m_stride + n_idx, res_0123);
+      vst1q_f32(output + m_idx * output_m_stride + n_idx + 4, res_4567);
     } // n_idx
     activation_data_byte_ptr += (activation_ptr - activation_data_byte_ptr);
   } // m_idx

diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
@@ -1,18 +1,15 @@
 // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
-#include <cassert>
 
 int32_t torchao::kernels::cpu::aarch64::reduction::compute_sum(
     const int8_t* vals,
     int size) {
-  assert(size >= 1);
-
   int32_t res = 0;
   int i = 0;
 
 #pragma unroll(4)
-  for (; i + 15 < size; i += 16) {
+  for (; i < size; i += 16) {
     int8x16_t vec_vals = vld1q_s8(vals + i);
     res += (int)(vaddlvq_s8(vec_vals));
   }

diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
@@ -1,33 +1,23 @@
 // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
-#include <cassert>
 
 void torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
     float32_t& min,
     float32_t& max,
     const float32_t* vals,
     int size) {
-  assert(size > 0);
-
-  // Needed in case size < 4 so we don't compare to
-  // uninitialized min/max values
-  min = vals[0];
-  max = min;
-
+  float32x4_t mins = vdupq_n_f32(0.0);
+  float32x4_t maxes = vdupq_n_f32(0.0);
   int i = 0;
-  if (i + 3 < size) {
-    float32x4_t mins = vld1q_f32(vals + i);
-    float32x4_t maxes = mins;
-    i += 4;
-    for (; i + 3 < size; i += 4) {
-      float32x4_t v = vld1q_f32(vals + i);
-      mins = vminq_f32(mins, v);
-      maxes = vmaxq_f32(maxes, v);
-    }
-    min = vminvq_f32(mins);
-    max = vmaxvq_f32(maxes);
+  for (; i < size; i += 8) {
+    float32x4_t v1 = vld1q_f32(vals + i);
+    float32x4_t v2 = vld1q_f32(vals + i + 4);
+    mins = vminq_f32(v1, v2);
+    maxes = vmaxq_f32(v1, v2);
   }
+  min = vminvq_f32(mins);
+  max = vmaxvq_f32(maxes);
 
   // Remainder
   while (i < size) {

diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt b/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt
@@ -35,14 +35,6 @@ target_link_libraries(
     dep
 )
 
-add_executable(test_reduction test_reduction.cpp)
-target_link_libraries(
-  test_reduction
-    PRIVATE
-    GTest::gtest_main
-    dep
-)
-
 add_executable(test_bitpacking test_bitpacking.cpp)
 target_link_libraries(
   test_bitpacking
@@ -69,7 +61,6 @@ target_link_libraries(
 
 include(GoogleTest)
 gtest_discover_tests(test_quantization)
-gtest_discover_tests(test_reduction)
 gtest_discover_tests(test_bitpacking)
 gtest_discover_tests(test_linear)
 gtest_discover_tests(test_valpacking)
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh b/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh
@@ -7,8 +7,7 @@ cmake -DTORCHAO_LIBRARIES=${TORCHAO_LIBRARIES} -S ${TORCHAO_LIBRARIES}/torchao/e
 cmake --build  ${CMAKE_OUT}
 
 # Run
-${CMAKE_OUT}/test_quantization
-${CMAKE_OUT}/test_reduction
-${CMAKE_OUT}/test_bitpacking
-${CMAKE_OUT}/test_linear
-${CMAKE_OUT}/test_valpacking
+ ${CMAKE_OUT}/test_quantization
+ ${CMAKE_OUT}/test_bitpacking
+ ${CMAKE_OUT}/test_linear
+ ${CMAKE_OUT}/test_valpacking