pytorch
diff --git a/‎torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h
Lines changed: 15 additions & 1 deletion b/‎torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h
Lines changed: 15 additions & 1 deletion
diff --git a/‎torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h
Lines changed: 28 additions & 2 deletions b/‎torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h
Lines changed: 28 additions & 2 deletions
diff --git a/‎torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
Lines changed: 4 additions & 1 deletion b/‎torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
Lines changed: 4 additions & 1 deletion
diff --git a/‎torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
Lines changed: 19 additions & 9 deletions b/‎torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
Lines changed: 19 additions & 9 deletions
diff --git a/‎torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt
Lines changed: 9 additions & 0 deletions b/‎torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt
Lines changed: 9 additions & 0 deletions
diff --git a/‎torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh
Lines changed: 5 additions & 4 deletions b/‎torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh
Lines changed: 5 additions & 4 deletions
@@ -218,7 +218,21 @@ void kernel_impl(
       if constexpr (has_clamp) {
         res = clamp(res, clamp_min, clamp_max);
       }
-      vst1q_f32(output + m_idx * output_m_stride + n_idx, res);
+
+      // Store result
+      int remaining = n - n_idx;
+      float* store_loc = output + m_idx * output_m_stride + n_idx;
+      if (remaining >= 4) {
+        vst1q_f32(store_loc, res);
+      } else if (remaining >= 3) {
+        vst1_f32(store_loc, vget_low_f32(res));
+        *(store_loc + 2) = res[2];
+      } else if (remaining >= 2) {
+        vst1_f32(store_loc, vget_low_f32(res));
+      } else {
+        *(store_loc) = res[0];
+      }
+
     } // n_idx
     activation_data_byte_ptr += (activation_ptr - activation_data_byte_ptr);
   } // m_idx
 
@@ -290,8 +290,34 @@ void kernel_impl(
         res_0123 = vec_clamp(res_0123, vec_min, vec_max);
         res_4567 = vec_clamp(res_4567, vec_min, vec_max);
       }
-      vst1q_f32(output + m_idx * output_m_stride + n_idx, res_0123);
-      vst1q_f32(output + m_idx * output_m_stride + n_idx + 4, res_4567);
+
+      // Store result
+      int remaining = n - n_idx;
+      float* store_loc = output + m_idx * output_m_stride + n_idx;
+      if (remaining >= 8) {
+        vst1q_f32(store_loc, res_0123);
+        vst1q_f32(store_loc + 4, res_4567);
+      } else if (remaining >= 7) {
+        vst1q_f32(store_loc, res_0123);
+        vst1_f32(store_loc + 4, vget_low_f32(res_4567));
+        *(store_loc + 6) = res_4567[2];
+      } else if (remaining >= 6) {
+        vst1q_f32(store_loc, res_0123);
+        vst1_f32(store_loc + 4, vget_low_f32(res_4567));
+      } else if (remaining >= 5) {
+        vst1q_f32(store_loc, res_0123);
+        *(store_loc + 4) = res_4567[0];
+      } else if (remaining >= 4) {
+        vst1q_f32(store_loc, res_0123);
+      } else if (remaining >= 3) {
+        vst1_f32(store_loc, vget_low_f32(res_0123));
+        *(store_loc + 2) = res_0123[2];
+      } else if (remaining >= 2) {
+        vst1_f32(store_loc, vget_low_f32(res_0123));
+      } else {
+        *store_loc = res_0123[0];
+      }
+
     } // n_idx
     activation_data_byte_ptr += (activation_ptr - activation_data_byte_ptr);
   } // m_idx
 
@@ -1,15 +1,18 @@
 // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
+#include <cassert>
 
 int32_t torchao::kernels::cpu::aarch64::reduction::compute_sum(
     const int8_t* vals,
     int size) {
+  assert(size >= 1);
+
   int32_t res = 0;
   int i = 0;
 
 #pragma unroll(4)
-  for (; i < size; i += 16) {
+  for (; i + 15 < size; i += 16) {
     int8x16_t vec_vals = vld1q_s8(vals + i);
     res += (int)(vaddlvq_s8(vec_vals));
   }
 
@@ -1,23 +1,33 @@
 // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
 #include <torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h>
+#include <cassert>
 
 void torchao::kernels::cpu::aarch64::reduction::find_min_and_max(
     float32_t& min,
     float32_t& max,
     const float32_t* vals,
     int size) {
-  float32x4_t mins = vdupq_n_f32(0.0);
-  float32x4_t maxes = vdupq_n_f32(0.0);
+  assert(size > 0);
+
+  // Needed in case size < 4 so we don't compare to
+  // uninitialized min/max values
+  min = vals[0];
+  max = min;
+
   int i = 0;
-  for (; i < size; i += 8) {
-    float32x4_t v1 = vld1q_f32(vals + i);
-    float32x4_t v2 = vld1q_f32(vals + i + 4);
-    mins = vminq_f32(v1, v2);
-    maxes = vmaxq_f32(v1, v2);
+  if (i + 3 < size) {
+    float32x4_t mins = vld1q_f32(vals + i);
+    float32x4_t maxes = mins;
+    i += 4;
+    for (; i + 3 < size; i += 4) {
+      float32x4_t v = vld1q_f32(vals + i);
+      mins = vminq_f32(mins, v);
+      maxes = vmaxq_f32(maxes, v);
+    }
+    min = vminvq_f32(mins);
+    max = vmaxvq_f32(maxes);
   }
-  min = vminvq_f32(mins);
-  max = vmaxvq_f32(maxes);
 
   // Remainder
   while (i < size) {
 
@@ -35,6 +35,14 @@ target_link_libraries(
     dep
 )
 
+add_executable(test_reduction test_reduction.cpp)
+target_link_libraries(
+  test_reduction
+    PRIVATE
+    GTest::gtest_main
+    dep
+)
+
 add_executable(test_bitpacking test_bitpacking.cpp)
 target_link_libraries(
   test_bitpacking
@@ -61,6 +69,7 @@ target_link_libraries(
 
 include(GoogleTest)
 gtest_discover_tests(test_quantization)
+gtest_discover_tests(test_reduction)
 gtest_discover_tests(test_bitpacking)
 gtest_discover_tests(test_linear)
 gtest_discover_tests(test_valpacking)
@@ -7,7 +7,8 @@ cmake -DTORCHAO_LIBRARIES=${TORCHAO_LIBRARIES} -S ${TORCHAO_LIBRARIES}/torchao/e
 cmake --build  ${CMAKE_OUT}
 
 # Run
- ${CMAKE_OUT}/test_quantization
- ${CMAKE_OUT}/test_bitpacking
- ${CMAKE_OUT}/test_linear
- ${CMAKE_OUT}/test_valpacking
+${CMAKE_OUT}/test_quantization
+${CMAKE_OUT}/test_reduction
+${CMAKE_OUT}/test_bitpacking
+${CMAKE_OUT}/test_linear
+${CMAKE_OUT}/test_valpacking