Optimize BITCOUNT using ARM NEON SIMD (#1867)

xbasel · zuiderkwast · web-flow · commit af25a5247ad0 · 2025-05-11T12:12:30.000+02:00
Replace scalar loop with ARM NEON intrinsics for vectorized processing on ARM. Results: **Throughout** | Payload Size | Scalar Throughput (k req/s) | SIMD Throughput (k req/s) | Improvement (%) | |--------------|-----------------------------|----------------------------|-----------------| | 16B | 249.69 | 249.69 | 0.00% | | 256B | 249.63 | 249.69 | +0.02% | | 4KB | 199.72 | 249.63 | +25.00% | | 64KB | 44.33 | 166.42 | +275.43% | | 1MB | 3.30 | 26.59 | +705.74% | | 10MB | 0.33 | 3.32 | +900.04% | **Average Latency** | Payload Size | Scalar Avg Latency (ms) | SIMD Avg Latency (ms) | |--------------|--------------------------|-------------------------| | 16B | 0.374 | 0.375 | | 256B | 0.381 | 0.376 | | 4KB | 0.489 | 0.389 | | 64KB | 2.241 | 0.575 | | 1MB | 30.169 | 3.649 | | 10MB | 287.228 | 29.220 | **P99 Latency** | Payload Size | Scalar p99 Latency (ms) | SIMD p99 Latency (ms) | |--------------|--------------------------|-------------------------| | 16B | 0.511 | 0.511 | | 256B | 0.519 | 0.511 | | 4KB | 0.639 | 0.535 | | 64KB | 2.439 | 0.727 | | 1MB | 32.303 | 3.959 | | 10MB | 314.623 | 31.615 | Tested on AWS Graviton2. To isolate CPU-bound improvements, the same key was used, reducing the likelihood of memory stalls for small payloads. Fixes: #1864 --------- Signed-off-by: xbasel <103044017+xbasel@users.noreply.github.com> Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech> Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
diff --git a/src/bitops.c b/src/bitops.c
@@ -35,6 +35,9 @@
 #define __MM_MALLOC_H
 #include <immintrin.h>
 #endif
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif
 /* -----------------------------------------------------------------------------
  * Helpers and low level bit functions.
  * -------------------------------------------------------------------------- */
@@ -187,6 +190,51 @@ long long popcountScalar(void *s, long count) {
     return bits;
 }
 
+#if defined(__aarch64__)
+#include <arm_neon.h>
+
+/*  SIMD version of popcount for ARM NEON.
+ *  Processes data in 64-byte NEON batches, falls back to scalar for tail. */
+long long popcountNEON(void *s, long n) {
+    long long t = 0;
+    uint8_t *p = (uint8_t *)s;
+    ;
+    const uint8_t *e = p + n;
+
+    /* Process 64-byte blocks using unrolled loop (4 x 16-byte vectors) */
+    for (; p <= e - 64; p += 64) {
+        /* Load 4 vector registers (16 bytes each) */
+        uint8x16_t v0 = vld1q_u8(p);
+        uint8x16_t v1 = vld1q_u8(p + 16);
+        uint8x16_t v2 = vld1q_u8(p + 32);
+        uint8x16_t v3 = vld1q_u8(p + 48);
+
+        /* Count bits in each byte and sum vectors */
+        uint8x16_t s1 = vaddq_u8(vcntq_u8(v0), vcntq_u8(v1));
+        uint8x16_t s2 = vaddq_u8(vcntq_u8(v2), vcntq_u8(v3));
+        uint8x16_t s0 = vaddq_u8(s1, s2);
+
+        /* Sum all bytes in the final vector */
+        uint16x8_t sc = vpaddlq_u8(s0); // 16x u8 -> 8x u16 (pairwise add)
+        uint32_t t1 = vaddvq_u16(sc);
+        t += t1;
+    }
+
+    /* Process remaining 16-byte chunks */
+    for (; p + 16 <= e; p += 16) {
+        t += vaddvq_u8(vcntq_u8(vld1q_u8(p)));
+    }
+
+    /* Handle remaining bytes with scalar fallback */
+    if (p < e) {
+        size_t r = e - p;
+        t += popcountScalar((void *)p, r);
+    }
+
+    return t;
+}
+#endif
+
 /* Count number of bits set in the binary array pointed by 's' and long
  * 'count' bytes. The implementation of this function is required to
  * work with an input string length up to 512 MB or more (server.proto_max_bulk_len) */
@@ -198,6 +246,12 @@ long long serverPopcount(void *s, long count) {
         return popcountAVX2(s, count);
     }
 #endif
+#ifdef __aarch64__
+    if (count >= 16) {
+        return popcountNEON(s, count);
+    }
+#endif
+
     return popcountScalar(s, count);
 }
 
diff --git a/src/unit/test_bitops.c b/src/unit/test_bitops.c
@@ -10,6 +10,9 @@ extern long long popcountScalar(void *s, long count);
 #if HAVE_X86_SIMD
 extern long long popcountAVX2(void *s, long count);
 #endif
+#if defined(__aarch64__)
+extern long long popcountNEON(void *s, long count);
+#endif
 
 static long long bitcount(void *s, long count) {
     long long bits = 0;
@@ -39,6 +42,10 @@ static int test_case(const char *msg, int size) {
 #if HAVE_X86_SIMD
         long long ret_avx2 = popcountAVX2(buf, size);
         TEST_ASSERT_MESSAGE(msg, expect == ret_avx2);
+#endif
+#if defined(__aarch64__)
+        long long ret_neon = popcountNEON(buf, size);
+        TEST_ASSERT_MESSAGE(msg, expect == ret_neon);
 #endif
     }