Skip to content

Commit af25a52

Browse files
xbaselzuiderkwast
andauthored
Optimize BITCOUNT using ARM NEON SIMD (#1867)
Replace scalar loop with ARM NEON intrinsics for vectorized processing on ARM. Results: **Throughout** | Payload Size | Scalar Throughput (k req/s) | SIMD Throughput (k req/s) | Improvement (%) | |--------------|-----------------------------|----------------------------|-----------------| | 16B | 249.69 | 249.69 | 0.00% | | 256B | 249.63 | 249.69 | +0.02% | | 4KB | 199.72 | 249.63 | +25.00% | | 64KB | 44.33 | 166.42 | +275.43% | | 1MB | 3.30 | 26.59 | +705.74% | | 10MB | 0.33 | 3.32 | +900.04% | **Average Latency** | Payload Size | Scalar Avg Latency (ms) | SIMD Avg Latency (ms) | |--------------|--------------------------|-------------------------| | 16B | 0.374 | 0.375 | | 256B | 0.381 | 0.376 | | 4KB | 0.489 | 0.389 | | 64KB | 2.241 | 0.575 | | 1MB | 30.169 | 3.649 | | 10MB | 287.228 | 29.220 | **P99 Latency** | Payload Size | Scalar p99 Latency (ms) | SIMD p99 Latency (ms) | |--------------|--------------------------|-------------------------| | 16B | 0.511 | 0.511 | | 256B | 0.519 | 0.511 | | 4KB | 0.639 | 0.535 | | 64KB | 2.439 | 0.727 | | 1MB | 32.303 | 3.959 | | 10MB | 314.623 | 31.615 | Tested on AWS Graviton2. To isolate CPU-bound improvements, the same key was used, reducing the likelihood of memory stalls for small payloads. Fixes: #1864 --------- Signed-off-by: xbasel <103044017+xbasel@users.noreply.github.com> Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech> Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
1 parent 6cf69a4 commit af25a52

File tree

2 files changed

+61
-0
lines changed

2 files changed

+61
-0
lines changed

src/bitops.c

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535
#define __MM_MALLOC_H
3636
#include <immintrin.h>
3737
#endif
38+
#if defined(__aarch64__)
39+
#include <arm_neon.h>
40+
#endif
3841
/* -----------------------------------------------------------------------------
3942
* Helpers and low level bit functions.
4043
* -------------------------------------------------------------------------- */
@@ -187,6 +190,51 @@ long long popcountScalar(void *s, long count) {
187190
return bits;
188191
}
189192

193+
#if defined(__aarch64__)
194+
#include <arm_neon.h>
195+
196+
/* SIMD version of popcount for ARM NEON.
197+
* Processes data in 64-byte NEON batches, falls back to scalar for tail. */
198+
long long popcountNEON(void *s, long n) {
199+
long long t = 0;
200+
uint8_t *p = (uint8_t *)s;
201+
;
202+
const uint8_t *e = p + n;
203+
204+
/* Process 64-byte blocks using unrolled loop (4 x 16-byte vectors) */
205+
for (; p <= e - 64; p += 64) {
206+
/* Load 4 vector registers (16 bytes each) */
207+
uint8x16_t v0 = vld1q_u8(p);
208+
uint8x16_t v1 = vld1q_u8(p + 16);
209+
uint8x16_t v2 = vld1q_u8(p + 32);
210+
uint8x16_t v3 = vld1q_u8(p + 48);
211+
212+
/* Count bits in each byte and sum vectors */
213+
uint8x16_t s1 = vaddq_u8(vcntq_u8(v0), vcntq_u8(v1));
214+
uint8x16_t s2 = vaddq_u8(vcntq_u8(v2), vcntq_u8(v3));
215+
uint8x16_t s0 = vaddq_u8(s1, s2);
216+
217+
/* Sum all bytes in the final vector */
218+
uint16x8_t sc = vpaddlq_u8(s0); // 16x u8 -> 8x u16 (pairwise add)
219+
uint32_t t1 = vaddvq_u16(sc);
220+
t += t1;
221+
}
222+
223+
/* Process remaining 16-byte chunks */
224+
for (; p + 16 <= e; p += 16) {
225+
t += vaddvq_u8(vcntq_u8(vld1q_u8(p)));
226+
}
227+
228+
/* Handle remaining bytes with scalar fallback */
229+
if (p < e) {
230+
size_t r = e - p;
231+
t += popcountScalar((void *)p, r);
232+
}
233+
234+
return t;
235+
}
236+
#endif
237+
190238
/* Count number of bits set in the binary array pointed by 's' and long
191239
* 'count' bytes. The implementation of this function is required to
192240
* work with an input string length up to 512 MB or more (server.proto_max_bulk_len) */
@@ -198,6 +246,12 @@ long long serverPopcount(void *s, long count) {
198246
return popcountAVX2(s, count);
199247
}
200248
#endif
249+
#ifdef __aarch64__
250+
if (count >= 16) {
251+
return popcountNEON(s, count);
252+
}
253+
#endif
254+
201255
return popcountScalar(s, count);
202256
}
203257

src/unit/test_bitops.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ extern long long popcountScalar(void *s, long count);
1010
#if HAVE_X86_SIMD
1111
extern long long popcountAVX2(void *s, long count);
1212
#endif
13+
#if defined(__aarch64__)
14+
extern long long popcountNEON(void *s, long count);
15+
#endif
1316

1417
static long long bitcount(void *s, long count) {
1518
long long bits = 0;
@@ -39,6 +42,10 @@ static int test_case(const char *msg, int size) {
3942
#if HAVE_X86_SIMD
4043
long long ret_avx2 = popcountAVX2(buf, size);
4144
TEST_ASSERT_MESSAGE(msg, expect == ret_avx2);
45+
#endif
46+
#if defined(__aarch64__)
47+
long long ret_neon = popcountNEON(buf, size);
48+
TEST_ASSERT_MESSAGE(msg, expect == ret_neon);
4249
#endif
4350
}
4451

0 commit comments

Comments
 (0)