35
35
#define __MM_MALLOC_H
36
36
#include <immintrin.h>
37
37
#endif
38
+ #if defined(__aarch64__ )
39
+ #include <arm_neon.h>
40
+ #endif
38
41
/* -----------------------------------------------------------------------------
39
42
* Helpers and low level bit functions.
40
43
* -------------------------------------------------------------------------- */
@@ -187,6 +190,51 @@ long long popcountScalar(void *s, long count) {
187
190
return bits ;
188
191
}
189
192
193
+ #if defined(__aarch64__ )
194
+ #include <arm_neon.h>
195
+
196
+ /* SIMD version of popcount for ARM NEON.
197
+ * Processes data in 64-byte NEON batches, falls back to scalar for tail. */
198
+ long long popcountNEON (void * s , long n ) {
199
+ long long t = 0 ;
200
+ uint8_t * p = (uint8_t * )s ;
201
+ ;
202
+ const uint8_t * e = p + n ;
203
+
204
+ /* Process 64-byte blocks using unrolled loop (4 x 16-byte vectors) */
205
+ for (; p <= e - 64 ; p += 64 ) {
206
+ /* Load 4 vector registers (16 bytes each) */
207
+ uint8x16_t v0 = vld1q_u8 (p );
208
+ uint8x16_t v1 = vld1q_u8 (p + 16 );
209
+ uint8x16_t v2 = vld1q_u8 (p + 32 );
210
+ uint8x16_t v3 = vld1q_u8 (p + 48 );
211
+
212
+ /* Count bits in each byte and sum vectors */
213
+ uint8x16_t s1 = vaddq_u8 (vcntq_u8 (v0 ), vcntq_u8 (v1 ));
214
+ uint8x16_t s2 = vaddq_u8 (vcntq_u8 (v2 ), vcntq_u8 (v3 ));
215
+ uint8x16_t s0 = vaddq_u8 (s1 , s2 );
216
+
217
+ /* Sum all bytes in the final vector */
218
+ uint16x8_t sc = vpaddlq_u8 (s0 ); // 16x u8 -> 8x u16 (pairwise add)
219
+ uint32_t t1 = vaddvq_u16 (sc );
220
+ t += t1 ;
221
+ }
222
+
223
+ /* Process remaining 16-byte chunks */
224
+ for (; p + 16 <= e ; p += 16 ) {
225
+ t += vaddvq_u8 (vcntq_u8 (vld1q_u8 (p )));
226
+ }
227
+
228
+ /* Handle remaining bytes with scalar fallback */
229
+ if (p < e ) {
230
+ size_t r = e - p ;
231
+ t += popcountScalar ((void * )p , r );
232
+ }
233
+
234
+ return t ;
235
+ }
236
+ #endif
237
+
190
238
/* Count number of bits set in the binary array pointed by 's' and long
191
239
* 'count' bytes. The implementation of this function is required to
192
240
* work with an input string length up to 512 MB or more (server.proto_max_bulk_len) */
@@ -198,6 +246,12 @@ long long serverPopcount(void *s, long count) {
198
246
return popcountAVX2 (s , count );
199
247
}
200
248
#endif
249
+ #ifdef __aarch64__
250
+ if (count >= 16 ) {
251
+ return popcountNEON (s , count );
252
+ }
253
+ #endif
254
+
201
255
return popcountScalar (s , count );
202
256
}
203
257
0 commit comments