47
47
#include <immintrin.h>
48
48
#endif
49
49
50
+ #ifdef __aarch64__
51
+ #include <arm_neon.h>
52
+ #endif
53
+
50
54
/* The HyperLogLog implementation is based on the following ideas:
51
55
*
52
56
* * The use of a 64 bit hash function as proposed in [1], in order to estimate
@@ -220,13 +224,26 @@ struct hllhdr {
220
224
221
225
static char * invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected" ;
222
226
223
- #if HAVE_X86_SIMD
227
+ #if HAVE_X86_SIMD || defined(__aarch64__ )
228
+ #define SIMD_SUPPORTED 1
224
229
static int simd_enabled = 1 ;
230
+ #else
231
+ #define SIMD_SUPPORTED 0
232
+ #endif
233
+
234
+ #if HAVE_X86_SIMD
225
235
#define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2"))
226
236
#else
227
237
#define HLL_USE_AVX2 0
228
238
#endif
229
239
240
+ #ifdef __aarch64__
241
+ #define HLL_USE_NEON (simd_enabled)
242
+ #else
243
+ #define HLL_USE_NEON 0
244
+ #endif
245
+
246
+
230
247
/* =========================== Low level bit macros ========================= */
231
248
232
249
/* Macros to access the dense representation.
@@ -1193,6 +1210,95 @@ void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
1193
1210
}
1194
1211
#endif
1195
1212
1213
+ #if defined(__aarch64__ )
1214
+ /*
1215
+ * hllMergeDenseNEON is an ARM optimized version of hllMergeDense using NEON
1216
+ *
1217
+ * This function merges HyperLogLog (HLL) dense registers using ARM NEON SIMD instructions.
1218
+ * It extracts 6 bits registers from a dense format, and stores them in raw format
1219
+ *
1220
+ * Parameters:
1221
+ * - reg_raw: Pointer to the raw register array
1222
+ * - reg_dense: Pointer to the dense register array
1223
+ */
1224
+ void hllMergeDenseNEON (uint8_t * reg_raw , const uint8_t * reg_dense ) {
1225
+ uint8_t * dense_ptr = (uint8_t * )reg_dense ;
1226
+ uint8_t * raw_ptr = (uint8_t * )reg_raw ;
1227
+
1228
+ uint8x16_t idx = {0 , 1 , 2 , 0xFF ,
1229
+ 3 , 4 , 5 , 0xFF ,
1230
+ 6 , 7 , 8 , 0xFF ,
1231
+ 9 , 10 , 11 , 0xFF };
1232
+
1233
+ // Bit masks for extracting specific bit ranges
1234
+ uint8x16_t mask1 = vreinterpretq_u8_u32 (vdupq_n_u32 (0x0000003f )); // Bits 0-5
1235
+ uint8x16_t mask2 = vreinterpretq_u8_u32 (vdupq_n_u32 (0x00000fc0 )); // Bits 6-11
1236
+ uint8x16_t mask3 = vreinterpretq_u8_u32 (vdupq_n_u32 (0x0003f000 )); // Bits 12-17
1237
+ uint8x16_t mask4 = vreinterpretq_u8_u32 (vdupq_n_u32 (0x00fc0000 )); // Bits 18-23
1238
+
1239
+ for (int i = 0 ; i < HLL_REGISTERS / 16 - 1 ; ++ i ) {
1240
+ /* Load 16 bytes from dense registers but only the first 12 bytes are processed because they contain
1241
+ * 16 registers, which is copied into 16 bytes raw registers.
1242
+ * The last 4 bytes are ignored because (1) they do not form a complete number of registers, and do not fit
1243
+ * in the 16 bytes. The unprocessed 4 bytes are processed in the next iteration.
1244
+ */
1245
+ uint8x16_t r = vld1q_u8 (dense_ptr );
1246
+
1247
+ /* Reorder bytes based on index mapping
1248
+ * Lookup indices
1249
+ *From: {AAAB|BBCC|CDDD}
1250
+ *To: {AAA0|BBB0|CCC0|DDD0}
1251
+ */
1252
+ uint8x16_t x = vqtbl1q_u8 (r , idx );
1253
+
1254
+ // Extract and isolate registers
1255
+ uint8x16_t a1 = vandq_u8 (x , mask1 );
1256
+ uint8x16_t a2 = vandq_u8 (x , mask2 );
1257
+ uint8x16_t a3 = vandq_u8 (x , mask3 );
1258
+ uint8x16_t a4 = vandq_u8 (x , mask4 );
1259
+
1260
+ // Align extracted values by shifting left
1261
+ uint32x4_t a2_32 = vreinterpretq_u32_u8 (a2 );
1262
+ a2_32 = vshlq_n_u32 (a2_32 , 2 );
1263
+ a2 = vreinterpretq_u8_u32 (a2_32 );
1264
+
1265
+ uint32x4_t a3_32 = vreinterpretq_u32_u8 (a3 );
1266
+ a3_32 = vshlq_n_u32 (a3_32 , 4 );
1267
+ a3 = vreinterpretq_u8_u32 (a3_32 );
1268
+
1269
+ uint32x4_t a4_32 = vreinterpretq_u32_u8 (a4 );
1270
+ a4_32 = vshlq_n_u32 (a4_32 , 6 );
1271
+ a4 = vreinterpretq_u8_u32 (a4_32 );
1272
+
1273
+ // Combine extracted values
1274
+ uint8x16_t y1 = vorrq_u8 (a1 , a2 );
1275
+ uint8x16_t y2 = vorrq_u8 (a3 , a4 );
1276
+ uint8x16_t y = vorrq_u8 (y1 , y2 );
1277
+
1278
+ // Load current raw register values
1279
+ uint8x16_t z = vld1q_u8 (raw_ptr );
1280
+
1281
+ // Update raw registers with max values
1282
+ z = vmaxq_u8 (z , y );
1283
+
1284
+ // Store updated values
1285
+ vst1q_u8 (raw_ptr , z );
1286
+
1287
+ raw_ptr += 16 ;
1288
+ dense_ptr += 12 ;
1289
+ }
1290
+
1291
+ /* Process remaining registers, we do this manually because we don't want to over-read 4 bytes */
1292
+ uint8_t val ;
1293
+ for (int i = HLL_REGISTERS - 16 ; i < HLL_REGISTERS ; i ++ ) {
1294
+ HLL_DENSE_GET_REGISTER (val , reg_dense , i );
1295
+ if (val > reg_raw [i ]) {
1296
+ reg_raw [i ] = val ; // Update raw register if new value is greater
1297
+ }
1298
+ }
1299
+ }
1300
+ #endif // __aarch64__
1301
+
1196
1302
/* Merge dense-encoded registers to raw registers array. */
1197
1303
void hllMergeDense (uint8_t * reg_raw , const uint8_t * reg_dense ) {
1198
1304
#if HAVE_X86_SIMD
@@ -1203,6 +1309,14 @@ void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
1203
1309
}
1204
1310
}
1205
1311
#endif
1312
+ #ifdef __aarch64__
1313
+ if (HLL_REGISTERS == 16384 && HLL_BITS == 6 ) {
1314
+ if (HLL_USE_NEON ) {
1315
+ hllMergeDenseNEON (reg_raw , reg_dense );
1316
+ return ;
1317
+ }
1318
+ }
1319
+ #endif
1206
1320
1207
1321
uint8_t val ;
1208
1322
for (int i = 0 ; i < HLL_REGISTERS ; i ++ ) {
@@ -1357,6 +1471,74 @@ void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
1357
1471
}
1358
1472
#endif
1359
1473
1474
+ #if defined(__aarch64__ )
1475
+ /*
1476
+ * hllDenseCompressNEON is ARM optimized version of hllDenseCompress using NEON.
1477
+ *
1478
+ * This function takes a raw register (`reg_raw`) and compresses it into a dense representation (`reg_dense`).
1479
+ * It uses NEON SIMD instructions to process multiple values at once.
1480
+ *
1481
+ * - The first loop processes most of the registers in 16-element blocks using NEON instructions.
1482
+ * - The second loop handles the remaining registers using a direct assignment macro.
1483
+ *
1484
+ */
1485
+ void hllDenseCompressNEON (uint8_t * reg_dense , const uint8_t * reg_raw ) {
1486
+ /* Shuffle indices for packing bytes of dense registers
1487
+ * From: {AAA0|BBB0|CCC0|DDD0}
1488
+ * To: {AAAB|BBCC|CDDD|0000}
1489
+ */
1490
+ uint8x16_t idx = {
1491
+ 0 , 1 , 2 , // Extract bytes from lane 0
1492
+ 4 , 5 , 6 , // Extract bytes from lane 1
1493
+ 8 , 9 , 10 , // Extract bytes from lane 2
1494
+ 12 , 13 , 14 , // Extract bytes from lane 3
1495
+ 0xFF , 0xFF , 0xFF , 0xFF // Zero out last 4 elements (padding)
1496
+ };
1497
+
1498
+ // Bit masks for extracting first 6 bits from every byte within 32-bit lanes
1499
+ uint32x4_t mask1 = vdupq_n_u32 (0x0000003F ); // Extract bits 0-5
1500
+ uint32x4_t mask2 = vdupq_n_u32 (0x00003F00 ); // Extract bits 8-13
1501
+ uint32x4_t mask3 = vdupq_n_u32 (0x003F0000 ); // Extract bits 16-21
1502
+ uint32x4_t mask4 = vdupq_n_u32 (0x3F000000 ); // Extract bits 24-29
1503
+
1504
+ uint8_t * r = (uint8_t * )reg_raw ; // Input pointer
1505
+ uint8_t * t = (uint8_t * )reg_dense ; // Output pointer
1506
+
1507
+ // Process registers in blocks of 16 using NEON instructions
1508
+ // The last 16 registers are processed separately to avoid overwriting, as the final write is 12 bytes.
1509
+ for (int i = 0 ; i < HLL_REGISTERS / 16 - 1 ; i ++ ) {
1510
+ // Load 16 bytes as 4x 32-bit values
1511
+ uint32x4_t x = vld1q_u32 ((uint32_t * )r );
1512
+
1513
+ // Apply masks to extract a single register from every 4 registers, for every lane
1514
+ uint32x4_t a1 = vandq_u32 (x , mask1 );
1515
+ uint32x4_t a2 = vandq_u32 (x , mask2 );
1516
+ uint32x4_t a3 = vandq_u32 (x , mask3 );
1517
+ uint32x4_t a4 = vandq_u32 (x , mask4 );
1518
+
1519
+ // Shift extracted bits to align them properly
1520
+ a2 = vshrq_n_u32 (a2 , 2 );
1521
+ a3 = vshrq_n_u32 (a3 , 4 );
1522
+ a4 = vshrq_n_u32 (a4 , 6 );
1523
+
1524
+ uint32x4_t y1 = vorrq_u32 (a1 , a2 );
1525
+ uint32x4_t y2 = vorrq_u32 (a3 , a4 );
1526
+ uint32x4_t y = vorrq_u32 (y1 , y2 );
1527
+
1528
+ // Perform a table lookup to shuffle extracted values and align them in 12 bytes
1529
+ vst1q_u8 (t , vqtbl1q_u8 (vreinterpretq_u8_u32 (y ), idx ));
1530
+
1531
+ t += 12 ;
1532
+ r += 16 ;
1533
+ }
1534
+
1535
+ // Handle the remaining registers individually (12 bytes)
1536
+ for (int i = HLL_REGISTERS - 16 ; i < HLL_REGISTERS ; i ++ ) {
1537
+ HLL_DENSE_SET_REGISTER (reg_dense , i , reg_raw [i ]);
1538
+ }
1539
+ }
1540
+ #endif // __aarch64__
1541
+
1360
1542
/* Compress raw registers to dense representation. */
1361
1543
void hllDenseCompress (uint8_t * reg_dense , const uint8_t * reg_raw ) {
1362
1544
#if HAVE_X86_SIMD
@@ -1366,6 +1548,16 @@ void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
1366
1548
return ;
1367
1549
}
1368
1550
}
1551
+
1552
+ #endif
1553
+
1554
+ #ifdef __ARM_NEON
1555
+ if (HLL_REGISTERS == 16384 && HLL_BITS == 6 ) {
1556
+ if (HLL_USE_NEON ) {
1557
+ hllDenseCompressNEON (reg_dense , reg_raw );
1558
+ return ;
1559
+ }
1560
+ }
1369
1561
#endif
1370
1562
1371
1563
for (int i = 0 ; i < HLL_REGISTERS ; i ++ ) {
@@ -1770,22 +1962,18 @@ void pfdebugCommand(client *c) {
1770
1962
if (c -> argc != 3 ) goto arityerr ;
1771
1963
1772
1964
if (!strcasecmp (c -> argv [2 ]-> ptr , "on" )) {
1773
- #if HAVE_X86_SIMD
1965
+ #if SIMD_SUPPORTED
1774
1966
simd_enabled = 1 ;
1775
1967
#endif
1776
1968
} else if (!strcasecmp (c -> argv [2 ]-> ptr , "off" )) {
1777
- #if HAVE_X86_SIMD
1969
+ #if SIMD_SUPPORTED
1778
1970
simd_enabled = 0 ;
1779
1971
#endif
1780
1972
} else {
1781
1973
addReplyError (c , "Argument must be ON or OFF" );
1782
1974
}
1783
1975
1784
- if (HLL_USE_AVX2 ) {
1785
- addReplyStatus (c , "enabled" );
1786
- } else {
1787
- addReplyStatus (c , "disabled" );
1788
- }
1976
+ addReplyStatus (c , (HLL_USE_AVX2 || HLL_USE_NEON ) ? "enabled" : "disabled" );
1789
1977
1790
1978
return ;
1791
1979
}
0 commit comments