Hyperloglog ARM NEON SIMD optimization (#1859)

xbasel · zuiderkwast · web-flow · commit dd772c490ed7 · 2025-05-02T00:23:58.000+02:00
Add ARM NEON optimization for HyperLogLog

- Implement two NEON optmized functions for converting between raw and
  dense representations in HyperLogLog:
  1. hllMergeDenseNEON
  2. hllDenseCompressNEON
  These functions process 16 registers in each iteration.

- Utilize existing SIMD test in hyperloglog.tcl (previously added for
  AVX2 optimization) to validate NEON implementation

Test:
``` valkey-benchmark -n 1000000 --dbnum  9  -p 21111 PFMERGE z hll1{t} hll2{t}```

```
+-------------------+-----------+-----------+---------------+
|      Metric       |  Before   |   After   | Improvement % |
+-------------------+-----------+-----------+---------------+
| Throughput (k rps)|    7.42   |   76.98   |    937.47%    |
+-------------------+-----------+-----------+---------------+
| Latency (msec)    |           |           |               |
|   avg             |   6.686   |   0.595   |     91.10%    |
|   min             |   0.520   |   0.152   |     70.77%    |
|   p50             |   7.799   |   0.599   |     92.32%    |
|   p95             |   8.039   |   0.767   |     90.46%    |
|   p99             |   8.111   |   0.807   |     90.05%    |
|   max             |   9.263   |   1.463   |     84.21%    |
+-------------------+-----------+-----------+---------------+
```

Hardware:
```
CPU: Graviton 3
Architecture:           aarch64
  CPU op-mode(s):       32-bit, 64-bit
  Byte Order:           Little Endian
CPU(s):                 64
  On-line CPU(s) list:  0-63
NUMA:
  NUMA node(s):         1
  NUMA node0 CPU(s):    0-63
Memory: 256 GB
```

Command stats:
Before:
```

cmdstat_pfmerge:calls=1000002,usec=126327984,**usec_per_call=126.33**,rejected_calls=0,failed_calls=0

```
After:
```

cmdstat_pfmerge:calls=1000002,usec=8588205,**usec_per_call=8.59**,rejected_calls=0,failed_calls=0
```
Improved by **~14.7x.**




Functional testing command:
```
./runtest --single unit/hyperloglog --only "PFMERGE results with simd"
--loops 10000 --fastfail
```
The SIMD test randomizes input and comapres scalar vs simd results.

---------

Signed-off-by: xbasel &lt;103044017+xbasel@users.noreply.github.com&gt;
Co-authored-by: Viktor Söderqvist &lt;viktor.soderqvist@est.tech&gt;
diff --git a/src/hyperloglog.c b/src/hyperloglog.c
@@ -47,6 +47,10 @@
 #include <immintrin.h>
 #endif
 
+#ifdef __aarch64__
+#include <arm_neon.h>
+#endif
+
 /* The HyperLogLog implementation is based on the following ideas:
  *
  * * The use of a 64 bit hash function as proposed in [1], in order to estimate
@@ -220,13 +224,26 @@ struct hllhdr {
 
 static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected";
 
-#if HAVE_X86_SIMD
+#if HAVE_X86_SIMD || defined(__aarch64__)
+#define SIMD_SUPPORTED 1
 static int simd_enabled = 1;
+#else
+#define SIMD_SUPPORTED 0
+#endif
+
+#if HAVE_X86_SIMD
 #define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2"))
 #else
 #define HLL_USE_AVX2 0
 #endif
 
+#ifdef __aarch64__
+#define HLL_USE_NEON (simd_enabled)
+#else
+#define HLL_USE_NEON 0
+#endif
+
+
 /* =========================== Low level bit macros ========================= */
 
 /* Macros to access the dense representation.
@@ -1193,6 +1210,95 @@ void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
 }
 #endif
 
+#if defined(__aarch64__)
+/*
+ * hllMergeDenseNEON is an ARM optimized version of hllMergeDense using NEON
+ *
+ * This function merges HyperLogLog (HLL) dense registers using ARM NEON SIMD instructions.
+ * It extracts 6 bits registers from a dense format, and stores them in raw format
+ *
+ * Parameters:
+ * - reg_raw: Pointer to the raw register array
+ * - reg_dense: Pointer to the dense register array
+ */
+void hllMergeDenseNEON(uint8_t *reg_raw, const uint8_t *reg_dense) {
+    uint8_t *dense_ptr = (uint8_t *)reg_dense;
+    uint8_t *raw_ptr = (uint8_t *)reg_raw;
+
+    uint8x16_t idx = {0, 1, 2, 0xFF,
+                      3, 4, 5, 0xFF,
+                      6, 7, 8, 0xFF,
+                      9, 10, 11, 0xFF};
+
+    // Bit masks for extracting specific bit ranges
+    uint8x16_t mask1 = vreinterpretq_u8_u32(vdupq_n_u32(0x0000003f)); // Bits 0-5
+    uint8x16_t mask2 = vreinterpretq_u8_u32(vdupq_n_u32(0x00000fc0)); // Bits 6-11
+    uint8x16_t mask3 = vreinterpretq_u8_u32(vdupq_n_u32(0x0003f000)); // Bits 12-17
+    uint8x16_t mask4 = vreinterpretq_u8_u32(vdupq_n_u32(0x00fc0000)); // Bits 18-23
+
+    for (int i = 0; i < HLL_REGISTERS / 16 - 1; ++i) {
+        /* Load 16 bytes from dense registers but only the first 12 bytes are processed because they contain
+         * 16 registers, which is copied into 16 bytes raw registers.
+         * The last 4 bytes are ignored because (1) they do not form a complete number of registers, and do not fit
+         * in the 16 bytes. The unprocessed 4 bytes are processed in the next iteration.
+         */
+        uint8x16_t r = vld1q_u8(dense_ptr);
+
+        /* Reorder bytes based on index mapping
+         * Lookup indices
+         *From: {AAAB|BBCC|CDDD}
+         *To:   {AAA0|BBB0|CCC0|DDD0}
+         */
+        uint8x16_t x = vqtbl1q_u8(r, idx);
+
+        // Extract and isolate registers
+        uint8x16_t a1 = vandq_u8(x, mask1);
+        uint8x16_t a2 = vandq_u8(x, mask2);
+        uint8x16_t a3 = vandq_u8(x, mask3);
+        uint8x16_t a4 = vandq_u8(x, mask4);
+
+        // Align extracted values by shifting left
+        uint32x4_t a2_32 = vreinterpretq_u32_u8(a2);
+        a2_32 = vshlq_n_u32(a2_32, 2);
+        a2 = vreinterpretq_u8_u32(a2_32);
+
+        uint32x4_t a3_32 = vreinterpretq_u32_u8(a3);
+        a3_32 = vshlq_n_u32(a3_32, 4);
+        a3 = vreinterpretq_u8_u32(a3_32);
+
+        uint32x4_t a4_32 = vreinterpretq_u32_u8(a4);
+        a4_32 = vshlq_n_u32(a4_32, 6);
+        a4 = vreinterpretq_u8_u32(a4_32);
+
+        // Combine extracted values
+        uint8x16_t y1 = vorrq_u8(a1, a2);
+        uint8x16_t y2 = vorrq_u8(a3, a4);
+        uint8x16_t y = vorrq_u8(y1, y2);
+
+        // Load current raw register values
+        uint8x16_t z = vld1q_u8(raw_ptr);
+
+        // Update raw registers with max values
+        z = vmaxq_u8(z, y);
+
+        // Store updated values
+        vst1q_u8(raw_ptr, z);
+
+        raw_ptr += 16;
+        dense_ptr += 12;
+    }
+
+    /* Process remaining registers, we do this manually because we don't want to over-read 4 bytes */
+    uint8_t val;
+    for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_GET_REGISTER(val, reg_dense, i);
+        if (val > reg_raw[i]) {
+            reg_raw[i] = val; // Update raw register if new value is greater
+        }
+    }
+}
+#endif // __aarch64__
+
 /* Merge dense-encoded registers to raw registers array. */
 void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
 #if HAVE_X86_SIMD
@@ -1203,6 +1309,14 @@ void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
         }
     }
 #endif
+#ifdef __aarch64__
+    if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
+        if (HLL_USE_NEON) {
+            hllMergeDenseNEON(reg_raw, reg_dense);
+            return;
+        }
+    }
+#endif
 
     uint8_t val;
     for (int i = 0; i < HLL_REGISTERS; i++) {
@@ -1357,6 +1471,74 @@ void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
 }
 #endif
 
+#if defined(__aarch64__)
+/*
+ * hllDenseCompressNEON is ARM optimized version of hllDenseCompress using NEON.
+ *
+ * This function takes a raw register (`reg_raw`) and compresses it into a dense representation (`reg_dense`).
+ * It uses NEON SIMD instructions to process multiple values at once.
+ *
+ * - The first loop processes most of the registers in 16-element blocks using NEON instructions.
+ * - The second loop handles the remaining registers using a direct assignment macro.
+ *
+ */
+void hllDenseCompressNEON(uint8_t *reg_dense, const uint8_t *reg_raw) {
+    /* Shuffle indices for packing bytes of dense registers
+     * From: {AAA0|BBB0|CCC0|DDD0}
+     * To:   {AAAB|BBCC|CDDD|0000}
+     */
+    uint8x16_t idx = {
+        0, 1, 2,               // Extract bytes from lane 0
+        4, 5, 6,               // Extract bytes from lane 1
+        8, 9, 10,              // Extract bytes from lane 2
+        12, 13, 14,            // Extract bytes from lane 3
+        0xFF, 0xFF, 0xFF, 0xFF // Zero out last 4 elements (padding)
+    };
+
+    // Bit masks for extracting first 6 bits from every byte within 32-bit lanes
+    uint32x4_t mask1 = vdupq_n_u32(0x0000003F); // Extract bits 0-5
+    uint32x4_t mask2 = vdupq_n_u32(0x00003F00); // Extract bits 8-13
+    uint32x4_t mask3 = vdupq_n_u32(0x003F0000); // Extract bits 16-21
+    uint32x4_t mask4 = vdupq_n_u32(0x3F000000); // Extract bits 24-29
+
+    uint8_t *r = (uint8_t *)reg_raw;   // Input pointer
+    uint8_t *t = (uint8_t *)reg_dense; // Output pointer
+
+    // Process registers in blocks of 16 using NEON instructions
+    // The last 16 registers are processed separately to avoid overwriting, as the final write is 12 bytes.
+    for (int i = 0; i < HLL_REGISTERS / 16 - 1; i++) {
+        // Load 16 bytes as 4x 32-bit values
+        uint32x4_t x = vld1q_u32((uint32_t *)r);
+
+        // Apply masks to extract a single register from every 4 registers, for every lane
+        uint32x4_t a1 = vandq_u32(x, mask1);
+        uint32x4_t a2 = vandq_u32(x, mask2);
+        uint32x4_t a3 = vandq_u32(x, mask3);
+        uint32x4_t a4 = vandq_u32(x, mask4);
+
+        // Shift extracted bits to align them properly
+        a2 = vshrq_n_u32(a2, 2);
+        a3 = vshrq_n_u32(a3, 4);
+        a4 = vshrq_n_u32(a4, 6);
+
+        uint32x4_t y1 = vorrq_u32(a1, a2);
+        uint32x4_t y2 = vorrq_u32(a3, a4);
+        uint32x4_t y = vorrq_u32(y1, y2);
+
+        // Perform a table lookup to shuffle extracted values and align them in 12 bytes
+        vst1q_u8(t, vqtbl1q_u8(vreinterpretq_u8_u32(y), idx));
+
+        t += 12;
+        r += 16;
+    }
+
+    // Handle the remaining registers individually (12 bytes)
+    for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
+    }
+}
+#endif // __aarch64__
+
 /* Compress raw registers to dense representation. */
 void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
 #if HAVE_X86_SIMD
@@ -1366,6 +1548,16 @@ void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
             return;
         }
     }
+
+#endif
+
+#ifdef __ARM_NEON
+    if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
+        if (HLL_USE_NEON) {
+            hllDenseCompressNEON(reg_dense, reg_raw);
+            return;
+        }
+    }
 #endif
 
     for (int i = 0; i < HLL_REGISTERS; i++) {
@@ -1770,22 +1962,18 @@ void pfdebugCommand(client *c) {
         if (c->argc != 3) goto arityerr;
 
         if (!strcasecmp(c->argv[2]->ptr, "on")) {
-#if HAVE_X86_SIMD
+#if SIMD_SUPPORTED
             simd_enabled = 1;
 #endif
         } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
-#if HAVE_X86_SIMD
+#if SIMD_SUPPORTED
             simd_enabled = 0;
 #endif
         } else {
             addReplyError(c, "Argument must be ON or OFF");
         }
 
-        if (HLL_USE_AVX2) {
-            addReplyStatus(c, "enabled");
-        } else {
-            addReplyStatus(c, "disabled");
-        }
+        addReplyStatus(c, (HLL_USE_AVX2 || HLL_USE_NEON) ? "enabled" : "disabled");
 
         return;
     }