Skip to content

Commit dd772c4

Browse files
xbaselzuiderkwast
andauthored
Hyperloglog ARM NEON SIMD optimization (#1859)
Add ARM NEON optimization for HyperLogLog - Implement two NEON optmized functions for converting between raw and dense representations in HyperLogLog: 1. hllMergeDenseNEON 2. hllDenseCompressNEON These functions process 16 registers in each iteration. - Utilize existing SIMD test in hyperloglog.tcl (previously added for AVX2 optimization) to validate NEON implementation Test: ``` valkey-benchmark -n 1000000 --dbnum 9 -p 21111 PFMERGE z hll1{t} hll2{t}``` ``` +-------------------+-----------+-----------+---------------+ | Metric | Before | After | Improvement % | +-------------------+-----------+-----------+---------------+ | Throughput (k rps)| 7.42 | 76.98 | 937.47% | +-------------------+-----------+-----------+---------------+ | Latency (msec) | | | | | avg | 6.686 | 0.595 | 91.10% | | min | 0.520 | 0.152 | 70.77% | | p50 | 7.799 | 0.599 | 92.32% | | p95 | 8.039 | 0.767 | 90.46% | | p99 | 8.111 | 0.807 | 90.05% | | max | 9.263 | 1.463 | 84.21% | +-------------------+-----------+-----------+---------------+ ``` Hardware: ``` CPU: Graviton 3 Architecture: aarch64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 64 On-line CPU(s) list: 0-63 NUMA: NUMA node(s): 1 NUMA node0 CPU(s): 0-63 Memory: 256 GB ``` Command stats: Before: ``` cmdstat_pfmerge:calls=1000002,usec=126327984,**usec_per_call=126.33**,rejected_calls=0,failed_calls=0 ``` After: ``` cmdstat_pfmerge:calls=1000002,usec=8588205,**usec_per_call=8.59**,rejected_calls=0,failed_calls=0 ``` Improved by **~14.7x.** Functional testing command: ``` ./runtest --single unit/hyperloglog --only "PFMERGE results with simd" --loops 10000 --fastfail ``` The SIMD test randomizes input and comapres scalar vs simd results. --------- Signed-off-by: xbasel <103044017+xbasel@users.noreply.github.com> Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
1 parent 8df0a6b commit dd772c4

File tree

1 file changed

+196
-8
lines changed

1 file changed

+196
-8
lines changed

src/hyperloglog.c

Lines changed: 196 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@
4747
#include <immintrin.h>
4848
#endif
4949

50+
#ifdef __aarch64__
51+
#include <arm_neon.h>
52+
#endif
53+
5054
/* The HyperLogLog implementation is based on the following ideas:
5155
*
5256
* * The use of a 64 bit hash function as proposed in [1], in order to estimate
@@ -220,13 +224,26 @@ struct hllhdr {
220224

221225
static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected";
222226

223-
#if HAVE_X86_SIMD
227+
#if HAVE_X86_SIMD || defined(__aarch64__)
228+
#define SIMD_SUPPORTED 1
224229
static int simd_enabled = 1;
230+
#else
231+
#define SIMD_SUPPORTED 0
232+
#endif
233+
234+
#if HAVE_X86_SIMD
225235
#define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2"))
226236
#else
227237
#define HLL_USE_AVX2 0
228238
#endif
229239

240+
#ifdef __aarch64__
241+
#define HLL_USE_NEON (simd_enabled)
242+
#else
243+
#define HLL_USE_NEON 0
244+
#endif
245+
246+
230247
/* =========================== Low level bit macros ========================= */
231248

232249
/* Macros to access the dense representation.
@@ -1193,6 +1210,95 @@ void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
11931210
}
11941211
#endif
11951212

1213+
#if defined(__aarch64__)
1214+
/*
1215+
* hllMergeDenseNEON is an ARM optimized version of hllMergeDense using NEON
1216+
*
1217+
* This function merges HyperLogLog (HLL) dense registers using ARM NEON SIMD instructions.
1218+
* It extracts 6 bits registers from a dense format, and stores them in raw format
1219+
*
1220+
* Parameters:
1221+
* - reg_raw: Pointer to the raw register array
1222+
* - reg_dense: Pointer to the dense register array
1223+
*/
1224+
void hllMergeDenseNEON(uint8_t *reg_raw, const uint8_t *reg_dense) {
1225+
uint8_t *dense_ptr = (uint8_t *)reg_dense;
1226+
uint8_t *raw_ptr = (uint8_t *)reg_raw;
1227+
1228+
uint8x16_t idx = {0, 1, 2, 0xFF,
1229+
3, 4, 5, 0xFF,
1230+
6, 7, 8, 0xFF,
1231+
9, 10, 11, 0xFF};
1232+
1233+
// Bit masks for extracting specific bit ranges
1234+
uint8x16_t mask1 = vreinterpretq_u8_u32(vdupq_n_u32(0x0000003f)); // Bits 0-5
1235+
uint8x16_t mask2 = vreinterpretq_u8_u32(vdupq_n_u32(0x00000fc0)); // Bits 6-11
1236+
uint8x16_t mask3 = vreinterpretq_u8_u32(vdupq_n_u32(0x0003f000)); // Bits 12-17
1237+
uint8x16_t mask4 = vreinterpretq_u8_u32(vdupq_n_u32(0x00fc0000)); // Bits 18-23
1238+
1239+
for (int i = 0; i < HLL_REGISTERS / 16 - 1; ++i) {
1240+
/* Load 16 bytes from dense registers but only the first 12 bytes are processed because they contain
1241+
* 16 registers, which is copied into 16 bytes raw registers.
1242+
* The last 4 bytes are ignored because (1) they do not form a complete number of registers, and do not fit
1243+
* in the 16 bytes. The unprocessed 4 bytes are processed in the next iteration.
1244+
*/
1245+
uint8x16_t r = vld1q_u8(dense_ptr);
1246+
1247+
/* Reorder bytes based on index mapping
1248+
* Lookup indices
1249+
*From: {AAAB|BBCC|CDDD}
1250+
*To: {AAA0|BBB0|CCC0|DDD0}
1251+
*/
1252+
uint8x16_t x = vqtbl1q_u8(r, idx);
1253+
1254+
// Extract and isolate registers
1255+
uint8x16_t a1 = vandq_u8(x, mask1);
1256+
uint8x16_t a2 = vandq_u8(x, mask2);
1257+
uint8x16_t a3 = vandq_u8(x, mask3);
1258+
uint8x16_t a4 = vandq_u8(x, mask4);
1259+
1260+
// Align extracted values by shifting left
1261+
uint32x4_t a2_32 = vreinterpretq_u32_u8(a2);
1262+
a2_32 = vshlq_n_u32(a2_32, 2);
1263+
a2 = vreinterpretq_u8_u32(a2_32);
1264+
1265+
uint32x4_t a3_32 = vreinterpretq_u32_u8(a3);
1266+
a3_32 = vshlq_n_u32(a3_32, 4);
1267+
a3 = vreinterpretq_u8_u32(a3_32);
1268+
1269+
uint32x4_t a4_32 = vreinterpretq_u32_u8(a4);
1270+
a4_32 = vshlq_n_u32(a4_32, 6);
1271+
a4 = vreinterpretq_u8_u32(a4_32);
1272+
1273+
// Combine extracted values
1274+
uint8x16_t y1 = vorrq_u8(a1, a2);
1275+
uint8x16_t y2 = vorrq_u8(a3, a4);
1276+
uint8x16_t y = vorrq_u8(y1, y2);
1277+
1278+
// Load current raw register values
1279+
uint8x16_t z = vld1q_u8(raw_ptr);
1280+
1281+
// Update raw registers with max values
1282+
z = vmaxq_u8(z, y);
1283+
1284+
// Store updated values
1285+
vst1q_u8(raw_ptr, z);
1286+
1287+
raw_ptr += 16;
1288+
dense_ptr += 12;
1289+
}
1290+
1291+
/* Process remaining registers, we do this manually because we don't want to over-read 4 bytes */
1292+
uint8_t val;
1293+
for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) {
1294+
HLL_DENSE_GET_REGISTER(val, reg_dense, i);
1295+
if (val > reg_raw[i]) {
1296+
reg_raw[i] = val; // Update raw register if new value is greater
1297+
}
1298+
}
1299+
}
1300+
#endif // __aarch64__
1301+
11961302
/* Merge dense-encoded registers to raw registers array. */
11971303
void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
11981304
#if HAVE_X86_SIMD
@@ -1203,6 +1309,14 @@ void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
12031309
}
12041310
}
12051311
#endif
1312+
#ifdef __aarch64__
1313+
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
1314+
if (HLL_USE_NEON) {
1315+
hllMergeDenseNEON(reg_raw, reg_dense);
1316+
return;
1317+
}
1318+
}
1319+
#endif
12061320

12071321
uint8_t val;
12081322
for (int i = 0; i < HLL_REGISTERS; i++) {
@@ -1357,6 +1471,74 @@ void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
13571471
}
13581472
#endif
13591473

1474+
#if defined(__aarch64__)
1475+
/*
1476+
* hllDenseCompressNEON is ARM optimized version of hllDenseCompress using NEON.
1477+
*
1478+
* This function takes a raw register (`reg_raw`) and compresses it into a dense representation (`reg_dense`).
1479+
* It uses NEON SIMD instructions to process multiple values at once.
1480+
*
1481+
* - The first loop processes most of the registers in 16-element blocks using NEON instructions.
1482+
* - The second loop handles the remaining registers using a direct assignment macro.
1483+
*
1484+
*/
1485+
void hllDenseCompressNEON(uint8_t *reg_dense, const uint8_t *reg_raw) {
1486+
/* Shuffle indices for packing bytes of dense registers
1487+
* From: {AAA0|BBB0|CCC0|DDD0}
1488+
* To: {AAAB|BBCC|CDDD|0000}
1489+
*/
1490+
uint8x16_t idx = {
1491+
0, 1, 2, // Extract bytes from lane 0
1492+
4, 5, 6, // Extract bytes from lane 1
1493+
8, 9, 10, // Extract bytes from lane 2
1494+
12, 13, 14, // Extract bytes from lane 3
1495+
0xFF, 0xFF, 0xFF, 0xFF // Zero out last 4 elements (padding)
1496+
};
1497+
1498+
// Bit masks for extracting first 6 bits from every byte within 32-bit lanes
1499+
uint32x4_t mask1 = vdupq_n_u32(0x0000003F); // Extract bits 0-5
1500+
uint32x4_t mask2 = vdupq_n_u32(0x00003F00); // Extract bits 8-13
1501+
uint32x4_t mask3 = vdupq_n_u32(0x003F0000); // Extract bits 16-21
1502+
uint32x4_t mask4 = vdupq_n_u32(0x3F000000); // Extract bits 24-29
1503+
1504+
uint8_t *r = (uint8_t *)reg_raw; // Input pointer
1505+
uint8_t *t = (uint8_t *)reg_dense; // Output pointer
1506+
1507+
// Process registers in blocks of 16 using NEON instructions
1508+
// The last 16 registers are processed separately to avoid overwriting, as the final write is 12 bytes.
1509+
for (int i = 0; i < HLL_REGISTERS / 16 - 1; i++) {
1510+
// Load 16 bytes as 4x 32-bit values
1511+
uint32x4_t x = vld1q_u32((uint32_t *)r);
1512+
1513+
// Apply masks to extract a single register from every 4 registers, for every lane
1514+
uint32x4_t a1 = vandq_u32(x, mask1);
1515+
uint32x4_t a2 = vandq_u32(x, mask2);
1516+
uint32x4_t a3 = vandq_u32(x, mask3);
1517+
uint32x4_t a4 = vandq_u32(x, mask4);
1518+
1519+
// Shift extracted bits to align them properly
1520+
a2 = vshrq_n_u32(a2, 2);
1521+
a3 = vshrq_n_u32(a3, 4);
1522+
a4 = vshrq_n_u32(a4, 6);
1523+
1524+
uint32x4_t y1 = vorrq_u32(a1, a2);
1525+
uint32x4_t y2 = vorrq_u32(a3, a4);
1526+
uint32x4_t y = vorrq_u32(y1, y2);
1527+
1528+
// Perform a table lookup to shuffle extracted values and align them in 12 bytes
1529+
vst1q_u8(t, vqtbl1q_u8(vreinterpretq_u8_u32(y), idx));
1530+
1531+
t += 12;
1532+
r += 16;
1533+
}
1534+
1535+
// Handle the remaining registers individually (12 bytes)
1536+
for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) {
1537+
HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
1538+
}
1539+
}
1540+
#endif // __aarch64__
1541+
13601542
/* Compress raw registers to dense representation. */
13611543
void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
13621544
#if HAVE_X86_SIMD
@@ -1366,6 +1548,16 @@ void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
13661548
return;
13671549
}
13681550
}
1551+
1552+
#endif
1553+
1554+
#ifdef __ARM_NEON
1555+
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
1556+
if (HLL_USE_NEON) {
1557+
hllDenseCompressNEON(reg_dense, reg_raw);
1558+
return;
1559+
}
1560+
}
13691561
#endif
13701562

13711563
for (int i = 0; i < HLL_REGISTERS; i++) {
@@ -1770,22 +1962,18 @@ void pfdebugCommand(client *c) {
17701962
if (c->argc != 3) goto arityerr;
17711963

17721964
if (!strcasecmp(c->argv[2]->ptr, "on")) {
1773-
#if HAVE_X86_SIMD
1965+
#if SIMD_SUPPORTED
17741966
simd_enabled = 1;
17751967
#endif
17761968
} else if (!strcasecmp(c->argv[2]->ptr, "off")) {
1777-
#if HAVE_X86_SIMD
1969+
#if SIMD_SUPPORTED
17781970
simd_enabled = 0;
17791971
#endif
17801972
} else {
17811973
addReplyError(c, "Argument must be ON or OFF");
17821974
}
17831975

1784-
if (HLL_USE_AVX2) {
1785-
addReplyStatus(c, "enabled");
1786-
} else {
1787-
addReplyStatus(c, "disabled");
1788-
}
1976+
addReplyStatus(c, (HLL_USE_AVX2 || HLL_USE_NEON) ? "enabled" : "disabled");
17891977

17901978
return;
17911979
}

0 commit comments

Comments
 (0)