Skip to content

Commit 70f2057

Browse files
lipzhuguowangyzuiderkwast
authored
Optimize string2ll performance using AVX512 (#1944)
### Description This pull request aims to optimize the `string2ll` function using AVX512 instructions to enhance performance for string-to-integer conversions. The existing scalar version of the function was identified as a bottleneck in certain scenarios, and this optimization seeks to address that. ### Changes - Introduced string2llAVX512 function in util.c to utilize AVX512 SIMD instructions. - Updated config.h to define ATTRIBUTE_TARGET_AVX512 for AVX512 support. - Replaced calls to `lpStringToInt64` with `string2ll` for consistency and performance improvement. - Removed redundant code in listpack.c and optimized integer encoding logic. ### Performance Boost The corresponding list commands can gain up to **~19%** performance improvement, the string length is 4 bytes. The longer the string, the greater the performance boost. ### Ref 1. http://0x80.pl/notesen/2018-04-19-simd-parsing-int-sequences.html#toc-entry-1 2. https://lemire.me/blog/2023/09/22/parsing-integers-quickly-with-avx-512/ --------- Signed-off-by: Lipeng Zhu <lipeng.zhu@intel.com> Signed-off-by: Lipeng Zhu <zhu.lipeng@outlook.com> Co-authored-by: Wangyang Guo <wangyang.guo@intel.com> Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
1 parent 25ef0b3 commit 70f2057

File tree

7 files changed

+170
-112
lines changed

7 files changed

+170
-112
lines changed

src/bitops.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
*/
3030

3131
#include "server.h"
32-
#ifdef HAVE_AVX2
32+
#if HAVE_X86_SIMD
3333
/* Define __MM_MALLOC_H to prevent importing the memory aligned
3434
* allocation functions, which we don't use. */
3535
#define __MM_MALLOC_H
@@ -48,7 +48,7 @@ static const unsigned char bitsinbyte[256] = {
4848
5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6,
4949
6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
5050

51-
#ifdef HAVE_AVX2
51+
#if HAVE_X86_SIMD
5252
/* The SIMD version of popcount enhances performance through parallel lookup tables which is based on the following article:
5353
* https://arxiv.org/pdf/1611.07612 */
5454
ATTRIBUTE_TARGET_AVX2
@@ -191,7 +191,7 @@ long long popcountScalar(void *s, long count) {
191191
* 'count' bytes. The implementation of this function is required to
192192
* work with an input string length up to 512 MB or more (server.proto_max_bulk_len) */
193193
long long serverPopcount(void *s, long count) {
194-
#ifdef HAVE_AVX2
194+
#if HAVE_X86_SIMD
195195
/* If length of s >= 256 bits and the CPU supports AVX2,
196196
* we prefer to use the SIMD version */
197197
if (count >= 32) {

src/config.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -374,17 +374,19 @@ void setcpuaffinity(const char *cpulist);
374374
#define valkey_prefetch(addr) ((void)(addr))
375375
#endif
376376

377-
/* Check if we can compile AVX2 code */
378-
#if defined(__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4))
379-
#if defined(__has_attribute) && __has_attribute(target)
380-
#define HAVE_AVX2
381-
#endif
377+
/* Check if we can compile SIMD code */
378+
#if defined(__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4)) && defined(__has_attribute) && __has_attribute(target)
379+
#define HAVE_X86_SIMD 1
380+
#else
381+
#define HAVE_X86_SIMD 0
382382
#endif
383383

384-
#if defined(HAVE_AVX2)
384+
#if HAVE_X86_SIMD
385385
#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2")))
386+
#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,avx512bw,avx512vl")))
386387
#else
387388
#define ATTRIBUTE_TARGET_AVX2
389+
#define ATTRIBUTE_TARGET_AVX512
388390
#endif
389391

390392
#endif

src/hyperloglog.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
#include <stdint.h>
4141
#include <math.h>
4242

43-
#ifdef HAVE_AVX2
43+
#if HAVE_X86_SIMD
4444
/* Define __MM_MALLOC_H to prevent importing the memory aligned
4545
* allocation functions, which we don't use. */
4646
#define __MM_MALLOC_H
@@ -220,7 +220,7 @@ struct hllhdr {
220220

221221
static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected";
222222

223-
#ifdef HAVE_AVX2
223+
#if HAVE_X86_SIMD
224224
static int simd_enabled = 1;
225225
#define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2"))
226226
#else
@@ -1083,7 +1083,7 @@ int hllAdd(robj *o, unsigned char *ele, size_t elesize) {
10831083
}
10841084
}
10851085

1086-
#ifdef HAVE_AVX2
1086+
#if HAVE_X86_SIMD
10871087
/* A specialized version of hllMergeDense, optimized for default configurations.
10881088
*
10891089
* Requirements:
@@ -1195,7 +1195,7 @@ void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
11951195

11961196
/* Merge dense-encoded registers to raw registers array. */
11971197
void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
1198-
#ifdef HAVE_AVX2
1198+
#if HAVE_X86_SIMD
11991199
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
12001200
if (HLL_USE_AVX2) {
12011201
hllMergeDenseAVX2(reg_raw, reg_dense);
@@ -1258,7 +1258,7 @@ int hllMerge(uint8_t *max, robj *hll) {
12581258
return C_OK;
12591259
}
12601260

1261-
#ifdef HAVE_AVX2
1261+
#if HAVE_X86_SIMD
12621262
/* A specialized version of hllDenseCompress, optimized for default configurations.
12631263
*
12641264
* Requirements:
@@ -1359,7 +1359,7 @@ void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
13591359

13601360
/* Compress raw registers to dense representation. */
13611361
void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
1362-
#ifdef HAVE_AVX2
1362+
#if HAVE_X86_SIMD
13631363
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
13641364
if (HLL_USE_AVX2) {
13651365
hllDenseCompressAVX2(reg_dense, reg_raw);
@@ -1770,11 +1770,11 @@ void pfdebugCommand(client *c) {
17701770
if (c->argc != 3) goto arityerr;
17711771

17721772
if (!strcasecmp(c->argv[2]->ptr, "on")) {
1773-
#ifdef HAVE_AVX2
1773+
#if HAVE_X86_SIMD
17741774
simd_enabled = 1;
17751775
#endif
17761776
} else if (!strcasecmp(c->argv[2]->ptr, "off")) {
1777-
#ifdef HAVE_AVX2
1777+
#if HAVE_X86_SIMD
17781778
simd_enabled = 0;
17791779
#endif
17801780
} else {

src/listpack.c

Lines changed: 4 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -146,91 +146,6 @@ int lpSafeToAdd(unsigned char *lp, size_t add) {
146146
return 1;
147147
}
148148

149-
/* Convert a string into a signed 64 bit integer.
150-
* The function returns 1 if the string could be parsed into a (non-overflowing)
151-
* signed 64 bit int, 0 otherwise. The 'value' will be set to the parsed value
152-
* when the function returns success.
153-
*
154-
* Note that this function demands that the string strictly represents
155-
* a int64 value: no spaces or other characters before or after the string
156-
* representing the number are accepted, nor zeroes at the start if not
157-
* for the string "0" representing the zero number.
158-
*
159-
* Because of its strictness, it is safe to use this function to check if
160-
* you can convert a string into a long long, and obtain back the string
161-
* from the number without any loss in the string representation. *
162-
*
163-
* -----------------------------------------------------------------------------
164-
*
165-
* Credits: this function was adapted from the Redis OSS source code, file
166-
* "utils.c", function string2ll(), and is copyright:
167-
*
168-
* Copyright(C) 2011, Pieter Noordhuis
169-
* Copyright(C) 2011, Redis Ltd.
170-
*
171-
* The function is released under the BSD 3-clause license.
172-
*/
173-
int lpStringToInt64(const char *s, unsigned long slen, int64_t *value) {
174-
const char *p = s;
175-
unsigned long plen = 0;
176-
int negative = 0;
177-
uint64_t v;
178-
179-
/* Abort if length indicates this cannot possibly be an int */
180-
if (slen == 0 || slen >= LONG_STR_SIZE) return 0;
181-
182-
/* Special case: first and only digit is 0. */
183-
if (slen == 1 && p[0] == '0') {
184-
if (value != NULL) *value = 0;
185-
return 1;
186-
}
187-
188-
if (p[0] == '-') {
189-
negative = 1;
190-
p++;
191-
plen++;
192-
193-
/* Abort on only a negative sign. */
194-
if (plen == slen) return 0;
195-
}
196-
197-
/* First digit should be 1-9, otherwise the string should just be 0. */
198-
if (p[0] >= '1' && p[0] <= '9') {
199-
v = p[0] - '0';
200-
p++;
201-
plen++;
202-
} else {
203-
return 0;
204-
}
205-
206-
while (plen < slen && p[0] >= '0' && p[0] <= '9') {
207-
if (v > (UINT64_MAX / 10)) /* Overflow. */
208-
return 0;
209-
v *= 10;
210-
211-
if (v > (UINT64_MAX - (p[0] - '0'))) /* Overflow. */
212-
return 0;
213-
v += p[0] - '0';
214-
215-
p++;
216-
plen++;
217-
}
218-
219-
/* Return if not all bytes were used. */
220-
if (plen < slen) return 0;
221-
222-
if (negative) {
223-
if (v > ((uint64_t)(-(INT64_MIN + 1)) + 1)) /* Overflow. */
224-
return 0;
225-
if (value != NULL) *value = -v;
226-
} else {
227-
if (v > INT64_MAX) /* Overflow. */
228-
return 0;
229-
if (value != NULL) *value = v;
230-
}
231-
return 1;
232-
}
233-
234149
/* Create a new, empty listpack.
235150
* On success the new listpack is returned, otherwise an error is returned.
236151
* Pre-allocate at least `capacity` bytes of memory,
@@ -331,7 +246,7 @@ static inline void lpEncodeIntegerGetType(int64_t v, unsigned char *intenc, uint
331246
* in order to be represented. */
332247
static inline int lpEncodeGetType(unsigned char *ele, uint32_t size, unsigned char *intenc, uint64_t *enclen) {
333248
int64_t v;
334-
if (lpStringToInt64((const char *)ele, size, &v)) {
249+
if (string2ll((const char *)ele, size, (long long *)&v)) {
335250
lpEncodeIntegerGetType(v, intenc, enclen);
336251
return LP_ENCODING_INT;
337252
} else {
@@ -704,7 +619,7 @@ unsigned char *lpFind(unsigned char *lp, unsigned char *p, unsigned char *s, uin
704619
/* If the entry can be encoded as integer we set it to
705620
* 1, else set it to UCHAR_MAX, so that we don't retry
706621
* again the next time. */
707-
if (slen >= 32 || slen == 0 || !lpStringToInt64((const char *)s, slen, &vll)) {
622+
if (slen >= 32 || slen == 0 || !string2ll((const char *)s, slen, (long long *)&vll)) {
708623
vencoding = UCHAR_MAX;
709624
} else {
710625
vencoding = 1;
@@ -1373,11 +1288,11 @@ unsigned int lpCompare(unsigned char *p, unsigned char *s, uint32_t slen) {
13731288
if (value) {
13741289
return (slen == sz) && memcmp(value, s, slen) == 0;
13751290
} else {
1376-
/* We use lpStringToInt64() to get an integer representation of the
1291+
/* We use string2ll() to get an integer representation of the
13771292
* string 's' and compare it to 'sval', it's much faster than convert
13781293
* integer to string and comparing. */
13791294
int64_t sval;
1380-
if (lpStringToInt64((const char *)s, slen, &sval)) return sz == sval;
1295+
if (string2ll((const char *)s, slen, (long long *)&sval)) return sz == sval;
13811296
}
13821297

13831298
return 0;

src/unit/test_bitops.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "../zmalloc.h"
88

99
extern long long popcountScalar(void *s, long count);
10-
#ifdef HAVE_AVX2
10+
#if HAVE_X86_SIMD
1111
extern long long popcountAVX2(void *s, long count);
1212
#endif
1313

@@ -36,7 +36,7 @@ static int test_case(const char *msg, int size) {
3636
long long expect = bitcount(buf, size);
3737
long long ret_scalar = popcountScalar(buf, size);
3838
TEST_ASSERT_MESSAGE(msg, expect == ret_scalar);
39-
#ifdef HAVE_AVX2
39+
#if HAVE_X86_SIMD
4040
long long ret_avx2 = popcountAVX2(buf, size);
4141
TEST_ASSERT_MESSAGE(msg, expect == ret_avx2);
4242
#endif

src/unit/test_util.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ int test_string2ll(int argc, char **argv, int flags) {
6969
valkey_strlcpy(buf, "9223372036854775808", sizeof(buf)); /* overflow */
7070
TEST_ASSERT(string2ll(buf, strlen(buf), &v) == 0);
7171

72+
valkey_strlcpy(buf, "18446744073709551615", sizeof(buf)); /* overflow */
73+
TEST_ASSERT(string2ll(buf, strlen(buf), &v) == 0);
74+
7275
return 0;
7376
}
7477

0 commit comments

Comments
 (0)