Skip to content

Commit c159bbd

Browse files
committed
Docs, tweaks.
1 parent c90f820 commit c159bbd

File tree

2 files changed

+59
-17
lines changed

2 files changed

+59
-17
lines changed

sqlite3/libc/README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Using SIMD for libc
2+
3+
I found that implementing some libc functions with Wasm SIMD128 can make them significantly faster.
4+
5+
Rough numbers for [wazero](https://wazero.io/):
6+
7+
function | speedup
8+
------------ | -----
9+
`strlen` | 4.1×
10+
`memchr` | 4.1×
11+
`strchr` | 4.0×
12+
`strrchr` | 9.1×
13+
`memcmp` | 13.0×
14+
`strcmp` | 10.4×
15+
`strncmp` | 15.7×
16+
`strcasecmp` | 8.8×
17+
`strncasecmp`| 8.6×
18+
`strspn` | 9.9×
19+
`strcspn` | 9.0×
20+
`memmem` | 2.2×
21+
`strstr` | 5.5×
22+
`strcasestr` | 25.2×
23+
24+
For functions where musl uses SWAR on a 4-byte `size_t`,
25+
the improvement is around 4×.
26+
This is very close to the expected theoretical improvement,
27+
as we're processing 4× the bytes per cycle (16 _vs._ 4).
28+
29+
For other functions where there's no algorithmic change,
30+
the improvement is around 8×.
31+
These functions are harder to optimize
32+
(which is why musl doesn't bother with SWAR),
33+
so getting an 8× improvement from processing 16× bytes seems decent.
34+
35+
String search is harder to compare, since there are algorithmic changes,
36+
and different needles produce very different numbers.
37+
We use [Quick Search](https://igm.univ-mlv.fr/~lecroq/string/node19.html) for `memmem`,
38+
and a [Rabin–Karp](https://igm.univ-mlv.fr/~lecroq/string/node5.html) for `strstr` and `strcasestr`;
39+
musl uses [Two Way](https://igm.univ-mlv.fr/~lecroq/string/node26.html) for `memmem` and `strstr`,
40+
and [brute force](https://igm.univ-mlv.fr/~lecroq/string/node3.html) for `strcasestr`.
41+
Unlike Two-Way, both replacements can go quadratic for long, periodic needles.

sqlite3/libc/string.h

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -402,8 +402,8 @@ size_t strspn(const char *s, const char *c) {
402402
__wasm_v128_bitmap256_t bitmap = {};
403403

404404
for (; *c; c++) {
405-
__wasm_v128_setbit(&bitmap, *c);
406405
// Terminator IS NOT on the bitmap.
406+
__wasm_v128_setbit(&bitmap, *c);
407407
}
408408

409409
for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
@@ -433,11 +433,10 @@ size_t strcspn(const char *s, const char *c) {
433433

434434
__wasm_v128_bitmap256_t bitmap = {};
435435

436-
for (;;) {
437-
__wasm_v128_setbit(&bitmap, *c);
436+
do {
438437
// Terminator IS on the bitmap.
439-
if (!*c++) break;
440-
}
438+
__wasm_v128_setbit(&bitmap, *c);
439+
} while (*c++);
441440

442441
for (; N >= sizeof(v128_t); N -= sizeof(v128_t)) {
443442
const v128_t cmp = __wasm_v128_chkbits(bitmap, wasm_v128_load(w));
@@ -465,13 +464,13 @@ size_t strcspn(const char *s, const char *c) {
465464
// We augment the SIMD algorithm with Quick Search's
466465
// bad-character shift.
467466
//
468-
// https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
469-
// https://www-igm.univ-mlv.fr/~lecroq/string/node18.html
470-
// https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
471-
// https://www-igm.univ-mlv.fr/~lecroq/string/node22.html
467+
// https://igm.univ-mlv.fr/~lecroq/string/node14.html
468+
// https://igm.univ-mlv.fr/~lecroq/string/node18.html
469+
// https://igm.univ-mlv.fr/~lecroq/string/node19.html
470+
// https://igm.univ-mlv.fr/~lecroq/string/node22.html
472471

473-
static const char *__memmem(const char *haystk, size_t sh,
474-
const char *needle, size_t sn,
472+
static const char *__memmem(const char *haystk, size_t sh, //
473+
const char *needle, size_t sn, //
475474
uint8_t bmbc[256]) {
476475
// We've handled empty and single character needles.
477476
// The needle is not longer than the haystack.
@@ -490,8 +489,8 @@ static const char *__memmem(const char *haystk, size_t sh,
490489
const v128_t lst = wasm_i8x16_splat(needle[i]);
491490

492491
// The last haystack offset for which loading blk_lst is safe.
493-
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i -
494-
sizeof(v128_t));
492+
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - //
493+
(sizeof(v128_t) + i));
495494

496495
while (haystk <= H) {
497496
const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
@@ -622,8 +621,8 @@ char *strcasestr(const char *haystk, const char *needle) {
622621
const v128_t lstu = wasm_i8x16_splat(toupper(needle[i]));
623622

624623
// The last haystk offset for which loading blk_lst is safe.
625-
const char *H =
626-
(char *)(__builtin_wasm_memory_size(0) * PAGESIZE - i - sizeof(v128_t));
624+
const char *H = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE - //
625+
(sizeof(v128_t) + i));
627626

628627
while (haystk <= H) {
629628
const v128_t blk_fst = wasm_v128_load((v128_t *)(haystk));
@@ -680,7 +679,8 @@ char *strcasestr(const char *haystk, const char *needle) {
680679
// - strtok
681680

682681
__attribute__((weak))
683-
void *memccpy(void *__restrict dest, const void *__restrict src, int c, size_t n) {
682+
void *memccpy(void *__restrict dest, const void *__restrict src, int c,
683+
size_t n) {
684684
const void *m = memchr(src, c, n);
685685
if (m != NULL) {
686686
n = (char *)m - (char *)src + 1;
@@ -717,7 +717,8 @@ static char *__stpcpy(char *__restrict dest, const char *__restrict src) {
717717
return dest + slen;
718718
}
719719

720-
static char *__stpncpy(char *__restrict dest, const char *__restrict src, size_t n) {
720+
static char *__stpncpy(char *__restrict dest, const char *__restrict src,
721+
size_t n) {
721722
size_t strnlen(const char *s, size_t n);
722723
size_t slen = strnlen(src, n);
723724
memcpy(dest, src, slen);

0 commit comments

Comments
 (0)