Skip to content

Commit 86b95aa

Browse files
committed
Merge branch 'zig' into main-dev
2 parents ac41200 + ca3d42b commit 86b95aa

File tree

10 files changed

+1664
-5
lines changed

10 files changed

+1664
-5
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,5 @@
3838
# Rust build artifacts
3939
Cargo.lock
4040
target/
41+
.zig-cache/
42+
zig-out/

.vscode/settings.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,16 @@
2222
"prettier.tabWidth": 2
2323
},
2424
"cSpell.words": [
25+
"anyopaque",
26+
"anytype",
2527
"ashvardanian",
28+
"callconv",
2629
"cntfrq",
2730
"cntvct",
2831
"codegen",
2932
"colocations",
3033
"combinators",
34+
"comptime",
3135
"Condvar",
3236
"constexpr",
3337
"coprime",
@@ -44,6 +48,7 @@
4448
"noexcept",
4549
"NUMA",
4650
"OpenMP",
51+
"orelse",
4752
"prefetcher",
4853
"println",
4954
"pthreads",

README.md

Lines changed: 213 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Fork Union 🍴
22

3-
Fork Union is arguably the lowest-latency OpenMP-style NUMA-aware minimalistic scoped thread-pool designed for 'Fork-Join' parallelism in C++, C, and Rust, avoiding × [mutexes & system calls](#locks-and-mutexes), × [dynamic memory allocations](#memory-allocations), × [CAS-primitives](#atomics-and-cas), and × [false-sharing](#alignment--false-sharing) of CPU cache-lines on the hot path 🍴
3+
Fork Union is arguably the lowest-latency OpenMP-style NUMA-aware minimalistic scoped thread-pool designed for 'Fork-Join' parallelism in C++, C, Rust, and Zig, avoiding × [mutexes & system calls](#locks-and-mutexes), × [dynamic memory allocations](#memory-allocations), × [CAS-primitives](#atomics-and-cas), and × [false-sharing](#alignment--false-sharing) of CPU cache-lines on the hot path 🍴
44

55
## Motivation
66

@@ -13,7 +13,7 @@ OpenMP, however, is not ideal for fine-grained parallelism and is less portable
1313
[![`fork_union` banner](https://github.com/ashvardanian/ashvardanian/blob/master/repositories/fork_union.jpg?raw=true)](https://github.com/ashvardanian/fork_union)
1414

1515
This is where __`fork_union`__ comes in.
16-
It's a C++ 17 library with C 99 and Rust bindings ([previously Rust implementation was standalone in v1](#why-not-reimplement-it-in-rust)).
16+
It's a C++ 17 library with C 99, Rust, and Zig bindings ([previously Rust implementation was standalone in v1](#why-not-reimplement-it-in-rust)).
1717
It supports pinning threads to specific [NUMA](https://en.wikipedia.org/wiki/Non-uniform_memory_access) nodes or individual CPU cores, making it much easier to ensure data locality and halving the latency of individual loads in Big Data applications.
1818

1919
## Basic Usage
@@ -179,13 +179,199 @@ int main() {
179179
For advanced usage, refer to the [NUMA section below](#non-uniform-memory-access-numa).
180180
NUMA detection on Linux defaults to AUTO. Override with `-D FORK_UNION_ENABLE_NUMA=ON` or `OFF`.
181181

182+
### Intro in Zig
183+
184+
To integrate into your Zig project, add Fork Union to your `build.zig.zon`:
185+
186+
```zig
187+
.dependencies = .{
188+
.fork_union = .{
189+
.url = "https://github.com/ashvardanian/fork_union/archive/refs/tags/v2.3.0.tar.gz",
190+
.hash = "12200000000000000000000000000000000000000000000000000000000000000000",
191+
},
192+
},
193+
```
194+
195+
Then import and use in your code:
196+
197+
```zig
198+
const std = @import("std");
199+
const fu = @import("fork_union");
200+
201+
pub fn main() !void {
202+
var pool = try fu.Pool.init(allocator, 4, .inclusive);
203+
defer pool.deinit();
204+
205+
// Execute work on each thread (OpenMP-style parallel)
206+
pool.forThreads(struct {
207+
fn work(thread_idx: usize, colocation_idx: usize) void {
208+
std.debug.print("Thread {}\n", .{thread_idx});
209+
}
210+
}.work, {});
211+
212+
// Distribute 1000 tasks across threads (OpenMP-style parallel for)
213+
var results = [_]i32{0} ** 1000;
214+
pool.forN(1000, struct {
215+
fn process(prong: fu.Prong, ctx: Context) void {
216+
ctx.results[prong.task_index] = @intCast(prong.task_index * 2);
217+
}
218+
}.process, .{ .results = &results });
219+
}
220+
```
221+
222+
Unlike `std.Thread.Pool` task queue for async work, Fork Union is designed for __data parallelism__
223+
and __tight parallel loops__ — think OpenMP's `#pragma omp parallel for` with zero allocations on the hot path.
224+
225+
### Intro in C
226+
227+
Fork Union provides a pure C99 API via `fork_union.h`, wrapping the C++ implementation in pre-compiled libraries: `fork_union_static.a` or `fork_union_dynamic.so`.
228+
The C API uses opaque `fu_pool_t` handles and function pointers for callbacks, making it compatible with any C99+ compiler.
229+
230+
To integrate using CMake:
231+
232+
```cmake
233+
FetchContent_Declare(
234+
fork_union
235+
GIT_REPOSITORY https://github.com/ashvardanian/fork_union
236+
GIT_TAG v2.3.0
237+
)
238+
FetchContent_MakeAvailable(fork_union)
239+
target_link_libraries(your_target PRIVATE fork_union::fork_union_static)
240+
```
241+
242+
A minimal C example:
243+
244+
```c
245+
#include <stdio.h> // printf
246+
#include <fork_union.h> // fu_pool_t, fu_pool_new, fu_pool_spawn
247+
248+
void hello_callback(void *context, size_t thread, size_t colocation) {
249+
(void)context;
250+
printf("Hello from thread %zu (colocation %zu)\n", thread, colocation);
251+
}
252+
253+
int main(void) {
254+
fu_pool_t *pool = fu_pool_new("my_pool");
255+
if (!pool || !fu_pool_spawn(pool, fu_count_logical_cores(), fu_caller_inclusive_k))
256+
return 1;
257+
258+
fu_pool_for_threads(pool, hello_callback, NULL);
259+
fu_pool_delete(pool);
260+
return 0;
261+
}
262+
```
263+
264+
For parallel tasks with context:
265+
266+
```c
267+
struct task_context {
268+
int *data;
269+
size_t size;
270+
};
271+
272+
void process_task(void *ctx, size_t task, size_t thread, size_t colocation) {
273+
(void)thread; (void)colocation;
274+
struct task_context *context = (struct task_context *)ctx;
275+
context->data[task] = task * 2;
276+
}
277+
278+
int main(void) {
279+
fu_pool_t *pool = fu_pool_new("tasks");
280+
fu_pool_spawn(pool, 4, fu_caller_inclusive_k);
281+
282+
int data[100] = {0};
283+
struct task_context ctx = { .data = data, .size = 100 };
284+
fu_pool_for_n(pool, 100, process_task, &ctx); // static scheduling
285+
fu_pool_for_n_dynamic(pool, 100, process_task, &ctx); // dynamic scheduling
286+
287+
fu_pool_delete(pool);
288+
return 0;
289+
}
290+
```
291+
292+
#### GCC Nested Functions Extension
293+
294+
GCC supports [nested functions](https://gcc.gnu.org/onlinedocs/gcc/Nested-Functions.html) that can capture variables from the enclosing scope:
295+
296+
```c
297+
#include <stdio.h>
298+
#include <stdatomic.h>
299+
#include <fork_union.h>
300+
301+
int main(void) {
302+
fu_pool_t *pool = fu_pool_new("gcc_nested");
303+
fu_pool_spawn(pool, 4, fu_caller_inclusive_k);
304+
305+
atomic_size_t counter = 0;
306+
307+
// GCC nested function - captures 'counter' from enclosing scope
308+
void nested_callback(void *ctx, size_t task, size_t thread, size_t colocation) {
309+
(void)ctx; (void)thread; (void)colocation;
310+
atomic_fetch_add(&counter, 1);
311+
}
312+
313+
fu_pool_for_n(pool, 100, nested_callback, NULL);
314+
printf("Completed %zu tasks\n", (size_t)atomic_load(&counter));
315+
316+
fu_pool_delete(pool);
317+
return 0;
318+
}
319+
```
320+
321+
Compile: `gcc -std=c11 test.c -lfork_union_static -lpthread -lnuma`
322+
323+
#### Clang Blocks Extension
324+
325+
Clang provides [blocks](https://clang.llvm.org/docs/BlockLanguageSpec.html) with `^{}` syntax:
326+
327+
```c
328+
#include <stdio.h>
329+
#include <stdatomic.h>
330+
#include <Block.h>
331+
#include <fork_union.h>
332+
333+
typedef void (^task_block_t)(void *, size_t, size_t, size_t);
334+
335+
struct block_wrapper { task_block_t block; };
336+
337+
void block_wrapper_fn(void *ctx, size_t task, size_t thread, size_t colocation) {
338+
((struct block_wrapper *)ctx)->block(NULL, task, thread, colocation);
339+
}
340+
341+
int main(void) {
342+
fu_pool_t *pool = fu_pool_new("clang_blocks");
343+
fu_pool_spawn(pool, 4, fu_caller_inclusive_k);
344+
345+
__block atomic_size_t counter = 0;
346+
347+
task_block_t my_block = ^(void *c, size_t task, size_t t, size_t col) {
348+
(void)c; (void)t; (void)col;
349+
atomic_fetch_add(&counter, 1);
350+
};
351+
352+
task_block_t heap_block = Block_copy(my_block);
353+
struct block_wrapper wrapper = { .block = heap_block };
354+
355+
fu_pool_for_n(pool, 100, block_wrapper_fn, &wrapper);
356+
357+
Block_release(heap_block);
358+
printf("Completed %zu tasks\n", (size_t)atomic_load(&counter));
359+
360+
fu_pool_delete(pool);
361+
return 0;
362+
}
363+
```
364+
365+
Compile: `clang -std=c11 -fblocks test.c -lfork_union_static -lpthread -lnuma -lBlocksRuntime`
366+
182367
## Alternatives & Differences
183368

184369
Many other thread-pool implementations are more feature-rich but have different limitations and design goals.
185370

186371
- Modern C++: [`taskflow/taskflow`](https://github.com/taskflow/taskflow), [`progschj/ThreadPool`](https://github.com/progschj/ThreadPool), [`bshoshany/thread-pool`](https://github.com/bshoshany/thread-pool)
187372
- Traditional C++: [`vit-vit/CTPL`](https://github.com/vit-vit/CTPL), [`mtrebi/thread-pool`](https://github.com/mtrebi/thread-pool)
188373
- Rust: [`tokio-rs/tokio`](https://github.com/tokio-rs/tokio), [`rayon-rs/rayon`](https://github.com/rayon-rs/rayon), [`smol-rs/smol`](https://github.com/smol-rs/smol)
374+
- Zig: [`std.Thread.Pool`](https://ziglang.org/documentation/master/std/#std.Thread.Pool)
189375

190376
Those are not designed for the same OpenMP-like use cases as __`fork_union`__.
191377
Instead, they primarily focus on task queuing, which requires significantly more work.
@@ -461,6 +647,17 @@ Rust benchmarking results for $N=128$ bodies and $I=1e6$ iterations:
461647
> ² When a combination of performance and efficiency cores is used, dynamic stealing may be more efficient than static slicing. It's also fair to say, that OpenMP is not optimized for AppleClang.
462648
> 🔄 Rotation emoji stands for iterators, the default way to use Rayon and the opt-in slower, but more convenient variant for Fork Union.
463649
650+
Zig benchmarking results for $N=128$ bodies and $I=1e6$ iterations:
651+
652+
| Machine | Standard (S) | Fork Union (D) | Fork Union (S) |
653+
| :------------- | -----------: | -------------: | -------------: |
654+
| 16x Intel SPR | 2m52.0s | 18.2s | 12.8s |
655+
| 12x Apple M2 | - | - | - |
656+
| 96x Graviton 4 | - | - | - |
657+
658+
> Benchmarking suite also includes [Spice](https://github.com/judofyr/spice) and [libXEV](https://github.com/mitchellh/libxev), two popular Zig libraries for async processing, but those don't provide comparable bulk-synchronous APIs.
659+
> Thus, typically, all of the submitted tasks are executed on a single thread, making results not comparable.
660+
464661
You can rerun those benchmarks with the following commands:
465662

466663
```bash
@@ -580,6 +777,20 @@ cargo +stable install cargo-msrv
580777
cargo msrv find --ignore-lockfile
581778
```
582779

780+
---
781+
782+
For Zig, use the following commands:
783+
784+
```bash
785+
zig build test --summary all # run tests
786+
zig build nbody -Doptimize=ReleaseFast # build benchmark
787+
zig build -Dnuma=true # enable NUMA support (Linux)
788+
789+
# Run benchmark
790+
time NBODY_COUNT=128 NBODY_ITERATIONS=1000000 NBODY_BACKEND=fork_union_static \
791+
./zig-out/bin/nbody_zig
792+
```
793+
583794
## License
584795

585796
Licensed under the Apache License, Version 2.0. See `LICENSE` for details.

build.zig

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
const std = @import("std");
2+
const builtin = @import("builtin");
3+
4+
pub fn build(b: *std.Build) void {
5+
// Check Zig version compatibility (requires 0.16.0 or later)
6+
if (builtin.zig_version.major == 0 and builtin.zig_version.minor < 16) {
7+
@panic("Fork Union requires Zig 0.16.0 or later. Please upgrade your Zig toolchain.");
8+
}
9+
10+
const target = b.standardTargetOptions(.{});
11+
const optimize = b.standardOptimizeOption(.{});
12+
13+
// Determine NUMA support
14+
const enable_numa = b.option(bool, "numa", "Enable NUMA support (Linux only)") orelse
15+
(target.result.os.tag == .linux);
16+
17+
// Compile the C++ library from c/lib.cpp (like Rust's build.rs does)
18+
const lib = b.addLibrary(.{
19+
.name = "fork_union",
20+
.linkage = .static,
21+
.root_module = b.createModule(.{
22+
.target = target,
23+
.optimize = optimize,
24+
}),
25+
});
26+
27+
// Build C++ flags
28+
const cpp_flags = if (enable_numa and target.result.os.tag == .linux)
29+
&[_][]const u8{
30+
"-std=c++20",
31+
"-fno-exceptions",
32+
"-fno-rtti",
33+
"-DFU_ENABLE_NUMA=1",
34+
}
35+
else
36+
&[_][]const u8{
37+
"-std=c++20",
38+
"-fno-exceptions",
39+
"-fno-rtti",
40+
"-DFU_ENABLE_NUMA=0",
41+
};
42+
43+
lib.addCSourceFile(.{
44+
.file = b.path("c/lib.cpp"),
45+
.flags = cpp_flags,
46+
});
47+
48+
lib.addIncludePath(b.path("include"));
49+
lib.linkLibCpp(); // Use Zig's bundled `libc++` instead of system `libstdc++`
50+
51+
b.installArtifact(lib);
52+
53+
// Create fork_union module for use as a dependency
54+
_ = b.addModule("fork_union", .{
55+
.root_source_file = b.path("zig/fork_union.zig"),
56+
.target = target,
57+
});
58+
59+
// Unit tests
60+
const test_step = b.step("test", "Run library tests");
61+
const lib_tests = b.addTest(.{
62+
.root_module = b.createModule(.{
63+
.root_source_file = b.path("zig/fork_union.zig"),
64+
.target = target,
65+
.optimize = optimize,
66+
}),
67+
});
68+
69+
lib_tests.addIncludePath(b.path("include"));
70+
lib_tests.linkLibrary(lib);
71+
if (target.result.os.tag == .linux) {
72+
lib_tests.root_module.linkSystemLibrary("pthread", .{});
73+
if (enable_numa) {
74+
lib_tests.root_module.linkSystemLibrary("numa", .{});
75+
}
76+
}
77+
78+
const run_tests = b.addRunArtifact(lib_tests);
79+
test_step.dependOn(&run_tests.step);
80+
}

build.zig.zon

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
.{
2+
.name = .fork_union,
3+
.version = "2.3.0",
4+
.fingerprint = 0xc31742f3e89a27c7,
5+
.minimum_zig_version = "0.16.0",
6+
.paths = .{
7+
"build.zig",
8+
"build.zig.zon",
9+
"zig/",
10+
"include/",
11+
"c/",
12+
"scripts/",
13+
"README.md",
14+
"LICENSE",
15+
},
16+
}

0 commit comments

Comments
 (0)