Skip to content

Commit d29bd45

Browse files
committed
hash: implement fast crc32c
1 parent 0adcfd6 commit d29bd45

File tree

6 files changed

+304
-34
lines changed

6 files changed

+304
-34
lines changed

lib/std/hash/crc.zig

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//! This file is auto-generated by tools/update_crc_catalog.zig.
22

3+
const builtin = @import("builtin");
34
const impl = @import("crc/impl.zig");
45

56
pub const Crc = impl.Crc;
@@ -13,6 +14,18 @@ test {
1314
_ = @import("crc/test.zig");
1415
}
1516

17+
const can_use_fast_crc32c = builtin.cpu.has(.x86, .crc32) and builtin.zig_backend == .stage2_llvm;
18+
pub const Crc32Iscsi = switch (can_use_fast_crc32c) {
19+
true => @import("crc/crc32c.zig").Wrapper,
20+
else => Crc(u32, .{
21+
.polynomial = 0x1edc6f41,
22+
.initial = 0xffffffff,
23+
.reflect_input = true,
24+
.reflect_output = true,
25+
.xor_output = 0xffffffff,
26+
}),
27+
};
28+
1629
pub const Crc3Gsm = Crc(u3, .{
1730
.polynomial = 0x3,
1831
.initial = 0x0,
@@ -797,14 +810,6 @@ pub const Crc32Cksum = Crc(u32, .{
797810
.xor_output = 0xffffffff,
798811
});
799812

800-
pub const Crc32Iscsi = Crc(u32, .{
801-
.polynomial = 0x1edc6f41,
802-
.initial = 0xffffffff,
803-
.reflect_input = true,
804-
.reflect_output = true,
805-
.xor_output = 0xffffffff,
806-
});
807-
808813
pub const Crc32IsoHdlc = Crc(u32, .{
809814
.polynomial = 0x04c11db7,
810815
.initial = 0xffffffff,

lib/std/hash/crc/crc32c.zig

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
//! Implements CRC-32C (Castagnoli) using the SSE4.2 Intel CRC32 instruction.
2+
//!
3+
//! A couple useful links for understanding the approach taken here:
4+
//! - https://github.com/madler/brotli/blob/1d428d3a9baade233ebc3ac108293256bcb813d1/crc32c.c
5+
//! - https://github.com/madler/zlib/blob/5a82f71ed1dfc0bec044d9702463dbdf84ea3b71/crc32.c
6+
//! - http://www.ross.net/crc/download/crc_v3.txt
7+
8+
// Reflected CRC-32C polynomial in binary form.
9+
const POLY = 0x82f63b78;
10+
11+
const LONG = 8192;
12+
const SHORT = 256;
13+
const long_lookup_table = genTable(LONG);
14+
const short_lookup_table = genTable(SHORT);
15+
16+
/// Generates the lookup table for efficiently combining CRCs over a block of a given length `length`.
17+
/// This works by building an operator that advances the CRC state as if `length` zero-bytes were appended.
18+
/// We pre-compute 4 tables of 256 entries each (one per byte offset).
19+
///
20+
///
21+
/// The idea behind this table is quite interesting. The CRC state is equivalent to the
22+
/// remainder of dividing the message polynomial (over GF(2)) by the CRC polynomial.
23+
///
24+
/// Advancing the CRC register by `k` zero bits is equivalent to multiplying the current
25+
/// CRC state by `x^k` modulo the CRC polynomial. This operation can be represented
26+
/// as a linear transformation in GF(2), i.e, a matrix.
27+
///
28+
/// We build up this matrix via repeated squaring:
29+
/// - odd represents the operator for 1 zero bit (i.e, multiplication by `x^1 mod POLY`)
30+
/// - even represents the operator for 2 zero bits (`x^2 mod POLY`)
31+
/// - squaring again gives `x^4 mod POLY`, and so on until we get to the right size.
32+
///
33+
/// By squaring the shifting `len`, we build the operator for `x^l mod POLY`.
34+
fn genTable(length: usize) [4][256]u32 {
35+
@setEvalBranchQuota(250000);
36+
37+
var even: [32]u32 = undefined;
38+
zeroes: {
39+
var odd: [32]u32 = undefined;
40+
41+
// Initialize our `odd` array with the operator for a single zero bit:
42+
// - odd[0] is the polynomial itself (acts on the MSB).
43+
// - odd[1..32] represent shifting a single bit through 31 positions.
44+
odd[0] = POLY;
45+
var row: u32 = 1;
46+
for (1..32) |n| {
47+
odd[n] = row;
48+
row <<= 1;
49+
}
50+
51+
// even = odd squared: even represents `x^2 mod POLY`.
52+
square(&even, &odd);
53+
// odd = even squared: odd now represents `x^4 mod POLY`.
54+
square(&odd, &even);
55+
56+
// Continue squaring to double the number of zeroes encoded each time:
57+
//
58+
// At each point in the process:
59+
// - square(even, odd): even gets the operator for twice the current length.
60+
// - square(odd, even): odd gets the operator for 4 times the original length.
61+
var len = length;
62+
while (true) {
63+
square(&even, &odd);
64+
len >>= 1;
65+
if (len == 0) break :zeroes;
66+
square(&odd, &even);
67+
len >>= 1;
68+
if (len == 0) break;
69+
}
70+
71+
@memcpy(&even, &odd);
72+
}
73+
74+
var zeroes: [4][256]u32 = undefined;
75+
for (0..256) |n| {
76+
zeroes[0][n] = times(&even, n);
77+
zeroes[1][n] = times(&even, n << 8);
78+
zeroes[2][n] = times(&even, n << 16);
79+
zeroes[3][n] = times(&even, n << 24);
80+
}
81+
return zeroes;
82+
}
83+
84+
/// Computes `mat * vec` over `GF(2)`, where `mat` is a 32x32 binary matrix and `vec`
85+
/// is a 32-bit vector. This somewhat "simulates" how bits propagate through the CRC register
86+
/// during shifting.
87+
///
88+
/// - In GF(2) (aka a field where the only values are 0 and 1, aka binary), multiplication is
89+
/// an `AND`, and addition is `XOR`.
90+
/// - This dot product determines how each bit in the input vector "contributes" to
91+
/// the final CRC state, by XORing (adding) rows of the matrix where `vec` has 1s.
92+
fn times(mat: *[32]u32, vec: u32) u32 {
93+
var sum: u32 = 0;
94+
var v = vec;
95+
var i: u32 = 0;
96+
while (v != 0) {
97+
if (v & 1 != 0) sum ^= mat[i];
98+
v >>= 1;
99+
i += 1;
100+
}
101+
return sum;
102+
}
103+
104+
/// Computes the square of a matrix in GF(2), i.e `dst = dst x src`.
105+
///
106+
/// This produces the operator for doubling the number of zeroes:
107+
/// if `src` represents advancing the CRC by `k` zeroes, then `dest` will
108+
/// represent advancing by 2k zeroes.
109+
///
110+
/// Since polynomial multiplication mod POLY is linear, `mat(mat(x)) = mat^2(x)`
111+
/// gives the effect of two sequential applications of the operator.
112+
fn square(dst: *[32]u32, src: *[32]u32) void {
113+
for (dst, src) |*s, m| {
114+
s.* = times(src, m);
115+
}
116+
}
117+
118+
fn shift(table: *const [4][256]u32, crc: u32) u32 {
119+
return table[0][crc & 0xFF] ^ table[1][(crc >> 8) & 0xFF] ^ table[2][(crc >> 16) & 0xFF] ^ table[3][crc >> 24];
120+
}
121+
122+
fn crc32(crc: u32, input: []const u8) u32 {
123+
var crc0: u64 = crc;
124+
125+
// Compute the CRC for up to seven leading bytes to bring the
126+
// `next` pointer to an eight-byte boundary.
127+
var next = input;
128+
while (next.len > 0 and @intFromPtr(next.ptr) & 7 != 0) {
129+
asm volatile ("crc32b %[out], %[in]"
130+
: [in] "+r" (crc0),
131+
: [out] "rm" (next[0]),
132+
);
133+
next = next[1..];
134+
}
135+
136+
// Compute the CRC on sets of LONG * 3 bytes, executing three independent
137+
// CRC instructions, each on LONG bytes. This is an optimization for
138+
// targets where the CRC instruction has a throughput of one CRC per
139+
// cycle, but a latency of three cycles.
140+
while (next.len >= LONG * 3) {
141+
var crc1: u64 = 0;
142+
var crc2: u64 = 0;
143+
144+
const start = next.len;
145+
while (true) {
146+
// Safe @alignCast(), since we've aligned the pointer to 8 bytes before this loop.
147+
const long: [*]const u64 = @alignCast(@ptrCast(next));
148+
asm volatile (
149+
\\crc32q %[out0], %[in0]
150+
\\crc32q %[out1], %[in1]
151+
\\crc32q %[out2], %[in2]
152+
: [in0] "+r" (crc0),
153+
[in1] "+r" (crc1),
154+
[in2] "+r" (crc2),
155+
: [out0] "rm" (long[0 * LONG / 8]),
156+
[out1] "rm" (long[1 * LONG / 8]),
157+
[out2] "rm" (long[2 * LONG / 8]),
158+
);
159+
next = next[8..];
160+
if (next.len <= start - LONG) break;
161+
}
162+
163+
crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc1;
164+
crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc2;
165+
next = next[LONG * 2 ..];
166+
}
167+
168+
// Same thing as above, but for smaller chunks of SHORT bytes.
169+
while (next.len >= SHORT * 3) {
170+
var crc1: u64 = 0;
171+
var crc2: u64 = 0;
172+
173+
const start = next.len;
174+
while (true) {
175+
const long: [*]const u64 = @alignCast(@ptrCast(next));
176+
asm volatile (
177+
\\crc32q %[out0], %[in0]
178+
\\crc32q %[out1], %[in1]
179+
\\crc32q %[out2], %[in2]
180+
: [in0] "+r" (crc0),
181+
[in1] "+r" (crc1),
182+
[in2] "+r" (crc2),
183+
: [out0] "rm" (long[0 * SHORT / 8]),
184+
[out1] "rm" (long[1 * SHORT / 8]),
185+
[out2] "rm" (long[2 * SHORT / 8]),
186+
);
187+
next = next[8..];
188+
if (next.len <= start - SHORT) break;
189+
}
190+
191+
crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc1;
192+
crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc2;
193+
next = next[SHORT * 2 ..];
194+
}
195+
196+
// Compute via 8-byte chunks, until we're left with less than 8 bytes.
197+
while (next.len >= 8) {
198+
const long: [*]const u64 = @alignCast(@ptrCast(next));
199+
asm volatile ("crc32q %[out], %[in]"
200+
: [in] "+r" (crc0),
201+
: [out] "rm" (long[0]),
202+
);
203+
next = next[8..];
204+
}
205+
206+
// // Finish the last bytes with just single instructions.
207+
while (next.len > 0) {
208+
asm volatile ("crc32b %[out], %[in]"
209+
: [in] "+r" (crc0),
210+
: [out] "rm" (next[0]),
211+
);
212+
next = next[1..];
213+
}
214+
215+
return @truncate(~crc0);
216+
}
217+
218+
// Wrapper around the accelerated implementation to match the one in impl.zig.
219+
pub const Wrapper = struct {
220+
crc: u32,
221+
222+
pub fn init() Wrapper {
223+
return .{ .crc = 0xffffffff };
224+
}
225+
226+
pub fn update(w: *Wrapper, bytes: []const u8) void {
227+
w.crc = crc32(w.crc, bytes);
228+
}
229+
230+
pub fn final(w: Wrapper) u32 {
231+
return w.crc;
232+
}
233+
234+
pub fn hash(bytes: []const u8) u32 {
235+
var c = init();
236+
c.update(bytes);
237+
return c.final();
238+
}
239+
};

lib/std/hash/crc/impl.zig

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,7 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
2323
const I = if (@bitSizeOf(W) < 8) u8 else W;
2424
const lookup_table = blk: {
2525
@setEvalBranchQuota(2500);
26-
27-
const poly = if (algorithm.reflect_input)
28-
@bitReverse(@as(I, algorithm.polynomial)) >> (@bitSizeOf(I) - @bitSizeOf(W))
29-
else
30-
@as(I, algorithm.polynomial) << (@bitSizeOf(I) - @bitSizeOf(W));
31-
26+
const poly = reflect(algorithm.polynomial);
3227
var table: [256]I = undefined;
3328
for (&table, 0..) |*e, i| {
3429
var crc: I = i;
@@ -52,15 +47,13 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
5247
crc: I,
5348

5449
pub fn init() Self {
55-
const initial = if (algorithm.reflect_input)
56-
@bitReverse(@as(I, algorithm.initial)) >> (@bitSizeOf(I) - @bitSizeOf(W))
57-
else
58-
@as(I, algorithm.initial) << (@bitSizeOf(I) - @bitSizeOf(W));
59-
return Self{ .crc = initial };
50+
const initial = reflect(algorithm.initial);
51+
return .{ .crc = initial };
6052
}
6153

6254
inline fn tableEntry(index: I) I {
63-
return lookup_table[@as(u8, @intCast(index & 0xFF))];
55+
const short: u8 = @truncate(index);
56+
return lookup_table[short];
6457
}
6558

6659
pub fn update(self: *Self, bytes: []const u8) void {
@@ -90,14 +83,22 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
9083
if (!algorithm.reflect_output) {
9184
c >>= @bitSizeOf(I) - @bitSizeOf(W);
9285
}
93-
return @as(W, @intCast(c ^ algorithm.xor_output));
86+
return @intCast(c ^ algorithm.xor_output);
9487
}
9588

9689
pub fn hash(bytes: []const u8) W {
9790
var c = Self.init();
9891
c.update(bytes);
9992
return c.final();
10093
}
94+
95+
fn reflect(x: I) I {
96+
const offset = @bitSizeOf(I) - @bitSizeOf(W);
97+
if (algorithm.reflect_input)
98+
return @bitReverse(x) >> offset
99+
else
100+
return x << offset;
101+
}
101102
};
102103
}
103104

lib/std/hash/crc/test.zig

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,17 @@ test "crc32 koopman regression" {
2626
try testing.expectEqual(crc32.hash("abc"), 0xba2322ac);
2727
}
2828

29+
test "CRC-32/ISCSI" {
30+
const Crc32Iscsi = crc.Crc32Iscsi;
31+
32+
try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));
33+
34+
var c = Crc32Iscsi.init();
35+
c.update("1234");
36+
c.update("56789");
37+
try testing.expectEqual(@as(u32, 0xe3069283), c.final());
38+
}
39+
2940
test "CRC-3/GSM" {
3041
const Crc3Gsm = crc.Crc3Gsm;
3142

@@ -1104,17 +1115,6 @@ test "CRC-32/CKSUM" {
11041115
try testing.expectEqual(@as(u32, 0x765e7680), c.final());
11051116
}
11061117

1107-
test "CRC-32/ISCSI" {
1108-
const Crc32Iscsi = crc.Crc32Iscsi;
1109-
1110-
try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));
1111-
1112-
var c = Crc32Iscsi.init();
1113-
c.update("1234");
1114-
c.update("56789");
1115-
try testing.expectEqual(@as(u32, 0xe3069283), c.final());
1116-
}
1117-
11181118
test "CRC-32/ISO-HDLC" {
11191119
const Crc32IsoHdlc = crc.Crc32IsoHdlc;
11201120

tools/crc/catalog.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ width=32 poly=0xa833982b init=0xffffffff refin=true refout=true xorout=0xff
9797
width=32 poly=0x04c11db7 init=0xffffffff refin=false refout=false xorout=0xffffffff check=0xfc891918 residue=0xc704dd7b name="CRC-32/BZIP2"
9898
width=32 poly=0x8001801b init=0x00000000 refin=true refout=true xorout=0x00000000 check=0x6ec2edc4 residue=0x00000000 name="CRC-32/CD-ROM-EDC"
9999
width=32 poly=0x04c11db7 init=0x00000000 refin=false refout=false xorout=0xffffffff check=0x765e7680 residue=0xc704dd7b name="CRC-32/CKSUM"
100-
width=32 poly=0x1edc6f41 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xe3069283 residue=0xb798b438 name="CRC-32/ISCSI"
100+
# CRC-32C implementation is defined manually, since it has an accelerated variant.
101+
# width=32 poly=0x1edc6f41 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xe3069283 residue=0xb798b438 name="CRC-32/ISCSI"
101102
width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xcbf43926 residue=0xdebb20e3 name="CRC-32/ISO-HDLC"
102103
width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0x00000000 check=0x340bc6d9 residue=0x00000000 name="CRC-32/JAMCRC"
103104
width=32 poly=0x741b8cd7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0x2d3dd0ae residue=0x00000000 name="CRC-32/KOOPMAN"

0 commit comments

Comments
 (0)