hash: implement fast crc32c

Rexicon226 · Rexicon226 · commit d29bd45aed9f · 2025-06-28T00:14:48.000-07:00
diff --git a/lib/std/hash/crc.zig b/lib/std/hash/crc.zig
@@ -1,5 +1,6 @@
 //! This file is auto-generated by tools/update_crc_catalog.zig.
 
+const builtin = @import("builtin");
 const impl = @import("crc/impl.zig");
 
 pub const Crc = impl.Crc;
@@ -13,6 +14,18 @@ test {
     _ = @import("crc/test.zig");
 }
 
+const can_use_fast_crc32c = builtin.cpu.has(.x86, .crc32) and builtin.zig_backend == .stage2_llvm;
+pub const Crc32Iscsi = switch (can_use_fast_crc32c) {
+    true => @import("crc/crc32c.zig").Wrapper,
+    else => Crc(u32, .{
+        .polynomial = 0x1edc6f41,
+        .initial = 0xffffffff,
+        .reflect_input = true,
+        .reflect_output = true,
+        .xor_output = 0xffffffff,
+    }),
+};
+
 pub const Crc3Gsm = Crc(u3, .{
     .polynomial = 0x3,
     .initial = 0x0,
@@ -797,14 +810,6 @@ pub const Crc32Cksum = Crc(u32, .{
     .xor_output = 0xffffffff,
 });
 
-pub const Crc32Iscsi = Crc(u32, .{
-    .polynomial = 0x1edc6f41,
-    .initial = 0xffffffff,
-    .reflect_input = true,
-    .reflect_output = true,
-    .xor_output = 0xffffffff,
-});
-
 pub const Crc32IsoHdlc = Crc(u32, .{
     .polynomial = 0x04c11db7,
     .initial = 0xffffffff,
diff --git a/lib/std/hash/crc/crc32c.zig b/lib/std/hash/crc/crc32c.zig
@@ -0,0 +1,239 @@
+//! Implements CRC-32C (Castagnoli) using the SSE4.2 Intel CRC32 instruction.
+//!
+//! A couple useful links for understanding the approach taken here:
+//! - https://github.com/madler/brotli/blob/1d428d3a9baade233ebc3ac108293256bcb813d1/crc32c.c
+//! - https://github.com/madler/zlib/blob/5a82f71ed1dfc0bec044d9702463dbdf84ea3b71/crc32.c
+//! - http://www.ross.net/crc/download/crc_v3.txt
+
+// Reflected CRC-32C polynomial in binary form.
+const POLY = 0x82f63b78;
+
+const LONG = 8192;
+const SHORT = 256;
+const long_lookup_table = genTable(LONG);
+const short_lookup_table = genTable(SHORT);
+
+/// Generates the lookup table for efficiently combining CRCs over a block of a given length `length`.
+/// This works by building an operator that advances the CRC state as if `length` zero-bytes were appended.
+/// We pre-compute 4 tables of 256 entries each (one per byte offset).
+///
+///
+/// The idea behind this table is quite interesting. The CRC state is equivalent to the
+/// remainder of dividing the message polynomial (over GF(2)) by the CRC polynomial.
+///
+/// Advancing the CRC register by `k` zero bits is equivalent to multiplying the current
+/// CRC state by `x^k` modulo the CRC polynomial. This operation can be represented
+/// as a linear transformation in GF(2), i.e, a matrix.
+///
+/// We build up this matrix via repeated squaring:
+/// - odd represents the operator for 1 zero bit (i.e, multiplication by `x^1 mod POLY`)
+/// - even represents the operator for 2 zero bits (`x^2 mod POLY`)
+/// - squaring again gives `x^4 mod POLY`, and so on until we get to the right size.
+///
+/// By squaring the shifting `len`, we build the operator for `x^l mod POLY`.
+fn genTable(length: usize) [4][256]u32 {
+    @setEvalBranchQuota(250000);
+
+    var even: [32]u32 = undefined;
+    zeroes: {
+        var odd: [32]u32 = undefined;
+
+        // Initialize our `odd` array with the operator for a single zero bit:
+        // - odd[0] is the polynomial itself (acts on the MSB).
+        // - odd[1..32] represent shifting a single bit through 31 positions.
+        odd[0] = POLY;
+        var row: u32 = 1;
+        for (1..32) |n| {
+            odd[n] = row;
+            row <<= 1;
+        }
+
+        // even = odd squared: even represents `x^2 mod POLY`.
+        square(&even, &odd);
+        // odd = even squared: odd now represents `x^4 mod POLY`.
+        square(&odd, &even);
+
+        // Continue squaring to double the number of zeroes encoded each time:
+        //
+        // At each point in the process:
+        // - square(even, odd): even gets the operator for twice the current length.
+        // - square(odd, even): odd gets the operator for 4 times the original length.
+        var len = length;
+        while (true) {
+            square(&even, &odd);
+            len >>= 1;
+            if (len == 0) break :zeroes;
+            square(&odd, &even);
+            len >>= 1;
+            if (len == 0) break;
+        }
+
+        @memcpy(&even, &odd);
+    }
+
+    var zeroes: [4][256]u32 = undefined;
+    for (0..256) |n| {
+        zeroes[0][n] = times(&even, n);
+        zeroes[1][n] = times(&even, n << 8);
+        zeroes[2][n] = times(&even, n << 16);
+        zeroes[3][n] = times(&even, n << 24);
+    }
+    return zeroes;
+}
+
+/// Computes `mat * vec` over `GF(2)`, where `mat` is a 32x32 binary matrix and `vec`
+/// is a 32-bit vector. This somewhat "simulates" how bits propagate through the CRC register
+/// during shifting.
+///
+/// - In GF(2) (aka a field where the only values are 0 and 1, aka binary), multiplication is
+/// an `AND`, and addition is `XOR`.
+/// - This dot product determines how each bit in the input vector "contributes" to
+/// the final CRC state, by XORing (adding) rows of the matrix where `vec` has 1s.
+fn times(mat: *[32]u32, vec: u32) u32 {
+    var sum: u32 = 0;
+    var v = vec;
+    var i: u32 = 0;
+    while (v != 0) {
+        if (v & 1 != 0) sum ^= mat[i];
+        v >>= 1;
+        i += 1;
+    }
+    return sum;
+}
+
+/// Computes the square of a matrix in GF(2), i.e `dst = dst x src`.
+///
+/// This produces the operator for doubling the number of zeroes:
+/// if `src` represents advancing the CRC by `k` zeroes, then `dest` will
+/// represent advancing by 2k zeroes.
+///
+/// Since polynomial multiplication mod POLY is linear, `mat(mat(x)) = mat^2(x)`
+/// gives the effect of two sequential applications of the operator.
+fn square(dst: *[32]u32, src: *[32]u32) void {
+    for (dst, src) |*s, m| {
+        s.* = times(src, m);
+    }
+}
+
+fn shift(table: *const [4][256]u32, crc: u32) u32 {
+    return table[0][crc & 0xFF] ^ table[1][(crc >> 8) & 0xFF] ^ table[2][(crc >> 16) & 0xFF] ^ table[3][crc >> 24];
+}
+
+fn crc32(crc: u32, input: []const u8) u32 {
+    var crc0: u64 = crc;
+
+    // Compute the CRC for up to seven leading bytes to bring the
+    // `next` pointer to an eight-byte boundary.
+    var next = input;
+    while (next.len > 0 and @intFromPtr(next.ptr) & 7 != 0) {
+        asm volatile ("crc32b %[out], %[in]"
+            : [in] "+r" (crc0),
+            : [out] "rm" (next[0]),
+        );
+        next = next[1..];
+    }
+
+    // Compute the CRC on sets of LONG * 3 bytes, executing three independent
+    // CRC instructions, each on LONG bytes. This is an optimization for
+    // targets where the CRC instruction has a throughput of one CRC per
+    // cycle, but a latency of three cycles.
+    while (next.len >= LONG * 3) {
+        var crc1: u64 = 0;
+        var crc2: u64 = 0;
+
+        const start = next.len;
+        while (true) {
+            // Safe @alignCast(), since we've aligned the pointer to 8 bytes before this loop.
+            const long: [*]const u64 = @alignCast(@ptrCast(next));
+            asm volatile (
+                \\crc32q %[out0], %[in0]
+                \\crc32q %[out1], %[in1]
+                \\crc32q %[out2], %[in2]
+                : [in0] "+r" (crc0),
+                  [in1] "+r" (crc1),
+                  [in2] "+r" (crc2),
+                : [out0] "rm" (long[0 * LONG / 8]),
+                  [out1] "rm" (long[1 * LONG / 8]),
+                  [out2] "rm" (long[2 * LONG / 8]),
+            );
+            next = next[8..];
+            if (next.len <= start - LONG) break;
+        }
+
+        crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc1;
+        crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc2;
+        next = next[LONG * 2 ..];
+    }
+
+    // Same thing as above, but for smaller chunks of SHORT bytes.
+    while (next.len >= SHORT * 3) {
+        var crc1: u64 = 0;
+        var crc2: u64 = 0;
+
+        const start = next.len;
+        while (true) {
+            const long: [*]const u64 = @alignCast(@ptrCast(next));
+            asm volatile (
+                \\crc32q %[out0], %[in0]
+                \\crc32q %[out1], %[in1]
+                \\crc32q %[out2], %[in2]
+                : [in0] "+r" (crc0),
+                  [in1] "+r" (crc1),
+                  [in2] "+r" (crc2),
+                : [out0] "rm" (long[0 * SHORT / 8]),
+                  [out1] "rm" (long[1 * SHORT / 8]),
+                  [out2] "rm" (long[2 * SHORT / 8]),
+            );
+            next = next[8..];
+            if (next.len <= start - SHORT) break;
+        }
+
+        crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc1;
+        crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc2;
+        next = next[SHORT * 2 ..];
+    }
+
+    // Compute via 8-byte chunks, until we're left with less than 8 bytes.
+    while (next.len >= 8) {
+        const long: [*]const u64 = @alignCast(@ptrCast(next));
+        asm volatile ("crc32q %[out], %[in]"
+            : [in] "+r" (crc0),
+            : [out] "rm" (long[0]),
+        );
+        next = next[8..];
+    }
+
+    // // Finish the last bytes with just single instructions.
+    while (next.len > 0) {
+        asm volatile ("crc32b %[out], %[in]"
+            : [in] "+r" (crc0),
+            : [out] "rm" (next[0]),
+        );
+        next = next[1..];
+    }
+
+    return @truncate(~crc0);
+}
+
+// Wrapper around the accelerated implementation to match the one in impl.zig.
+pub const Wrapper = struct {
+    crc: u32,
+
+    pub fn init() Wrapper {
+        return .{ .crc = 0xffffffff };
+    }
+
+    pub fn update(w: *Wrapper, bytes: []const u8) void {
+        w.crc = crc32(w.crc, bytes);
+    }
+
+    pub fn final(w: Wrapper) u32 {
+        return w.crc;
+    }
+
+    pub fn hash(bytes: []const u8) u32 {
+        var c = init();
+        c.update(bytes);
+        return c.final();
+    }
+};
diff --git a/lib/std/hash/crc/impl.zig b/lib/std/hash/crc/impl.zig
@@ -23,12 +23,7 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
         const I = if (@bitSizeOf(W) < 8) u8 else W;
         const lookup_table = blk: {
             @setEvalBranchQuota(2500);
-
-            const poly = if (algorithm.reflect_input)
-                @bitReverse(@as(I, algorithm.polynomial)) >> (@bitSizeOf(I) - @bitSizeOf(W))
-            else
-                @as(I, algorithm.polynomial) << (@bitSizeOf(I) - @bitSizeOf(W));
-
+            const poly = reflect(algorithm.polynomial);
             var table: [256]I = undefined;
             for (&table, 0..) |*e, i| {
                 var crc: I = i;
@@ -52,15 +47,13 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
         crc: I,
 
         pub fn init() Self {
-            const initial = if (algorithm.reflect_input)
-                @bitReverse(@as(I, algorithm.initial)) >> (@bitSizeOf(I) - @bitSizeOf(W))
-            else
-                @as(I, algorithm.initial) << (@bitSizeOf(I) - @bitSizeOf(W));
-            return Self{ .crc = initial };
+            const initial = reflect(algorithm.initial);
+            return .{ .crc = initial };
         }
 
         inline fn tableEntry(index: I) I {
-            return lookup_table[@as(u8, @intCast(index & 0xFF))];
+            const short: u8 = @truncate(index);
+            return lookup_table[short];
         }
 
         pub fn update(self: *Self, bytes: []const u8) void {
@@ -90,14 +83,22 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
             if (!algorithm.reflect_output) {
                 c >>= @bitSizeOf(I) - @bitSizeOf(W);
             }
-            return @as(W, @intCast(c ^ algorithm.xor_output));
+            return @intCast(c ^ algorithm.xor_output);
         }
 
         pub fn hash(bytes: []const u8) W {
             var c = Self.init();
             c.update(bytes);
             return c.final();
         }
+
+        fn reflect(x: I) I {
+            const offset = @bitSizeOf(I) - @bitSizeOf(W);
+            if (algorithm.reflect_input)
+                return @bitReverse(x) >> offset
+            else
+                return x << offset;
+        }
     };
 }
 
diff --git a/lib/std/hash/crc/test.zig b/lib/std/hash/crc/test.zig
@@ -26,6 +26,17 @@ test "crc32 koopman regression" {
     try testing.expectEqual(crc32.hash("abc"), 0xba2322ac);
 }
 
+test "CRC-32/ISCSI" {
+    const Crc32Iscsi = crc.Crc32Iscsi;
+
+    try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));
+
+    var c = Crc32Iscsi.init();
+    c.update("1234");
+    c.update("56789");
+    try testing.expectEqual(@as(u32, 0xe3069283), c.final());
+}
+
 test "CRC-3/GSM" {
     const Crc3Gsm = crc.Crc3Gsm;
 
@@ -1104,17 +1115,6 @@ test "CRC-32/CKSUM" {
     try testing.expectEqual(@as(u32, 0x765e7680), c.final());
 }
 
-test "CRC-32/ISCSI" {
-    const Crc32Iscsi = crc.Crc32Iscsi;
-
-    try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));
-
-    var c = Crc32Iscsi.init();
-    c.update("1234");
-    c.update("56789");
-    try testing.expectEqual(@as(u32, 0xe3069283), c.final());
-}
-
 test "CRC-32/ISO-HDLC" {
     const Crc32IsoHdlc = crc.Crc32IsoHdlc;
 
diff --git a/tools/crc/catalog.txt b/tools/crc/catalog.txt
@@ -97,7 +97,8 @@ width=32  poly=0xa833982b  init=0xffffffff  refin=true  refout=true  xorout=0xff
 width=32  poly=0x04c11db7  init=0xffffffff  refin=false  refout=false  xorout=0xffffffff  check=0xfc891918  residue=0xc704dd7b  name="CRC-32/BZIP2"
 width=32  poly=0x8001801b  init=0x00000000  refin=true  refout=true  xorout=0x00000000  check=0x6ec2edc4  residue=0x00000000  name="CRC-32/CD-ROM-EDC"
 width=32  poly=0x04c11db7  init=0x00000000  refin=false  refout=false  xorout=0xffffffff  check=0x765e7680  residue=0xc704dd7b  name="CRC-32/CKSUM"
-width=32  poly=0x1edc6f41  init=0xffffffff  refin=true  refout=true  xorout=0xffffffff  check=0xe3069283  residue=0xb798b438  name="CRC-32/ISCSI"
+# CRC-32C implementation is defined manually, since it has an accelerated variant.
+# width=32  poly=0x1edc6f41  init=0xffffffff  refin=true  refout=true  xorout=0xffffffff  check=0xe3069283  residue=0xb798b438  name="CRC-32/ISCSI"
 width=32  poly=0x04c11db7  init=0xffffffff  refin=true  refout=true  xorout=0xffffffff  check=0xcbf43926  residue=0xdebb20e3  name="CRC-32/ISO-HDLC"
 width=32  poly=0x04c11db7  init=0xffffffff  refin=true  refout=true  xorout=0x00000000  check=0x340bc6d9  residue=0x00000000  name="CRC-32/JAMCRC"
 width=32  poly=0x741b8cd7  init=0xffffffff  refin=true  refout=true  xorout=0xffffffff  check=0x2d3dd0ae  residue=0x00000000  name="CRC-32/KOOPMAN"
diff --git a/tools/update_crc_catalog.zig b/tools/update_crc_catalog.zig