diff --git a/lib/std/hash/crc.zig b/lib/std/hash/crc.zig index c0a418a0c2cd..148919d820a2 100644 --- a/lib/std/hash/crc.zig +++ b/lib/std/hash/crc.zig @@ -1,5 +1,6 @@ //! This file is auto-generated by tools/update_crc_catalog.zig. +const builtin = @import("builtin"); const impl = @import("crc/impl.zig"); pub const Crc = impl.Crc; @@ -13,6 +14,18 @@ test { _ = @import("crc/test.zig"); } +const can_use_fast_crc32c = builtin.cpu.has(.x86, .crc32) and builtin.zig_backend == .stage2_llvm; +pub const Crc32Iscsi = switch (can_use_fast_crc32c) { + true => @import("crc/crc32c.zig").Wrapper, + else => Crc(u32, .{ + .polynomial = 0x1edc6f41, + .initial = 0xffffffff, + .reflect_input = true, + .reflect_output = true, + .xor_output = 0xffffffff, + }), +}; + pub const Crc3Gsm = Crc(u3, .{ .polynomial = 0x3, .initial = 0x0, @@ -797,14 +810,6 @@ pub const Crc32Cksum = Crc(u32, .{ .xor_output = 0xffffffff, }); -pub const Crc32Iscsi = Crc(u32, .{ - .polynomial = 0x1edc6f41, - .initial = 0xffffffff, - .reflect_input = true, - .reflect_output = true, - .xor_output = 0xffffffff, -}); - pub const Crc32IsoHdlc = Crc(u32, .{ .polynomial = 0x04c11db7, .initial = 0xffffffff, diff --git a/lib/std/hash/crc/crc32c.zig b/lib/std/hash/crc/crc32c.zig new file mode 100644 index 000000000000..87943d81848d --- /dev/null +++ b/lib/std/hash/crc/crc32c.zig @@ -0,0 +1,239 @@ +//! Implements CRC-32C (Castagnoli) using the SSE4.2 Intel CRC32 instruction. +//! +//! A couple useful links for understanding the approach taken here: +//! - https://github.com/madler/brotli/blob/1d428d3a9baade233ebc3ac108293256bcb813d1/crc32c.c +//! - https://github.com/madler/zlib/blob/5a82f71ed1dfc0bec044d9702463dbdf84ea3b71/crc32.c +//! - http://www.ross.net/crc/download/crc_v3.txt + +// Reflected CRC-32C polynomial in binary form. +const POLY = 0x82f63b78; + +const LONG = 8192; +const SHORT = 256; +const long_lookup_table = genTable(LONG); +const short_lookup_table = genTable(SHORT); + +/// Generates the lookup table for efficiently combining CRCs over a block of a given length `length`. +/// This works by building an operator that advances the CRC state as if `length` zero-bytes were appended. +/// We pre-compute 4 tables of 256 entries each (one per byte offset). +/// +/// +/// The idea behind this table is quite interesting. The CRC state is equivalent to the +/// remainder of dividing the message polynomial (over GF(2)) by the CRC polynomial. +/// +/// Advancing the CRC register by `k` zero bits is equivalent to multiplying the current +/// CRC state by `x^k` modulo the CRC polynomial. This operation can be represented +/// as a linear transformation in GF(2), i.e, a matrix. +/// +/// We build up this matrix via repeated squaring: +/// - odd represents the operator for 1 zero bit (i.e, multiplication by `x^1 mod POLY`) +/// - even represents the operator for 2 zero bits (`x^2 mod POLY`) +/// - squaring again gives `x^4 mod POLY`, and so on until we get to the right size. +/// +/// By squaring the shifting `len`, we build the operator for `x^l mod POLY`. +fn genTable(length: usize) [4][256]u32 { + @setEvalBranchQuota(250000); + + var even: [32]u32 = undefined; + zeroes: { + var odd: [32]u32 = undefined; + + // Initialize our `odd` array with the operator for a single zero bit: + // - odd[0] is the polynomial itself (acts on the MSB). + // - odd[1..32] represent shifting a single bit through 31 positions. + odd[0] = POLY; + var row: u32 = 1; + for (1..32) |n| { + odd[n] = row; + row <<= 1; + } + + // even = odd squared: even represents `x^2 mod POLY`. + square(&even, &odd); + // odd = even squared: odd now represents `x^4 mod POLY`. + square(&odd, &even); + + // Continue squaring to double the number of zeroes encoded each time: + // + // At each point in the process: + // - square(even, odd): even gets the operator for twice the current length. + // - square(odd, even): odd gets the operator for 4 times the original length. + var len = length; + while (true) { + square(&even, &odd); + len >>= 1; + if (len == 0) break :zeroes; + square(&odd, &even); + len >>= 1; + if (len == 0) break; + } + + @memcpy(&even, &odd); + } + + var zeroes: [4][256]u32 = undefined; + for (0..256) |n| { + zeroes[0][n] = times(&even, n); + zeroes[1][n] = times(&even, n << 8); + zeroes[2][n] = times(&even, n << 16); + zeroes[3][n] = times(&even, n << 24); + } + return zeroes; +} + +/// Computes `mat * vec` over `GF(2)`, where `mat` is a 32x32 binary matrix and `vec` +/// is a 32-bit vector. This somewhat "simulates" how bits propagate through the CRC register +/// during shifting. +/// +/// - In GF(2) (aka a field where the only values are 0 and 1, aka binary), multiplication is +/// an `AND`, and addition is `XOR`. +/// - This dot product determines how each bit in the input vector "contributes" to +/// the final CRC state, by XORing (adding) rows of the matrix where `vec` has 1s. +fn times(mat: *const [32]u32, vec: u32) u32 { + var sum: u32 = 0; + var v = vec; + var i: u32 = 0; + while (v != 0) { + if (v & 1 != 0) sum ^= mat[i]; + v >>= 1; + i += 1; + } + return sum; +} + +/// Computes the square of a matrix in GF(2), i.e `dst = dst x src`. +/// +/// This produces the operator for doubling the number of zeroes: +/// if `src` represents advancing the CRC by `k` zeroes, then `dest` will +/// represent advancing by 2k zeroes. +/// +/// Since polynomial multiplication mod POLY is linear, `mat(mat(x)) = mat^2(x)` +/// gives the effect of two sequential applications of the operator. +fn square(dst: *[32]u32, src: *const [32]u32) void { + for (dst, src) |*d, s| { + d.* = times(src, s); + } +} + +fn shift(table: *const [4][256]u32, crc: u32) u32 { + return table[0][crc & 0xFF] ^ table[1][(crc >> 8) & 0xFF] ^ table[2][(crc >> 16) & 0xFF] ^ table[3][crc >> 24]; +} + +fn crc32(crc: u32, input: []const u8) u32 { + var crc0: u64 = ~crc; + + // Compute the CRC for up to seven leading bytes to bring the + // `next` pointer to an eight-byte boundary. + var next = input; + while (next.len > 0 and @intFromPtr(next.ptr) & 7 != 0) { + asm volatile ("crc32b %[out], %[in]" + : [in] "+r" (crc0), + : [out] "rm" (next[0]), + ); + next = next[1..]; + } + + // Compute the CRC on sets of LONG * 3 bytes, executing three independent + // CRC instructions, each on LONG bytes. This is an optimization for + // targets where the CRC instruction has a throughput of one CRC per + // cycle, but a latency of three cycles. + while (next.len >= LONG * 3) { + var crc1: u64 = 0; + var crc2: u64 = 0; + + const start = next.len; + while (true) { + // Safe @alignCast(), since we've aligned the pointer to 8 bytes before this loop. + const long: [*]const u64 = @alignCast(@ptrCast(next)); + asm volatile ( + \\crc32q %[out0], %[in0] + \\crc32q %[out1], %[in1] + \\crc32q %[out2], %[in2] + : [in0] "+r" (crc0), + [in1] "+r" (crc1), + [in2] "+r" (crc2), + : [out0] "rm" (long[0 * LONG / 8]), + [out1] "rm" (long[1 * LONG / 8]), + [out2] "rm" (long[2 * LONG / 8]), + ); + next = next[8..]; + if (next.len <= start - LONG) break; + } + + crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc1; + crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc2; + next = next[LONG * 2 ..]; + } + + // Same thing as above, but for smaller chunks of SHORT bytes. + while (next.len >= SHORT * 3) { + var crc1: u64 = 0; + var crc2: u64 = 0; + + const start = next.len; + while (true) { + const long: [*]const u64 = @alignCast(@ptrCast(next)); + asm volatile ( + \\crc32q %[out0], %[in0] + \\crc32q %[out1], %[in1] + \\crc32q %[out2], %[in2] + : [in0] "+r" (crc0), + [in1] "+r" (crc1), + [in2] "+r" (crc2), + : [out0] "rm" (long[0 * SHORT / 8]), + [out1] "rm" (long[1 * SHORT / 8]), + [out2] "rm" (long[2 * SHORT / 8]), + ); + next = next[8..]; + if (next.len <= start - SHORT) break; + } + + crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc1; + crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc2; + next = next[SHORT * 2 ..]; + } + + // Compute via 8-byte chunks, until we're left with less than 8 bytes. + while (next.len >= 8) { + const long: [*]const u64 = @alignCast(@ptrCast(next)); + asm volatile ("crc32q %[out], %[in]" + : [in] "+r" (crc0), + : [out] "rm" (long[0]), + ); + next = next[8..]; + } + + // // Finish the last bytes with just single instructions. + while (next.len > 0) { + asm volatile ("crc32b %[out], %[in]" + : [in] "+r" (crc0), + : [out] "rm" (next[0]), + ); + next = next[1..]; + } + + return @truncate(~crc0); +} + +// Wrapper around the accelerated implementation to match the one in impl.zig. +pub const Wrapper = struct { + crc: u32, + + pub fn init() Wrapper { + return .{ .crc = 0 }; + } + + pub fn update(w: *Wrapper, bytes: []const u8) void { + w.crc = crc32(w.crc, bytes); + } + + pub fn final(w: Wrapper) u32 { + return w.crc; + } + + pub fn hash(bytes: []const u8) u32 { + var c = init(); + c.update(bytes); + return c.final(); + } +}; diff --git a/lib/std/hash/crc/impl.zig b/lib/std/hash/crc/impl.zig index 253a7b0a6287..04894702f793 100644 --- a/lib/std/hash/crc/impl.zig +++ b/lib/std/hash/crc/impl.zig @@ -23,12 +23,7 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type { const I = if (@bitSizeOf(W) < 8) u8 else W; const lookup_table = blk: { @setEvalBranchQuota(2500); - - const poly = if (algorithm.reflect_input) - @bitReverse(@as(I, algorithm.polynomial)) >> (@bitSizeOf(I) - @bitSizeOf(W)) - else - @as(I, algorithm.polynomial) << (@bitSizeOf(I) - @bitSizeOf(W)); - + const poly = reflect(algorithm.polynomial); var table: [256]I = undefined; for (&table, 0..) |*e, i| { var crc: I = i; @@ -52,15 +47,13 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type { crc: I, pub fn init() Self { - const initial = if (algorithm.reflect_input) - @bitReverse(@as(I, algorithm.initial)) >> (@bitSizeOf(I) - @bitSizeOf(W)) - else - @as(I, algorithm.initial) << (@bitSizeOf(I) - @bitSizeOf(W)); - return Self{ .crc = initial }; + const initial = reflect(algorithm.initial); + return .{ .crc = initial }; } inline fn tableEntry(index: I) I { - return lookup_table[@as(u8, @intCast(index & 0xFF))]; + const short: u8 = @truncate(index); + return lookup_table[short]; } pub fn update(self: *Self, bytes: []const u8) void { @@ -90,7 +83,7 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type { if (!algorithm.reflect_output) { c >>= @bitSizeOf(I) - @bitSizeOf(W); } - return @as(W, @intCast(c ^ algorithm.xor_output)); + return @intCast(c ^ algorithm.xor_output); } pub fn hash(bytes: []const u8) W { @@ -98,6 +91,14 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type { c.update(bytes); return c.final(); } + + fn reflect(x: I) I { + const offset = @bitSizeOf(I) - @bitSizeOf(W); + if (algorithm.reflect_input) + return @bitReverse(x) >> offset + else + return x << offset; + } }; } diff --git a/lib/std/hash/crc/test.zig b/lib/std/hash/crc/test.zig index a6c2641853e0..cc2b2191a767 100644 --- a/lib/std/hash/crc/test.zig +++ b/lib/std/hash/crc/test.zig @@ -26,6 +26,17 @@ test "crc32 koopman regression" { try testing.expectEqual(crc32.hash("abc"), 0xba2322ac); } +test "CRC-32/ISCSI" { + const Crc32Iscsi = crc.Crc32Iscsi; + + try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789")); + + var c = Crc32Iscsi.init(); + c.update("1234"); + c.update("56789"); + try testing.expectEqual(@as(u32, 0xe3069283), c.final()); +} + test "CRC-3/GSM" { const Crc3Gsm = crc.Crc3Gsm; @@ -1104,17 +1115,6 @@ test "CRC-32/CKSUM" { try testing.expectEqual(@as(u32, 0x765e7680), c.final()); } -test "CRC-32/ISCSI" { - const Crc32Iscsi = crc.Crc32Iscsi; - - try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789")); - - var c = Crc32Iscsi.init(); - c.update("1234"); - c.update("56789"); - try testing.expectEqual(@as(u32, 0xe3069283), c.final()); -} - test "CRC-32/ISO-HDLC" { const Crc32IsoHdlc = crc.Crc32IsoHdlc; diff --git a/tools/crc/catalog.txt b/tools/crc/catalog.txt index 4051f4b70b1a..d4244c6b7001 100644 --- a/tools/crc/catalog.txt +++ b/tools/crc/catalog.txt @@ -97,7 +97,8 @@ width=32 poly=0xa833982b init=0xffffffff refin=true refout=true xorout=0xff width=32 poly=0x04c11db7 init=0xffffffff refin=false refout=false xorout=0xffffffff check=0xfc891918 residue=0xc704dd7b name="CRC-32/BZIP2" width=32 poly=0x8001801b init=0x00000000 refin=true refout=true xorout=0x00000000 check=0x6ec2edc4 residue=0x00000000 name="CRC-32/CD-ROM-EDC" width=32 poly=0x04c11db7 init=0x00000000 refin=false refout=false xorout=0xffffffff check=0x765e7680 residue=0xc704dd7b name="CRC-32/CKSUM" -width=32 poly=0x1edc6f41 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xe3069283 residue=0xb798b438 name="CRC-32/ISCSI" +# CRC-32C implementation is defined manually, since it has an accelerated variant. +# width=32 poly=0x1edc6f41 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xe3069283 residue=0xb798b438 name="CRC-32/ISCSI" width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xcbf43926 residue=0xdebb20e3 name="CRC-32/ISO-HDLC" width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0x00000000 check=0x340bc6d9 residue=0x00000000 name="CRC-32/JAMCRC" width=32 poly=0x741b8cd7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0x2d3dd0ae residue=0x00000000 name="CRC-32/KOOPMAN" diff --git a/tools/update_crc_catalog.zig b/tools/update_crc_catalog.zig index 5ccac1511241..b76f1afedecf 100644 --- a/tools/update_crc_catalog.zig +++ b/tools/update_crc_catalog.zig @@ -41,6 +41,7 @@ pub fn main() anyerror!void { try code_writer.writeAll( \\//! This file is auto-generated by tools/update_crc_catalog.zig. \\ + \\const builtin = @import("builtin"); \\const impl = @import("crc/impl.zig"); \\ \\pub const Crc = impl.Crc; @@ -54,6 +55,18 @@ pub fn main() anyerror!void { \\ _ = @import("crc/test.zig"); \\} \\ + \\const can_use_fast_crc32c = builtin.cpu.has(.x86, .crc32) and builtin.zig_backend == .stage2_llvm; + \\pub const Crc32Iscsi = switch (can_use_fast_crc32c) { + \\ true => @import("crc/crc32c.zig").Wrapper, + \\ else => Crc(u32, .{ + \\ .polynomial = 0x1edc6f41, + \\ .initial = 0xffffffff, + \\ .reflect_input = true, + \\ .reflect_output = true, + \\ .xor_output = 0xffffffff, + \\ }), + \\}; + \\ ); var zig_test_file = try crc_target_dir.createFile("test.zig", .{}); @@ -86,12 +99,23 @@ pub fn main() anyerror!void { \\} \\ \\test "crc32 koopman regression" { - \\ const crc32 = crc.Koopman; + \\ const crc32 = crc.Crc32Koopman; \\ try testing.expectEqual(crc32.hash(""), 0x00000000); \\ try testing.expectEqual(crc32.hash("a"), 0x0da2aa8a); \\ try testing.expectEqual(crc32.hash("abc"), 0xba2322ac); \\} \\ + \\test "CRC-32/ISCSI" { + \\ const Crc32Iscsi = crc.Crc32Iscsi; + \\ + \\ try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789")); + \\ + \\ var c = Crc32Iscsi.init(); + \\ c.update("1234"); + \\ c.update("56789"); + \\ try testing.expectEqual(@as(u32, 0xe3069283), c.final()); + \\} + \\ ); var stream = std.io.fixedBufferStream(catalog_txt);