Skip to content

hash: implement fast crc32c #24279

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions lib/std/hash/crc.zig
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! This file is auto-generated by tools/update_crc_catalog.zig.

const builtin = @import("builtin");
const impl = @import("crc/impl.zig");

pub const Crc = impl.Crc;
Expand All @@ -13,6 +14,18 @@ test {
_ = @import("crc/test.zig");
}

const can_use_fast_crc32c = builtin.cpu.has(.x86, .crc32) and builtin.zig_backend == .stage2_llvm;
pub const Crc32Iscsi = switch (can_use_fast_crc32c) {
true => @import("crc/crc32c.zig").Wrapper,
else => Crc(u32, .{
.polynomial = 0x1edc6f41,
.initial = 0xffffffff,
.reflect_input = true,
.reflect_output = true,
.xor_output = 0xffffffff,
}),
};

pub const Crc3Gsm = Crc(u3, .{
.polynomial = 0x3,
.initial = 0x0,
Expand Down Expand Up @@ -797,14 +810,6 @@ pub const Crc32Cksum = Crc(u32, .{
.xor_output = 0xffffffff,
});

pub const Crc32Iscsi = Crc(u32, .{
.polynomial = 0x1edc6f41,
.initial = 0xffffffff,
.reflect_input = true,
.reflect_output = true,
.xor_output = 0xffffffff,
});

pub const Crc32IsoHdlc = Crc(u32, .{
.polynomial = 0x04c11db7,
.initial = 0xffffffff,
Expand Down
239 changes: 239 additions & 0 deletions lib/std/hash/crc/crc32c.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
//! Implements CRC-32C (Castagnoli) using the SSE4.2 Intel CRC32 instruction.
//!
//! A couple useful links for understanding the approach taken here:
//! - https://github.com/madler/brotli/blob/1d428d3a9baade233ebc3ac108293256bcb813d1/crc32c.c
//! - https://github.com/madler/zlib/blob/5a82f71ed1dfc0bec044d9702463dbdf84ea3b71/crc32.c
//! - http://www.ross.net/crc/download/crc_v3.txt

// Reflected CRC-32C polynomial in binary form.
const POLY = 0x82f63b78;

const LONG = 8192;
const SHORT = 256;
const long_lookup_table = genTable(LONG);
const short_lookup_table = genTable(SHORT);

/// Generates the lookup table for efficiently combining CRCs over a block of a given length `length`.
/// This works by building an operator that advances the CRC state as if `length` zero-bytes were appended.
/// We pre-compute 4 tables of 256 entries each (one per byte offset).
///
///
/// The idea behind this table is quite interesting. The CRC state is equivalent to the
/// remainder of dividing the message polynomial (over GF(2)) by the CRC polynomial.
///
/// Advancing the CRC register by `k` zero bits is equivalent to multiplying the current
/// CRC state by `x^k` modulo the CRC polynomial. This operation can be represented
/// as a linear transformation in GF(2), i.e, a matrix.
///
/// We build up this matrix via repeated squaring:
/// - odd represents the operator for 1 zero bit (i.e, multiplication by `x^1 mod POLY`)
/// - even represents the operator for 2 zero bits (`x^2 mod POLY`)
/// - squaring again gives `x^4 mod POLY`, and so on until we get to the right size.
///
/// By squaring the shifting `len`, we build the operator for `x^l mod POLY`.
fn genTable(length: usize) [4][256]u32 {
@setEvalBranchQuota(250000);

var even: [32]u32 = undefined;
zeroes: {
var odd: [32]u32 = undefined;

// Initialize our `odd` array with the operator for a single zero bit:
// - odd[0] is the polynomial itself (acts on the MSB).
// - odd[1..32] represent shifting a single bit through 31 positions.
odd[0] = POLY;
var row: u32 = 1;
for (1..32) |n| {
odd[n] = row;
row <<= 1;
}

// even = odd squared: even represents `x^2 mod POLY`.
square(&even, &odd);
// odd = even squared: odd now represents `x^4 mod POLY`.
square(&odd, &even);

// Continue squaring to double the number of zeroes encoded each time:
//
// At each point in the process:
// - square(even, odd): even gets the operator for twice the current length.
// - square(odd, even): odd gets the operator for 4 times the original length.
var len = length;
while (true) {
square(&even, &odd);
len >>= 1;
if (len == 0) break :zeroes;
square(&odd, &even);
len >>= 1;
if (len == 0) break;
}

@memcpy(&even, &odd);
}

var zeroes: [4][256]u32 = undefined;
for (0..256) |n| {
zeroes[0][n] = times(&even, n);
zeroes[1][n] = times(&even, n << 8);
zeroes[2][n] = times(&even, n << 16);
zeroes[3][n] = times(&even, n << 24);
}
return zeroes;
}

/// Computes `mat * vec` over `GF(2)`, where `mat` is a 32x32 binary matrix and `vec`
/// is a 32-bit vector. This somewhat "simulates" how bits propagate through the CRC register
/// during shifting.
///
/// - In GF(2) (aka a field where the only values are 0 and 1, aka binary), multiplication is
/// an `AND`, and addition is `XOR`.
/// - This dot product determines how each bit in the input vector "contributes" to
/// the final CRC state, by XORing (adding) rows of the matrix where `vec` has 1s.
fn times(mat: *const [32]u32, vec: u32) u32 {
var sum: u32 = 0;
var v = vec;
var i: u32 = 0;
while (v != 0) {
if (v & 1 != 0) sum ^= mat[i];
v >>= 1;
i += 1;
}
return sum;
}

/// Computes the square of a matrix in GF(2), i.e `dst = dst x src`.
///
/// This produces the operator for doubling the number of zeroes:
/// if `src` represents advancing the CRC by `k` zeroes, then `dest` will
/// represent advancing by 2k zeroes.
///
/// Since polynomial multiplication mod POLY is linear, `mat(mat(x)) = mat^2(x)`
/// gives the effect of two sequential applications of the operator.
fn square(dst: *[32]u32, src: *const [32]u32) void {
for (dst, src) |*d, s| {
d.* = times(src, s);
}
}

fn shift(table: *const [4][256]u32, crc: u32) u32 {
return table[0][crc & 0xFF] ^ table[1][(crc >> 8) & 0xFF] ^ table[2][(crc >> 16) & 0xFF] ^ table[3][crc >> 24];
}

fn crc32(crc: u32, input: []const u8) u32 {
var crc0: u64 = ~crc;

// Compute the CRC for up to seven leading bytes to bring the
// `next` pointer to an eight-byte boundary.
var next = input;
while (next.len > 0 and @intFromPtr(next.ptr) & 7 != 0) {
asm volatile ("crc32b %[out], %[in]"
: [in] "+r" (crc0),
: [out] "rm" (next[0]),
);
next = next[1..];
}

// Compute the CRC on sets of LONG * 3 bytes, executing three independent
// CRC instructions, each on LONG bytes. This is an optimization for
// targets where the CRC instruction has a throughput of one CRC per
// cycle, but a latency of three cycles.
while (next.len >= LONG * 3) {
var crc1: u64 = 0;
var crc2: u64 = 0;

const start = next.len;
while (true) {
// Safe @alignCast(), since we've aligned the pointer to 8 bytes before this loop.
const long: [*]const u64 = @alignCast(@ptrCast(next));
asm volatile (
\\crc32q %[out0], %[in0]
\\crc32q %[out1], %[in1]
\\crc32q %[out2], %[in2]
: [in0] "+r" (crc0),
[in1] "+r" (crc1),
[in2] "+r" (crc2),
: [out0] "rm" (long[0 * LONG / 8]),
[out1] "rm" (long[1 * LONG / 8]),
[out2] "rm" (long[2 * LONG / 8]),
);
next = next[8..];
if (next.len <= start - LONG) break;
}

crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc1;
crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc2;
next = next[LONG * 2 ..];
}

// Same thing as above, but for smaller chunks of SHORT bytes.
while (next.len >= SHORT * 3) {
var crc1: u64 = 0;
var crc2: u64 = 0;

const start = next.len;
while (true) {
const long: [*]const u64 = @alignCast(@ptrCast(next));
asm volatile (
\\crc32q %[out0], %[in0]
\\crc32q %[out1], %[in1]
\\crc32q %[out2], %[in2]
: [in0] "+r" (crc0),
[in1] "+r" (crc1),
[in2] "+r" (crc2),
: [out0] "rm" (long[0 * SHORT / 8]),
[out1] "rm" (long[1 * SHORT / 8]),
[out2] "rm" (long[2 * SHORT / 8]),
);
next = next[8..];
if (next.len <= start - SHORT) break;
}

crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc1;
crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc2;
next = next[SHORT * 2 ..];
}

// Compute via 8-byte chunks, until we're left with less than 8 bytes.
while (next.len >= 8) {
const long: [*]const u64 = @alignCast(@ptrCast(next));
asm volatile ("crc32q %[out], %[in]"
: [in] "+r" (crc0),
: [out] "rm" (long[0]),
);
next = next[8..];
}

// // Finish the last bytes with just single instructions.
while (next.len > 0) {
asm volatile ("crc32b %[out], %[in]"
: [in] "+r" (crc0),
: [out] "rm" (next[0]),
);
next = next[1..];
}

return @truncate(~crc0);
}

// Wrapper around the accelerated implementation to match the one in impl.zig.
pub const Wrapper = struct {
crc: u32,

pub fn init() Wrapper {
return .{ .crc = 0 };
}

pub fn update(w: *Wrapper, bytes: []const u8) void {
w.crc = crc32(w.crc, bytes);
}

pub fn final(w: Wrapper) u32 {
return w.crc;
}

pub fn hash(bytes: []const u8) u32 {
var c = init();
c.update(bytes);
return c.final();
}
};
27 changes: 14 additions & 13 deletions lib/std/hash/crc/impl.zig
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,7 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
const I = if (@bitSizeOf(W) < 8) u8 else W;
const lookup_table = blk: {
@setEvalBranchQuota(2500);

const poly = if (algorithm.reflect_input)
@bitReverse(@as(I, algorithm.polynomial)) >> (@bitSizeOf(I) - @bitSizeOf(W))
else
@as(I, algorithm.polynomial) << (@bitSizeOf(I) - @bitSizeOf(W));

const poly = reflect(algorithm.polynomial);
var table: [256]I = undefined;
for (&table, 0..) |*e, i| {
var crc: I = i;
Expand All @@ -52,15 +47,13 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
crc: I,

pub fn init() Self {
const initial = if (algorithm.reflect_input)
@bitReverse(@as(I, algorithm.initial)) >> (@bitSizeOf(I) - @bitSizeOf(W))
else
@as(I, algorithm.initial) << (@bitSizeOf(I) - @bitSizeOf(W));
return Self{ .crc = initial };
const initial = reflect(algorithm.initial);
return .{ .crc = initial };
}

inline fn tableEntry(index: I) I {
return lookup_table[@as(u8, @intCast(index & 0xFF))];
const short: u8 = @truncate(index);
return lookup_table[short];
}

pub fn update(self: *Self, bytes: []const u8) void {
Expand Down Expand Up @@ -90,14 +83,22 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
if (!algorithm.reflect_output) {
c >>= @bitSizeOf(I) - @bitSizeOf(W);
}
return @as(W, @intCast(c ^ algorithm.xor_output));
return @intCast(c ^ algorithm.xor_output);
}

pub fn hash(bytes: []const u8) W {
var c = Self.init();
c.update(bytes);
return c.final();
}

fn reflect(x: I) I {
const offset = @bitSizeOf(I) - @bitSizeOf(W);
if (algorithm.reflect_input)
return @bitReverse(x) >> offset
else
return x << offset;
}
};
}

Expand Down
22 changes: 11 additions & 11 deletions lib/std/hash/crc/test.zig
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,17 @@ test "crc32 koopman regression" {
try testing.expectEqual(crc32.hash("abc"), 0xba2322ac);
}

test "CRC-32/ISCSI" {
const Crc32Iscsi = crc.Crc32Iscsi;

try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));

var c = Crc32Iscsi.init();
c.update("1234");
c.update("56789");
try testing.expectEqual(@as(u32, 0xe3069283), c.final());
}

test "CRC-3/GSM" {
const Crc3Gsm = crc.Crc3Gsm;

Expand Down Expand Up @@ -1104,17 +1115,6 @@ test "CRC-32/CKSUM" {
try testing.expectEqual(@as(u32, 0x765e7680), c.final());
}

test "CRC-32/ISCSI" {
const Crc32Iscsi = crc.Crc32Iscsi;

try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));

var c = Crc32Iscsi.init();
c.update("1234");
c.update("56789");
try testing.expectEqual(@as(u32, 0xe3069283), c.final());
}

test "CRC-32/ISO-HDLC" {
const Crc32IsoHdlc = crc.Crc32IsoHdlc;

Expand Down
3 changes: 2 additions & 1 deletion tools/crc/catalog.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ width=32 poly=0xa833982b init=0xffffffff refin=true refout=true xorout=0xff
width=32 poly=0x04c11db7 init=0xffffffff refin=false refout=false xorout=0xffffffff check=0xfc891918 residue=0xc704dd7b name="CRC-32/BZIP2"
width=32 poly=0x8001801b init=0x00000000 refin=true refout=true xorout=0x00000000 check=0x6ec2edc4 residue=0x00000000 name="CRC-32/CD-ROM-EDC"
width=32 poly=0x04c11db7 init=0x00000000 refin=false refout=false xorout=0xffffffff check=0x765e7680 residue=0xc704dd7b name="CRC-32/CKSUM"
width=32 poly=0x1edc6f41 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xe3069283 residue=0xb798b438 name="CRC-32/ISCSI"
# CRC-32C implementation is defined manually, since it has an accelerated variant.
# width=32 poly=0x1edc6f41 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xe3069283 residue=0xb798b438 name="CRC-32/ISCSI"
width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xcbf43926 residue=0xdebb20e3 name="CRC-32/ISO-HDLC"
width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0x00000000 check=0x340bc6d9 residue=0x00000000 name="CRC-32/JAMCRC"
width=32 poly=0x741b8cd7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0x2d3dd0ae residue=0x00000000 name="CRC-32/KOOPMAN"
Expand Down
Loading