|
| 1 | +//! Implements CRC-32C (Castagnoli) using the SSE4.2 Intel CRC32 instruction. |
| 2 | +//! |
| 3 | +//! A couple useful links for understanding the approach taken here: |
| 4 | +//! - https://github.com/madler/brotli/blob/1d428d3a9baade233ebc3ac108293256bcb813d1/crc32c.c |
| 5 | +//! - https://github.com/madler/zlib/blob/5a82f71ed1dfc0bec044d9702463dbdf84ea3b71/crc32.c |
| 6 | +//! - http://www.ross.net/crc/download/crc_v3.txt |
| 7 | + |
| 8 | +// Reflected CRC-32C polynomial in binary form. |
| 9 | +const POLY = 0x82f63b78; |
| 10 | + |
| 11 | +const LONG = 8192; |
| 12 | +const SHORT = 256; |
| 13 | +const long_lookup_table = genTable(LONG); |
| 14 | +const short_lookup_table = genTable(SHORT); |
| 15 | + |
| 16 | +/// Generates the lookup table for efficiently combining CRCs over a block of a given length `length`. |
| 17 | +/// This works by building an operator that advances the CRC state as if `length` zero-bytes were appended. |
| 18 | +/// We pre-compute 4 tables of 256 entries each (one per byte offset). |
| 19 | +/// |
| 20 | +/// |
| 21 | +/// The idea behind this table is quite interesting. The CRC state is equivalent to the |
| 22 | +/// remainder of dividing the message polynomial (over GF(2)) by the CRC polynomial. |
| 23 | +/// |
| 24 | +/// Advancing the CRC register by `k` zero bits is equivalent to multiplying the current |
| 25 | +/// CRC state by `x^k` modulo the CRC polynomial. This operation can be represented |
| 26 | +/// as a linear transformation in GF(2), i.e, a matrix. |
| 27 | +/// |
| 28 | +/// We build up this matrix via repeated squaring: |
| 29 | +/// - odd represents the operator for 1 zero bit (i.e, multiplication by `x^1 mod POLY`) |
| 30 | +/// - even represents the operator for 2 zero bits (`x^2 mod POLY`) |
| 31 | +/// - squaring again gives `x^4 mod POLY`, and so on until we get to the right size. |
| 32 | +/// |
| 33 | +/// By squaring the shifting `len`, we build the operator for `x^l mod POLY`. |
| 34 | +fn genTable(length: usize) [4][256]u32 { |
| 35 | + @setEvalBranchQuota(250000); |
| 36 | + |
| 37 | + var even: [32]u32 = undefined; |
| 38 | + zeroes: { |
| 39 | + var odd: [32]u32 = undefined; |
| 40 | + |
| 41 | + // Initialize our `odd` array with the operator for a single zero bit: |
| 42 | + // - odd[0] is the polynomial itself (acts on the MSB). |
| 43 | + // - odd[1..32] represent shifting a single bit through 31 positions. |
| 44 | + odd[0] = POLY; |
| 45 | + var row: u32 = 1; |
| 46 | + for (1..32) |n| { |
| 47 | + odd[n] = row; |
| 48 | + row <<= 1; |
| 49 | + } |
| 50 | + |
| 51 | + // even = odd squared: even represents `x^2 mod POLY`. |
| 52 | + square(&even, &odd); |
| 53 | + // odd = even squared: odd now represents `x^4 mod POLY`. |
| 54 | + square(&odd, &even); |
| 55 | + |
| 56 | + // Continue squaring to double the number of zeroes encoded each time: |
| 57 | + // |
| 58 | + // At each point in the process: |
| 59 | + // - square(even, odd): even gets the operator for twice the current length. |
| 60 | + // - square(odd, even): odd gets the operator for 4 times the original length. |
| 61 | + var len = length; |
| 62 | + while (true) { |
| 63 | + square(&even, &odd); |
| 64 | + len >>= 1; |
| 65 | + if (len == 0) break :zeroes; |
| 66 | + square(&odd, &even); |
| 67 | + len >>= 1; |
| 68 | + if (len == 0) break; |
| 69 | + } |
| 70 | + |
| 71 | + @memcpy(&even, &odd); |
| 72 | + } |
| 73 | + |
| 74 | + var zeroes: [4][256]u32 = undefined; |
| 75 | + for (0..256) |n| { |
| 76 | + zeroes[0][n] = times(&even, n); |
| 77 | + zeroes[1][n] = times(&even, n << 8); |
| 78 | + zeroes[2][n] = times(&even, n << 16); |
| 79 | + zeroes[3][n] = times(&even, n << 24); |
| 80 | + } |
| 81 | + return zeroes; |
| 82 | +} |
| 83 | + |
| 84 | +/// Computes `mat * vec` over `GF(2)`, where `mat` is a 32x32 binary matrix and `vec` |
| 85 | +/// is a 32-bit vector. This somewhat "simulates" how bits propagate through the CRC register |
| 86 | +/// during shifting. |
| 87 | +/// |
| 88 | +/// - In GF(2) (aka a field where the only values are 0 and 1, aka binary), multiplication is |
| 89 | +/// an `AND`, and addition is `XOR`. |
| 90 | +/// - This dot product determines how each bit in the input vector "contributes" to |
| 91 | +/// the final CRC state, by XORing (adding) rows of the matrix where `vec` has 1s. |
| 92 | +fn times(mat: *[32]u32, vec: u32) u32 { |
| 93 | + var sum: u32 = 0; |
| 94 | + var v = vec; |
| 95 | + var i: u32 = 0; |
| 96 | + while (v != 0) { |
| 97 | + if (v & 1 != 0) sum ^= mat[i]; |
| 98 | + v >>= 1; |
| 99 | + i += 1; |
| 100 | + } |
| 101 | + return sum; |
| 102 | +} |
| 103 | + |
| 104 | +/// Computes the square of a matrix in GF(2), i.e `dst = dst x src`. |
| 105 | +/// |
| 106 | +/// This produces the operator for doubling the number of zeroes: |
| 107 | +/// if `src` represents advancing the CRC by `k` zeroes, then `dest` will |
| 108 | +/// represent advancing by 2k zeroes. |
| 109 | +/// |
| 110 | +/// Since polynomial multiplication mod POLY is linear, `mat(mat(x)) = mat^2(x)` |
| 111 | +/// gives the effect of two sequential applications of the operator. |
| 112 | +fn square(dst: *[32]u32, src: *[32]u32) void { |
| 113 | + for (dst, src) |*s, m| { |
| 114 | + s.* = times(src, m); |
| 115 | + } |
| 116 | +} |
| 117 | + |
| 118 | +fn shift(table: *const [4][256]u32, crc: u32) u32 { |
| 119 | + return table[0][crc & 0xFF] ^ table[1][(crc >> 8) & 0xFF] ^ table[2][(crc >> 16) & 0xFF] ^ table[3][crc >> 24]; |
| 120 | +} |
| 121 | + |
| 122 | +fn crc32(crc: u32, input: []const u8) u32 { |
| 123 | + var crc0: u64 = crc; |
| 124 | + |
| 125 | + // Compute the CRC for up to seven leading bytes to bring the |
| 126 | + // `next` pointer to an eight-byte boundary. |
| 127 | + var next = input; |
| 128 | + while (next.len > 0 and @intFromPtr(next.ptr) & 7 != 0) { |
| 129 | + asm volatile ("crc32b %[out], %[in]" |
| 130 | + : [in] "+r" (crc0), |
| 131 | + : [out] "rm" (next[0]), |
| 132 | + ); |
| 133 | + next = next[1..]; |
| 134 | + } |
| 135 | + |
| 136 | + // Compute the CRC on sets of LONG * 3 bytes, executing three independent |
| 137 | + // CRC instructions, each on LONG bytes. This is an optimization for |
| 138 | + // targets where the CRC instruction has a throughput of one CRC per |
| 139 | + // cycle, but a latency of three cycles. |
| 140 | + while (next.len >= LONG * 3) { |
| 141 | + var crc1: u64 = 0; |
| 142 | + var crc2: u64 = 0; |
| 143 | + |
| 144 | + const start = next.len; |
| 145 | + while (true) { |
| 146 | + // Safe @alignCast(), since we've aligned the pointer to 8 bytes before this loop. |
| 147 | + const long: [*]const u64 = @alignCast(@ptrCast(next)); |
| 148 | + asm volatile ( |
| 149 | + \\crc32q %[out0], %[in0] |
| 150 | + \\crc32q %[out1], %[in1] |
| 151 | + \\crc32q %[out2], %[in2] |
| 152 | + : [in0] "+r" (crc0), |
| 153 | + [in1] "+r" (crc1), |
| 154 | + [in2] "+r" (crc2), |
| 155 | + : [out0] "rm" (long[0 * LONG / 8]), |
| 156 | + [out1] "rm" (long[1 * LONG / 8]), |
| 157 | + [out2] "rm" (long[2 * LONG / 8]), |
| 158 | + ); |
| 159 | + next = next[8..]; |
| 160 | + if (next.len <= start - LONG) break; |
| 161 | + } |
| 162 | + |
| 163 | + crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc1; |
| 164 | + crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc2; |
| 165 | + next = next[LONG * 2 ..]; |
| 166 | + } |
| 167 | + |
| 168 | + // Same thing as above, but for smaller chunks of SHORT bytes. |
| 169 | + while (next.len >= SHORT * 3) { |
| 170 | + var crc1: u64 = 0; |
| 171 | + var crc2: u64 = 0; |
| 172 | + |
| 173 | + const start = next.len; |
| 174 | + while (true) { |
| 175 | + const long: [*]const u64 = @alignCast(@ptrCast(next)); |
| 176 | + asm volatile ( |
| 177 | + \\crc32q %[out0], %[in0] |
| 178 | + \\crc32q %[out1], %[in1] |
| 179 | + \\crc32q %[out2], %[in2] |
| 180 | + : [in0] "+r" (crc0), |
| 181 | + [in1] "+r" (crc1), |
| 182 | + [in2] "+r" (crc2), |
| 183 | + : [out0] "rm" (long[0 * SHORT / 8]), |
| 184 | + [out1] "rm" (long[1 * SHORT / 8]), |
| 185 | + [out2] "rm" (long[2 * SHORT / 8]), |
| 186 | + ); |
| 187 | + next = next[8..]; |
| 188 | + if (next.len <= start - SHORT) break; |
| 189 | + } |
| 190 | + |
| 191 | + crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc1; |
| 192 | + crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc2; |
| 193 | + next = next[SHORT * 2 ..]; |
| 194 | + } |
| 195 | + |
| 196 | + // Compute via 8-byte chunks, until we're left with less than 8 bytes. |
| 197 | + while (next.len >= 8) { |
| 198 | + const long: [*]const u64 = @alignCast(@ptrCast(next)); |
| 199 | + asm volatile ("crc32q %[out], %[in]" |
| 200 | + : [in] "+r" (crc0), |
| 201 | + : [out] "rm" (long[0]), |
| 202 | + ); |
| 203 | + next = next[8..]; |
| 204 | + } |
| 205 | + |
| 206 | + // // Finish the last bytes with just single instructions. |
| 207 | + while (next.len > 0) { |
| 208 | + asm volatile ("crc32b %[out], %[in]" |
| 209 | + : [in] "+r" (crc0), |
| 210 | + : [out] "rm" (next[0]), |
| 211 | + ); |
| 212 | + next = next[1..]; |
| 213 | + } |
| 214 | + |
| 215 | + return @truncate(~crc0); |
| 216 | +} |
| 217 | + |
| 218 | +// Wrapper around the accelerated implementation to match the one in impl.zig. |
| 219 | +pub const Wrapper = struct { |
| 220 | + crc: u32, |
| 221 | + |
| 222 | + pub fn init() Wrapper { |
| 223 | + return .{ .crc = 0xffffffff }; |
| 224 | + } |
| 225 | + |
| 226 | + pub fn update(w: *Wrapper, bytes: []const u8) void { |
| 227 | + w.crc = crc32(w.crc, bytes); |
| 228 | + } |
| 229 | + |
| 230 | + pub fn final(w: Wrapper) u32 { |
| 231 | + return w.crc; |
| 232 | + } |
| 233 | + |
| 234 | + pub fn hash(bytes: []const u8) u32 { |
| 235 | + var c = init(); |
| 236 | + c.update(bytes); |
| 237 | + return c.final(); |
| 238 | + } |
| 239 | +}; |
0 commit comments