Improve SIMD vectorization of the ChaCha block function on CPU

kernelmethod · kernelmethod · commit d28f74f060f2 · 2022-05-14T02:48:50.000-06:00
Using some techniques borrowed from [1], I've made some improvements to the instruction-level parallelism of the ChaCha block function for CPU. The block function now uses AVX-512 instructions to compute as many ChaCha blocks as possible, and then uses 128-bit vectorization to compute the remaining blocks. In my tests this gives a pretty big speedup for CPU-based random number generation. [1] https://eprint.iacr.org/2013/759.pdf
diff --git a/Manifest.toml b/Manifest.toml
@@ -250,6 +250,11 @@ version = "1.3.0"
 [[deps.SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 
+[[deps.SIMD]]
+git-tree-sha1 = "7dbc15af7ed5f751a82bf3ed37757adf76c32402"
+uuid = "fdea26ae-647d-5447-a871-4b548cad5224"
+version = "3.4.1"
+
 [[deps.Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
diff --git a/Project.toml b/Project.toml
@@ -6,9 +6,11 @@ version = "0.1.0"
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
 [compat]
 CUDA = "3.8"
+SIMD = "3.4"
 StaticArrays = "1.4"
 julia = "1"
diff --git a/src/ChaCha.jl b/src/ChaCha.jl
@@ -5,25 +5,24 @@ for CSPRNG.
 
 module ChaCha
 
-using Core.Intrinsics: llvmcall
 using CUDA
+using SIMD
 using StaticArrays
 
 # ChaCha block size is 32 * 16 bits = 64 bytes
 const CHACHA_BLOCK_SIZE_U32 = 16
 const CHACHA_BLOCK_SIZE = div(32 * 16, 8)
 
 @inline lrot32(x, n) = (x << n) | (x >> (32 - n))
-@inline lrot32(x::UInt32, n::UInt32) = llvmcall(
-    ("""
-     declare i32 @llvm.fshl.i32(i32, i32, i32)
-     define i32 @entry(i32, i32, i32) #0 {
-     3:
-        %res = call i32 @llvm.fshl.i32(i32 %0, i32 %0, i32 %1)
-        ret i32 %res
-     }
-     attributes #0 = { alwaysinline }
-     """, "entry"), UInt32, Tuple{UInt32, UInt32}, x, n)
+@inline lrot32(x::Union{Vec,UInt32}, n) = bitrotate(x, n)
+
+@inline @generated function rotatevector(x::Vec{N,T}, ::Val{M}) where {N,T,M}
+    rotation = circshift(0:3, M)
+    rotation = repeat(rotation, N ÷ 4)
+    rotation += 4 * ((0:N-1) .÷ 4)
+    rotation = Val(Tuple(rotation))
+    :(shufflevector(x, $rotation))
+end
 
 @inline function _QR!(x, a, b, c, d)
     @inbounds begin
@@ -34,14 +33,26 @@ const CHACHA_BLOCK_SIZE = div(32 * 16, 8)
     end
 end
 
+@inline function _QR!(a, b, c, d)
+    a += b; d ⊻= a; d = lrot32(d, UInt32(16));
+    c += d; b ⊻= c; b = lrot32(b, UInt32(12));
+    a += b; d ⊻= a; d = lrot32(d, UInt32(8));
+    c += d; b ⊻= c; b = lrot32(b, UInt32(7));
+    a, b, c, d
+end
+
 @inline function store_u64!(x::AbstractVector{UInt32}, u::UInt64, idx)
-    x[idx] = UInt32(u & 0xffffffff)
-    x[idx+1] = UInt32((u >> 32) & 0xffffffff)
+    @inbounds begin
+        x[idx] = UInt32(u & 0xffffffff)
+        x[idx+1] = UInt32((u >> 32) & 0xffffffff)
+    end
 end
 
 @inline function add_u64!(x::AbstractVector{UInt32}, u::UInt64, idx)
-    x[idx] += UInt32(u & 0xffffffff)
-    x[idx+1] += UInt32((u >> 32) & 0xffffffff)
+    @inbounds begin
+        x[idx] += UInt32(u & 0xffffffff)
+        x[idx+1] += UInt32((u >> 32) & 0xffffffff)
+    end
 end
 
 #=
@@ -144,40 +155,89 @@ function chacha_blocks!(
     nblocks = 1;
     doublerounds = 10,
 )
-    for i ∈ 1:nblocks
-        block_start = CHACHA_BLOCK_SIZE_U32 * (i - 1) + 1
-        block_end = block_start + CHACHA_BLOCK_SIZE_U32 - 1
-        state = view(buffer, block_start:block_end)
-
-        _chacha_set_initial_state!(state, key, nonce, counter, 1)
-
-        # Perform alternating rounds of columnar
-        # quarter-rounds and diagonal quarter-rounds
-        for i = 1:doublerounds
-            # Columnar rounds
-            _QR!(state, 1, 5, 9, 13)
-            _QR!(state, 2, 6, 10, 14)
-            _QR!(state, 3, 7, 11, 15)
-            _QR!(state, 4, 8, 12, 16)
-
-            # Diagonal rounds
-            _QR!(state, 1, 6, 11, 16)
-            _QR!(state, 2, 7, 12, 13)
-            _QR!(state, 3, 8, 9, 14)
-            _QR!(state, 4, 5, 10, 15)
-        end
-
-        # Finish by adding the initial state back to
-        # the original state, so that the operations
-        # are no longer invertible
-        _chacha_add_initial_state!(state, key, nonce, counter, 1)
+    block_start = 1
+
+    # We compute as many blocks of output as possible with 512-bit
+    # SIMD vectorization
+    for i ∈ 1:4:nblocks-3
+        block_start, counter = _chacha_blocks!(
+            buffer, block_start, key, nonce, counter, doublerounds, Val(4)
+        )
+    end
 
-        counter += 1
+    # The remaining blocks are computed with 128-bit vectorization
+    for i ∈ 1:(nblocks % 4)
+        block_start, counter = _chacha_blocks!(
+            buffer, block_start, key, nonce, counter, doublerounds, Val(1)
+        )
     end
 
     counter
 end
 
+# Compute the ChaCha block function with N * 128-bit SIMD vectorization
+#
+# Reference: https://eprint.iacr.org/2013/759.pdf
+@inline function _chacha_blocks!(
+    buffer::AbstractVector{UInt32}, block_start, key, nonce, counter, doublerounds, ::Val{N}
+) where N
+    block_end = block_start + N * CHACHA_BLOCK_SIZE_U32 - 1
+    @inbounds state = view(buffer, block_start:block_end)
+
+    for i = 0:N-1
+        _chacha_set_initial_state!(state, key, nonce, counter + i, i * CHACHA_BLOCK_SIZE_U32 + 1)
+    end
+
+    _chacha_rounds!(state, doublerounds, Val(N))
+
+    for i = 0:N-1
+        _chacha_add_initial_state!(state, key, nonce, counter + i, i * CHACHA_BLOCK_SIZE_U32 + 1)
+    end
+
+    block_end + 1, counter + N
+end
+
+
+@inline @generated function _chacha_rounds!(state, doublerounds, ::Val{N}) where N
+    # Perform alternating rounds of columnar
+    # quarter-rounds and diagonal quarter-rounds
+    lane = (1, 2, 3, 4)
+    lane = repeat(1:4, N)
+    lane += 16 * ((0:4*N-1) .÷ 4)
+    lane = Tuple(lane)
+
+    idx0 = Vec(lane)
+    idx1 = Vec(lane .+ 4)
+    idx2 = Vec(lane .+ 8)
+    idx3 = Vec(lane .+ 12)
+
+    quote
+        @inbounds begin
+            v0 = vgather(state, $idx0)
+            v1 = vgather(state, $idx1)
+            v2 = vgather(state, $idx2)
+            v3 = vgather(state, $idx3)
+
+            for i = 1:doublerounds
+                v0, v1, v2, v3 = _QR!(v0, v1, v2, v3)
+                v1 = rotatevector(v1, Val(-1))
+                v2 = rotatevector(v2, Val(-2))
+                v3 = rotatevector(v3, Val(-3))
+
+                v0, v1, v2, v3 = _QR!(v0, v1, v2, v3)
+                v1 = rotatevector(v1, Val(1))
+                v2 = rotatevector(v2, Val(2))
+                v3 = rotatevector(v3, Val(3))
+            end
+
+            vscatter(v0, state, $idx0)
+            vscatter(v1, state, $idx1)
+            vscatter(v2, state, $idx2)
+            vscatter(v3, state, $idx3)
+        end
+    end
+end
+
 function chacha_blocks!(
     buffer::CuArray, key, nonce::UInt64, counter::UInt64, nblocks = 1; doublerounds = 10
 )
diff --git a/src/keystream.jl b/src/keystream.jl
@@ -122,17 +122,12 @@ end
 function _fill_blocks!(
     buffer::AbstractVector{T}, stream::ChaChaStream, nblocks::Int
 ) where {T <: BitInteger}
-    bufsize_u32 = div(length(buffer) * sizeof(T), sizeof(UInt32))
+    bufsize_u32 = sizeof(buffer) ÷ sizeof(UInt32)
 
     GC.@preserve buffer begin
-        # Create a pointer to the start of the block,
-        # and wrap it in an instance of UnsafeView.
-        #
-        # This provides a decent speedup over using
-        # reinterpret(UInt32, ...)
         bp = pointer(buffer)
         bp = Base.unsafe_convert(Ptr{UInt32}, bp)
-        bufview = UnsafeView(bp, bufsize_u32)
+        bufview = unsafe_wrap(Vector{UInt32}, bp, bufsize_u32)
 
         stream.counter = chacha_blocks!(
             bufview,
diff --git a/test/test_chacha.jl b/test/test_chacha.jl
@@ -13,7 +13,7 @@ using StaticArrays
 using Test
 
 function chacha_blocks_test_suite(T)
-    @testset "Test chacha_blocks!" begin
+    @testset "RFC 8439 ChaCha block function tests" begin
         # Ref: IETF RFC 8439, Sec. 2.3.2
         # https://datatracker.ietf.org/doc/html/rfc8439#section-2.3.2
         key = SVector{8,UInt32}([
@@ -114,6 +114,42 @@ function chacha_blocks_test_suite(T)
         @test state == test_vector
     end
 
+    @testset "Extended ChaCha block function tests" begin
+        # Run multiple blocks of ChaCha with key, counter, and nonce equal
+        # to zero
+        #
+        # It's more efficient to compute multiple blocks in parallel on both
+        # CPU and GPU, so this test ensures that parallelization doesn't
+        # introduce any new errors.
+        key = SVector{8,UInt32}(zeros(UInt32, 8)) |> T
+        nonce = UInt64(0)
+        counter = UInt64(0)
+        test_vector = SVector{64,UInt32}([
+            # Block 1
+            0xade0b876, 0x903df1a0, 0xe56a5d40, 0x28bd8653,
+            0xb819d2bd, 0x1aed8da0, 0xccef36a8, 0xc70d778b,
+            0x7c5941da, 0x8d485751, 0x3fe02477, 0x374ad8b8,
+            0xf4b8436a, 0x1ca11815, 0x69b687c3, 0x8665eeb2,
+            # Block 2
+            0xbee7079f, 0x7a385155, 0x7c97ba98, 0x0d082d73,
+            0xa0290fcb, 0x6965e348, 0x3e53c612, 0xed7aee32,
+            0x7621b729, 0x434ee69c, 0xb03371d5, 0xd539d874,
+            0x281fed31, 0x45fb0a51, 0x1f0ae1ac, 0x6f4d794b,
+            # Block 3
+            0xe6a0092d, 0xe16c2663, 0x08d17eae, 0x75a06819,
+            0x998e718e, 0xc662d37b, 0x3446c3b0, 0x5db3a0a9,
+            0x68372701, 0x0f5d7b1f, 0xfd3a1e28, 0x1ebc58e4,
+            0x13d3d273, 0xc094cfc9, 0x6271f35f, 0xf248a240,
+            # Block 4
+            0x58a02013, 0x6b56b3d7, 0xaada20d5, 0x0abfd23e,
+            0x20b1b8c5, 0x732785fb, 0x349763c3, 0xa4915cb4,
+            0x83cbd42d, 0x2e0d84f8, 0x1358b1ed, 0x3fac6210,
+            0xfff82c1f, 0x5618cd6d, 0x6c1e6ae8, 0x7e166731
+        ]) |> T
+        state = MVector{64,UInt32}(undef) |> T
+        @test ChaCha.chacha_blocks!(state, key, nonce, counter, 4) == counter + 4
+        @test state == test_vector
+    end
 end
 
 @testset "ChaCha tests" begin