Merge pull request #4 from kernelmethod/kernelmethod/simd

kernelmethod · web-flow · commit 2f2f79ebfbbd · 2022-05-14T05:15:35.000-04:00
Improve SIMD parallelization
diff --git a/Manifest.toml b/Manifest.toml
@@ -250,6 +250,11 @@ version = "1.3.0"
 [[deps.SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 
+[[deps.SIMD]]
+git-tree-sha1 = "7dbc15af7ed5f751a82bf3ed37757adf76c32402"
+uuid = "fdea26ae-647d-5447-a871-4b548cad5224"
+version = "3.4.1"
+
 [[deps.Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
diff --git a/Project.toml b/Project.toml
@@ -6,9 +6,11 @@ version = "0.1.0"
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
 [compat]
 CUDA = "3.8"
+SIMD = "3.4"
 StaticArrays = "1.4"
 julia = "1"
diff --git a/src/ChaCha.jl b/src/ChaCha.jl
@@ -5,43 +5,48 @@ for CSPRNG.
 
 module ChaCha
 
-using Core.Intrinsics: llvmcall
 using CUDA
+using SIMD
 using StaticArrays
 
 # ChaCha block size is 32 * 16 bits = 64 bytes
 const CHACHA_BLOCK_SIZE_U32 = 16
 const CHACHA_BLOCK_SIZE = div(32 * 16, 8)
 
 @inline lrot32(x, n) = (x << n) | (x >> (32 - n))
-@inline lrot32(x::UInt32, n::UInt32) = llvmcall(
-    ("""
-     declare i32 @llvm.fshl.i32(i32, i32, i32)
-     define i32 @entry(i32, i32, i32) #0 {
-     3:
-        %res = call i32 @llvm.fshl.i32(i32 %0, i32 %0, i32 %1)
-        ret i32 %res
-     }
-     attributes #0 = { alwaysinline }
-     """, "entry"), UInt32, Tuple{UInt32, UInt32}, x, n)
-
-@inline function _QR!(x, a, b, c, d)
-    @inbounds begin
-        x[a] += x[b]; x[d] ⊻= x[a]; x[d] = lrot32(x[d], UInt32(16))
-        x[c] += x[d]; x[b] ⊻= x[c]; x[b] = lrot32(x[b], UInt32(12))
-        x[a] += x[b]; x[d] ⊻= x[a]; x[d] = lrot32(x[d],  UInt32(8))
-        x[c] += x[d]; x[b] ⊻= x[c]; x[b] = lrot32(x[b],  UInt32(7))
+@inline lrot32(x::Union{Vec,UInt32}, n) = bitrotate(x, n)
+
+@inline @generated function rotatevector(x::Vec{N,T}, ::Val{M}) where {N,T,M}
+    rotation = circshift(0:3, M)
+    rotation = repeat(rotation, N ÷ 4)
+    rotation += 4 * ((0:N-1) .÷ 4)
+    rotation = Val(Tuple(rotation))
+    :(shufflevector(x, $rotation))
+end
+
+macro _QR!(a, b, c, d)
+    quote
+        $(esc(a)) += $(esc(b)); $(esc(d)) ⊻= $(esc(a)); $(esc(d)) = lrot32($(esc(d)), 16);
+        $(esc(c)) += $(esc(d)); $(esc(b)) ⊻= $(esc(c)); $(esc(b)) = lrot32($(esc(b)), 12);
+        $(esc(a)) += $(esc(b)); $(esc(d)) ⊻= $(esc(a)); $(esc(d)) = lrot32($(esc(d)), 8);
+        $(esc(c)) += $(esc(d)); $(esc(b)) ⊻= $(esc(c)); $(esc(b)) = lrot32($(esc(b)), 7);
+
+        $(esc(a)), $(esc(b)), $(esc(c)), $(esc(d))
     end
 end
 
 @inline function store_u64!(x::AbstractVector{UInt32}, u::UInt64, idx)
-    x[idx] = UInt32(u & 0xffffffff)
-    x[idx+1] = UInt32((u >> 32) & 0xffffffff)
+    @inbounds begin
+        x[idx] = UInt32(u & 0xffffffff)
+        x[idx+1] = UInt32((u >> 32) & 0xffffffff)
+    end
 end
 
 @inline function add_u64!(x::AbstractVector{UInt32}, u::UInt64, idx)
-    x[idx] += UInt32(u & 0xffffffff)
-    x[idx+1] += UInt32((u >> 32) & 0xffffffff)
+    @inbounds begin
+        x[idx] += UInt32(u & 0xffffffff)
+        x[idx+1] += UInt32((u >> 32) & 0xffffffff)
+    end
 end
 
 #=
@@ -144,40 +149,89 @@ function chacha_blocks!(
     nblocks = 1;
     doublerounds = 10,
 )
-    for i ∈ 1:nblocks
-        block_start = CHACHA_BLOCK_SIZE_U32 * (i - 1) + 1
-        block_end = block_start + CHACHA_BLOCK_SIZE_U32 - 1
-        state = view(buffer, block_start:block_end)
-
-        _chacha_set_initial_state!(state, key, nonce, counter, 1)
-
-        # Perform alternating rounds of columnar
-        # quarter-rounds and diagonal quarter-rounds
-        for i = 1:doublerounds
-            # Columnar rounds
-            _QR!(state, 1, 5, 9, 13)
-            _QR!(state, 2, 6, 10, 14)
-            _QR!(state, 3, 7, 11, 15)
-            _QR!(state, 4, 8, 12, 16)
-
-            # Diagonal rounds
-            _QR!(state, 1, 6, 11, 16)
-            _QR!(state, 2, 7, 12, 13)
-            _QR!(state, 3, 8, 9, 14)
-            _QR!(state, 4, 5, 10, 15)
-        end
-
-        # Finish by adding the initial state back to
-        # the original state, so that the operations
-        # are no longer invertible
-        _chacha_add_initial_state!(state, key, nonce, counter, 1)
+    block_start = 1
+
+    # We compute as many blocks of output as possible with 512-bit
+    # SIMD vectorization
+    for i ∈ 1:4:nblocks-3
+        block_start, counter = _chacha_blocks!(
+            buffer, block_start, key, nonce, counter, doublerounds, Val(4)
+        )
+    end
 
-        counter += 1
+    # The remaining blocks are computed with 128-bit vectorization
+    for i ∈ 1:(nblocks % 4)
+        block_start, counter = _chacha_blocks!(
+            buffer, block_start, key, nonce, counter, doublerounds, Val(1)
+        )
     end
 
     counter
 end
 
+# Compute the ChaCha block function with N * 128-bit SIMD vectorization
+#
+# Reference: https://eprint.iacr.org/2013/759.pdf
+@inline function _chacha_blocks!(
+    buffer::AbstractVector{UInt32}, block_start, key, nonce, counter, doublerounds, ::Val{N}
+) where N
+    block_end = block_start + N * CHACHA_BLOCK_SIZE_U32 - 1
+    @inbounds state = view(buffer, block_start:block_end)
+
+    for i = 0:N-1
+        _chacha_set_initial_state!(state, key, nonce, counter + i, i * CHACHA_BLOCK_SIZE_U32 + 1)
+    end
+
+    _chacha_rounds!(state, doublerounds, Val(N))
+
+    for i = 0:N-1
+        _chacha_add_initial_state!(state, key, nonce, counter + i, i * CHACHA_BLOCK_SIZE_U32 + 1)
+    end
+
+    block_end + 1, counter + N
+end
+
+
+@inline @generated function _chacha_rounds!(state, doublerounds, ::Val{N}) where N
+    # Perform alternating rounds of columnar
+    # quarter-rounds and diagonal quarter-rounds
+    lane = (1, 2, 3, 4)
+    lane = repeat(1:4, N)
+    lane += 16 * ((0:4*N-1) .÷ 4)
+    lane = Tuple(lane)
+
+    idx0 = Vec(lane)
+    idx1 = Vec(lane .+ 4)
+    idx2 = Vec(lane .+ 8)
+    idx3 = Vec(lane .+ 12)
+
+    quote
+        @inbounds begin
+            v0 = vgather(state, $idx0)
+            v1 = vgather(state, $idx1)
+            v2 = vgather(state, $idx2)
+            v3 = vgather(state, $idx3)
+
+            for i = 1:doublerounds
+                v0, v1, v2, v3 = @_QR!(v0, v1, v2, v3)
+                v1 = rotatevector(v1, Val(-1))
+                v2 = rotatevector(v2, Val(-2))
+                v3 = rotatevector(v3, Val(-3))
+
+                v0, v1, v2, v3 = @_QR!(v0, v1, v2, v3)
+                v1 = rotatevector(v1, Val(1))
+                v2 = rotatevector(v2, Val(2))
+                v3 = rotatevector(v3, Val(3))
+            end
+
+            vscatter(v0, state, $idx0)
+            vscatter(v1, state, $idx1)
+            vscatter(v2, state, $idx2)
+            vscatter(v3, state, $idx3)
+        end
+    end
+end
+
 function chacha_blocks!(
     buffer::CuArray, key, nonce::UInt64, counter::UInt64, nblocks = 1; doublerounds = 10
 )
@@ -204,7 +258,7 @@ function _cuda_chacha_rounds!(state, doublerounds)
 
     # Only operate on a slice of the state corresponding to
     # the thread block
-    state_slice = view(state, block+1:block+16)
+    slice = view(state, block+1:block+16)
 
     # Pre-compute the indices that this thread will use to
     # perform its diagonal rounds
@@ -219,11 +273,11 @@ function _cuda_chacha_rounds!(state, doublerounds)
     # Each thread in the same block runs its rounds in parallel
     for _ = 1:doublerounds
         # Columnar rounds
-        _QR!(state_slice, i, i + 4, i + 8, i + 12)
+        @_QR!(slice[i], slice[i+4], slice[i+8], slice[i+12])
         CUDA.threadfence_block()
 
         # Diagonal rounds
-        _QR!(state_slice, dgc1, dgc2, dgc3, dgc4)
+        @_QR!(slice[dgc1], slice[dgc2], slice[dgc3], slice[dgc4])
         CUDA.threadfence_block()
     end
 
diff --git a/src/keystream.jl b/src/keystream.jl
@@ -122,17 +122,12 @@ end
 function _fill_blocks!(
     buffer::AbstractVector{T}, stream::ChaChaStream, nblocks::Int
 ) where {T <: BitInteger}
-    bufsize_u32 = div(length(buffer) * sizeof(T), sizeof(UInt32))
+    bufsize_u32 = sizeof(buffer) ÷ sizeof(UInt32)
 
     GC.@preserve buffer begin
-        # Create a pointer to the start of the block,
-        # and wrap it in an instance of UnsafeView.
-        #
-        # This provides a decent speedup over using
-        # reinterpret(UInt32, ...)
         bp = pointer(buffer)
         bp = Base.unsafe_convert(Ptr{UInt32}, bp)
-        bufview = UnsafeView(bp, bufsize_u32)
+        bufview = unsafe_wrap(Vector{UInt32}, bp, bufsize_u32)
 
         stream.counter = chacha_blocks!(
             bufview,
diff --git a/test/test_chacha.jl b/test/test_chacha.jl
@@ -13,7 +13,7 @@ using StaticArrays
 using Test
 
 function chacha_blocks_test_suite(T)
-    @testset "Test chacha_blocks!" begin
+    @testset "RFC 8439 ChaCha block function tests" begin
         # Ref: IETF RFC 8439, Sec. 2.3.2
         # https://datatracker.ietf.org/doc/html/rfc8439#section-2.3.2
         key = SVector{8,UInt32}([
@@ -114,14 +114,50 @@ function chacha_blocks_test_suite(T)
         @test state == test_vector
     end
 
+    @testset "Extended ChaCha block function tests" begin
+        # Run multiple blocks of ChaCha with key, counter, and nonce equal
+        # to zero
+        #
+        # It's more efficient to compute multiple blocks in parallel on both
+        # CPU and GPU, so this test ensures that parallelization doesn't
+        # introduce any new errors.
+        key = SVector{8,UInt32}(zeros(UInt32, 8)) |> T
+        nonce = UInt64(0)
+        counter = UInt64(0)
+        test_vector = SVector{64,UInt32}([
+            # Block 1
+            0xade0b876, 0x903df1a0, 0xe56a5d40, 0x28bd8653,
+            0xb819d2bd, 0x1aed8da0, 0xccef36a8, 0xc70d778b,
+            0x7c5941da, 0x8d485751, 0x3fe02477, 0x374ad8b8,
+            0xf4b8436a, 0x1ca11815, 0x69b687c3, 0x8665eeb2,
+            # Block 2
+            0xbee7079f, 0x7a385155, 0x7c97ba98, 0x0d082d73,
+            0xa0290fcb, 0x6965e348, 0x3e53c612, 0xed7aee32,
+            0x7621b729, 0x434ee69c, 0xb03371d5, 0xd539d874,
+            0x281fed31, 0x45fb0a51, 0x1f0ae1ac, 0x6f4d794b,
+            # Block 3
+            0xe6a0092d, 0xe16c2663, 0x08d17eae, 0x75a06819,
+            0x998e718e, 0xc662d37b, 0x3446c3b0, 0x5db3a0a9,
+            0x68372701, 0x0f5d7b1f, 0xfd3a1e28, 0x1ebc58e4,
+            0x13d3d273, 0xc094cfc9, 0x6271f35f, 0xf248a240,
+            # Block 4
+            0x58a02013, 0x6b56b3d7, 0xaada20d5, 0x0abfd23e,
+            0x20b1b8c5, 0x732785fb, 0x349763c3, 0xa4915cb4,
+            0x83cbd42d, 0x2e0d84f8, 0x1358b1ed, 0x3fac6210,
+            0xfff82c1f, 0x5618cd6d, 0x6c1e6ae8, 0x7e166731
+        ]) |> T
+        state = MVector{64,UInt32}(undef) |> T
+        @test ChaCha.chacha_blocks!(state, key, nonce, counter, 4) == counter + 4
+        @test state == test_vector
+    end
 end
 
 @testset "ChaCha tests" begin
     @testset "Quarter-round function tests" begin
         # Ref: IETF RFC 8439, Sec. 2.1.1
         # https://datatracker.ietf.org/doc/html/rfc8439#section-2.1.1
         state = MVector{4,UInt32}([0x11111111, 0x01020304, 0x9b8d6f43, 0x01234567])
-        ChaCha._QR!(state, 1, 2, 3, 4)
+        ChaCha.@_QR!(state[1], state[2], state[3], state[4])
 
         expected_state = SVector{4,UInt32}([0xea2a92f4, 0xcb1cf8ce, 0x4581472e, 0x5881c4bb])
 
@@ -138,7 +174,7 @@ end
         ])
         initial_state = deepcopy(state)
 
-        ChaCha._QR!(state, 3, 8, 9, 14)
+        ChaCha.@_QR!(state[3], state[8], state[9], state[14])
 
         mask = trues(length(state))
         mask[3] = mask[8] = mask[9] = mask[14] = false
@@ -188,11 +224,11 @@ end
 
             function kernel(state, a, b, c, d)
                 i = 4 * (threadIdx().x - 1)
-                ChaCha._QR!(state, i + a, i + b, i + c, i + d)
+                ChaCha.@_QR!(state[i+a], state[i+b], state[i+c], state[i+d])
                 nothing
             end
 
-            ChaCha._QR!(state, 1, 2, 3, 4)
+            ChaCha.@_QR!(state[1], state[2], state[3], state[4])
             CUDA.@sync @cuda threads=1024 kernel(state_gpu, 1, 2, 3, 4)
 
             @test state_gpu == CuArray(collect(repeat(state, 1024)))