Convert the quarter-round function into a macro

kernelmethod · kernelmethod · commit e1c250abf868 · 2022-05-14T03:07:00.000-06:00
Convert _QR! from an inlined function into a macro, to re-unify some of
the CPU and GPU ChaCha code. The previous changes to explicitly
vectorize operations with SIMD required us to define a new version of
_QR! and dispatch on it. By converting _QR! into a macro, we can use the
same code to represent a quarter-round on either GPU arrays or CPU ones.
diff --git a/src/ChaCha.jl b/src/ChaCha.jl
@@ -24,21 +24,15 @@ const CHACHA_BLOCK_SIZE = div(32 * 16, 8)
     :(shufflevector(x, $rotation))
 end
 
-@inline function _QR!(x, a, b, c, d)
-    @inbounds begin
-        x[a] += x[b]; x[d] ⊻= x[a]; x[d] = lrot32(x[d], UInt32(16))
-        x[c] += x[d]; x[b] ⊻= x[c]; x[b] = lrot32(x[b], UInt32(12))
-        x[a] += x[b]; x[d] ⊻= x[a]; x[d] = lrot32(x[d],  UInt32(8))
-        x[c] += x[d]; x[b] ⊻= x[c]; x[b] = lrot32(x[b],  UInt32(7))
-    end
-end
+macro _QR!(a, b, c, d)
+    quote
+        $(esc(a)) += $(esc(b)); $(esc(d)) ⊻= $(esc(a)); $(esc(d)) = lrot32($(esc(d)), 16);
+        $(esc(c)) += $(esc(d)); $(esc(b)) ⊻= $(esc(c)); $(esc(b)) = lrot32($(esc(b)), 12);
+        $(esc(a)) += $(esc(b)); $(esc(d)) ⊻= $(esc(a)); $(esc(d)) = lrot32($(esc(d)), 8);
+        $(esc(c)) += $(esc(d)); $(esc(b)) ⊻= $(esc(c)); $(esc(b)) = lrot32($(esc(b)), 7);
 
-@inline function _QR!(a, b, c, d)
-    a += b; d ⊻= a; d = lrot32(d, UInt32(16));
-    c += d; b ⊻= c; b = lrot32(b, UInt32(12));
-    a += b; d ⊻= a; d = lrot32(d, UInt32(8));
-    c += d; b ⊻= c; b = lrot32(b, UInt32(7));
-    a, b, c, d
+        $(esc(a)), $(esc(b)), $(esc(c)), $(esc(d))
+    end
 end
 
 @inline function store_u64!(x::AbstractVector{UInt32}, u::UInt64, idx)
@@ -219,12 +213,12 @@ end
             v3 = vgather(state, $idx3)
 
             for i = 1:doublerounds
-                v0, v1, v2, v3 = _QR!(v0, v1, v2, v3)
+                v0, v1, v2, v3 = @_QR!(v0, v1, v2, v3)
                 v1 = rotatevector(v1, Val(-1))
                 v2 = rotatevector(v2, Val(-2))
                 v3 = rotatevector(v3, Val(-3))
 
-                v0, v1, v2, v3 = _QR!(v0, v1, v2, v3)
+                v0, v1, v2, v3 = @_QR!(v0, v1, v2, v3)
                 v1 = rotatevector(v1, Val(1))
                 v2 = rotatevector(v2, Val(2))
                 v3 = rotatevector(v3, Val(3))
@@ -264,7 +258,7 @@ function _cuda_chacha_rounds!(state, doublerounds)
 
     # Only operate on a slice of the state corresponding to
     # the thread block
-    state_slice = view(state, block+1:block+16)
+    slice = view(state, block+1:block+16)
 
     # Pre-compute the indices that this thread will use to
     # perform its diagonal rounds
@@ -279,11 +273,11 @@ function _cuda_chacha_rounds!(state, doublerounds)
     # Each thread in the same block runs its rounds in parallel
     for _ = 1:doublerounds
         # Columnar rounds
-        _QR!(state_slice, i, i + 4, i + 8, i + 12)
+        @_QR!(slice[i], slice[i+4], slice[i+8], slice[i+12])
         CUDA.threadfence_block()
 
         # Diagonal rounds
-        _QR!(state_slice, dgc1, dgc2, dgc3, dgc4)
+        @_QR!(slice[dgc1], slice[dgc2], slice[dgc3], slice[dgc4])
         CUDA.threadfence_block()
     end
 
diff --git a/test/test_chacha.jl b/test/test_chacha.jl
@@ -157,7 +157,7 @@ end
         # Ref: IETF RFC 8439, Sec. 2.1.1
         # https://datatracker.ietf.org/doc/html/rfc8439#section-2.1.1
         state = MVector{4,UInt32}([0x11111111, 0x01020304, 0x9b8d6f43, 0x01234567])
-        ChaCha._QR!(state, 1, 2, 3, 4)
+        ChaCha.@_QR!(state[1], state[2], state[3], state[4])
 
         expected_state = SVector{4,UInt32}([0xea2a92f4, 0xcb1cf8ce, 0x4581472e, 0x5881c4bb])
 
@@ -174,7 +174,7 @@ end
         ])
         initial_state = deepcopy(state)
 
-        ChaCha._QR!(state, 3, 8, 9, 14)
+        ChaCha.@_QR!(state[3], state[8], state[9], state[14])
 
         mask = trues(length(state))
         mask[3] = mask[8] = mask[9] = mask[14] = false
@@ -224,11 +224,11 @@ end
 
             function kernel(state, a, b, c, d)
                 i = 4 * (threadIdx().x - 1)
-                ChaCha._QR!(state, i + a, i + b, i + c, i + d)
+                ChaCha.@_QR!(state[i+a], state[i+b], state[i+c], state[i+d])
                 nothing
             end
 
-            ChaCha._QR!(state, 1, 2, 3, 4)
+            ChaCha.@_QR!(state[1], state[2], state[3], state[4])
             CUDA.@sync @cuda threads=1024 kernel(state_gpu, 1, 2, 3, 4)
 
             @test state_gpu == CuArray(collect(repeat(state, 1024)))