@@ -24,21 +24,15 @@ const CHACHA_BLOCK_SIZE = div(32 * 16, 8)
24
24
:(shufflevector (x, $ rotation))
25
25
end
26
26
27
- @inline function _QR! (x, a, b, c, d)
28
- @inbounds begin
29
- x[a] += x[b]; x[d] ⊻= x[a]; x[d] = lrot32 (x[d], UInt32 (16 ))
30
- x[c] += x[d]; x[b] ⊻= x[c]; x[b] = lrot32 (x[b], UInt32 (12 ))
31
- x[a] += x[b]; x[d] ⊻= x[a]; x[d] = lrot32 (x[d], UInt32 (8 ))
32
- x[c] += x[d]; x[b] ⊻= x[c]; x[b] = lrot32 (x[b], UInt32 (7 ))
33
- end
34
- end
27
+ macro _QR! (a, b, c, d)
28
+ quote
29
+ $ (esc (a)) += $ (esc (b)); $ (esc (d)) ⊻= $ (esc (a)); $ (esc (d)) = lrot32 ($ (esc (d)), 16 );
30
+ $ (esc (c)) += $ (esc (d)); $ (esc (b)) ⊻= $ (esc (c)); $ (esc (b)) = lrot32 ($ (esc (b)), 12 );
31
+ $ (esc (a)) += $ (esc (b)); $ (esc (d)) ⊻= $ (esc (a)); $ (esc (d)) = lrot32 ($ (esc (d)), 8 );
32
+ $ (esc (c)) += $ (esc (d)); $ (esc (b)) ⊻= $ (esc (c)); $ (esc (b)) = lrot32 ($ (esc (b)), 7 );
35
33
36
- @inline function _QR! (a, b, c, d)
37
- a += b; d ⊻= a; d = lrot32 (d, UInt32 (16 ));
38
- c += d; b ⊻= c; b = lrot32 (b, UInt32 (12 ));
39
- a += b; d ⊻= a; d = lrot32 (d, UInt32 (8 ));
40
- c += d; b ⊻= c; b = lrot32 (b, UInt32 (7 ));
41
- a, b, c, d
34
+ $ (esc (a)), $ (esc (b)), $ (esc (c)), $ (esc (d))
35
+ end
42
36
end
43
37
44
38
@inline function store_u64! (x:: AbstractVector{UInt32} , u:: UInt64 , idx)
@@ -219,12 +213,12 @@ end
219
213
v3 = vgather (state, $ idx3)
220
214
221
215
for i = 1 : doublerounds
222
- v0, v1, v2, v3 = _QR! (v0, v1, v2, v3)
216
+ v0, v1, v2, v3 = @ _QR! (v0, v1, v2, v3)
223
217
v1 = rotatevector (v1, Val (- 1 ))
224
218
v2 = rotatevector (v2, Val (- 2 ))
225
219
v3 = rotatevector (v3, Val (- 3 ))
226
220
227
- v0, v1, v2, v3 = _QR! (v0, v1, v2, v3)
221
+ v0, v1, v2, v3 = @ _QR! (v0, v1, v2, v3)
228
222
v1 = rotatevector (v1, Val (1 ))
229
223
v2 = rotatevector (v2, Val (2 ))
230
224
v3 = rotatevector (v3, Val (3 ))
@@ -264,7 +258,7 @@ function _cuda_chacha_rounds!(state, doublerounds)
264
258
265
259
# Only operate on a slice of the state corresponding to
266
260
# the thread block
267
- state_slice = view (state, block+ 1 : block+ 16 )
261
+ slice = view (state, block+ 1 : block+ 16 )
268
262
269
263
# Pre-compute the indices that this thread will use to
270
264
# perform its diagonal rounds
@@ -279,11 +273,11 @@ function _cuda_chacha_rounds!(state, doublerounds)
279
273
# Each thread in the same block runs its rounds in parallel
280
274
for _ = 1 : doublerounds
281
275
# Columnar rounds
282
- _QR! (state_slice, i, i + 4 , i + 8 , i + 12 )
276
+ @ _QR! (slice[i], slice[i + 4 ], slice[i + 8 ], slice[i + 12 ] )
283
277
CUDA. threadfence_block ()
284
278
285
279
# Diagonal rounds
286
- _QR! (state_slice, dgc1, dgc2, dgc3, dgc4)
280
+ @ _QR! (slice[ dgc1], slice[ dgc2], slice[ dgc3], slice[ dgc4] )
287
281
CUDA. threadfence_block ()
288
282
end
289
283
0 commit comments