@@ -5,25 +5,24 @@ for CSPRNG.
5
5
6
6
module ChaCha
7
7
8
- using Core. Intrinsics: llvmcall
9
8
using CUDA
9
+ using SIMD
10
10
using StaticArrays
11
11
12
12
# ChaCha block size is 32 * 16 bits = 64 bytes
13
13
const CHACHA_BLOCK_SIZE_U32 = 16
14
14
const CHACHA_BLOCK_SIZE = div (32 * 16 , 8 )
15
15
16
16
@inline lrot32 (x, n) = (x << n) | (x >> (32 - n))
17
- @inline lrot32 (x:: UInt32 , n:: UInt32 ) = llvmcall (
18
- ("""
19
- declare i32 @llvm.fshl.i32(i32, i32, i32)
20
- define i32 @entry(i32, i32, i32) #0 {
21
- 3:
22
- %res = call i32 @llvm.fshl.i32(i32 %0, i32 %0, i32 %1)
23
- ret i32 %res
24
- }
25
- attributes #0 = { alwaysinline }
26
- """ , " entry" ), UInt32, Tuple{UInt32, UInt32}, x, n)
17
+ @inline lrot32 (x:: Union{Vec,UInt32} , n) = bitrotate (x, n)
18
+
19
+ @inline @generated function rotatevector (x:: Vec{N,T} , :: Val{M} ) where {N,T,M}
20
+ rotation = circshift (0 : 3 , M)
21
+ rotation = repeat (rotation, N ÷ 4 )
22
+ rotation += 4 * ((0 : N- 1 ) .÷ 4 )
23
+ rotation = Val (Tuple (rotation))
24
+ :(shufflevector (x, $ rotation))
25
+ end
27
26
28
27
@inline function _QR! (x, a, b, c, d)
29
28
@inbounds begin
@@ -34,14 +33,26 @@ const CHACHA_BLOCK_SIZE = div(32 * 16, 8)
34
33
end
35
34
end
36
35
36
+ @inline function _QR! (a, b, c, d)
37
+ a += b; d ⊻= a; d = lrot32 (d, UInt32 (16 ));
38
+ c += d; b ⊻= c; b = lrot32 (b, UInt32 (12 ));
39
+ a += b; d ⊻= a; d = lrot32 (d, UInt32 (8 ));
40
+ c += d; b ⊻= c; b = lrot32 (b, UInt32 (7 ));
41
+ a, b, c, d
42
+ end
43
+
37
44
@inline function store_u64! (x:: AbstractVector{UInt32} , u:: UInt64 , idx)
38
- x[idx] = UInt32 (u & 0xffffffff )
39
- x[idx+ 1 ] = UInt32 ((u >> 32 ) & 0xffffffff )
45
+ @inbounds begin
46
+ x[idx] = UInt32 (u & 0xffffffff )
47
+ x[idx+ 1 ] = UInt32 ((u >> 32 ) & 0xffffffff )
48
+ end
40
49
end
41
50
42
51
@inline function add_u64! (x:: AbstractVector{UInt32} , u:: UInt64 , idx)
43
- x[idx] += UInt32 (u & 0xffffffff )
44
- x[idx+ 1 ] += UInt32 ((u >> 32 ) & 0xffffffff )
52
+ @inbounds begin
53
+ x[idx] += UInt32 (u & 0xffffffff )
54
+ x[idx+ 1 ] += UInt32 ((u >> 32 ) & 0xffffffff )
55
+ end
45
56
end
46
57
47
58
#=
@@ -144,40 +155,89 @@ function chacha_blocks!(
144
155
nblocks = 1 ;
145
156
doublerounds = 10 ,
146
157
)
147
- for i ∈ 1 : nblocks
148
- block_start = CHACHA_BLOCK_SIZE_U32 * (i - 1 ) + 1
149
- block_end = block_start + CHACHA_BLOCK_SIZE_U32 - 1
150
- state = view (buffer, block_start: block_end)
151
-
152
- _chacha_set_initial_state! (state, key, nonce, counter, 1 )
153
-
154
- # Perform alternating rounds of columnar
155
- # quarter-rounds and diagonal quarter-rounds
156
- for i = 1 : doublerounds
157
- # Columnar rounds
158
- _QR! (state, 1 , 5 , 9 , 13 )
159
- _QR! (state, 2 , 6 , 10 , 14 )
160
- _QR! (state, 3 , 7 , 11 , 15 )
161
- _QR! (state, 4 , 8 , 12 , 16 )
162
-
163
- # Diagonal rounds
164
- _QR! (state, 1 , 6 , 11 , 16 )
165
- _QR! (state, 2 , 7 , 12 , 13 )
166
- _QR! (state, 3 , 8 , 9 , 14 )
167
- _QR! (state, 4 , 5 , 10 , 15 )
168
- end
169
-
170
- # Finish by adding the initial state back to
171
- # the original state, so that the operations
172
- # are no longer invertible
173
- _chacha_add_initial_state! (state, key, nonce, counter, 1 )
158
+ block_start = 1
159
+
160
+ # We compute as many blocks of output as possible with 512-bit
161
+ # SIMD vectorization
162
+ for i ∈ 1 : 4 : nblocks- 3
163
+ block_start, counter = _chacha_blocks! (
164
+ buffer, block_start, key, nonce, counter, doublerounds, Val (4 )
165
+ )
166
+ end
174
167
175
- counter += 1
168
+ # The remaining blocks are computed with 128-bit vectorization
169
+ for i ∈ 1 : (nblocks % 4 )
170
+ block_start, counter = _chacha_blocks! (
171
+ buffer, block_start, key, nonce, counter, doublerounds, Val (1 )
172
+ )
176
173
end
177
174
178
175
counter
179
176
end
180
177
178
+ # Compute the ChaCha block function with N * 128-bit SIMD vectorization
179
+ #
180
+ # Reference: https://eprint.iacr.org/2013/759.pdf
181
+ @inline function _chacha_blocks! (
182
+ buffer:: AbstractVector{UInt32} , block_start, key, nonce, counter, doublerounds, :: Val{N}
183
+ ) where N
184
+ block_end = block_start + N * CHACHA_BLOCK_SIZE_U32 - 1
185
+ @inbounds state = view (buffer, block_start: block_end)
186
+
187
+ for i = 0 : N- 1
188
+ _chacha_set_initial_state! (state, key, nonce, counter + i, i * CHACHA_BLOCK_SIZE_U32 + 1 )
189
+ end
190
+
191
+ _chacha_rounds! (state, doublerounds, Val (N))
192
+
193
+ for i = 0 : N- 1
194
+ _chacha_add_initial_state! (state, key, nonce, counter + i, i * CHACHA_BLOCK_SIZE_U32 + 1 )
195
+ end
196
+
197
+ block_end + 1 , counter + N
198
+ end
199
+
200
+
201
+ @inline @generated function _chacha_rounds! (state, doublerounds, :: Val{N} ) where N
202
+ # Perform alternating rounds of columnar
203
+ # quarter-rounds and diagonal quarter-rounds
204
+ lane = (1 , 2 , 3 , 4 )
205
+ lane = repeat (1 : 4 , N)
206
+ lane += 16 * ((0 : 4 * N- 1 ) .÷ 4 )
207
+ lane = Tuple (lane)
208
+
209
+ idx0 = Vec (lane)
210
+ idx1 = Vec (lane .+ 4 )
211
+ idx2 = Vec (lane .+ 8 )
212
+ idx3 = Vec (lane .+ 12 )
213
+
214
+ quote
215
+ @inbounds begin
216
+ v0 = vgather (state, $ idx0)
217
+ v1 = vgather (state, $ idx1)
218
+ v2 = vgather (state, $ idx2)
219
+ v3 = vgather (state, $ idx3)
220
+
221
+ for i = 1 : doublerounds
222
+ v0, v1, v2, v3 = _QR! (v0, v1, v2, v3)
223
+ v1 = rotatevector (v1, Val (- 1 ))
224
+ v2 = rotatevector (v2, Val (- 2 ))
225
+ v3 = rotatevector (v3, Val (- 3 ))
226
+
227
+ v0, v1, v2, v3 = _QR! (v0, v1, v2, v3)
228
+ v1 = rotatevector (v1, Val (1 ))
229
+ v2 = rotatevector (v2, Val (2 ))
230
+ v3 = rotatevector (v3, Val (3 ))
231
+ end
232
+
233
+ vscatter (v0, state, $ idx0)
234
+ vscatter (v1, state, $ idx1)
235
+ vscatter (v2, state, $ idx2)
236
+ vscatter (v3, state, $ idx3)
237
+ end
238
+ end
239
+ end
240
+
181
241
function chacha_blocks! (
182
242
buffer:: CuArray , key, nonce:: UInt64 , counter:: UInt64 , nblocks = 1 ; doublerounds = 10
183
243
)
0 commit comments