@@ -73,12 +73,6 @@ impl ChaCha {
73
73
init_chacha ( key, nonce)
74
74
}
75
75
76
- #[ inline( always) ]
77
- fn pos64 < M : Machine > ( & self , m : M ) -> u64 {
78
- let d: M :: u32x4 = m. unpack ( self . d ) ;
79
- ( ( d. extract ( 1 ) as u64 ) << 32 ) | d. extract ( 0 ) as u64
80
- }
81
-
82
76
/// Produce 4 blocks of output, advancing the state
83
77
#[ inline( always) ]
84
78
pub fn refill4 ( & mut self , drounds : u32 , out : & mut [ u32 ; BUFSZ ] ) {
@@ -111,44 +105,63 @@ impl ChaCha {
111
105
}
112
106
}
113
107
114
- # [ allow ( clippy :: many_single_char_names ) ]
108
+ // This implementation is platform-independent.
115
109
#[ inline( always) ]
116
- fn refill_wide_impl < Mach : Machine > (
117
- m : Mach , state : & mut ChaCha , drounds : u32 , out : & mut [ u32 ; BUFSZ ] ,
118
- ) {
119
- let k = m. vec ( [ 0x6170_7865 , 0x3320_646e , 0x7962_2d32 , 0x6b20_6574 ] ) ;
120
- let mut pos = state. pos64 ( m) ;
121
- let d0: Mach :: u32x4 = m. unpack ( state. d ) ;
110
+ #[ cfg( target_endian = "big" ) ]
111
+ fn add_pos < Mach : Machine > ( _m : Mach , d0 : Mach :: u32x4 , i : u64 ) -> Mach :: u32x4 {
112
+ let pos0 = ( ( d0. extract ( 1 ) as u64 ) << 32 ) | d0. extract ( 0 ) as u64 ;
113
+ let pos = pos0. wrapping_add ( i) ;
114
+ d0. insert ( ( pos >> 32 ) as u32 , 1 ) . insert ( pos as u32 , 0 )
115
+ }
116
+ #[ inline( always) ]
117
+ #[ cfg( target_endian = "big" ) ]
118
+ fn d0123 < Mach : Machine > ( m : Mach , d : vec128_storage ) -> Mach :: u32x4x4 {
119
+ let d0: Mach :: u32x4 = m. unpack ( d) ;
120
+ let mut pos = ( ( d0. extract ( 1 ) as u64 ) << 32 ) | d0. extract ( 0 ) as u64 ;
122
121
pos = pos. wrapping_add ( 1 ) ;
123
122
let d1 = d0. insert ( ( pos >> 32 ) as u32 , 1 ) . insert ( pos as u32 , 0 ) ;
124
123
pos = pos. wrapping_add ( 1 ) ;
125
124
let d2 = d0. insert ( ( pos >> 32 ) as u32 , 1 ) . insert ( pos as u32 , 0 ) ;
126
125
pos = pos. wrapping_add ( 1 ) ;
127
126
let d3 = d0. insert ( ( pos >> 32 ) as u32 , 1 ) . insert ( pos as u32 , 0 ) ;
127
+ Mach :: u32x4x4:: from_lanes ( [ d0, d1, d2, d3] )
128
+ }
129
+
130
+ // Pos is packed into the state vectors as a little-endian u64,
131
+ // so on LE platforms we can use native vector ops to increment it.
132
+ #[ inline( always) ]
133
+ #[ cfg( target_endian = "little" ) ]
134
+ fn add_pos < Mach : Machine > ( m : Mach , d : Mach :: u32x4 , i : u64 ) -> Mach :: u32x4 {
135
+ let d0: Mach :: u64x2 = m. unpack ( d. into ( ) ) ;
136
+ let incr = m. vec ( [ i, 0 ] ) ;
137
+ m. unpack ( ( d0 + incr) . into ( ) )
138
+ }
139
+ #[ inline( always) ]
140
+ #[ cfg( target_endian = "little" ) ]
141
+ fn d0123 < Mach : Machine > ( m : Mach , d : vec128_storage ) -> Mach :: u32x4x4 {
142
+ let d0: Mach :: u64x2 = m. unpack ( d) ;
143
+ let incr = Mach :: u64x2x4:: from_lanes ( [ m. vec ( [ 0 , 0 ] ) , m. vec ( [ 1 , 0 ] ) , m. vec ( [ 2 , 0 ] ) , m. vec ( [ 3 , 0 ] ) ] ) ;
144
+ m. unpack ( ( Mach :: u64x2x4:: from_lanes ( [ d0, d0, d0, d0] ) + incr) . into ( ) )
145
+ }
128
146
147
+ #[ allow( clippy:: many_single_char_names) ]
148
+ #[ inline( always) ]
149
+ fn refill_wide_impl < Mach : Machine > (
150
+ m : Mach , state : & mut ChaCha , drounds : u32 , out : & mut [ u32 ; BUFSZ ] ,
151
+ ) {
152
+ let k = m. vec ( [ 0x6170_7865 , 0x3320_646e , 0x7962_2d32 , 0x6b20_6574 ] ) ;
129
153
let b = m. unpack ( state. b ) ;
130
154
let c = m. unpack ( state. c ) ;
131
155
let mut x = State {
132
156
a : Mach :: u32x4x4:: from_lanes ( [ k, k, k, k] ) ,
133
157
b : Mach :: u32x4x4:: from_lanes ( [ b, b, b, b] ) ,
134
158
c : Mach :: u32x4x4:: from_lanes ( [ c, c, c, c] ) ,
135
- d : m . unpack ( Mach :: u32x4x4 :: from_lanes ( [ d0 , d1 , d2 , d3 ] ) . into ( ) ) ,
159
+ d : d0123 ( m , state . d ) ,
136
160
} ;
137
161
for _ in 0 ..drounds {
138
162
x = round ( x) ;
139
163
x = undiagonalize ( round ( diagonalize ( x) ) ) ;
140
164
}
141
- let mut pos = state. pos64 ( m) ;
142
- let d0: Mach :: u32x4 = m. unpack ( state. d ) ;
143
- pos = pos. wrapping_add ( 1 ) ;
144
- let d1 = d0. insert ( ( pos >> 32 ) as u32 , 1 ) . insert ( pos as u32 , 0 ) ;
145
- pos = pos. wrapping_add ( 1 ) ;
146
- let d2 = d0. insert ( ( pos >> 32 ) as u32 , 1 ) . insert ( pos as u32 , 0 ) ;
147
- pos = pos. wrapping_add ( 1 ) ;
148
- let d3 = d0. insert ( ( pos >> 32 ) as u32 , 1 ) . insert ( pos as u32 , 0 ) ;
149
- pos = pos. wrapping_add ( 1 ) ;
150
- let d4 = d0. insert ( ( pos >> 32 ) as u32 , 1 ) . insert ( pos as u32 , 0 ) ;
151
-
152
165
let ( a, b, c, d) = (
153
166
x. a . to_lanes ( ) ,
154
167
x. b . to_lanes ( ) ,
@@ -157,8 +170,8 @@ fn refill_wide_impl<Mach: Machine>(
157
170
) ;
158
171
let sb = m. unpack ( state. b ) ;
159
172
let sc = m. unpack ( state. c ) ;
160
- let sd = [ m . unpack ( state. d ) , d1 , d2 , d3 ] ;
161
- state. d = d4 . into ( ) ;
173
+ let sd = d0123 ( m , state. d ) . to_lanes ( ) ;
174
+ state. d = add_pos ( m , sd [ 0 ] , 4 ) . into ( ) ;
162
175
out[ 0 ..4 ] . copy_from_slice ( & ( a[ 0 ] + k) . to_lanes ( ) ) ;
163
176
out[ 4 ..8 ] . copy_from_slice ( & ( b[ 0 ] + sb) . to_lanes ( ) ) ;
164
177
out[ 8 ..12 ] . copy_from_slice ( & ( c[ 0 ] + sc) . to_lanes ( ) ) ;
0 commit comments