Skip to content

Commit 6e6cc35

Browse files
committed
rand_chacha: improve little-endian performance incrementing pos counter
On little-endian platforms we can use native vector operations to increment the pos counter, because it is packed little-endian into the state vector.
1 parent 320acef commit 6e6cc35

File tree

2 files changed

+41
-28
lines changed

2 files changed

+41
-28
lines changed

rand_chacha/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ edition = "2018"
1616

1717
[dependencies]
1818
rand_core = { path = "../rand_core", version = "0.6.0" }
19-
ppv-lite86 = { version = "0.2.8", default-features = false, features = ["simd"] }
19+
ppv-lite86 = { version = "0.2.14", default-features = false, features = ["simd"] }
2020
serde = { version = "1.0", features = ["derive"], optional = true }
2121

2222
[dev-dependencies]

rand_chacha/src/guts.rs

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,6 @@ impl ChaCha {
7373
init_chacha(key, nonce)
7474
}
7575

76-
#[inline(always)]
77-
fn pos64<M: Machine>(&self, m: M) -> u64 {
78-
let d: M::u32x4 = m.unpack(self.d);
79-
((d.extract(1) as u64) << 32) | d.extract(0) as u64
80-
}
81-
8276
/// Produce 4 blocks of output, advancing the state
8377
#[inline(always)]
8478
pub fn refill4(&mut self, drounds: u32, out: &mut [u32; BUFSZ]) {
@@ -111,44 +105,63 @@ impl ChaCha {
111105
}
112106
}
113107

114-
#[allow(clippy::many_single_char_names)]
108+
// This implementation is platform-independent.
115109
#[inline(always)]
116-
fn refill_wide_impl<Mach: Machine>(
117-
m: Mach, state: &mut ChaCha, drounds: u32, out: &mut [u32; BUFSZ],
118-
) {
119-
let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]);
120-
let mut pos = state.pos64(m);
121-
let d0: Mach::u32x4 = m.unpack(state.d);
110+
#[cfg(target_endian = "big")]
111+
fn add_pos<Mach: Machine>(_m: Mach, d0: Mach::u32x4, i: u64) -> Mach::u32x4 {
112+
let pos0 = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64;
113+
let pos = pos0.wrapping_add(i);
114+
d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0)
115+
}
116+
#[inline(always)]
117+
#[cfg(target_endian = "big")]
118+
fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 {
119+
let d0: Mach::u32x4 = m.unpack(d);
120+
let mut pos = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64;
122121
pos = pos.wrapping_add(1);
123122
let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
124123
pos = pos.wrapping_add(1);
125124
let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
126125
pos = pos.wrapping_add(1);
127126
let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
127+
Mach::u32x4x4::from_lanes([d0, d1, d2, d3])
128+
}
129+
130+
// Pos is packed into the state vectors as a little-endian u64,
131+
// so on LE platforms we can use native vector ops to increment it.
132+
#[inline(always)]
133+
#[cfg(target_endian = "little")]
134+
fn add_pos<Mach: Machine>(m: Mach, d: Mach::u32x4, i: u64) -> Mach::u32x4 {
135+
let d0: Mach::u64x2 = m.unpack(d.into());
136+
let incr = m.vec([i, 0]);
137+
m.unpack((d0 + incr).into())
138+
}
139+
#[inline(always)]
140+
#[cfg(target_endian = "little")]
141+
fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 {
142+
let d0: Mach::u64x2 = m.unpack(d);
143+
let incr = Mach::u64x2x4::from_lanes([m.vec([0, 0]), m.vec([1, 0]), m.vec([2, 0]), m.vec([3, 0])]);
144+
m.unpack((Mach::u64x2x4::from_lanes([d0, d0, d0, d0]) + incr).into())
145+
}
128146

147+
#[allow(clippy::many_single_char_names)]
148+
#[inline(always)]
149+
fn refill_wide_impl<Mach: Machine>(
150+
m: Mach, state: &mut ChaCha, drounds: u32, out: &mut [u32; BUFSZ],
151+
) {
152+
let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]);
129153
let b = m.unpack(state.b);
130154
let c = m.unpack(state.c);
131155
let mut x = State {
132156
a: Mach::u32x4x4::from_lanes([k, k, k, k]),
133157
b: Mach::u32x4x4::from_lanes([b, b, b, b]),
134158
c: Mach::u32x4x4::from_lanes([c, c, c, c]),
135-
d: m.unpack(Mach::u32x4x4::from_lanes([d0, d1, d2, d3]).into()),
159+
d: d0123(m, state.d),
136160
};
137161
for _ in 0..drounds {
138162
x = round(x);
139163
x = undiagonalize(round(diagonalize(x)));
140164
}
141-
let mut pos = state.pos64(m);
142-
let d0: Mach::u32x4 = m.unpack(state.d);
143-
pos = pos.wrapping_add(1);
144-
let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
145-
pos = pos.wrapping_add(1);
146-
let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
147-
pos = pos.wrapping_add(1);
148-
let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
149-
pos = pos.wrapping_add(1);
150-
let d4 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
151-
152165
let (a, b, c, d) = (
153166
x.a.to_lanes(),
154167
x.b.to_lanes(),
@@ -157,8 +170,8 @@ fn refill_wide_impl<Mach: Machine>(
157170
);
158171
let sb = m.unpack(state.b);
159172
let sc = m.unpack(state.c);
160-
let sd = [m.unpack(state.d), d1, d2, d3];
161-
state.d = d4.into();
173+
let sd = d0123(m, state.d).to_lanes();
174+
state.d = add_pos(m, sd[0], 4).into();
162175
out[0..4].copy_from_slice(&(a[0] + k).to_lanes());
163176
out[4..8].copy_from_slice(&(b[0] + sb).to_lanes());
164177
out[8..12].copy_from_slice(&(c[0] + sc).to_lanes());

0 commit comments

Comments
 (0)