Skip to content

Commit d60ab38

Browse files
authored
optimize simd wmul
1 parent 2c16a92 commit d60ab38

File tree

1 file changed

+7
-8
lines changed

1 file changed

+7
-8
lines changed

src/distributions/utils.rs

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ macro_rules! wmul_impl {
3131
};
3232

3333
// simd bulk implementation
34-
($(($ty:ident, $wide:ident),)+, $shift:expr) => {
34+
($(($ty:ident, $wide:ty),)+, $shift:expr) => {
3535
$(
3636
impl WideningMultiply for $ty {
3737
type Output = ($ty, $ty);
@@ -152,7 +152,8 @@ mod simd_wmul {
152152
(u8x4, u16x4),
153153
(u8x8, u16x8),
154154
(u8x16, u16x16),
155-
(u8x32, u16x32),,
155+
(u8x32, u16x32),
156+
(u8x64, Simd<u16, 64>),,
156157
8
157158
}
158159

@@ -162,6 +163,8 @@ mod simd_wmul {
162163
wmul_impl! { (u16x8, u32x8),, 16 }
163164
#[cfg(not(target_feature = "avx2"))]
164165
wmul_impl! { (u16x16, u32x16),, 16 }
166+
#[cfg(not(target_feature = "avx512bw"))]
167+
wmul_impl! { (u16x32, Simd<u32, 32>),, 16 }
165168

166169
// 16-bit lane widths allow use of the x86 `mulhi` instructions, which
167170
// means `wmul` can be implemented with only two instructions.
@@ -191,15 +194,11 @@ mod simd_wmul {
191194
wmul_impl! {
192195
(u32x2, u64x2),
193196
(u32x4, u64x4),
194-
(u32x8, u64x8),,
197+
(u32x8, u64x8),
198+
(u32x16, Simd<u64, 16>),,
195199
32
196200
}
197201

198-
// TODO: optimize, this seems to seriously slow things down
199-
wmul_impl_large! { (u8x64,) u8, 4 }
200-
#[cfg(not(target_feature = "avx512bw"))]
201-
wmul_impl_large! { (u16x32,) u16, 8 }
202-
wmul_impl_large! { (u32x16,) u32, 16 }
203202
wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 }
204203
}
205204

0 commit comments

Comments
 (0)