@@ -31,7 +31,7 @@ macro_rules! wmul_impl {
31
31
} ;
32
32
33
33
// simd bulk implementation
34
- ( $( ( $ty: ident, $wide: ident ) , ) +, $shift: expr) => {
34
+ ( $( ( $ty: ident, $wide: ty ) , ) +, $shift: expr) => {
35
35
$(
36
36
impl WideningMultiply for $ty {
37
37
type Output = ( $ty, $ty) ;
@@ -152,7 +152,8 @@ mod simd_wmul {
152
152
( u8x4, u16x4) ,
153
153
( u8x8, u16x8) ,
154
154
( u8x16, u16x16) ,
155
- ( u8x32, u16x32) , ,
155
+ ( u8x32, u16x32) ,
156
+ ( u8x64, Simd <u16 , 64 >) , ,
156
157
8
157
158
}
158
159
@@ -162,6 +163,8 @@ mod simd_wmul {
162
163
wmul_impl ! { ( u16x8, u32x8) , , 16 }
163
164
#[ cfg( not( target_feature = "avx2" ) ) ]
164
165
wmul_impl ! { ( u16x16, u32x16) , , 16 }
166
+ #[ cfg( not( target_feature = "avx512bw" ) ) ]
167
+ wmul_impl ! { ( u16x32, Simd <u32 , 32 >) , , 16 }
165
168
166
169
// 16-bit lane widths allow use of the x86 `mulhi` instructions, which
167
170
// means `wmul` can be implemented with only two instructions.
@@ -191,15 +194,11 @@ mod simd_wmul {
191
194
wmul_impl ! {
192
195
( u32x2, u64x2) ,
193
196
( u32x4, u64x4) ,
194
- ( u32x8, u64x8) , ,
197
+ ( u32x8, u64x8) ,
198
+ ( u32x16, Simd <u64 , 16 >) , ,
195
199
32
196
200
}
197
201
198
- // TODO: optimize, this seems to seriously slow things down
199
- wmul_impl_large ! { ( u8x64, ) u8 , 4 }
200
- #[ cfg( not( target_feature = "avx512bw" ) ) ]
201
- wmul_impl_large ! { ( u16x32, ) u16 , 8 }
202
- wmul_impl_large ! { ( u32x16, ) u32 , 16 }
203
202
wmul_impl_large ! { ( u64x2, u64x4, u64x8, ) u64 , 32 }
204
203
}
205
204
0 commit comments