|
18 | 18 | //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
|
19 | 19 | //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
|
20 | 20 |
|
21 |
| -use core::hint::unreachable_unchecked; |
| 21 | + |
22 | 22 |
|
23 | 23 | use crate::core_arch::{simd::*, x86::*};
|
24 | 24 | use crate::intrinsics::simd::*;
|
@@ -170,158 +170,74 @@ pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
|
170 | 170 | #[stable(feature = "simd_x86", since = "1.27.0")]
|
171 | 171 | pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
|
172 | 172 | static_assert_uimm_bits!(IMM8, 8);
|
| 173 | + |
| 174 | + |
| 175 | + // If palignr is shifting the pair of vectors more than the size of two |
| 176 | + // lanes, emit zero. |
| 177 | + if IMM8 >= 32 { |
| 178 | + return _mm256_setzero_si256(); |
| 179 | + } |
| 180 | + // If palignr is shifting the pair of input vectors more than one lane, |
| 181 | + // but less than two lanes, convert to shifting in zeroes. |
| 182 | + let (a, b) = if IMM8 > 16 { |
| 183 | + (_mm256_setzero_si256(), a) |
| 184 | + } else { |
| 185 | + (a, b) |
| 186 | + }; |
173 | 187 | unsafe {
|
174 |
| - // If palignr is shifting the pair of vectors more than the size of two |
175 |
| - // lanes, emit zero. |
176 |
| - if IMM8 >= 32 { |
177 |
| - return _mm256_setzero_si256(); |
178 |
| - } |
179 |
| - // If palignr is shifting the pair of input vectors more than one lane, |
180 |
| - // but less than two lanes, convert to shifting in zeroes. |
181 |
| - let (a, b) = if IMM8 > 16 { |
182 |
| - (_mm256_setzero_si256(), a) |
183 |
| - } else { |
184 |
| - (a, b) |
185 |
| - }; |
186 |
| - |
187 |
| - let a = a.as_i8x32(); |
188 |
| - let b = b.as_i8x32(); |
189 |
| - |
190 |
| - if IMM8 == 16 { |
191 |
| - return transmute(a); |
192 |
| - } |
193 |
| - |
194 |
| - let r: i8x32 = match IMM8 % 16 { |
195 |
| - 0 => simd_shuffle!( |
196 |
| - b, |
197 |
| - a, |
198 |
| - [ |
199 |
| - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, |
200 |
| - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, |
201 |
| - ], |
202 |
| - ), |
203 |
| - 1 => simd_shuffle!( |
204 |
| - b, |
205 |
| - a, |
206 |
| - [ |
207 |
| - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, |
208 |
| - 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, |
209 |
| - ], |
210 |
| - ), |
211 |
| - 2 => simd_shuffle!( |
212 |
| - b, |
213 |
| - a, |
214 |
| - [ |
215 |
| - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, |
216 |
| - 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, |
217 |
| - ], |
218 |
| - ), |
219 |
| - 3 => simd_shuffle!( |
220 |
| - b, |
221 |
| - a, |
222 |
| - [ |
223 |
| - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, |
224 |
| - 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, |
225 |
| - ], |
226 |
| - ), |
227 |
| - 4 => simd_shuffle!( |
228 |
| - b, |
229 |
| - a, |
230 |
| - [ |
231 |
| - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, |
232 |
| - 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, |
233 |
| - ], |
234 |
| - ), |
235 |
| - 5 => simd_shuffle!( |
236 |
| - b, |
237 |
| - a, |
238 |
| - [ |
239 |
| - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, |
240 |
| - 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, |
241 |
| - ], |
242 |
| - ), |
243 |
| - 6 => simd_shuffle!( |
244 |
| - b, |
245 |
| - a, |
246 |
| - [ |
247 |
| - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, |
248 |
| - 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, |
249 |
| - ], |
250 |
| - ), |
251 |
| - 7 => simd_shuffle!( |
252 |
| - b, |
253 |
| - a, |
254 |
| - [ |
255 |
| - 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, |
256 |
| - 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, |
257 |
| - ], |
258 |
| - ), |
259 |
| - 8 => simd_shuffle!( |
260 |
| - b, |
261 |
| - a, |
262 |
| - [ |
263 |
| - 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, |
264 |
| - 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, |
265 |
| - ], |
266 |
| - ), |
267 |
| - 9 => simd_shuffle!( |
268 |
| - b, |
269 |
| - a, |
270 |
| - [ |
271 |
| - 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, |
272 |
| - 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, |
273 |
| - ], |
274 |
| - ), |
275 |
| - 10 => simd_shuffle!( |
276 |
| - b, |
277 |
| - a, |
278 |
| - [ |
279 |
| - 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, |
280 |
| - 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, |
281 |
| - ], |
282 |
| - ), |
283 |
| - 11 => simd_shuffle!( |
284 |
| - b, |
285 |
| - a, |
286 |
| - [ |
287 |
| - 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, |
288 |
| - 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, |
289 |
| - ], |
290 |
| - ), |
291 |
| - 12 => simd_shuffle!( |
292 |
| - b, |
293 |
| - a, |
294 |
| - [ |
295 |
| - 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, |
296 |
| - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, |
297 |
| - ], |
298 |
| - ), |
299 |
| - 13 => simd_shuffle!( |
300 |
| - b, |
301 |
| - a, |
302 |
| - [ |
303 |
| - 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, |
304 |
| - 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, |
305 |
| - ], |
306 |
| - ), |
307 |
| - 14 => simd_shuffle!( |
308 |
| - b, |
309 |
| - a, |
310 |
| - [ |
311 |
| - 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, |
312 |
| - 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, |
313 |
| - ], |
314 |
| - ), |
315 |
| - 15 => simd_shuffle!( |
316 |
| - b, |
317 |
| - a, |
318 |
| - [ |
319 |
| - 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, |
320 |
| - 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, |
321 |
| - ], |
322 |
| - ), |
323 |
| - _ => unreachable_unchecked(), |
324 |
| - }; |
| 188 | + if IMM8 == 16 { |
| 189 | + return transmute(a) |
| 190 | + } |
| 191 | + } |
| 192 | + const fn mask(shift: u32, i: u32) -> u32 { |
| 193 | + let shift = shift % 16; |
| 194 | + let mod_i = i%16; |
| 195 | + if mod_i < (16 - shift) { |
| 196 | + i + shift |
| 197 | + } else { |
| 198 | + i + 16 + shift |
| 199 | + } |
| 200 | + } |
| 201 | + |
| 202 | + unsafe { |
| 203 | + let r: i8x32 = simd_shuffle!( |
| 204 | + b.as_i8x32(), |
| 205 | + a.as_i8x32(), |
| 206 | + [ |
| 207 | + mask(IMM8 as u32, 0), |
| 208 | + mask(IMM8 as u32, 1), |
| 209 | + mask(IMM8 as u32, 2), |
| 210 | + mask(IMM8 as u32, 3), |
| 211 | + mask(IMM8 as u32, 4), |
| 212 | + mask(IMM8 as u32, 5), |
| 213 | + mask(IMM8 as u32, 6), |
| 214 | + mask(IMM8 as u32, 7), |
| 215 | + mask(IMM8 as u32, 8), |
| 216 | + mask(IMM8 as u32, 9), |
| 217 | + mask(IMM8 as u32, 10), |
| 218 | + mask(IMM8 as u32, 11), |
| 219 | + mask(IMM8 as u32, 12), |
| 220 | + mask(IMM8 as u32, 13), |
| 221 | + mask(IMM8 as u32, 14), |
| 222 | + mask(IMM8 as u32, 15), |
| 223 | + mask(IMM8 as u32, 16), |
| 224 | + mask(IMM8 as u32, 17), |
| 225 | + mask(IMM8 as u32, 18), |
| 226 | + mask(IMM8 as u32, 19), |
| 227 | + mask(IMM8 as u32, 20), |
| 228 | + mask(IMM8 as u32, 21), |
| 229 | + mask(IMM8 as u32, 22), |
| 230 | + mask(IMM8 as u32, 23), |
| 231 | + mask(IMM8 as u32, 24), |
| 232 | + mask(IMM8 as u32, 25), |
| 233 | + mask(IMM8 as u32, 26), |
| 234 | + mask(IMM8 as u32, 27), |
| 235 | + mask(IMM8 as u32, 28), |
| 236 | + mask(IMM8 as u32, 29), |
| 237 | + mask(IMM8 as u32, 30), |
| 238 | + mask(IMM8 as u32, 31), |
| 239 | + ], |
| 240 | + ); |
325 | 241 | transmute(r)
|
326 | 242 | }
|
327 | 243 | }
|
|
0 commit comments