1
+ use std:: iter:: FromIterator ;
2
+
1
3
use gccjit:: ToRValue ;
2
4
use gccjit:: { BinaryOp , RValue , Type } ;
3
5
#[ cfg( feature = "master" ) ]
@@ -21,6 +23,8 @@ use rustc_target::abi::Align;
21
23
use crate :: builder:: Builder ;
22
24
#[ cfg( feature = "master" ) ]
23
25
use crate :: context:: CodegenCx ;
26
+ #[ cfg( not( feature = "master" ) ) ]
27
+ use crate :: common:: SignType ;
24
28
25
29
pub fn generic_simd_intrinsic < ' a , ' gcc , ' tcx > (
26
30
bx : & mut Builder < ' a , ' gcc , ' tcx > ,
@@ -156,6 +160,195 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
156
160
return Ok ( compare_simd_types ( bx, arg1, arg2, in_elem, llret_ty, cmp_op) ) ;
157
161
}
158
162
163
+ let simd_bswap = |bx : & mut Builder < ' a , ' gcc , ' tcx > , vector : RValue < ' gcc > | -> RValue < ' gcc > {
164
+ let v_type = vector. get_type ( ) ;
165
+ let vector_type = v_type. unqualified ( ) . dyncast_vector ( ) . expect ( "vector type" ) ;
166
+ let elem_type = vector_type. get_element_type ( ) ;
167
+ let elem_size_bytes = elem_type. get_size ( ) ;
168
+ if elem_size_bytes == 1 {
169
+ return vector;
170
+ }
171
+
172
+ let type_size_bytes = elem_size_bytes as u64 * in_len;
173
+ let shuffle_indices = Vec :: from_iter ( 0 ..type_size_bytes) ;
174
+ let byte_vector_type = bx. context . new_vector_type ( bx. type_u8 ( ) , type_size_bytes) ;
175
+ let byte_vector = bx. context . new_bitcast ( None , args[ 0 ] . immediate ( ) , byte_vector_type) ;
176
+
177
+ #[ cfg( not( feature = "master" ) ) ]
178
+ let shuffled = {
179
+ let new_elements: Vec < _ > = shuffle_indices. chunks_exact ( elem_size_bytes as _ )
180
+ . flat_map ( |x| x. iter ( ) . rev ( ) )
181
+ . map ( |& i| {
182
+ let index = bx. context . new_rvalue_from_long ( bx. u64_type , i as _ ) ;
183
+ bx. extract_element ( byte_vector, index)
184
+ } )
185
+ . collect ( ) ;
186
+
187
+ bx. context . new_rvalue_from_vector ( None , byte_vector_type, & new_elements)
188
+ } ;
189
+ #[ cfg( feature = "master" ) ]
190
+ let shuffled = {
191
+ let indices: Vec < _ > = shuffle_indices. chunks_exact ( elem_size_bytes as _ )
192
+ . flat_map ( |x| x. iter ( ) . rev ( ) )
193
+ . map ( |& i| bx. context . new_rvalue_from_int ( bx. u8_type , i as _ ) )
194
+ . collect ( ) ;
195
+
196
+ let mask = bx. context . new_rvalue_from_vector ( None , byte_vector_type, & indices) ;
197
+ bx. context . new_rvalue_vector_perm ( None , byte_vector, byte_vector, mask)
198
+ } ;
199
+ bx. context . new_bitcast ( None , shuffled, v_type)
200
+ } ;
201
+
202
+ if name == sym:: simd_bswap || name == sym:: simd_bitreverse {
203
+ require ! (
204
+ bx. type_kind( bx. element_type( llret_ty) ) == TypeKind :: Integer ,
205
+ InvalidMonomorphization :: UnsupportedOperation {
206
+ span,
207
+ name,
208
+ in_ty,
209
+ in_elem,
210
+ }
211
+ ) ;
212
+ }
213
+
214
+ if name == sym:: simd_bswap {
215
+ return Ok ( simd_bswap ( bx, args[ 0 ] . immediate ( ) ) ) ;
216
+ }
217
+
218
+ // We use a different algorithm from non-vector bitreverse to take advantage of most
219
+ // processors' vector shuffle units. It works like this:
220
+ // 1. Generate pre-reversed low and high nibbles as a vector.
221
+ // 2. Byte-swap the input.
222
+ // 3. Mask off the low and high nibbles of each byte in the byte-swapped input.
223
+ // 4. Shuffle the pre-reversed low and high-nibbles using the masked nibbles as a shuffle mask.
224
+ // 5. Combine the results of the shuffle back together and cast back to the original type.
225
+ #[ cfg( feature = "master" ) ]
226
+ if name == sym:: simd_bitreverse {
227
+ let vector = args[ 0 ] . immediate ( ) ;
228
+ let v_type = vector. get_type ( ) ;
229
+ let vector_type = v_type. unqualified ( ) . dyncast_vector ( ) . expect ( "vector type" ) ;
230
+ let elem_type = vector_type. get_element_type ( ) ;
231
+ let elem_size_bytes = elem_type. get_size ( ) ;
232
+
233
+ let type_size_bytes = elem_size_bytes as u64 * in_len;
234
+ // We need to ensure at least 16 entries in our vector type, since the pre-reversed vectors
235
+ // we generate below have 16 entries in them. `new_rvalue_vector_perm` requires the mask
236
+ // vector to be of the same length as the source vectors.
237
+ let byte_vector_type_size = type_size_bytes. max ( 16 ) ;
238
+
239
+ let byte_vector_type = bx. context . new_vector_type ( bx. u8_type , type_size_bytes) ;
240
+ let long_byte_vector_type = bx. context . new_vector_type ( bx. u8_type , byte_vector_type_size) ;
241
+
242
+ // Step 1: Generate pre-reversed low and high nibbles as a vector.
243
+ let zero_byte = bx. context . new_rvalue_zero ( bx. u8_type ) ;
244
+ let hi_nibble_elements: Vec < _ > = ( 0u8 ..16 )
245
+ . map ( |x| bx. context . new_rvalue_from_int ( bx. u8_type , x. reverse_bits ( ) as _ ) )
246
+ . chain ( ( 16 ..byte_vector_type_size) . map ( |_| zero_byte) )
247
+ . collect ( ) ;
248
+ let hi_nibble = bx. context . new_rvalue_from_vector ( None , long_byte_vector_type, & hi_nibble_elements) ;
249
+
250
+ let lo_nibble_elements: Vec < _ > = ( 0u8 ..16 )
251
+ . map ( |x| bx. context . new_rvalue_from_int ( bx. u8_type , ( x. reverse_bits ( ) >> 4 ) as _ ) )
252
+ . chain ( ( 16 ..byte_vector_type_size) . map ( |_| zero_byte) )
253
+ . collect ( ) ;
254
+ let lo_nibble = bx. context . new_rvalue_from_vector ( None , long_byte_vector_type, & lo_nibble_elements) ;
255
+
256
+ let mask = bx. context . new_rvalue_from_vector (
257
+ None ,
258
+ long_byte_vector_type,
259
+ & vec ! [ bx. context. new_rvalue_from_int( bx. u8_type, 0x0f ) ; byte_vector_type_size as _] ) ;
260
+
261
+ let four_vec = bx. context . new_rvalue_from_vector (
262
+ None ,
263
+ long_byte_vector_type,
264
+ & vec ! [ bx. context. new_rvalue_from_int( bx. u8_type, 4 ) ; byte_vector_type_size as _] ) ;
265
+
266
+ // Step 2: Byte-swap the input.
267
+ let swapped = simd_bswap ( bx, args[ 0 ] . immediate ( ) ) ;
268
+ let byte_vector = bx. context . new_bitcast ( None , swapped, byte_vector_type) ;
269
+
270
+ // We're going to need to extend the vector with zeros to make sure that the types are the
271
+ // same, since that's what new_rvalue_vector_perm expects.
272
+ let byte_vector = if byte_vector_type_size > type_size_bytes {
273
+ let mut byte_vector_elements = Vec :: with_capacity ( byte_vector_type_size as _ ) ;
274
+ for i in 0 ..type_size_bytes {
275
+ let idx = bx. context . new_rvalue_from_int ( bx. u32_type , i as _ ) ;
276
+ let val = bx. extract_element ( byte_vector, idx) ;
277
+ byte_vector_elements. push ( val) ;
278
+ }
279
+ for _ in type_size_bytes..byte_vector_type_size {
280
+ byte_vector_elements. push ( zero_byte) ;
281
+ }
282
+ bx. context . new_rvalue_from_vector ( None , long_byte_vector_type, & byte_vector_elements)
283
+ } else {
284
+ bx. context . new_bitcast ( None , byte_vector, long_byte_vector_type)
285
+ } ;
286
+
287
+ // Step 3: Mask off the low and high nibbles of each byte in the byte-swapped input.
288
+ let masked_hi = ( byte_vector >> four_vec) & mask;
289
+ let masked_lo = byte_vector & mask;
290
+
291
+ // Step 4: Shuffle the pre-reversed low and high-nibbles using the masked nibbles as a shuffle mask.
292
+ let hi = bx. context . new_rvalue_vector_perm ( None , hi_nibble, hi_nibble, masked_lo) ;
293
+ let lo = bx. context . new_rvalue_vector_perm ( None , lo_nibble, lo_nibble, masked_hi) ;
294
+
295
+ // Step 5: Combine the results of the shuffle back together and cast back to the original type.
296
+ let result = hi | lo;
297
+ let cast_ty = bx. context . new_vector_type ( elem_type, byte_vector_type_size / ( elem_size_bytes as u64 ) ) ;
298
+
299
+ // we might need to truncate if sizeof(v_type) < sizeof(cast_type)
300
+ if type_size_bytes < byte_vector_type_size {
301
+ let cast_result = bx. context . new_bitcast ( None , result, cast_ty) ;
302
+ let elems: Vec < _ > = ( 0 ..in_len)
303
+ . map ( |i| {
304
+ let idx = bx. context . new_rvalue_from_int ( bx. u32_type , i as _ ) ;
305
+ bx. extract_element ( cast_result, idx)
306
+ } )
307
+ . collect ( ) ;
308
+ return Ok ( bx. context . new_rvalue_from_vector ( None , v_type, & elems) )
309
+ } else {
310
+ // avoid the unnecessary truncation as an optimization.
311
+ return Ok ( bx. context . new_bitcast ( None , result, v_type) ) ;
312
+ }
313
+ }
314
+ // since gcc doesn't have vector shuffle methods available in non-patched builds, fallback to
315
+ // component-wise bitreverses if they're not available.
316
+ #[ cfg( not( feature = "master" ) ) ]
317
+ if name == sym:: simd_bitreverse {
318
+ let vector = args[ 0 ] . immediate ( ) ;
319
+ let vector_ty = vector. get_type ( ) ;
320
+ let vector_type = vector_ty. unqualified ( ) . dyncast_vector ( ) . expect ( "vector type" ) ;
321
+ let num_elements = vector_type. get_num_units ( ) ;
322
+
323
+ let elem_type = vector_type. get_element_type ( ) ;
324
+ let elem_size_bytes = elem_type. get_size ( ) ;
325
+ let num_type = elem_type. to_unsigned ( bx. cx ) ;
326
+ let new_elements: Vec < _ > = ( 0 ..num_elements)
327
+ . map ( |idx| {
328
+ let index = bx. context . new_rvalue_from_long ( num_type, idx as _ ) ;
329
+ let extracted_value = bx. extract_element ( vector, index) . to_rvalue ( ) ;
330
+ bx. bit_reverse ( elem_size_bytes as u64 * 8 , extracted_value)
331
+ } )
332
+ . collect ( ) ;
333
+ return Ok ( bx. context . new_rvalue_from_vector ( None , vector_ty, & new_elements) ) ;
334
+ }
335
+
336
+ if name == sym:: simd_ctlz || name == sym:: simd_cttz {
337
+ let vector = args[ 0 ] . immediate ( ) ;
338
+ let elements: Vec < _ > = ( 0 ..in_len)
339
+ . map ( |i| {
340
+ let index = bx. context . new_rvalue_from_long ( bx. i32_type , i as i64 ) ;
341
+ let value = bx. extract_element ( vector, index) . to_rvalue ( ) ;
342
+ if name == sym:: simd_ctlz {
343
+ bx. count_leading_zeroes ( value. get_type ( ) . get_size ( ) as u64 * 8 , value)
344
+ } else {
345
+ bx. count_trailing_zeroes ( value. get_type ( ) . get_size ( ) as u64 * 8 , value)
346
+ }
347
+ } )
348
+ . collect ( ) ;
349
+ return Ok ( bx. context . new_rvalue_from_vector ( None , vector. get_type ( ) , & elements) ) ;
350
+ }
351
+
159
352
if name == sym:: simd_shuffle {
160
353
// Make sure this is actually an array, since typeck only checks the length-suffixed
161
354
// version of this intrinsic.
0 commit comments