Add x86_palignr instructions

abrown · abrown · commit fafef7db77e8 · 2020-07-15T11:32:08.000-07:00
This instruction is necessary for implementing `[s|u]widen_high`.
diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1697,6 +1697,7 @@ fn define_simd(
     let x86_pminu = x86.by_name("x86_pminu");
     let x86_pmullq = x86.by_name("x86_pmullq");
     let x86_pmuludq = x86.by_name("x86_pmuludq");
+    let x86_palignr = x86.by_name("x86_palignr");
     let x86_pshufb = x86.by_name("x86_pshufb");
     let x86_pshufd = x86.by_name("x86_pshufd");
     let x86_psll = x86.by_name("x86_psll");
@@ -1901,6 +1902,8 @@ fn define_simd(
             rec_fa.opcodes(low),
         );
     }
+
+    // SIMD narrow/widen
     for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] {
         let snarrow = snarrow.bind(vector(*ty, sse_vector_size));
         e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes));
@@ -1912,6 +1915,13 @@ fn define_simd(
         let unarrow = unarrow.bind(vector(*ty, sse_vector_size));
         e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap);
     }
+    for ty in &[I8, I16, I32, I64] {
+        e.enc_both_inferred_maybe_isap(
+            x86_palignr.bind(vector(*ty, sse_vector_size)),
+            rec_fa_ib.opcodes(&PALIGNR[..]),
+            Some(use_ssse3_simd),
+        );
+    }
 
     // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
     for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -664,6 +664,21 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    let c = &Operand::new("c", uimm8)
+        .with_doc("The number of bytes to shift right; see PALIGNR in Intel manual for details");
+    ig.push(
+        Inst::new(
+            "x86_palignr",
+            r#"
+        Concatenate destination and source operands, extracting a byte-aligned result shifted to 
+        the right by `c`.
+        "#,
+            &formats.ternary_imm8,
+        )
+        .operands_in(vec![x, y, c])
+        .operands_out(vec![a]),
+    );
+
     let i64_t = &TypeVar::new(
         "i64_t",
         "A scalar 64bit integer",
diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -354,6 +354,10 @@ pub static PADDUSB: [u8; 3] = [0x66, 0x0f, 0xdc];
 /// Add packed unsigned word integers from xmm2/m128 and xmm1 saturate the results (SSE).
 pub static PADDUSW: [u8; 3] = [0x66, 0x0f, 0xdd];
 
+/// Concatenate destination and source operands, extract a byte-aligned result into xmm1 that is
+/// shifted to the right by the constant number of bytes in imm8 (SSSE3).
+pub static PALIGNR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0f];
+
 /// Bitwise AND of xmm2/m128 and xmm1 (SSE2).
 pub static PAND: [u8; 3] = [0x66, 0x0f, 0xdb];
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2133,6 +2133,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::X86Insertps
         | Opcode::X86Movsd
         | Opcode::X86Movlhps
+        | Opcode::X86Palignr
         | Opcode::X86Psll
         | Opcode::X86Psrl
         | Opcode::X86Psra
diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
@@ -1,5 +1,6 @@
 test binemit
-target x86_64
+set enable_simd
+target x86_64 has_ssse3=true
 
 ; Ensure raw_bitcast emits no instructions.
 function %raw_bitcast_i16x8_to_b32x4() {
@@ -10,8 +11,9 @@ block0:
             return
 }
 
-function %fcvt_32(i32x4) {
-block0(v0: i32x4 [%xmm6]):
-[-, %xmm2]  v1 = fcvt_from_sint.f32x4 v0    ; bin: 40 0f 5b d6
+function %conversions_i32x4(i32x4, i32x4) {
+block0(v0: i32x4 [%xmm6], v1: i32x4 [%xmm4]):
+[-, %xmm2]  v2 = fcvt_from_sint.f32x4 v0    ; bin: 40 0f 5b d6
+[-, %xmm6]  v3 = x86_palignr v0, v1, 3      ; bin: 66 0f 3a 0f f4 03
             return
 }