Skip to content

Commit ba950f2

Browse files
authored
Winch: i8x16.shuffle for x64 with AVX (bytecodealliance#9959)
* i8x16_shuffle for x64 with AVX512 * Use AVX2 instructions instead * Change panic to error * Forgot to update callsite * Ignore SIMD misc test on non-AVX hardware * Add xmm_ prefix to function name * Use early exit in shuffle
1 parent 86184ec commit ba950f2

File tree

9 files changed

+1411
-3
lines changed

9 files changed

+1411
-3
lines changed

crates/wast-util/src/lib.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,10 @@ impl WastTest {
500500
// SIMD on Winch requires AVX instructions.
501501
#[cfg(target_arch = "x86_64")]
502502
if !(std::is_x86_feature_detected!("avx") && std::is_x86_feature_detected!("avx2")) {
503-
let unsupported = ["spec_testsuite/simd_align.wast"];
503+
let unsupported = [
504+
"misc_testsuite/winch/_simd_lane.wast",
505+
"spec_testsuite/simd_align.wast",
506+
];
504507

505508
if unsupported.iter().any(|part| self.path.ends_with(part)) {
506509
return true;
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
;;! target = "x86_64"
2+
;;! test = "winch"
3+
;;! flags = [ "-Ccranelift-has-avx" ]
4+
5+
(module
6+
(func (result v128)
7+
v128.const i64x2 1 2
8+
v128.const i64x2 2 1
9+
i8x16.shuffle 0 1 2 3 20 21 22 23 4 5 6 7 24 25 26 27
10+
)
11+
)
12+
;; wasm[0]::function[0]:
13+
;; pushq %rbp
14+
;; movq %rsp, %rbp
15+
;; movq 8(%rdi), %r11
16+
;; movq 0x10(%r11), %r11
17+
;; addq $0x10, %r11
18+
;; cmpq %rsp, %r11
19+
;; ja 0x5c
20+
;; 1c: movq %rdi, %r14
21+
;; subq $0x10, %rsp
22+
;; movq %rdi, 8(%rsp)
23+
;; movq %rsi, (%rsp)
24+
;; movdqu 0x2c(%rip), %xmm0
25+
;; movdqu 0x34(%rip), %xmm1
26+
;; vpshufb 0x3b(%rip), %xmm1, %xmm1
27+
;; vpshufb 0x42(%rip), %xmm0, %xmm15
28+
;; vpor %xmm1, %xmm15, %xmm1
29+
;; movdqa %xmm1, %xmm0
30+
;; addq $0x10, %rsp
31+
;; popq %rbp
32+
;; retq
33+
;; 5c: ud2
34+
;; 5e: addb %al, (%rax)
35+
;; 60: addb (%rax), %al
36+
;; 62: addb %al, (%rax)
37+
;; 64: addb %al, (%rax)
38+
;; 66: addb %al, (%rax)
39+
;; 68: addl %eax, (%rax)
40+
;; 6a: addb %al, (%rax)
41+
;; 6c: addb %al, (%rax)
42+
;; 6e: addb %al, (%rax)
43+
;; 70: addl %eax, (%rax)
44+
;; 72: addb %al, (%rax)
45+
;; 74: addb %al, (%rax)
46+
;; 76: addb %al, (%rax)
47+
;; 78: addb (%rax), %al
48+
;; 7a: addb %al, (%rax)
49+
;; 7c: addb %al, (%rax)
50+
;; 7e: addb %al, (%rax)
51+
;; 80: addb %al, (%rcx)
52+
;; 82: addb (%rbx), %al
53+
;; 84: addb $6, 0x5048080(%rax)

tests/misc_testsuite/winch/_simd_lane.wast

Lines changed: 1267 additions & 0 deletions
Large diffs are not rendered by default.

winch/codegen/src/isa/aarch64/masm.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,10 @@ impl Masm for MacroAssembler {
894894
Err(anyhow!(CodeGenError::unimplemented_masm_instruction()))
895895
}
896896

897+
fn shuffle(&mut self, _dst: WritableReg, _lhs: Reg, _rhs: Reg, _lanes: [u8; 16]) -> Result<()> {
898+
bail!(CodeGenError::unimplemented_masm_instruction())
899+
}
900+
897901
fn atomic_rmw(
898902
&mut self,
899903
_addr: Self::Address,

winch/codegen/src/isa/x64/asm.rs

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use cranelift_codegen::{
1919
self, AluRmiROpcode, Amode, AvxOpcode, CmpOpcode, DivSignedness, ExtMode,
2020
FenceKind, FromWritableReg, Gpr, GprMem, GprMemImm, Imm8Gpr, Imm8Reg, RegMem,
2121
RegMemImm, ShiftKind as CraneliftShiftKind, SseOpcode, SyntheticAmode, WritableGpr,
22-
WritableXmm, Xmm, XmmMem, XmmMemAligned, CC,
22+
WritableXmm, Xmm, XmmMem, XmmMemAligned, XmmMemImm, CC,
2323
},
2424
encoding::rex::{encode_modrm, RexFlags},
2525
settings as x64_settings, EmitInfo, EmitState, Inst,
@@ -1509,6 +1509,35 @@ impl Assembler {
15091509
});
15101510
}
15111511

1512+
/// Shuffles bytes in `src` according to contents of `mask` and puts
1513+
/// result in `dst`.
1514+
pub fn xmm_vpshufb_rrm(&mut self, dst: WritableReg, src: Reg, mask: &Address) {
1515+
let mask = Self::to_synthetic_amode(
1516+
mask,
1517+
&mut self.pool,
1518+
&mut self.constants,
1519+
&mut self.buffer,
1520+
MemFlags::trusted(),
1521+
);
1522+
1523+
self.emit(Inst::XmmRmiRVex {
1524+
op: args::AvxOpcode::Vpshufb,
1525+
src1: src.into(),
1526+
src2: XmmMemImm::unwrap_new(RegMemImm::Mem { addr: mask }),
1527+
dst: dst.to_reg().into(),
1528+
});
1529+
}
1530+
1531+
/// Bitwise OR of `src1` and `src2`.
1532+
pub fn vpor(&mut self, dst: WritableReg, src1: Reg, src2: Reg) {
1533+
self.emit(Inst::XmmRmiRVex {
1534+
op: args::AvxOpcode::Vpor,
1535+
src1: src1.into(),
1536+
src2: XmmMemImm::unwrap_new(src2.into()),
1537+
dst: dst.to_reg().into(),
1538+
})
1539+
}
1540+
15121541
pub fn fence(&mut self, kind: FenceKind) {
15131542
self.emit(Inst::Fence { kind });
15141543
}

winch/codegen/src/isa/x64/masm.rs

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1286,6 +1286,36 @@ impl Masm for MacroAssembler {
12861286
Ok(())
12871287
}
12881288

1289+
fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()> {
1290+
if !self.flags.has_avx() {
1291+
bail!(CodeGenError::UnimplementedForNoAvx)
1292+
}
1293+
1294+
// Use `vpshufb` with `lanes` to set the lanes in `lhs` and `rhs`
1295+
// separately to either the selected index or 0.
1296+
// Then use `vpor` to combine `lhs` and `rhs` into `dst`.
1297+
// Setting the most significant bit in the mask's lane to 1 will
1298+
// result in corresponding lane in the destination register being
1299+
// set to 0. 0x80 sets the most significant bit to 1.
1300+
let mut mask_lhs: [u8; 16] = [0x80; 16];
1301+
let mut mask_rhs: [u8; 16] = [0x80; 16];
1302+
for i in 0..lanes.len() {
1303+
if lanes[i] < 16 {
1304+
mask_lhs[i] = lanes[i];
1305+
} else {
1306+
mask_rhs[i] = lanes[i] - 16;
1307+
}
1308+
}
1309+
let mask_lhs = self.asm.add_constant(&mask_lhs);
1310+
let mask_rhs = self.asm.add_constant(&mask_rhs);
1311+
1312+
self.asm.xmm_vpshufb_rrm(dst, lhs, &mask_lhs);
1313+
let scratch = writable!(regs::scratch_xmm());
1314+
self.asm.xmm_vpshufb_rrm(scratch, rhs, &mask_rhs);
1315+
self.asm.vpor(dst, dst.to_reg(), scratch.to_reg());
1316+
Ok(())
1317+
}
1318+
12891319
fn atomic_rmw(
12901320
&mut self,
12911321
addr: Self::Address,
@@ -1309,7 +1339,6 @@ impl Masm for MacroAssembler {
13091339
}
13101340
}
13111341
}
1312-
13131342
Ok(())
13141343
}
13151344
}

winch/codegen/src/masm.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1257,6 +1257,10 @@ pub(crate) trait MacroAssembler {
12571257
fn mul_wide(&mut self, context: &mut CodeGenContext<Emission>, kind: MulWideKind)
12581258
-> Result<()>;
12591259

1260+
/// Performs a shuffle between two 128-bit vectors into a 128-bit result
1261+
/// using lanes as a mask to select which indexes to copy.
1262+
fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()>;
1263+
12601264
/// Performs the RMW `op` operation on the passed `addr`.
12611265
///
12621266
/// The value *before* the operation was performed is written back to the `operand` register.

winch/codegen/src/stack.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ impl TypedReg {
5151
reg,
5252
}
5353
}
54+
55+
/// Create a v128 [`TypedReg`].
56+
pub fn v128(reg: Reg) -> Self {
57+
Self {
58+
ty: WasmValType::V128,
59+
reg,
60+
}
61+
}
5462
}
5563

5664
impl From<TypedReg> for Reg {

winch/codegen/src/visitor.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ macro_rules! def_unsupported {
285285
(emit I64AtomicRmw16AddU $($rest:tt)*) => {};
286286
(emit I64AtomicRmw32AddU $($rest:tt)*) => {};
287287
(emit I64AtomicRmwAdd $($rest:tt)*) => {};
288+
(emit I8x16Shuffle $($rest:tt)*) => {};
288289

289290
(emit $unsupported:tt $($rest:tt)*) => {$($rest)*};
290291
}
@@ -2471,6 +2472,16 @@ where
24712472
)
24722473
}
24732474

2475+
fn visit_i8x16_shuffle(&mut self, lanes: [u8; 16]) -> Self::Output {
2476+
let rhs = self.context.pop_to_reg(self.masm, None)?;
2477+
let lhs = self.context.pop_to_reg(self.masm, None)?;
2478+
self.masm
2479+
.shuffle(writable!(lhs.into()), lhs.into(), rhs.into(), lanes)?;
2480+
self.context.stack.push(TypedReg::v128(lhs.into()).into());
2481+
self.context.free_reg(rhs);
2482+
Ok(())
2483+
}
2484+
24742485
wasmparser::for_each_visit_simd_operator!(def_unsupported);
24752486
}
24762487

0 commit comments

Comments
 (0)