arm64: Implement saturating SIMD arithmetic

jgouly · jgouly · commit aa84a4173c90 · 2020-07-14T18:19:11.000+01:00
diff --git a/build.rs b/build.rs
@@ -189,8 +189,10 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             ("simd", "simd_f64x2_cmp") => return false,
             ("simd", "simd_i8x16_arith") => return false,
             ("simd", "simd_i8x16_cmp") => return false,
+            ("simd", "simd_i8x16_sat_arith") => return false,
             ("simd", "simd_i16x8_arith") => return false,
             ("simd", "simd_i16x8_cmp") => return false,
+            ("simd", "simd_i16x8_sat_arith") => return false,
             ("simd", "simd_i32x4_arith") => return false,
             ("simd", "simd_i32x4_cmp") => return false,
             ("simd", "simd_load_extend") => return false,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1311,18 +1311,22 @@ impl MachInstEmit for Inst {
                         debug_assert_eq!(I64, ty);
                         (0b010_11110_11_1, 0b000011)
                     }
+                    VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
                     VecALUOp::SQSubScalar => {
                         debug_assert_eq!(I64, ty);
                         (0b010_11110_11_1, 0b001011)
                     }
+                    VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
                     VecALUOp::UQAddScalar => {
                         debug_assert_eq!(I64, ty);
                         (0b011_11110_11_1, 0b000011)
                     }
+                    VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
                     VecALUOp::UQSubScalar => {
                         debug_assert_eq!(I64, ty);
                         (0b011_11110_11_1, 0b001011)
                     }
+                    VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
                     VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
                     VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
                     VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2049,6 +2049,198 @@ fn test_aarch64_binemit() {
         "sqsub d21, d22, d23",
     ));
 
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqadd,
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(8),
+            ty: I8X16,
+        },
+        "410C284E",
+        "sqadd v1.16b, v2.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqadd,
+            rd: writable_vreg(1),
+            rn: vreg(12),
+            rm: vreg(28),
+            ty: I16X8,
+        },
+        "810D7C4E",
+        "sqadd v1.8h, v12.8h, v28.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqadd,
+            rd: writable_vreg(12),
+            rn: vreg(2),
+            rm: vreg(6),
+            ty: I32X4,
+        },
+        "4C0CA64E",
+        "sqadd v12.4s, v2.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqadd,
+            rd: writable_vreg(20),
+            rn: vreg(7),
+            rm: vreg(13),
+            ty: I64X2,
+        },
+        "F40CED4E",
+        "sqadd v20.2d, v7.2d, v13.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqsub,
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(8),
+            ty: I8X16,
+        },
+        "412C284E",
+        "sqsub v1.16b, v2.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqsub,
+            rd: writable_vreg(1),
+            rn: vreg(12),
+            rm: vreg(28),
+            ty: I16X8,
+        },
+        "812D7C4E",
+        "sqsub v1.8h, v12.8h, v28.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqsub,
+            rd: writable_vreg(12),
+            rn: vreg(2),
+            rm: vreg(6),
+            ty: I32X4,
+        },
+        "4C2CA64E",
+        "sqsub v12.4s, v2.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqsub,
+            rd: writable_vreg(20),
+            rn: vreg(7),
+            rm: vreg(13),
+            ty: I64X2,
+        },
+        "F42CED4E",
+        "sqsub v20.2d, v7.2d, v13.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqadd,
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(8),
+            ty: I8X16,
+        },
+        "410C286E",
+        "uqadd v1.16b, v2.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqadd,
+            rd: writable_vreg(1),
+            rn: vreg(12),
+            rm: vreg(28),
+            ty: I16X8,
+        },
+        "810D7C6E",
+        "uqadd v1.8h, v12.8h, v28.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqadd,
+            rd: writable_vreg(12),
+            rn: vreg(2),
+            rm: vreg(6),
+            ty: I32X4,
+        },
+        "4C0CA66E",
+        "uqadd v12.4s, v2.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqadd,
+            rd: writable_vreg(20),
+            rn: vreg(7),
+            rm: vreg(13),
+            ty: I64X2,
+        },
+        "F40CED6E",
+        "uqadd v20.2d, v7.2d, v13.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqsub,
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(8),
+            ty: I8X16,
+        },
+        "412C286E",
+        "uqsub v1.16b, v2.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqsub,
+            rd: writable_vreg(1),
+            rn: vreg(12),
+            rm: vreg(28),
+            ty: I16X8,
+        },
+        "812D7C6E",
+        "uqsub v1.8h, v12.8h, v28.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqsub,
+            rd: writable_vreg(12),
+            rn: vreg(2),
+            rm: vreg(6),
+            ty: I32X4,
+        },
+        "4C2CA66E",
+        "uqsub v12.4s, v2.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqsub,
+            rd: writable_vreg(20),
+            rn: vreg(7),
+            rm: vreg(13),
+            ty: I64X2,
+        },
+        "F42CED6E",
+        "uqsub v20.2d, v7.2d, v13.2d",
+    ));
+
     insns.push((
         Inst::VecRRR {
             alu_op: VecALUOp::Cmeq,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -209,12 +209,16 @@ pub enum VecExtendOp {
 pub enum VecALUOp {
     /// Signed saturating add
     SQAddScalar,
+    Sqadd,
     /// Unsigned saturating add
     UQAddScalar,
+    Uqadd,
     /// Signed saturating subtract
     SQSubScalar,
+    Sqsub,
     /// Unsigned saturating subtract
     UQSubScalar,
+    Uqsub,
     /// Compare bitwise equal
     Cmeq,
     /// Compare signed greater than or equal
@@ -2734,9 +2738,13 @@ impl ShowWithRRU for Inst {
             } => {
                 let (op, vector, ty) = match alu_op {
                     VecALUOp::SQAddScalar => ("sqadd", false, ty),
+                    VecALUOp::Sqadd => ("sqadd", true, ty),
                     VecALUOp::UQAddScalar => ("uqadd", false, ty),
+                    VecALUOp::Uqadd => ("uqadd", true, ty),
                     VecALUOp::SQSubScalar => ("sqsub", false, ty),
+                    VecALUOp::Sqsub => ("sqsub", true, ty),
                     VecALUOp::UQSubScalar => ("uqsub", false, ty),
+                    VecALUOp::Uqsub => ("uqsub", true, ty),
                     VecALUOp::Cmeq => ("cmeq", true, ty),
                     VecALUOp::Cmge => ("cmge", true, ty),
                     VecALUOp::Cmgt => ("cmgt", true, ty),
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -93,74 +93,64 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 });
             }
         }
-        Opcode::UaddSat | Opcode::SaddSat => {
+        Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
             // We use the vector instruction set's saturating adds (UQADD /
             // SQADD), which require vector registers.
-            let is_signed = op == Opcode::SaddSat;
-            let narrow_mode = if is_signed {
-                NarrowValueMode::SignExtend64
-            } else {
-                NarrowValueMode::ZeroExtend64
-            };
-            let alu_op = if is_signed {
-                VecALUOp::SQAddScalar
-            } else {
-                VecALUOp::UQAddScalar
-            };
-            let va = ctx.alloc_tmp(RegClass::V128, I128);
-            let vb = ctx.alloc_tmp(RegClass::V128, I128);
-            let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
-            let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
+            let is_signed = op == Opcode::SaddSat || op == Opcode::SsubSat;
+            let ty = ty.unwrap();
             let rd = get_output_reg(ctx, outputs[0]);
-            ctx.emit(Inst::MovToVec64 { rd: va, rn: ra });
-            ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb });
-            ctx.emit(Inst::VecRRR {
-                rd: va,
-                rn: va.to_reg(),
-                rm: vb.to_reg(),
-                alu_op,
-                ty: I64,
-            });
-            ctx.emit(Inst::MovFromVec {
-                rd,
-                rn: va.to_reg(),
-                idx: 0,
-                ty: I64,
-            });
-        }
-
-        Opcode::UsubSat | Opcode::SsubSat => {
-            let is_signed = op == Opcode::SsubSat;
-            let narrow_mode = if is_signed {
-                NarrowValueMode::SignExtend64
-            } else {
-                NarrowValueMode::ZeroExtend64
-            };
-            let alu_op = if is_signed {
-                VecALUOp::SQSubScalar
+            if ty_bits(ty) < 128 {
+                let narrow_mode = if is_signed {
+                    NarrowValueMode::SignExtend64
+                } else {
+                    NarrowValueMode::ZeroExtend64
+                };
+                let alu_op = match op {
+                    Opcode::UaddSat => VecALUOp::UQAddScalar,
+                    Opcode::SaddSat => VecALUOp::SQAddScalar,
+                    Opcode::UsubSat => VecALUOp::UQSubScalar,
+                    Opcode::SsubSat => VecALUOp::SQSubScalar,
+                    _ => unreachable!(),
+                };
+                let va = ctx.alloc_tmp(RegClass::V128, I128);
+                let vb = ctx.alloc_tmp(RegClass::V128, I128);
+                let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
+                let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
+                ctx.emit(Inst::MovToVec64 { rd: va, rn: ra });
+                ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb });
+                ctx.emit(Inst::VecRRR {
+                    rd: va,
+                    rn: va.to_reg(),
+                    rm: vb.to_reg(),
+                    alu_op,
+                    ty: I64,
+                });
+                ctx.emit(Inst::MovFromVec {
+                    rd,
+                    rn: va.to_reg(),
+                    idx: 0,
+                    ty: I64,
+                });
             } else {
-                VecALUOp::UQSubScalar
-            };
-            let va = ctx.alloc_tmp(RegClass::V128, I128);
-            let vb = ctx.alloc_tmp(RegClass::V128, I128);
-            let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
-            let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
-            let rd = get_output_reg(ctx, outputs[0]);
-            ctx.emit(Inst::MovToVec64 { rd: va, rn: ra });
-            ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb });
-            ctx.emit(Inst::VecRRR {
-                rd: va,
-                rn: va.to_reg(),
-                rm: vb.to_reg(),
-                alu_op,
-                ty: I64,
-            });
-            ctx.emit(Inst::MovFromVec {
-                rd,
-                rn: va.to_reg(),
-                idx: 0,
-                ty: I64,
-            });
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+
+                let alu_op = match op {
+                    Opcode::UaddSat => VecALUOp::Uqadd,
+                    Opcode::SaddSat => VecALUOp::Sqadd,
+                    Opcode::UsubSat => VecALUOp::Uqsub,
+                    Opcode::SsubSat => VecALUOp::Sqsub,
+                    _ => unreachable!(),
+                };
+
+                ctx.emit(Inst::VecRRR {
+                    rd,
+                    rn,
+                    rm,
+                    alu_op,
+                    ty,
+                });
+            }
         }
 
         Opcode::Ineg => {

Original file line number	Diff line number	Diff line change
`@@ -1311,18 +1311,22 @@ impl MachInstEmit for Inst {`
`1311`	`1311`	`debug_assert_eq!(I64, ty);`
`1312`	`1312`	`(0b010_11110_11_1, 0b000011)`
`1313`	`1313`	`}`
	`1314`	`+ VecALUOp::Sqadd => (0b010_01110_00_1 \| enc_size << 1, 0b000011),`
`1314`	`1315`	`VecALUOp::SQSubScalar => {`
`1315`	`1316`	`debug_assert_eq!(I64, ty);`
`1316`	`1317`	`(0b010_11110_11_1, 0b001011)`
`1317`	`1318`	`}`
	`1319`	`+ VecALUOp::Sqsub => (0b010_01110_00_1 \| enc_size << 1, 0b001011),`
`1318`	`1320`	`VecALUOp::UQAddScalar => {`
`1319`	`1321`	`debug_assert_eq!(I64, ty);`
`1320`	`1322`	`(0b011_11110_11_1, 0b000011)`
`1321`	`1323`	`}`
	`1324`	`+ VecALUOp::Uqadd => (0b011_01110_00_1 \| enc_size << 1, 0b000011),`
`1322`	`1325`	`VecALUOp::UQSubScalar => {`
`1323`	`1326`	`debug_assert_eq!(I64, ty);`
`1324`	`1327`	`(0b011_11110_11_1, 0b001011)`
`1325`	`1328`	`}`
	`1329`	`+ VecALUOp::Uqsub => (0b011_01110_00_1 \| enc_size << 1, 0b001011),`
`1326`	`1330`	`VecALUOp::Cmeq => (0b011_01110_00_1 \| enc_size << 1, 0b100011),`
`1327`	`1331`	`VecALUOp::Cmge => (0b010_01110_00_1 \| enc_size << 1, 0b001111),`
`1328`	`1332`	`VecALUOp::Cmgt => (0b010_01110_00_1 \| enc_size << 1, 0b001101),`