Skip to content

Commit aa84a41

Browse files
committed
arm64: Implement saturating SIMD arithmetic
Copyright (c) 2020, Arm Limited.
1 parent 85ffc8f commit aa84a41

File tree

5 files changed

+260
-64
lines changed

5 files changed

+260
-64
lines changed

build.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,10 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
189189
("simd", "simd_f64x2_cmp") => return false,
190190
("simd", "simd_i8x16_arith") => return false,
191191
("simd", "simd_i8x16_cmp") => return false,
192+
("simd", "simd_i8x16_sat_arith") => return false,
192193
("simd", "simd_i16x8_arith") => return false,
193194
("simd", "simd_i16x8_cmp") => return false,
195+
("simd", "simd_i16x8_sat_arith") => return false,
194196
("simd", "simd_i32x4_arith") => return false,
195197
("simd", "simd_i32x4_cmp") => return false,
196198
("simd", "simd_load_extend") => return false,

cranelift/codegen/src/isa/aarch64/inst/emit.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1311,18 +1311,22 @@ impl MachInstEmit for Inst {
13111311
debug_assert_eq!(I64, ty);
13121312
(0b010_11110_11_1, 0b000011)
13131313
}
1314+
VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
13141315
VecALUOp::SQSubScalar => {
13151316
debug_assert_eq!(I64, ty);
13161317
(0b010_11110_11_1, 0b001011)
13171318
}
1319+
VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
13181320
VecALUOp::UQAddScalar => {
13191321
debug_assert_eq!(I64, ty);
13201322
(0b011_11110_11_1, 0b000011)
13211323
}
1324+
VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
13221325
VecALUOp::UQSubScalar => {
13231326
debug_assert_eq!(I64, ty);
13241327
(0b011_11110_11_1, 0b001011)
13251328
}
1329+
VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
13261330
VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
13271331
VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
13281332
VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),

cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2049,6 +2049,198 @@ fn test_aarch64_binemit() {
20492049
"sqsub d21, d22, d23",
20502050
));
20512051

2052+
insns.push((
2053+
Inst::VecRRR {
2054+
alu_op: VecALUOp::Sqadd,
2055+
rd: writable_vreg(1),
2056+
rn: vreg(2),
2057+
rm: vreg(8),
2058+
ty: I8X16,
2059+
},
2060+
"410C284E",
2061+
"sqadd v1.16b, v2.16b, v8.16b",
2062+
));
2063+
2064+
insns.push((
2065+
Inst::VecRRR {
2066+
alu_op: VecALUOp::Sqadd,
2067+
rd: writable_vreg(1),
2068+
rn: vreg(12),
2069+
rm: vreg(28),
2070+
ty: I16X8,
2071+
},
2072+
"810D7C4E",
2073+
"sqadd v1.8h, v12.8h, v28.8h",
2074+
));
2075+
2076+
insns.push((
2077+
Inst::VecRRR {
2078+
alu_op: VecALUOp::Sqadd,
2079+
rd: writable_vreg(12),
2080+
rn: vreg(2),
2081+
rm: vreg(6),
2082+
ty: I32X4,
2083+
},
2084+
"4C0CA64E",
2085+
"sqadd v12.4s, v2.4s, v6.4s",
2086+
));
2087+
2088+
insns.push((
2089+
Inst::VecRRR {
2090+
alu_op: VecALUOp::Sqadd,
2091+
rd: writable_vreg(20),
2092+
rn: vreg(7),
2093+
rm: vreg(13),
2094+
ty: I64X2,
2095+
},
2096+
"F40CED4E",
2097+
"sqadd v20.2d, v7.2d, v13.2d",
2098+
));
2099+
2100+
insns.push((
2101+
Inst::VecRRR {
2102+
alu_op: VecALUOp::Sqsub,
2103+
rd: writable_vreg(1),
2104+
rn: vreg(2),
2105+
rm: vreg(8),
2106+
ty: I8X16,
2107+
},
2108+
"412C284E",
2109+
"sqsub v1.16b, v2.16b, v8.16b",
2110+
));
2111+
2112+
insns.push((
2113+
Inst::VecRRR {
2114+
alu_op: VecALUOp::Sqsub,
2115+
rd: writable_vreg(1),
2116+
rn: vreg(12),
2117+
rm: vreg(28),
2118+
ty: I16X8,
2119+
},
2120+
"812D7C4E",
2121+
"sqsub v1.8h, v12.8h, v28.8h",
2122+
));
2123+
2124+
insns.push((
2125+
Inst::VecRRR {
2126+
alu_op: VecALUOp::Sqsub,
2127+
rd: writable_vreg(12),
2128+
rn: vreg(2),
2129+
rm: vreg(6),
2130+
ty: I32X4,
2131+
},
2132+
"4C2CA64E",
2133+
"sqsub v12.4s, v2.4s, v6.4s",
2134+
));
2135+
2136+
insns.push((
2137+
Inst::VecRRR {
2138+
alu_op: VecALUOp::Sqsub,
2139+
rd: writable_vreg(20),
2140+
rn: vreg(7),
2141+
rm: vreg(13),
2142+
ty: I64X2,
2143+
},
2144+
"F42CED4E",
2145+
"sqsub v20.2d, v7.2d, v13.2d",
2146+
));
2147+
2148+
insns.push((
2149+
Inst::VecRRR {
2150+
alu_op: VecALUOp::Uqadd,
2151+
rd: writable_vreg(1),
2152+
rn: vreg(2),
2153+
rm: vreg(8),
2154+
ty: I8X16,
2155+
},
2156+
"410C286E",
2157+
"uqadd v1.16b, v2.16b, v8.16b",
2158+
));
2159+
2160+
insns.push((
2161+
Inst::VecRRR {
2162+
alu_op: VecALUOp::Uqadd,
2163+
rd: writable_vreg(1),
2164+
rn: vreg(12),
2165+
rm: vreg(28),
2166+
ty: I16X8,
2167+
},
2168+
"810D7C6E",
2169+
"uqadd v1.8h, v12.8h, v28.8h",
2170+
));
2171+
2172+
insns.push((
2173+
Inst::VecRRR {
2174+
alu_op: VecALUOp::Uqadd,
2175+
rd: writable_vreg(12),
2176+
rn: vreg(2),
2177+
rm: vreg(6),
2178+
ty: I32X4,
2179+
},
2180+
"4C0CA66E",
2181+
"uqadd v12.4s, v2.4s, v6.4s",
2182+
));
2183+
2184+
insns.push((
2185+
Inst::VecRRR {
2186+
alu_op: VecALUOp::Uqadd,
2187+
rd: writable_vreg(20),
2188+
rn: vreg(7),
2189+
rm: vreg(13),
2190+
ty: I64X2,
2191+
},
2192+
"F40CED6E",
2193+
"uqadd v20.2d, v7.2d, v13.2d",
2194+
));
2195+
2196+
insns.push((
2197+
Inst::VecRRR {
2198+
alu_op: VecALUOp::Uqsub,
2199+
rd: writable_vreg(1),
2200+
rn: vreg(2),
2201+
rm: vreg(8),
2202+
ty: I8X16,
2203+
},
2204+
"412C286E",
2205+
"uqsub v1.16b, v2.16b, v8.16b",
2206+
));
2207+
2208+
insns.push((
2209+
Inst::VecRRR {
2210+
alu_op: VecALUOp::Uqsub,
2211+
rd: writable_vreg(1),
2212+
rn: vreg(12),
2213+
rm: vreg(28),
2214+
ty: I16X8,
2215+
},
2216+
"812D7C6E",
2217+
"uqsub v1.8h, v12.8h, v28.8h",
2218+
));
2219+
2220+
insns.push((
2221+
Inst::VecRRR {
2222+
alu_op: VecALUOp::Uqsub,
2223+
rd: writable_vreg(12),
2224+
rn: vreg(2),
2225+
rm: vreg(6),
2226+
ty: I32X4,
2227+
},
2228+
"4C2CA66E",
2229+
"uqsub v12.4s, v2.4s, v6.4s",
2230+
));
2231+
2232+
insns.push((
2233+
Inst::VecRRR {
2234+
alu_op: VecALUOp::Uqsub,
2235+
rd: writable_vreg(20),
2236+
rn: vreg(7),
2237+
rm: vreg(13),
2238+
ty: I64X2,
2239+
},
2240+
"F42CED6E",
2241+
"uqsub v20.2d, v7.2d, v13.2d",
2242+
));
2243+
20522244
insns.push((
20532245
Inst::VecRRR {
20542246
alu_op: VecALUOp::Cmeq,

cranelift/codegen/src/isa/aarch64/inst/mod.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,12 +209,16 @@ pub enum VecExtendOp {
209209
pub enum VecALUOp {
210210
/// Signed saturating add
211211
SQAddScalar,
212+
Sqadd,
212213
/// Unsigned saturating add
213214
UQAddScalar,
215+
Uqadd,
214216
/// Signed saturating subtract
215217
SQSubScalar,
218+
Sqsub,
216219
/// Unsigned saturating subtract
217220
UQSubScalar,
221+
Uqsub,
218222
/// Compare bitwise equal
219223
Cmeq,
220224
/// Compare signed greater than or equal
@@ -2734,9 +2738,13 @@ impl ShowWithRRU for Inst {
27342738
} => {
27352739
let (op, vector, ty) = match alu_op {
27362740
VecALUOp::SQAddScalar => ("sqadd", false, ty),
2741+
VecALUOp::Sqadd => ("sqadd", true, ty),
27372742
VecALUOp::UQAddScalar => ("uqadd", false, ty),
2743+
VecALUOp::Uqadd => ("uqadd", true, ty),
27382744
VecALUOp::SQSubScalar => ("sqsub", false, ty),
2745+
VecALUOp::Sqsub => ("sqsub", true, ty),
27392746
VecALUOp::UQSubScalar => ("uqsub", false, ty),
2747+
VecALUOp::Uqsub => ("uqsub", true, ty),
27402748
VecALUOp::Cmeq => ("cmeq", true, ty),
27412749
VecALUOp::Cmge => ("cmge", true, ty),
27422750
VecALUOp::Cmgt => ("cmgt", true, ty),

cranelift/codegen/src/isa/aarch64/lower_inst.rs

Lines changed: 54 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -93,74 +93,64 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
9393
});
9494
}
9595
}
96-
Opcode::UaddSat | Opcode::SaddSat => {
96+
Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
9797
// We use the vector instruction set's saturating adds (UQADD /
9898
// SQADD), which require vector registers.
99-
let is_signed = op == Opcode::SaddSat;
100-
let narrow_mode = if is_signed {
101-
NarrowValueMode::SignExtend64
102-
} else {
103-
NarrowValueMode::ZeroExtend64
104-
};
105-
let alu_op = if is_signed {
106-
VecALUOp::SQAddScalar
107-
} else {
108-
VecALUOp::UQAddScalar
109-
};
110-
let va = ctx.alloc_tmp(RegClass::V128, I128);
111-
let vb = ctx.alloc_tmp(RegClass::V128, I128);
112-
let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
113-
let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
99+
let is_signed = op == Opcode::SaddSat || op == Opcode::SsubSat;
100+
let ty = ty.unwrap();
114101
let rd = get_output_reg(ctx, outputs[0]);
115-
ctx.emit(Inst::MovToVec64 { rd: va, rn: ra });
116-
ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb });
117-
ctx.emit(Inst::VecRRR {
118-
rd: va,
119-
rn: va.to_reg(),
120-
rm: vb.to_reg(),
121-
alu_op,
122-
ty: I64,
123-
});
124-
ctx.emit(Inst::MovFromVec {
125-
rd,
126-
rn: va.to_reg(),
127-
idx: 0,
128-
ty: I64,
129-
});
130-
}
131-
132-
Opcode::UsubSat | Opcode::SsubSat => {
133-
let is_signed = op == Opcode::SsubSat;
134-
let narrow_mode = if is_signed {
135-
NarrowValueMode::SignExtend64
136-
} else {
137-
NarrowValueMode::ZeroExtend64
138-
};
139-
let alu_op = if is_signed {
140-
VecALUOp::SQSubScalar
102+
if ty_bits(ty) < 128 {
103+
let narrow_mode = if is_signed {
104+
NarrowValueMode::SignExtend64
105+
} else {
106+
NarrowValueMode::ZeroExtend64
107+
};
108+
let alu_op = match op {
109+
Opcode::UaddSat => VecALUOp::UQAddScalar,
110+
Opcode::SaddSat => VecALUOp::SQAddScalar,
111+
Opcode::UsubSat => VecALUOp::UQSubScalar,
112+
Opcode::SsubSat => VecALUOp::SQSubScalar,
113+
_ => unreachable!(),
114+
};
115+
let va = ctx.alloc_tmp(RegClass::V128, I128);
116+
let vb = ctx.alloc_tmp(RegClass::V128, I128);
117+
let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
118+
let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
119+
ctx.emit(Inst::MovToVec64 { rd: va, rn: ra });
120+
ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb });
121+
ctx.emit(Inst::VecRRR {
122+
rd: va,
123+
rn: va.to_reg(),
124+
rm: vb.to_reg(),
125+
alu_op,
126+
ty: I64,
127+
});
128+
ctx.emit(Inst::MovFromVec {
129+
rd,
130+
rn: va.to_reg(),
131+
idx: 0,
132+
ty: I64,
133+
});
141134
} else {
142-
VecALUOp::UQSubScalar
143-
};
144-
let va = ctx.alloc_tmp(RegClass::V128, I128);
145-
let vb = ctx.alloc_tmp(RegClass::V128, I128);
146-
let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
147-
let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
148-
let rd = get_output_reg(ctx, outputs[0]);
149-
ctx.emit(Inst::MovToVec64 { rd: va, rn: ra });
150-
ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb });
151-
ctx.emit(Inst::VecRRR {
152-
rd: va,
153-
rn: va.to_reg(),
154-
rm: vb.to_reg(),
155-
alu_op,
156-
ty: I64,
157-
});
158-
ctx.emit(Inst::MovFromVec {
159-
rd,
160-
rn: va.to_reg(),
161-
idx: 0,
162-
ty: I64,
163-
});
135+
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
136+
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
137+
138+
let alu_op = match op {
139+
Opcode::UaddSat => VecALUOp::Uqadd,
140+
Opcode::SaddSat => VecALUOp::Sqadd,
141+
Opcode::UsubSat => VecALUOp::Uqsub,
142+
Opcode::SsubSat => VecALUOp::Sqsub,
143+
_ => unreachable!(),
144+
};
145+
146+
ctx.emit(Inst::VecRRR {
147+
rd,
148+
rn,
149+
rm,
150+
alu_op,
151+
ty,
152+
});
153+
}
164154
}
165155

166156
Opcode::Ineg => {

0 commit comments

Comments
 (0)