Skip to content

Commit 71dc3de

Browse files
calebzulawskidavemgreen
authored andcommitted
[ARM] Improve min/max vector reductions on Arm
This patch adds some more efficient lowering for vecreduce.min/max under NEON, using sequences of pairwise vpmin/vpmax to reduce to a single value. This nearly resolves issues such as llvm#50466, llvm#40981, llvm#38190. Differential Revision: https://reviews.llvm.org/D146404
1 parent ada0356 commit 71dc3de

File tree

2 files changed

+306
-0
lines changed

2 files changed

+306
-0
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,6 +1007,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
10071007
setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
10081008
}
10091009
}
1010+
1011+
for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1012+
MVT::v4i32}) {
1013+
setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1014+
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1015+
setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1016+
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1017+
}
10101018
}
10111019

10121020
if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
@@ -10271,6 +10279,80 @@ static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
1027110279
return LowerVecReduce(Op, DAG, ST);
1027210280
}
1027310281

10282+
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG,
10283+
const ARMSubtarget *ST) {
10284+
if (!ST->hasNEON())
10285+
return SDValue();
10286+
10287+
SDLoc dl(Op);
10288+
SDValue Op0 = Op->getOperand(0);
10289+
EVT VT = Op0.getValueType();
10290+
EVT EltVT = VT.getVectorElementType();
10291+
10292+
unsigned PairwiseIntrinsic = 0;
10293+
switch (Op->getOpcode()) {
10294+
default:
10295+
llvm_unreachable("Expected VECREDUCE opcode");
10296+
case ISD::VECREDUCE_UMIN:
10297+
PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10298+
break;
10299+
case ISD::VECREDUCE_UMAX:
10300+
PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10301+
break;
10302+
case ISD::VECREDUCE_SMIN:
10303+
PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10304+
break;
10305+
case ISD::VECREDUCE_SMAX:
10306+
PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10307+
break;
10308+
}
10309+
SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10310+
10311+
unsigned NumElts = VT.getVectorNumElements();
10312+
unsigned NumActiveLanes = NumElts;
10313+
10314+
assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10315+
NumActiveLanes == 2) &&
10316+
"Only expected a power 2 vector size");
10317+
10318+
// Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10319+
if (VT.is128BitVector()) {
10320+
SDValue Lo, Hi;
10321+
std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10322+
VT = Lo.getValueType();
10323+
Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10324+
NumActiveLanes /= 2;
10325+
}
10326+
10327+
// Use pairwise reductions until one lane remains
10328+
while (NumActiveLanes > 1) {
10329+
Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10330+
NumActiveLanes /= 2;
10331+
}
10332+
10333+
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10334+
DAG.getConstant(0, dl, MVT::i32));
10335+
10336+
// Result type may be wider than element type.
10337+
if (EltVT != Op.getValueType()) {
10338+
unsigned Extend = 0;
10339+
switch (Op->getOpcode()) {
10340+
default:
10341+
llvm_unreachable("Expected VECREDUCE opcode");
10342+
case ISD::VECREDUCE_UMIN:
10343+
case ISD::VECREDUCE_UMAX:
10344+
Extend = ISD::ZERO_EXTEND;
10345+
break;
10346+
case ISD::VECREDUCE_SMIN:
10347+
case ISD::VECREDUCE_SMAX:
10348+
Extend = ISD::SIGN_EXTEND;
10349+
break;
10350+
}
10351+
Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10352+
}
10353+
return Res;
10354+
}
10355+
1027410356
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
1027510357
if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
1027610358
// Acquire/Release load/store is not legal for targets without a dmb or
@@ -10502,6 +10584,11 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1050210584
case ISD::VECREDUCE_FMIN:
1050310585
case ISD::VECREDUCE_FMAX:
1050410586
return LowerVecReduceF(Op, DAG, Subtarget);
10587+
case ISD::VECREDUCE_UMIN:
10588+
case ISD::VECREDUCE_UMAX:
10589+
case ISD::VECREDUCE_SMIN:
10590+
case ISD::VECREDUCE_SMAX:
10591+
return LowerVecReduceMinMax(Op, DAG, Subtarget);
1050510592
case ISD::ATOMIC_LOAD:
1050610593
case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
1050710594
case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=hard -mattr=+neon -verify-machineinstrs | FileCheck %s
3+
4+
define i8 @test_umin_v8i8(<8 x i8> %x) {
5+
; CHECK-LABEL: test_umin_v8i8:
6+
; CHECK: @ %bb.0: @ %entry
7+
; CHECK-NEXT: vpmin.u8 d16, d0, d0
8+
; CHECK-NEXT: vpmin.u8 d16, d16, d16
9+
; CHECK-NEXT: vpmin.u8 d16, d16, d16
10+
; CHECK-NEXT: vmov.u8 r0, d16[0]
11+
; CHECK-NEXT: bx lr
12+
entry:
13+
%z = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %x)
14+
ret i8 %z
15+
}
16+
17+
define i8 @test_smin_v8i8(<8 x i8> %x) {
18+
; CHECK-LABEL: test_smin_v8i8:
19+
; CHECK: @ %bb.0: @ %entry
20+
; CHECK-NEXT: vpmin.s8 d16, d0, d0
21+
; CHECK-NEXT: vpmin.s8 d16, d16, d16
22+
; CHECK-NEXT: vpmin.s8 d16, d16, d16
23+
; CHECK-NEXT: vmov.s8 r0, d16[0]
24+
; CHECK-NEXT: bx lr
25+
entry:
26+
%z = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %x)
27+
ret i8 %z
28+
}
29+
30+
define i8 @test_umax_v8i8(<8 x i8> %x) {
31+
; CHECK-LABEL: test_umax_v8i8:
32+
; CHECK: @ %bb.0: @ %entry
33+
; CHECK-NEXT: vpmax.u8 d16, d0, d0
34+
; CHECK-NEXT: vpmax.u8 d16, d16, d16
35+
; CHECK-NEXT: vpmax.u8 d16, d16, d16
36+
; CHECK-NEXT: vmov.u8 r0, d16[0]
37+
; CHECK-NEXT: bx lr
38+
entry:
39+
%z = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %x)
40+
ret i8 %z
41+
}
42+
43+
define i8 @test_smax_v8i8(<8 x i8> %x) {
44+
; CHECK-LABEL: test_smax_v8i8:
45+
; CHECK: @ %bb.0: @ %entry
46+
; CHECK-NEXT: vpmax.s8 d16, d0, d0
47+
; CHECK-NEXT: vpmax.s8 d16, d16, d16
48+
; CHECK-NEXT: vpmax.s8 d16, d16, d16
49+
; CHECK-NEXT: vmov.s8 r0, d16[0]
50+
; CHECK-NEXT: bx lr
51+
entry:
52+
%z = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %x)
53+
ret i8 %z
54+
}
55+
56+
define i16 @test_umin_v4i16(<4 x i16> %x) {
57+
; CHECK-LABEL: test_umin_v4i16:
58+
; CHECK: @ %bb.0: @ %entry
59+
; CHECK-NEXT: vpmin.u16 d16, d0, d0
60+
; CHECK-NEXT: vpmin.u16 d16, d16, d16
61+
; CHECK-NEXT: vmov.u16 r0, d16[0]
62+
; CHECK-NEXT: bx lr
63+
entry:
64+
%z = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %x)
65+
ret i16 %z
66+
}
67+
68+
define i16 @test_smin_v4i16(<4 x i16> %x) {
69+
; CHECK-LABEL: test_smin_v4i16:
70+
; CHECK: @ %bb.0: @ %entry
71+
; CHECK-NEXT: vpmin.s16 d16, d0, d0
72+
; CHECK-NEXT: vpmin.s16 d16, d16, d16
73+
; CHECK-NEXT: vmov.s16 r0, d16[0]
74+
; CHECK-NEXT: bx lr
75+
entry:
76+
%z = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %x)
77+
ret i16 %z
78+
}
79+
80+
define i16 @test_umax_v4i16(<4 x i16> %x) {
81+
; CHECK-LABEL: test_umax_v4i16:
82+
; CHECK: @ %bb.0: @ %entry
83+
; CHECK-NEXT: vpmax.u16 d16, d0, d0
84+
; CHECK-NEXT: vpmax.u16 d16, d16, d16
85+
; CHECK-NEXT: vmov.u16 r0, d16[0]
86+
; CHECK-NEXT: bx lr
87+
entry:
88+
%z = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %x)
89+
ret i16 %z
90+
}
91+
92+
define i16 @test_smax_v4i16(<4 x i16> %x) {
93+
; CHECK-LABEL: test_smax_v4i16:
94+
; CHECK: @ %bb.0: @ %entry
95+
; CHECK-NEXT: vpmax.s16 d16, d0, d0
96+
; CHECK-NEXT: vpmax.s16 d16, d16, d16
97+
; CHECK-NEXT: vmov.s16 r0, d16[0]
98+
; CHECK-NEXT: bx lr
99+
entry:
100+
%z = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %x)
101+
ret i16 %z
102+
}
103+
104+
define i32 @test_umin_v2i32(<2 x i32> %x) {
105+
; CHECK-LABEL: test_umin_v2i32:
106+
; CHECK: @ %bb.0: @ %entry
107+
; CHECK-NEXT: vpmin.u32 d16, d0, d0
108+
; CHECK-NEXT: vmov.32 r0, d16[0]
109+
; CHECK-NEXT: bx lr
110+
entry:
111+
%z = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %x)
112+
ret i32 %z
113+
}
114+
115+
define i32 @test_smin_v2i32(<2 x i32> %x) {
116+
; CHECK-LABEL: test_smin_v2i32:
117+
; CHECK: @ %bb.0: @ %entry
118+
; CHECK-NEXT: vpmin.s32 d16, d0, d0
119+
; CHECK-NEXT: vmov.32 r0, d16[0]
120+
; CHECK-NEXT: bx lr
121+
entry:
122+
%z = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %x)
123+
ret i32 %z
124+
}
125+
126+
define i32 @test_umax_v2i32(<2 x i32> %x) {
127+
; CHECK-LABEL: test_umax_v2i32:
128+
; CHECK: @ %bb.0: @ %entry
129+
; CHECK-NEXT: vpmax.u32 d16, d0, d0
130+
; CHECK-NEXT: vmov.32 r0, d16[0]
131+
; CHECK-NEXT: bx lr
132+
entry:
133+
%z = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %x)
134+
ret i32 %z
135+
}
136+
137+
define i32 @test_smax_v2i32(<2 x i32> %x) {
138+
; CHECK-LABEL: test_smax_v2i32:
139+
; CHECK: @ %bb.0: @ %entry
140+
; CHECK-NEXT: vpmax.s32 d16, d0, d0
141+
; CHECK-NEXT: vmov.32 r0, d16[0]
142+
; CHECK-NEXT: bx lr
143+
entry:
144+
%z = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %x)
145+
ret i32 %z
146+
}
147+
148+
define i8 @test_umin_v16i8(<16 x i8> %x) {
149+
; CHECK-LABEL: test_umin_v16i8:
150+
; CHECK: @ %bb.0: @ %entry
151+
; CHECK-NEXT: vpmin.u8 d16, d0, d1
152+
; CHECK-NEXT: vpmin.u8 d16, d16, d16
153+
; CHECK-NEXT: vpmin.u8 d16, d16, d16
154+
; CHECK-NEXT: vpmin.u8 d16, d16, d16
155+
; CHECK-NEXT: vmov.u8 r0, d16[0]
156+
; CHECK-NEXT: bx lr
157+
entry:
158+
%z = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %x)
159+
ret i8 %z
160+
}
161+
162+
define i16 @test_smin_v8i16(<8 x i16> %x) {
163+
; CHECK-LABEL: test_smin_v8i16:
164+
; CHECK: @ %bb.0: @ %entry
165+
; CHECK-NEXT: vpmin.s16 d16, d0, d1
166+
; CHECK-NEXT: vpmin.s16 d16, d16, d16
167+
; CHECK-NEXT: vpmin.s16 d16, d16, d16
168+
; CHECK-NEXT: vmov.s16 r0, d16[0]
169+
; CHECK-NEXT: bx lr
170+
entry:
171+
%z = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %x)
172+
ret i16 %z
173+
}
174+
175+
define i32 @test_umax_v4i32(<4 x i32> %x) {
176+
; CHECK-LABEL: test_umax_v4i32:
177+
; CHECK: @ %bb.0: @ %entry
178+
; CHECK-NEXT: vpmax.u32 d16, d0, d1
179+
; CHECK-NEXT: vpmax.u32 d16, d16, d16
180+
; CHECK-NEXT: vmov.32 r0, d16[0]
181+
; CHECK-NEXT: bx lr
182+
entry:
183+
%z = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %x)
184+
ret i32 %z
185+
}
186+
187+
define i8 @test_umin_v32i8(<32 x i8> %x) {
188+
; CHECK-LABEL: test_umin_v32i8:
189+
; CHECK: @ %bb.0: @ %entry
190+
; CHECK-NEXT: vmin.u8 q8, q0, q1
191+
; CHECK-NEXT: vpmin.u8 d16, d16, d17
192+
; CHECK-NEXT: vpmin.u8 d16, d16, d16
193+
; CHECK-NEXT: vpmin.u8 d16, d16, d16
194+
; CHECK-NEXT: vpmin.u8 d16, d16, d16
195+
; CHECK-NEXT: vmov.u8 r0, d16[0]
196+
; CHECK-NEXT: bx lr
197+
entry:
198+
%z = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %x)
199+
ret i8 %z
200+
}
201+
202+
declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
203+
declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
204+
declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
205+
declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
206+
declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
207+
declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
208+
declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
209+
declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
210+
declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
211+
declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
212+
declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
213+
declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
214+
215+
declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
216+
declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
217+
declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
218+
219+
declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)

0 commit comments

Comments
 (0)