Skip to content

Commit adaa409

Browse files
authored
[AArch64] Expand UADDLV patterns to handle two-step i8->i16->i32 extends (#146078)
Closes #142961
1 parent 9a8d45f commit adaa409

File tree

2 files changed

+72
-7
lines changed

2 files changed

+72
-7
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18052,6 +18052,15 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
1805218052
// v16i32 abs(
1805318053
// v16i32 sub(
1805418054
// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18055+
//
18056+
// or
18057+
//
18058+
// i32 vecreduce_add(
18059+
// v16i32 zext(
18060+
// v16i16 abs(
18061+
// v16i16 sub(
18062+
// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
18063+
//
1805518064
// =================>
1805618065
// i32 vecreduce_add(
1805718066
// v4i32 UADDLP(
@@ -18067,23 +18076,35 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
1806718076
return SDValue();
1806818077

1806918078
SDValue VecReduceOp0 = N->getOperand(0);
18079+
bool SawTrailingZext = false;
18080+
// Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
18081+
if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
18082+
VecReduceOp0->getValueType(0) == MVT::v16i32 &&
18083+
VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
18084+
VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
18085+
SawTrailingZext = true;
18086+
VecReduceOp0 = VecReduceOp0.getOperand(0);
18087+
}
18088+
18089+
// Peel off an optional post-ABS extend (v16i16 -> v16i32).
18090+
MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
18091+
// Assumed v16i16 or v16i32 abs input
1807018092
unsigned Opcode = VecReduceOp0.getOpcode();
18071-
// Assumed v16i32 abs
18072-
if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
18093+
if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
1807318094
return SDValue();
1807418095

1807518096
SDValue ABS = VecReduceOp0;
18076-
// Assumed v16i32 sub
18097+
// Assumed v16i16 or v16i32 sub
1807718098
if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18078-
ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
18099+
ABS->getOperand(0)->getValueType(0) != AbsInputVT)
1807918100
return SDValue();
1808018101

1808118102
SDValue SUB = ABS->getOperand(0);
1808218103
unsigned Opcode0 = SUB->getOperand(0).getOpcode();
1808318104
unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18084-
// Assumed v16i32 type
18085-
if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
18086-
SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
18105+
// Assumed v16i16 or v16i32 type
18106+
if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
18107+
SUB->getOperand(1)->getValueType(0) != AbsInputVT)
1808718108
return SDValue();
1808818109

1808918110
// Assumed zext or sext

llvm/test/CodeGen/AArch64/neon-sad.ll

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,47 @@ entry:
4545
%6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
4646
ret i32 %6
4747
}
48+
49+
define i32 @test_sad_v16i8_two_step_zext(ptr noundef readonly %a, ptr noundef readonly %b) {
50+
; CHECK-LABEL: test_sad_v16i8_two_step_zext:
51+
; CHECK: // %bb.0: // %entry
52+
; CHECK-NEXT: ldr q0, [x0]
53+
; CHECK-NEXT: ldr q1, [x1]
54+
; CHECK-NEXT: uabdl v2.8h, v1.8b, v0.8b
55+
; CHECK-NEXT: uabal2 v2.8h, v1.16b, v0.16b
56+
; CHECK-NEXT: uaddlv s0, v2.8h
57+
; CHECK-NEXT: fmov w0, s0
58+
; CHECK-NEXT: ret
59+
entry:
60+
%0 = load <16 x i8>, ptr %a
61+
%1 = zext <16 x i8> %0 to <16 x i16>
62+
%2 = load <16 x i8>, ptr %b
63+
%3 = zext <16 x i8> %2 to <16 x i16>
64+
%4 = sub nsw <16 x i16> %3, %1
65+
%5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 false)
66+
%6 = zext <16 x i16> %5 to <16 x i32>
67+
%7 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
68+
ret i32 %7
69+
}
70+
71+
define i32 @test_sad_v16i8_two_step_sext(ptr noundef readonly %a, ptr noundef readonly %b) {
72+
; CHECK-LABEL: test_sad_v16i8_two_step_sext:
73+
; CHECK: // %bb.0: // %entry
74+
; CHECK-NEXT: ldr q0, [x0]
75+
; CHECK-NEXT: ldr q1, [x1]
76+
; CHECK-NEXT: sabdl v2.8h, v1.8b, v0.8b
77+
; CHECK-NEXT: sabal2 v2.8h, v1.16b, v0.16b
78+
; CHECK-NEXT: uaddlv s0, v2.8h
79+
; CHECK-NEXT: fmov w0, s0
80+
; CHECK-NEXT: ret
81+
entry:
82+
%0 = load <16 x i8>, ptr %a
83+
%1 = sext <16 x i8> %0 to <16 x i16>
84+
%2 = load <16 x i8>, ptr %b
85+
%3 = sext <16 x i8> %2 to <16 x i16>
86+
%4 = sub nsw <16 x i16> %3, %1
87+
%5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 false)
88+
%6 = zext <16 x i16> %5 to <16 x i32>
89+
%7 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
90+
ret i32 %7
91+
}

0 commit comments

Comments
 (0)