Skip to content

Commit cee8b25

Browse files
committed
[AArch64][SVE] Remove Redundant aarch64.sve.convert.to.svbool
Generated code resulted in redundant aarch64.sve.convert.to.svbool calls for AArch64 Binary Operations. Narrow the more precise operands instead of widening the less precise operands Differential Revision: https://reviews.llvm.org/D116730
1 parent 524150f commit cee8b25

File tree

2 files changed

+205
-0
lines changed

2 files changed

+205
-0
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,12 +416,76 @@ static Optional<Instruction *> processPhiNode(InstCombiner &IC,
416416
return IC.replaceInstUsesWith(II, NPN);
417417
}
418418

419+
// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
420+
// => (binop (pred) (from_svbool _) (from_svbool _))
421+
//
422+
// The above transformation eliminates a `to_svbool` in the predicate
423+
// operand of bitwise operation `binop` by narrowing the vector width of
424+
// the operation. For example, it would convert a `<vscale x 16 x i1>
425+
// and` into a `<vscale x 4 x i1> and`. This is profitable because
426+
// to_svbool must zero the new lanes during widening, whereas
427+
// from_svbool is free.
428+
static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC,
429+
IntrinsicInst &II) {
430+
auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
431+
if (!BinOp)
432+
return None;
433+
434+
auto IntrinsicID = BinOp->getIntrinsicID();
435+
switch (IntrinsicID) {
436+
case Intrinsic::aarch64_sve_and_z:
437+
case Intrinsic::aarch64_sve_bic_z:
438+
case Intrinsic::aarch64_sve_eor_z:
439+
case Intrinsic::aarch64_sve_nand_z:
440+
case Intrinsic::aarch64_sve_nor_z:
441+
case Intrinsic::aarch64_sve_orn_z:
442+
case Intrinsic::aarch64_sve_orr_z:
443+
break;
444+
default:
445+
return None;
446+
}
447+
448+
auto BinOpPred = BinOp->getOperand(0);
449+
auto BinOpOp1 = BinOp->getOperand(1);
450+
auto BinOpOp2 = BinOp->getOperand(2);
451+
452+
auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
453+
if (!PredIntr ||
454+
PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
455+
return None;
456+
457+
auto PredOp = PredIntr->getOperand(0);
458+
auto PredOpTy = cast<VectorType>(PredOp->getType());
459+
if (PredOpTy != II.getType())
460+
return None;
461+
462+
IRBuilder<> Builder(II.getContext());
463+
Builder.SetInsertPoint(&II);
464+
465+
SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
466+
auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
467+
Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
468+
NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
469+
if (BinOpOp1 == BinOpOp2)
470+
NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
471+
else
472+
NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
473+
Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
474+
475+
auto NarrowedBinOp =
476+
Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
477+
return IC.replaceInstUsesWith(II, NarrowedBinOp);
478+
}
479+
419480
static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
420481
IntrinsicInst &II) {
421482
// If the reinterpret instruction operand is a PHI Node
422483
if (isa<PHINode>(II.getArgOperand(0)))
423484
return processPhiNode(IC, II);
424485

486+
if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
487+
return BinOpCombine;
488+
425489
SmallVector<Instruction *, 32> CandidatesForRemoval;
426490
Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
427491

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -S -instcombine < %s | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
define <vscale x 4 x i1> @try_combine_svbool_binop_and_0(<vscale x 4 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c) {
7+
; CHECK-LABEL: @try_combine_svbool_binop_and_0(
8+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[B:%.*]])
9+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[C:%.*]])
10+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.and.z.nxv4i1(<vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i1> [[TMP2]])
11+
; CHECK-NEXT: ret <vscale x 4 x i1> [[TMP3]]
12+
;
13+
%t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
14+
%t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c)
15+
%t3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %t2)
16+
ret <vscale x 4 x i1> %t3
17+
}
18+
19+
define <vscale x 8 x i1> @try_combine_svbool_binop_and_1(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
20+
; CHECK-LABEL: @try_combine_svbool_binop_and_1(
21+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
22+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.and.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
23+
; CHECK-NEXT: ret <vscale x 8 x i1> [[TMP2]]
24+
;
25+
%t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
26+
%t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
27+
%t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
28+
ret <vscale x 8 x i1> %t3
29+
}
30+
31+
define <vscale x 4 x i1> @try_combine_svbool_binop_and_2(<vscale x 4 x i1> %a, <vscale x 16 x i1> %b) {
32+
; CHECK-LABEL: @try_combine_svbool_binop_and_2(
33+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[B:%.*]])
34+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.and.z.nxv4i1(<vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i1> [[TMP1]])
35+
; CHECK-NEXT: ret <vscale x 4 x i1> [[TMP2]]
36+
;
37+
%t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
38+
%t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
39+
%t3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %t2)
40+
ret <vscale x 4 x i1> %t3
41+
}
42+
43+
define <vscale x 2 x i1> @try_combine_svbool_binop_and_3(<vscale x 2 x i1> %a, <vscale x 16 x i1> %b) {
44+
; CHECK-LABEL: @try_combine_svbool_binop_and_3(
45+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[B:%.*]])
46+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.and.z.nxv2i1(<vscale x 2 x i1> [[A:%.*]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i1> [[TMP1]])
47+
; CHECK-NEXT: ret <vscale x 2 x i1> [[TMP2]]
48+
;
49+
%t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
50+
%t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
51+
%t3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %t2)
52+
ret <vscale x 2 x i1> %t3
53+
}
54+
55+
define <vscale x 8 x i1> @try_combine_svbool_binop_bic(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
56+
; CHECK-LABEL: @try_combine_svbool_binop_bic(
57+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
58+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.bic.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
59+
; CHECK-NEXT: ret <vscale x 8 x i1> [[TMP2]]
60+
;
61+
%t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
62+
%t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.bic.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
63+
%t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
64+
ret <vscale x 8 x i1> %t3
65+
}
66+
67+
define <vscale x 8 x i1> @try_combine_svbool_binop_eor(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
68+
; CHECK-LABEL: @try_combine_svbool_binop_eor(
69+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
70+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.eor.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
71+
; CHECK-NEXT: ret <vscale x 8 x i1> [[TMP2]]
72+
;
73+
%t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
74+
%t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.eor.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
75+
%t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
76+
ret <vscale x 8 x i1> %t3
77+
}
78+
79+
define <vscale x 8 x i1> @try_combine_svbool_binop_nand(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
80+
; CHECK-LABEL: @try_combine_svbool_binop_nand(
81+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
82+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.nand.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
83+
; CHECK-NEXT: ret <vscale x 8 x i1> [[TMP2]]
84+
;
85+
%t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
86+
%t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.nand.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
87+
%t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
88+
ret <vscale x 8 x i1> %t3
89+
}
90+
91+
define <vscale x 8 x i1> @try_combine_svbool_binop_nor(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
92+
; CHECK-LABEL: @try_combine_svbool_binop_nor(
93+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
94+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.nor.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
95+
; CHECK-NEXT: ret <vscale x 8 x i1> [[TMP2]]
96+
;
97+
%t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
98+
%t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.nor.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
99+
%t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
100+
ret <vscale x 8 x i1> %t3
101+
}
102+
103+
define <vscale x 8 x i1> @try_combine_svbool_binop_orn(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
104+
; CHECK-LABEL: @try_combine_svbool_binop_orn(
105+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
106+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.orn.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
107+
; CHECK-NEXT: ret <vscale x 8 x i1> [[TMP2]]
108+
;
109+
%t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
110+
%t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.orn.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
111+
%t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
112+
ret <vscale x 8 x i1> %t3
113+
}
114+
115+
define <vscale x 8 x i1> @try_combine_svbool_binop_orr(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
116+
; CHECK-LABEL: @try_combine_svbool_binop_orr(
117+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
118+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.orr.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
119+
; CHECK-NEXT: ret <vscale x 8 x i1> [[TMP2]]
120+
;
121+
%t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
122+
%t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.orr.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
123+
%t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
124+
ret <vscale x 8 x i1> %t3
125+
}
126+
127+
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
128+
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
129+
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
130+
declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
131+
declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
132+
declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
133+
declare <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
134+
declare <vscale x 16 x i1> @llvm.aarch64.sve.bic.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
135+
declare <vscale x 16 x i1> @llvm.aarch64.sve.eor.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
136+
declare <vscale x 16 x i1> @llvm.aarch64.sve.nand.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
137+
declare <vscale x 16 x i1> @llvm.aarch64.sve.nor.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
138+
declare <vscale x 16 x i1> @llvm.aarch64.sve.orn.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
139+
declare <vscale x 16 x i1> @llvm.aarch64.sve.orr.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
140+
141+
attributes #0 = { "target-features"="+sve" }

0 commit comments

Comments
 (0)