Skip to content

Commit 8bc9551

Browse files
authored
[AArch64] Improve operand sinking for mul instructions (llvm#116604)
- Sink splat operands to mul instructions for types where we can use the lane-indexed variants. - When sinking operands for [su]mull, also sink the ext instruction.
1 parent a202a35 commit 8bc9551

File tree

5 files changed

+374
-113
lines changed

5 files changed

+374
-113
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5170,26 +5170,45 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
51705170
return false;
51715171
}
51725172
case Instruction::Mul: {
5173+
auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
5174+
auto *Ty = cast<VectorType>(V->getType());
5175+
// For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5176+
if (Ty->isScalableTy())
5177+
return false;
5178+
5179+
// Indexed variants of Mul exist for i16 and i32 element types only.
5180+
return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
5181+
};
5182+
51735183
int NumZExts = 0, NumSExts = 0;
51745184
for (auto &Op : I->operands()) {
51755185
// Make sure we are not already sinking this operand
51765186
if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
51775187
continue;
51785188

5179-
if (match(&Op, m_SExt(m_Value()))) {
5180-
NumSExts++;
5181-
continue;
5182-
} else if (match(&Op, m_ZExt(m_Value()))) {
5183-
NumZExts++;
5189+
if (match(&Op, m_ZExtOrSExt(m_Value()))) {
5190+
auto *Ext = cast<Instruction>(Op);
5191+
auto *ExtOp = Ext->getOperand(0);
5192+
if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
5193+
Ops.push_back(&Ext->getOperandUse(0));
5194+
Ops.push_back(&Op);
5195+
5196+
if (isa<SExtInst>(Ext))
5197+
NumSExts++;
5198+
else
5199+
NumZExts++;
5200+
51845201
continue;
51855202
}
51865203

51875204
ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
5205+
if (!Shuffle)
5206+
continue;
51885207

51895208
// If the Shuffle is a splat and the operand is a zext/sext, sinking the
51905209
// operand and the s/zext can help create indexed s/umull. This is
51915210
// especially useful to prevent i64 mul being scalarized.
5192-
if (Shuffle && isSplatShuffle(Shuffle) &&
5211+
if (isSplatShuffle(Shuffle) &&
51935212
match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
51945213
Ops.push_back(&Shuffle->getOperandUse(0));
51955214
Ops.push_back(&Op);
@@ -5200,9 +5219,6 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
52005219
continue;
52015220
}
52025221

5203-
if (!Shuffle)
5204-
continue;
5205-
52065222
Value *ShuffleOperand = Shuffle->getOperand(0);
52075223
InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
52085224
if (!Insert)
@@ -5234,12 +5250,26 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
52345250
NumZExts++;
52355251
}
52365252

5253+
Ops.push_back(&Insert->getOperandUse(1));
52375254
Ops.push_back(&Shuffle->getOperandUse(0));
52385255
Ops.push_back(&Op);
52395256
}
52405257

5241-
// Is it profitable to sink if we found two of the same type of extends.
5242-
return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
5258+
// It is profitable to sink if we found two of the same type of extends.
5259+
if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
5260+
return true;
5261+
5262+
// Otherwise, see if we should sink splats for indexed variants.
5263+
if (!ShouldSinkSplatForIndexedVariant(I))
5264+
return false;
5265+
5266+
Ops.clear();
5267+
if (isSplatShuffle(I->getOperand(0)))
5268+
Ops.push_back(&I->getOperandUse(0));
5269+
if (isSplatShuffle(I->getOperand(1)))
5270+
Ops.push_back(&I->getOperandUse(1));
5271+
5272+
return !Ops.empty();
52435273
}
52445274
case Instruction::FMul: {
52455275
// For SVE the lane-indexing is within 128-bits, so we can't fold splats.

llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,18 @@ target triple = "aarch64-unknown-linux-gnu"
1010
define dso_local i32 @dupext_crashtest(i32 %e) local_unnamed_addr {
1111
; CHECK-LABEL: dupext_crashtest:
1212
; CHECK: // %bb.0: // %for.body.lr.ph
13-
; CHECK-NEXT: mov w8, w0
14-
; CHECK-NEXT: dup v0.2s, w8
1513
; CHECK-NEXT: .LBB0_1: // %vector.body
1614
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
17-
; CHECK-NEXT: ldr d1, [x8]
18-
; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
19-
; CHECK-NEXT: xtn v1.2s, v1.2d
20-
; CHECK-NEXT: str d1, [x8]
15+
; CHECK-NEXT: ldr d0, [x8]
16+
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
17+
; CHECK-NEXT: fmov x9, d0
18+
; CHECK-NEXT: mov x8, v0.d[1]
19+
; CHECK-NEXT: mul w9, w0, w9
20+
; CHECK-NEXT: mul w8, w0, w8
21+
; CHECK-NEXT: fmov d0, x9
22+
; CHECK-NEXT: mov v0.d[1], x8
23+
; CHECK-NEXT: xtn v0.2s, v0.2d
24+
; CHECK-NEXT: str d0, [x8]
2125
; CHECK-NEXT: b .LBB0_1
2226
for.body.lr.ph:
2327
%conv314 = zext i32 %e to i64

0 commit comments

Comments
 (0)