Skip to content

Commit 150fe8c

Browse files
committed
[AMDGPU] Move S_BFE lowering into RegBankCombiner
1 parent a1992f6 commit 150fe8c

File tree

3 files changed

+119
-71
lines changed

3 files changed

+119
-71
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[
151151
canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
152152
]>;
153153

154+
// Early select of uniform BFX into S_BFE instructions.
155+
// These instructions encode the offset/width in a way that requires using
156+
// bitwise operations. Selecting these instructions early allow the combiner
157+
// to potentially fold these.
158+
class lower_uniform_bfx<Instruction bfx> : GICombineRule<
159+
(defs root:$bfx),
160+
(combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>;
161+
162+
def lower_uniform_sbfx : lower_uniform_bfx<G_SBFX>;
163+
def lower_uniform_ubfx : lower_uniform_bfx<G_UBFX>;
164+
154165
let Predicates = [Has16BitInsts, NotHasMed3_16] in {
155166
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
156167
// saves one instruction compared to the promotion.
@@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner<
198209
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
199210
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
200211
identity_combines, redundant_and, constant_fold_cast_op,
201-
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
212+
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
213+
lower_uniform_sbfx, lower_uniform_ubfx]> {
202214
}

llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
8989

9090
void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
9191

92+
bool lowerUniformBFX(MachineInstr &MI) const;
93+
9294
private:
9395
SIModeRegisterDefaults getMode() const;
9496
bool getIEEE() const;
@@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
392394
MI.eraseFromParent();
393395
}
394396

397+
bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
398+
assert(MI.getOpcode() == TargetOpcode::G_UBFX ||
399+
MI.getOpcode() == TargetOpcode::G_SBFX);
400+
const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX);
401+
402+
Register DstReg = MI.getOperand(0).getReg();
403+
const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI);
404+
assert(RB && "No RB?");
405+
if (RB->getID() != AMDGPU::SGPRRegBankID)
406+
return false;
407+
408+
Register SrcReg = MI.getOperand(1).getReg();
409+
Register OffsetReg = MI.getOperand(2).getReg();
410+
Register WidthReg = MI.getOperand(3).getReg();
411+
412+
const LLT S32 = LLT::scalar(32);
413+
LLT Ty = MRI.getType(DstReg);
414+
415+
const unsigned Opc = (Ty == S32)
416+
? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
417+
: (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
418+
419+
// Ensure the high bits are clear to insert the offset.
420+
auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
421+
auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
422+
423+
// Zeros out the low bits, so don't bother clamping the input value.
424+
auto ShiftAmt = B.buildConstant(S32, 16);
425+
auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
426+
427+
// Transformation function, pack the offset and width of a BFE into
428+
// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
429+
// source, bits [5:0] contain the offset and bits [22:16] the width.
430+
auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
431+
432+
MRI.setRegBank(OffsetMask.getReg(0), *RB);
433+
MRI.setRegBank(ClampOffset.getReg(0), *RB);
434+
MRI.setRegBank(ShiftAmt.getReg(0), *RB);
435+
MRI.setRegBank(ShiftWidth.getReg(0), *RB);
436+
MRI.setRegBank(MergedInputs.getReg(0), *RB);
437+
438+
auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
439+
if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
440+
llvm_unreachable("failed to constrain BFE");
441+
442+
MI.eraseFromParent();
443+
return true;
444+
}
445+
395446
SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
396447
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
397448
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 55 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1492,88 +1492,73 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
14921492
Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
14931493

14941494
const RegisterBank *DstBank =
1495-
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1496-
if (DstBank == &AMDGPU::VGPRRegBank) {
1497-
if (Ty == S32)
1498-
return true;
1499-
1500-
// There is no 64-bit vgpr bitfield extract instructions so the operation
1501-
// is expanded to a sequence of instructions that implement the operation.
1502-
ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1495+
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
15031496

1504-
const LLT S64 = LLT::scalar(64);
1505-
// Shift the source operand so that extracted bits start at bit 0.
1506-
auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1507-
: B.buildLShr(S64, SrcReg, OffsetReg);
1508-
auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1509-
1510-
// A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1511-
// if the width is a constant.
1512-
if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1513-
// Use the 32-bit bitfield extract instruction if the width is a constant.
1514-
// Depending on the width size, use either the low or high 32-bits.
1515-
auto Zero = B.buildConstant(S32, 0);
1516-
auto WidthImm = ConstWidth->Value.getZExtValue();
1517-
if (WidthImm <= 32) {
1518-
// Use bitfield extract on the lower 32-bit source, and then sign-extend
1519-
// or clear the upper 32-bits.
1520-
auto Extract =
1521-
Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1522-
: B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1523-
auto Extend =
1524-
Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1525-
B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1526-
} else {
1527-
// Use bitfield extract on upper 32-bit source, and combine with lower
1528-
// 32-bit source.
1529-
auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1530-
auto Extract =
1531-
Signed
1532-
? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1533-
: B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1534-
B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1535-
}
1536-
MI.eraseFromParent();
1497+
if (DstBank != &AMDGPU::VGPRRegBank) {
1498+
// SGPR: Canonicalize to a G_S/UBFX
1499+
if (!isa<GIntrinsic>(MI))
15371500
return true;
1538-
}
15391501

1540-
// Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1541-
// operations.
1542-
auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1543-
auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1502+
ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
15441503
if (Signed)
1545-
B.buildAShr(S64, SignBit, ExtShift);
1504+
B.buildSbfx(DstReg, SrcReg, OffsetReg, WidthReg);
15461505
else
1547-
B.buildLShr(S64, SignBit, ExtShift);
1506+
B.buildUbfx(DstReg, SrcReg, OffsetReg, WidthReg);
15481507
MI.eraseFromParent();
15491508
return true;
15501509
}
15511510

1552-
// The scalar form packs the offset and width in a single operand.
1553-
1554-
ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1555-
1556-
// Ensure the high bits are clear to insert the offset.
1557-
auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1558-
auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1559-
1560-
// Zeros out the low bits, so don't bother clamping the input value.
1561-
auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1562-
1563-
// Transformation function, pack the offset and width of a BFE into
1564-
// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1565-
// source, bits [5:0] contain the offset and bits [22:16] the width.
1566-
auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1511+
// VGPR
1512+
if (Ty == S32)
1513+
return true;
15671514

1568-
// TODO: It might be worth using a pseudo here to avoid scc clobber and
1569-
// register class constraints.
1570-
unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1571-
(Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1515+
// There is no 64-bit vgpr bitfield extract instructions so the operation
1516+
// is expanded to a sequence of instructions that implement the operation.
1517+
ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
15721518

1573-
auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1574-
if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1575-
llvm_unreachable("failed to constrain BFE");
1519+
const LLT S64 = LLT::scalar(64);
1520+
// Shift the source operand so that extracted bits start at bit 0.
1521+
auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1522+
: B.buildLShr(S64, SrcReg, OffsetReg);
1523+
auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1524+
1525+
// A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1526+
// if the width is a constant.
1527+
if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1528+
// Use the 32-bit bitfield extract instruction if the width is a constant.
1529+
// Depending on the width size, use either the low or high 32-bits.
1530+
auto Zero = B.buildConstant(S32, 0);
1531+
auto WidthImm = ConstWidth->Value.getZExtValue();
1532+
if (WidthImm <= 32) {
1533+
// Use bitfield extract on the lower 32-bit source, and then sign-extend
1534+
// or clear the upper 32-bits.
1535+
auto Extract =
1536+
Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1537+
: B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1538+
auto Extend =
1539+
Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1540+
B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1541+
} else {
1542+
// Use bitfield extract on upper 32-bit source, and combine with lower
1543+
// 32-bit source.
1544+
auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1545+
auto Extract =
1546+
Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1547+
: B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1548+
B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1549+
}
1550+
MI.eraseFromParent();
1551+
return true;
1552+
}
15761553

1554+
// Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1555+
// operations.
1556+
auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1557+
auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1558+
if (Signed)
1559+
B.buildAShr(S64, SignBit, ExtShift);
1560+
else
1561+
B.buildLShr(S64, SignBit, ExtShift);
15771562
MI.eraseFromParent();
15781563
return true;
15791564
}

0 commit comments

Comments
 (0)