Skip to content

Commit 0d2b47a

Browse files
authored
[AMDGPU][True16][CodeGen] stop emitting spgr_lo16 from isel (#144819)
When true16 is enabled, isel start to emit sgpr_lo16 register when a trunc/sext i16/i32 is generated, or a salu32 is used by vgpr16 or vice versa. And this causes a problem as sgpr_lo16 is not fully supported in the pipeline. True16 mode works fine in -O3 mode since folding pass remove sgpr_lo16 from the pipeline. However it hit a problem in -O0 mode as folding pass is skipped. This patch did: 1. stop emitting sgpr_lo16 from isel 2. update codegen pattern to split uniformed/divergent pattern for i16/i32 conversion 3. update fix-sgpr-copy pass to address legalization requirement in true16 mode, update fix-sgpr-copies-f16-true16.mir test to include all possible combinations This patch is tested with cts and downstream repo with -O0 testing
1 parent 702784c commit 0d2b47a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+46201
-85038
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,8 +1088,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10881088
assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
10891089
"We do not expect to see 16-bit copies from VGPR to SGPR unless "
10901090
"we have 16-bit VGPRs");
1091-
assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
1092-
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
1091+
assert(MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
10931092
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
10941093
// There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
10951094
MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7261,7 +7261,8 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
72617261
MachineBasicBlock *MBB = MI.getParent();
72627262
// Legalize operands and check for size mismatch
72637263
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7264-
OpIdx >= get(Opcode).getNumOperands())
7264+
OpIdx >= get(Opcode).getNumOperands() ||
7265+
get(Opcode).operands()[OpIdx].RegClass == -1)
72657266
return;
72667267

72677268
MachineOperand &Op = MI.getOperand(OpIdx);
@@ -7820,15 +7821,21 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
78207821
// that copies will end up as machine instructions and not be
78217822
// eliminated.
78227823
addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7823-
MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7824-
MRI.clearKillFlags(Inst.getOperand(1).getReg());
7824+
Register NewDstReg = Inst.getOperand(1).getReg();
7825+
MRI.replaceRegWith(DstReg, NewDstReg);
7826+
MRI.clearKillFlags(NewDstReg);
78257827
Inst.getOperand(0).setReg(DstReg);
78267828
// Make sure we don't leave around a dead VGPR->SGPR copy. Normally
78277829
// these are deleted later, but at -O0 it would leave a suspicious
78287830
// looking illegal copy of an undef register.
78297831
for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
78307832
Inst.removeOperand(I);
78317833
Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7834+
// Legalize t16 operand since replaceReg is called after addUsersToVALU
7835+
for (MachineOperand &MO :
7836+
make_early_inc_range(MRI.use_operands(NewDstReg))) {
7837+
legalizeOperandsVALUt16(*MO.getParent(), MRI);
7838+
}
78327839
return;
78337840
}
78347841

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3557,9 +3557,7 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
35573557

35583558
const TargetRegisterClass *
35593559
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
3560-
if (BitWidth == 16)
3561-
return &AMDGPU::SGPR_LO16RegClass;
3562-
if (BitWidth == 32)
3560+
if (BitWidth == 16 || BitWidth == 32)
35633561
return &AMDGPU::SReg_32RegClass;
35643562
if (BitWidth == 64)
35653563
return &AMDGPU::SReg_64RegClass;

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,17 +1552,34 @@ def : GCNPat<
15521552
>;
15531553

15541554
def : GCNPat<
1555-
(i64 (anyext i16:$src)),
1555+
(i64 (UniformUnaryFrag<anyext> i16:$src)),
1556+
(REG_SEQUENCE VReg_64,
1557+
(i32 (COPY $src)), sub0,
1558+
(V_MOV_B32_e32 (i32 0)), sub1)
1559+
>;
1560+
1561+
def : GCNPat<
1562+
(i64 (DivergentUnaryFrag<anyext> i16:$src)),
15561563
(REG_SEQUENCE VReg_64, $src, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
15571564
>;
15581565

15591566
def : GCNPat<
1560-
(i16 (trunc i32:$src)),
1567+
(i16 (UniformUnaryFrag<trunc> i32:$src)),
1568+
(COPY $src)
1569+
>;
1570+
1571+
def : GCNPat<
1572+
(i16 (DivergentUnaryFrag<trunc> i32:$src)),
15611573
(EXTRACT_SUBREG $src, lo16)
15621574
>;
15631575

15641576
def : GCNPat <
1565-
(i16 (trunc i64:$src)),
1577+
(i16 (UniformUnaryFrag<trunc> i64:$src)),
1578+
(EXTRACT_SUBREG $src, sub0)
1579+
>;
1580+
1581+
def : GCNPat <
1582+
(i16 (DivergentUnaryFrag<trunc> i64:$src)),
15661583
(EXTRACT_SUBREG $src, lo16)
15671584
>;
15681585

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Lines changed: 16460 additions & 28699 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)