Skip to content

Commit fcd0dc7

Browse files
AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize
1 parent a029d8b commit fcd0dc7

File tree

3 files changed

+139
-100
lines changed

3 files changed

+139
-100
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 118 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "GCNSubtarget.h"
2424
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
2525
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
26+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
2627
#include "llvm/CodeGen/MachineFunctionPass.h"
2728
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2829
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -137,7 +138,123 @@ class AMDGPURegBankLegalizeCombiner {
137138
return {MatchMI, MatchMI->getOperand(1).getReg()};
138139
}
139140

141+
std::tuple<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src) {
142+
auto *ReadAnyLane = MRI.getVRegDef(Src);
143+
if (ReadAnyLane->getOpcode() == AMDGPU::G_AMDGPU_READANYLANE) {
144+
Register RALSrc = ReadAnyLane->getOperand(1).getReg();
145+
auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI);
146+
if (UnMerge)
147+
return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
148+
}
149+
return {nullptr, -1};
150+
}
151+
152+
Register getReadAnyLaneSrc(Register Src) {
153+
// Src = G_AMDGPU_READANYLANE RALSrc
154+
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
155+
if (RAL)
156+
return RALSrc;
157+
158+
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
159+
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
160+
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr
161+
// Src G_MERGE_VALUES LoSgpr, HiSgpr
162+
auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
163+
if (Merge) {
164+
unsigned NumElts = Merge->getNumSources();
165+
auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
166+
if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
167+
return {};
168+
169+
// check if all elements are from same unmerge and there is no shuffling
170+
for (unsigned i = 1; i < NumElts; ++i) {
171+
auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
172+
if (UnmergeI != Unmerge || (unsigned)IdxI != i)
173+
return {};
174+
}
175+
return Unmerge->getSourceReg();
176+
}
177+
178+
// ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge
179+
// SgprI = G_AMDGPU_READANYLANE VgprI
180+
// SgprLarge G_MERGE_VALUES ..., SgprI, ...
181+
// ..., Src, ... = G_UNMERGE_VALUES SgprLarge
182+
auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
183+
if (UnMerge) {
184+
int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
185+
auto *Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
186+
if (Merge) {
187+
auto [RAL, RALSrc] =
188+
tryMatch(Merge->getSourceReg(Idx), AMDGPU::G_AMDGPU_READANYLANE);
189+
if (RAL)
190+
return RALSrc;
191+
}
192+
}
193+
194+
return {};
195+
}
196+
197+
bool tryEliminateReadAnyLane(MachineInstr &Copy) {
198+
Register Dst = Copy.getOperand(0).getReg();
199+
Register Src = Copy.getOperand(1).getReg();
200+
if (!Src.isVirtual())
201+
return false;
202+
203+
Register RALDst = Src;
204+
MachineInstr &SrcMI = *MRI.getVRegDef(Src);
205+
if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) {
206+
RALDst = SrcMI.getOperand(1).getReg();
207+
}
208+
209+
Register RALSrc = getReadAnyLaneSrc(RALDst);
210+
if (!RALSrc)
211+
return false;
212+
213+
if (Dst.isVirtual()) {
214+
if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
215+
// Src = READANYLANE RALSrc
216+
// Dst = Copy Src
217+
// ->
218+
// Dst = RALSrc
219+
MRI.replaceRegWith(Dst, RALSrc);
220+
} else {
221+
// RALDst = READANYLANE RALSrc
222+
// Src = G_BITCAST RALDst
223+
// Dst = Copy Src
224+
// ->
225+
// NewVgpr = G_BITCAST RALDst
226+
// Dst = NewVgpr
227+
auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
228+
MRI.replaceRegWith(Dst, Bitcast.getReg(0));
229+
}
230+
} else {
231+
B.setInstr(Copy);
232+
if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
233+
// Src = READANYLANE RALSrc
234+
// $Dst = Copy Src
235+
// ->
236+
// $Dst = Copy RALSrc
237+
B.buildCopy(Dst, RALSrc);
238+
} else {
239+
// RALDst = READANYLANE RALSrc
240+
// Src = G_BITCAST RALDst
241+
// Dst = Copy Src
242+
// ->
243+
// NewVgpr = G_BITCAST RALDst
244+
// $Dst = Copy NewVgpr
245+
auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
246+
B.buildCopy(Dst, Bitcast.getReg(0));
247+
}
248+
}
249+
250+
eraseInstr(Copy, MRI, nullptr);
251+
return true;
252+
}
253+
140254
void tryCombineCopy(MachineInstr &MI) {
255+
if (tryEliminateReadAnyLane(MI))
256+
return;
257+
141258
Register Dst = MI.getOperand(0).getReg();
142259
Register Src = MI.getOperand(1).getReg();
143260
// Skip copies of physical registers.
@@ -160,24 +277,7 @@ class AMDGPURegBankLegalizeCombiner {
160277
auto One = B.buildConstant({SgprRB, S32}, 1);
161278
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
162279
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
163-
cleanUpAfterCombine(MI, Trunc);
164-
return;
165-
}
166-
167-
// Src = G_AMDGPU_READANYLANE RALSrc
168-
// Dst = COPY Src
169-
// ->
170-
// Dst = RALSrc
171-
if (MRI.getRegBankOrNull(Dst) == VgprRB &&
172-
MRI.getRegBankOrNull(Src) == SgprRB) {
173-
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
174-
if (!RAL)
175-
return;
176-
177-
assert(MRI.getRegBank(RALSrc) == VgprRB);
178-
MRI.replaceRegWith(Dst, RALSrc);
179-
cleanUpAfterCombine(MI, RAL);
180-
return;
280+
eraseInstr(MI, MRI, nullptr);
181281
}
182282
}
183283

llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@ define amdgpu_ps float @readanylane_to_physical_vgpr(ptr addrspace(1) inreg %ptr
2020
; CHECK-NEXT: v_mov_b32_e32 v0, 0
2121
; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
2222
; CHECK-NEXT: s_waitcnt vmcnt(0)
23-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
24-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
2523
; CHECK-NEXT: ; return to shader part epilog
2624
%load = load volatile float, ptr addrspace(1) %ptr
2725
ret float %load
@@ -33,8 +31,6 @@ define amdgpu_ps void @readanylane_to_bitcast_to_virtual_vgpr(ptr addrspace(1) i
3331
; CHECK-NEXT: v_mov_b32_e32 v0, 0
3432
; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
3533
; CHECK-NEXT: s_waitcnt vmcnt(0)
36-
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
37-
; CHECK-NEXT: v_mov_b32_e32 v1, s0
3834
; CHECK-NEXT: global_store_dword v0, v1, s[2:3]
3935
; CHECK-NEXT: s_endpgm
4036
%load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
@@ -49,8 +45,6 @@ define amdgpu_ps float @readanylane_to_bitcast_to_physical_vgpr(ptr addrspace(1)
4945
; CHECK-NEXT: v_mov_b32_e32 v0, 0
5046
; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
5147
; CHECK-NEXT: s_waitcnt vmcnt(0)
52-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
53-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
5448
; CHECK-NEXT: ; return to shader part epilog
5549
%load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
5650
%bitcast = bitcast <2 x i16> %load to float
@@ -63,10 +57,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_to_virtual_vgpr(ptr addrspace(1
6357
; CHECK-NEXT: v_mov_b32_e32 v2, 0
6458
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
6559
; CHECK-NEXT: s_waitcnt vmcnt(0)
66-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
67-
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
68-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
69-
; CHECK-NEXT: v_mov_b32_e32 v1, s1
7060
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
7161
; CHECK-NEXT: s_endpgm
7262
%load = load volatile i64, ptr addrspace(1) %ptr0
@@ -85,10 +75,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_bitcast_to_virtual_vgpr(ptr add
8575
; CHECK-NEXT: v_mov_b32_e32 v2, 0
8676
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
8777
; CHECK-NEXT: s_waitcnt vmcnt(0)
88-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
89-
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
90-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
91-
; CHECK-NEXT: v_mov_b32_e32 v1, s1
9278
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
9379
; CHECK-NEXT: s_endpgm
9480
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
@@ -109,9 +95,7 @@ define amdgpu_ps void @unmerge_readanylane_merge_extract_to_virtual_vgpr(ptr add
10995
; CHECK-NEXT: v_mov_b32_e32 v2, 0
11096
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
11197
; CHECK-NEXT: s_waitcnt vmcnt(0)
112-
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
113-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
114-
; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
98+
; CHECK-NEXT: global_store_dword v2, v1, s[2:3]
11599
; CHECK-NEXT: s_endpgm
116100
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
117101
%extracted = extractelement <2 x i32> %load, i32 1
@@ -125,8 +109,7 @@ define amdgpu_ps float @unmerge_readanylane_merge_extract_to_physical_vgpr(ptr a
125109
; CHECK-NEXT: v_mov_b32_e32 v0, 0
126110
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
127111
; CHECK-NEXT: s_waitcnt vmcnt(0)
128-
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
129-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
112+
; CHECK-NEXT: v_mov_b32_e32 v0, v1
130113
; CHECK-NEXT: ; return to shader part epilog
131114
%load = load volatile <2 x float>, ptr addrspace(1) %ptr0
132115
%extracted = extractelement <2 x float> %load, i32 1
@@ -139,8 +122,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_extract_bitcast_to_virtual_vgpr
139122
; CHECK-NEXT: v_mov_b32_e32 v2, 0
140123
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
141124
; CHECK-NEXT: s_waitcnt vmcnt(0)
142-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
143-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
144125
; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
145126
; CHECK-NEXT: s_endpgm
146127
%load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
@@ -156,8 +137,6 @@ define amdgpu_ps float @unmerge_readanylane_merge_extract_bitcast_to_physical_vg
156137
; CHECK-NEXT: v_mov_b32_e32 v0, 0
157138
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
158139
; CHECK-NEXT: s_waitcnt vmcnt(0)
159-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
160-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
161140
; CHECK-NEXT: ; return to shader part epilog
162141
%load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
163142
%extracted = shufflevector <4 x i16> %load, <4 x i16> %load, <2 x i32> <i32 0, i32 1>

0 commit comments

Comments
 (0)