Skip to content

Commit fada12c

Browse files
AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize
1 parent 50133d4 commit fada12c

File tree

3 files changed

+127
-100
lines changed

3 files changed

+127
-100
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 106 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "GCNSubtarget.h"
2424
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
2525
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
26+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
2627
#include "llvm/CodeGen/MachineFunctionPass.h"
2728
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2829
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -137,7 +138,111 @@ class AMDGPURegBankLegalizeCombiner {
137138
return {MatchMI, MatchMI->getOperand(1).getReg()};
138139
}
139140

141+
std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src) {
142+
MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
143+
if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
144+
return {nullptr, -1};
145+
146+
Register RALSrc = ReadAnyLane->getOperand(1).getReg();
147+
if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
148+
return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
149+
150+
return {nullptr, -1};
151+
}
152+
153+
Register getReadAnyLaneSrc(Register Src) {
154+
// Src = G_AMDGPU_READANYLANE RALSrc
155+
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
156+
if (RAL)
157+
return RALSrc;
158+
159+
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
160+
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
161+
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr
162+
// Src G_MERGE_VALUES LoSgpr, HiSgpr
163+
auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
164+
if (Merge) {
165+
unsigned NumElts = Merge->getNumSources();
166+
auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
167+
if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
168+
return {};
169+
170+
// check if all elements are from same unmerge and there is no shuffling
171+
for (unsigned i = 1; i < NumElts; ++i) {
172+
auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
173+
if (UnmergeI != Unmerge || (unsigned)IdxI != i)
174+
return {};
175+
}
176+
return Unmerge->getSourceReg();
177+
}
178+
179+
// ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge
180+
// SgprI = G_AMDGPU_READANYLANE VgprI
181+
// SgprLarge G_MERGE_VALUES ..., SgprI, ...
182+
// ..., Src, ... = G_UNMERGE_VALUES SgprLarge
183+
auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
184+
if (UnMerge) {
185+
int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
186+
auto *Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
187+
if (Merge) {
188+
auto [RAL, RALSrc] =
189+
tryMatch(Merge->getSourceReg(Idx), AMDGPU::G_AMDGPU_READANYLANE);
190+
if (RAL)
191+
return RALSrc;
192+
}
193+
}
194+
195+
return {};
196+
}
197+
198+
void replaceRegWithOrBuildCopy(Register Dst, Register Src) {
199+
if (Dst.isVirtual())
200+
MRI.replaceRegWith(Dst, Src);
201+
else
202+
B.buildCopy(Dst, Src);
203+
}
204+
205+
bool tryEliminateReadAnyLane(MachineInstr &Copy) {
206+
Register Dst = Copy.getOperand(0).getReg();
207+
Register Src = Copy.getOperand(1).getReg();
208+
if (!Src.isVirtual())
209+
return false;
210+
211+
Register RALDst = Src;
212+
MachineInstr &SrcMI = *MRI.getVRegDef(Src);
213+
if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
214+
RALDst = SrcMI.getOperand(1).getReg();
215+
216+
Register RALSrc = getReadAnyLaneSrc(RALDst);
217+
if (!RALSrc)
218+
return false;
219+
220+
B.setInstr(Copy);
221+
if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
222+
// Src = READANYLANE RALSrc Src = READANYLANE RALSrc
223+
// Dst = Copy Src $Dst = Copy Src
224+
// -> ->
225+
// Dst = RALSrc $Dst = Copy RALSrc
226+
replaceRegWithOrBuildCopy(Dst, RALSrc);
227+
} else {
228+
// RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
229+
// Src = G_BITCAST RALDst Src = G_BITCAST RALDst
230+
// Dst = Copy Src Dst = Copy Src
231+
// -> ->
232+
// NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
233+
// Dst = NewVgpr $Dst = Copy NewVgpr
234+
auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
235+
replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
236+
}
237+
238+
eraseInstr(Copy, MRI, nullptr);
239+
return true;
240+
}
241+
140242
void tryCombineCopy(MachineInstr &MI) {
243+
if (tryEliminateReadAnyLane(MI))
244+
return;
245+
141246
Register Dst = MI.getOperand(0).getReg();
142247
Register Src = MI.getOperand(1).getReg();
143248
// Skip copies of physical registers.
@@ -160,24 +265,7 @@ class AMDGPURegBankLegalizeCombiner {
160265
auto One = B.buildConstant({SgprRB, S32}, 1);
161266
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
162267
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
163-
cleanUpAfterCombine(MI, Trunc);
164-
return;
165-
}
166-
167-
// Src = G_AMDGPU_READANYLANE RALSrc
168-
// Dst = COPY Src
169-
// ->
170-
// Dst = RALSrc
171-
if (MRI.getRegBankOrNull(Dst) == VgprRB &&
172-
MRI.getRegBankOrNull(Src) == SgprRB) {
173-
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
174-
if (!RAL)
175-
return;
176-
177-
assert(MRI.getRegBank(RALSrc) == VgprRB);
178-
MRI.replaceRegWith(Dst, RALSrc);
179-
cleanUpAfterCombine(MI, RAL);
180-
return;
268+
eraseInstr(MI, MRI, nullptr);
181269
}
182270
}
183271

llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@ define amdgpu_ps float @readanylane_to_physical_vgpr(ptr addrspace(1) inreg %ptr
2020
; CHECK-NEXT: v_mov_b32_e32 v0, 0
2121
; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
2222
; CHECK-NEXT: s_waitcnt vmcnt(0)
23-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
24-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
2523
; CHECK-NEXT: ; return to shader part epilog
2624
%load = load volatile float, ptr addrspace(1) %ptr
2725
ret float %load
@@ -33,8 +31,6 @@ define amdgpu_ps void @readanylane_to_bitcast_to_virtual_vgpr(ptr addrspace(1) i
3331
; CHECK-NEXT: v_mov_b32_e32 v0, 0
3432
; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
3533
; CHECK-NEXT: s_waitcnt vmcnt(0)
36-
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
37-
; CHECK-NEXT: v_mov_b32_e32 v1, s0
3834
; CHECK-NEXT: global_store_dword v0, v1, s[2:3]
3935
; CHECK-NEXT: s_endpgm
4036
%load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
@@ -49,8 +45,6 @@ define amdgpu_ps float @readanylane_to_bitcast_to_physical_vgpr(ptr addrspace(1)
4945
; CHECK-NEXT: v_mov_b32_e32 v0, 0
5046
; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
5147
; CHECK-NEXT: s_waitcnt vmcnt(0)
52-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
53-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
5448
; CHECK-NEXT: ; return to shader part epilog
5549
%load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
5650
%bitcast = bitcast <2 x i16> %load to float
@@ -63,10 +57,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_to_virtual_vgpr(ptr addrspace(1
6357
; CHECK-NEXT: v_mov_b32_e32 v2, 0
6458
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
6559
; CHECK-NEXT: s_waitcnt vmcnt(0)
66-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
67-
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
68-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
69-
; CHECK-NEXT: v_mov_b32_e32 v1, s1
7060
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
7161
; CHECK-NEXT: s_endpgm
7262
%load = load volatile i64, ptr addrspace(1) %ptr0
@@ -85,10 +75,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_bitcast_to_virtual_vgpr(ptr add
8575
; CHECK-NEXT: v_mov_b32_e32 v2, 0
8676
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
8777
; CHECK-NEXT: s_waitcnt vmcnt(0)
88-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
89-
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
90-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
91-
; CHECK-NEXT: v_mov_b32_e32 v1, s1
9278
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
9379
; CHECK-NEXT: s_endpgm
9480
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
@@ -109,9 +95,7 @@ define amdgpu_ps void @unmerge_readanylane_merge_extract_to_virtual_vgpr(ptr add
10995
; CHECK-NEXT: v_mov_b32_e32 v2, 0
11096
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
11197
; CHECK-NEXT: s_waitcnt vmcnt(0)
112-
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
113-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
114-
; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
98+
; CHECK-NEXT: global_store_dword v2, v1, s[2:3]
11599
; CHECK-NEXT: s_endpgm
116100
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
117101
%extracted = extractelement <2 x i32> %load, i32 1
@@ -125,8 +109,7 @@ define amdgpu_ps float @unmerge_readanylane_merge_extract_to_physical_vgpr(ptr a
125109
; CHECK-NEXT: v_mov_b32_e32 v0, 0
126110
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
127111
; CHECK-NEXT: s_waitcnt vmcnt(0)
128-
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
129-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
112+
; CHECK-NEXT: v_mov_b32_e32 v0, v1
130113
; CHECK-NEXT: ; return to shader part epilog
131114
%load = load volatile <2 x float>, ptr addrspace(1) %ptr0
132115
%extracted = extractelement <2 x float> %load, i32 1
@@ -139,8 +122,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_extract_bitcast_to_virtual_vgpr
139122
; CHECK-NEXT: v_mov_b32_e32 v2, 0
140123
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
141124
; CHECK-NEXT: s_waitcnt vmcnt(0)
142-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
143-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
144125
; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
145126
; CHECK-NEXT: s_endpgm
146127
%load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
@@ -156,8 +137,6 @@ define amdgpu_ps float @unmerge_readanylane_merge_extract_bitcast_to_physical_vg
156137
; CHECK-NEXT: v_mov_b32_e32 v0, 0
157138
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
158139
; CHECK-NEXT: s_waitcnt vmcnt(0)
159-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
160-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
161140
; CHECK-NEXT: ; return to shader part epilog
162141
%load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
163142
%extracted = shufflevector <4 x i16> %load, <4 x i16> %load, <2 x i32> <i32 0, i32 1>

0 commit comments

Comments
 (0)