Skip to content

Commit 6ddf2a8

Browse files
committed
[AMDGPU] Adjust wave priority based on VMEM instructions to avoid duty-cycling.
As older waves execute long sequences of VALU instructions, this may prevent younger waves from address calculation and then issuing their VMEM loads, which in turn leads the VALU unit to idle. This patch tries to prevent this by temporarily raising the wave's priority. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D124246
1 parent 6e8dda0 commit 6ddf2a8

File tree

5 files changed

+329
-0
lines changed

5 files changed

+329
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,9 @@ extern char &GCNNSAReassignID;
331331
void initializeGCNPreRAOptimizationsPass(PassRegistry &);
332332
extern char &GCNPreRAOptimizationsID;
333333

334+
FunctionPass *createAMDGPUSetWavePriorityPass();
335+
void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
336+
334337
namespace AMDGPU {
335338
enum TargetIndex {
336339
TI_CONSTDATA_START,
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// Pass to temporarily raise the wave priority beginning the start of
11+
/// the shader function until its last VMEM instructions to allow younger
12+
/// waves to issue their VMEM instructions as well.
13+
//
14+
//===----------------------------------------------------------------------===//
15+
16+
#include "AMDGPU.h"
17+
#include "GCNSubtarget.h"
18+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19+
#include "SIInstrInfo.h"
20+
#include "llvm/ADT/PostOrderIterator.h"
21+
#include "llvm/CodeGen/MachineFunctionPass.h"
22+
#include "llvm/InitializePasses.h"
23+
#include "llvm/Support/Allocator.h"
24+
25+
using namespace llvm;
26+
27+
#define DEBUG_TYPE "amdgpu-set-wave-priority"
28+
29+
namespace {
30+
31+
struct MBBInfo {
32+
MBBInfo() = default;
33+
bool MayReachVMEMLoad = false;
34+
};
35+
36+
using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>;
37+
38+
class AMDGPUSetWavePriority : public MachineFunctionPass {
39+
public:
40+
static char ID;
41+
42+
AMDGPUSetWavePriority() : MachineFunctionPass(ID) {}
43+
44+
StringRef getPassName() const override { return "Set wave priority"; }
45+
46+
bool runOnMachineFunction(MachineFunction &MF) override;
47+
48+
private:
49+
MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const;
50+
51+
const SIInstrInfo *TII;
52+
};
53+
54+
} // End anonymous namespace.
55+
56+
INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false,
57+
false)
58+
59+
char AMDGPUSetWavePriority::ID = 0;
60+
61+
FunctionPass *llvm::createAMDGPUSetWavePriorityPass() {
62+
return new AMDGPUSetWavePriority();
63+
}
64+
65+
MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF,
66+
unsigned priority) const {
67+
return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority);
68+
}
69+
70+
// Checks that for every predecessor Pred that can reach a VMEM load,
71+
// none of Pred's successors can reach a VMEM load.
72+
static bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB,
73+
MBBInfoSet &MBBInfos) {
74+
for (const MachineBasicBlock *Pred : MBB.predecessors()) {
75+
if (!MBBInfos[Pred].MayReachVMEMLoad)
76+
continue;
77+
for (const MachineBasicBlock *Succ : Pred->successors()) {
78+
if (MBBInfos[Succ].MayReachVMEMLoad)
79+
return false;
80+
}
81+
}
82+
return true;
83+
}
84+
85+
static bool isVMEMLoad(const MachineInstr &MI) {
86+
return SIInstrInfo::isVMEM(MI) && MI.mayLoad();
87+
}
88+
89+
bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
90+
const unsigned HighPriority = 3;
91+
const unsigned LowPriority = 0;
92+
93+
Function &F = MF.getFunction();
94+
if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
95+
return false;
96+
97+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
98+
TII = ST.getInstrInfo();
99+
100+
MBBInfoSet MBBInfos;
101+
SmallVector<const MachineBasicBlock *, 16> Worklist;
102+
for (MachineBasicBlock &MBB : MF) {
103+
if (any_of(MBB, isVMEMLoad))
104+
Worklist.push_back(&MBB);
105+
}
106+
107+
// Mark blocks from which control may reach VMEM loads.
108+
while (!Worklist.empty()) {
109+
const MachineBasicBlock *MBB = Worklist.pop_back_val();
110+
MBBInfo &Info = MBBInfos[MBB];
111+
if (!Info.MayReachVMEMLoad) {
112+
Info.MayReachVMEMLoad = true;
113+
Worklist.append(MBB->pred_begin(), MBB->pred_end());
114+
}
115+
}
116+
117+
MachineBasicBlock &Entry = MF.front();
118+
if (!MBBInfos[&Entry].MayReachVMEMLoad)
119+
return false;
120+
121+
// Raise the priority at the beginning of the shader.
122+
MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end();
123+
while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator())
124+
++I;
125+
Entry.insert(I, BuildSetprioMI(MF, HighPriority));
126+
127+
// Lower the priority on edges where control leaves blocks from which
128+
// VMEM loads are reachable.
129+
SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
130+
for (MachineBasicBlock &MBB : MF) {
131+
if (MBBInfos[&MBB].MayReachVMEMLoad) {
132+
if (MBB.succ_empty())
133+
PriorityLoweringBlocks.insert(&MBB);
134+
continue;
135+
}
136+
137+
if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) {
138+
for (MachineBasicBlock *Pred : MBB.predecessors()) {
139+
if (MBBInfos[Pred].MayReachVMEMLoad)
140+
PriorityLoweringBlocks.insert(Pred);
141+
}
142+
continue;
143+
}
144+
145+
// Where lowering the priority in predecessors is not possible, the
146+
// block receiving control either was not part of a loop in the first
147+
// place or the loop simplification/canonicalization pass should have
148+
// already tried to split the edge and insert a preheader, and if for
149+
// whatever reason it failed to do so, then this leaves us with the
150+
// only option of lowering the priority within the loop.
151+
PriorityLoweringBlocks.insert(&MBB);
152+
}
153+
154+
for (MachineBasicBlock *MBB : PriorityLoweringBlocks) {
155+
MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin();
156+
while (I != B) {
157+
if (isVMEMLoad(*--I)) {
158+
++I;
159+
break;
160+
}
161+
}
162+
MBB->insert(I, BuildSetprioMI(MF, LowPriority));
163+
}
164+
165+
return true;
166+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,10 @@ EnableDCEInRA("amdgpu-dce-in-ra",
277277
cl::init(true), cl::Hidden,
278278
cl::desc("Enable machine DCE inside regalloc"));
279279

280+
static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
281+
cl::desc("Adjust wave priority"),
282+
cl::init(false), cl::Hidden);
283+
280284
static cl::opt<bool> EnableScalarIRPasses(
281285
"amdgpu-scalar-ir-passes",
282286
cl::desc("Enable scalar IR passes"),
@@ -1360,6 +1364,8 @@ void GCNPassConfig::addPreEmitPass() {
13601364
addPass(&SIInsertHardClausesID);
13611365

13621366
addPass(&SILateBranchLoweringPassID);
1367+
if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
1368+
addPass(createAMDGPUSetWavePriorityPass());
13631369
if (getOptLevel() > CodeGenOpt::None)
13641370
addPass(&SIPreEmitPeepholeID);
13651371
// The hazard recognizer that runs as part of the post-ra scheduler does not

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ add_llvm_target(AMDGPUCodeGen
8989
AMDGPUReplaceLDSUseWithPointer.cpp
9090
AMDGPUResourceUsageAnalysis.cpp
9191
AMDGPURewriteOutArguments.cpp
92+
AMDGPUSetWavePriority.cpp
9293
AMDGPUSubtarget.cpp
9394
AMDGPUTargetMachine.cpp
9495
AMDGPUTargetObjectFile.cpp
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \
2+
; RUN: FileCheck %s
3+
4+
; CHECK-LABEL: no_setprio:
5+
; CHECK-NOT: s_setprio
6+
; CHECK: ; return to shader part epilog
7+
define amdgpu_ps <2 x float> @no_setprio() {
8+
ret <2 x float> <float 0.0, float 0.0>
9+
}
10+
11+
; CHECK-LABEL: vmem_in_exit_block:
12+
; CHECK: s_setprio 3
13+
; CHECK: buffer_load_dwordx2
14+
; CHECK-NEXT: s_setprio 0
15+
; CHECK: ; return to shader part epilog
16+
define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) {
17+
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
18+
ret <2 x float> %v
19+
}
20+
21+
; CHECK-LABEL: branch:
22+
; CHECK: s_setprio 3
23+
; CHECK: s_cbranch_scc0 [[A:.*]]
24+
; CHECK: {{.*}}: ; %b
25+
; CHECK: buffer_load_dwordx2
26+
; CHECK-NEXT: s_setprio 0
27+
; CHECK: s_branch [[EXIT:.*]]
28+
; CHECK: [[A]]: ; %a
29+
; CHECK-NEXT: s_setprio 0
30+
; CHECK: s_branch [[EXIT]]
31+
; CHECK-NEXT: [[EXIT]]:
32+
define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) {
33+
%cond = icmp eq i32 %i, 0
34+
br i1 %cond, label %a, label %b
35+
36+
a:
37+
ret <2 x float> <float 0.0, float 0.0>
38+
39+
b:
40+
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
41+
ret <2 x float> %v
42+
}
43+
44+
; CHECK-LABEL: setprio_follows_setprio:
45+
; CHECK: s_setprio 3
46+
; CHECK: buffer_load_dwordx2
47+
; CHECK: s_cbranch_scc1 [[C:.*]]
48+
; CHECK: {{.*}}: ; %a
49+
; CHECK: buffer_load_dwordx2
50+
; CHECK-NEXT: s_setprio 0
51+
; CHECK: s_cbranch_scc1 [[C]]
52+
; CHECK: {{.*}}: ; %b
53+
; CHECK-NOT: s_setprio
54+
; CHECK: s_branch [[EXIT:.*]]
55+
; CHECK: [[C]]: ; %c
56+
; CHECK-NEXT: s_setprio 0
57+
; CHECK: s_branch [[EXIT]]
58+
; CHECK: [[EXIT]]:
59+
define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) {
60+
entry:
61+
%v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
62+
%cond1 = icmp ne i32 %i, 0
63+
br i1 %cond1, label %a, label %c
64+
65+
a:
66+
%v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
67+
%cond2 = icmp ne i32 %i, 1
68+
br i1 %cond2, label %b, label %c
69+
70+
b:
71+
ret <2 x float> %v2
72+
73+
c:
74+
%v3 = phi <2 x float> [%v1, %entry], [%v2, %a]
75+
%v4 = fadd <2 x float> %v1, %v3
76+
ret <2 x float> %v4
77+
}
78+
79+
; CHECK-LABEL: loop:
80+
; CHECK: {{.*}}: ; %entry
81+
; CHECK: s_setprio 3
82+
; CHECK-NOT: s_setprio
83+
; CHECK: [[LOOP:.*]]: ; %loop
84+
; CHECK-NOT: s_setprio
85+
; CHECK: buffer_load_dwordx2
86+
; CHECK-NOT: s_setprio
87+
; CHECK: s_cbranch_scc1 [[LOOP]]
88+
; CHECK-NEXT: {{.*}}: ; %exit
89+
; CHECK-NEXT: s_setprio 0
90+
define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) {
91+
entry:
92+
br label %loop
93+
94+
loop:
95+
%i = phi i32 [0, %entry], [%i2, %loop]
96+
%sum = phi <2 x float> [<float 0.0, float 0.0>, %entry], [%sum2, %loop]
97+
98+
%i2 = add i32 %i, 1
99+
100+
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0)
101+
%sum2 = fadd <2 x float> %sum, %v
102+
103+
%cond = icmp ult i32 %i2, 5
104+
br i1 %cond, label %loop, label %exit
105+
106+
exit:
107+
ret <2 x float> %sum2
108+
}
109+
110+
; CHECK-LABEL: edge_split:
111+
; CHECK: s_setprio 3
112+
; CHECK: buffer_load_dwordx2
113+
; CHECK-NOT: s_setprio
114+
; CHECK: s_cbranch_scc1 [[ANOTHER_LOAD:.*]]
115+
; CHECK: {{.*}}: ; %loop.preheader
116+
; CHECK-NEXT: s_setprio 0
117+
; CHECK: [[LOOP:.*]]: ; %loop
118+
; CHECK-NOT: s_setprio
119+
; CHECK: s_cbranch_scc1 [[LOOP]]
120+
; CHECK {{.*}}: ; %exit
121+
; CHECK-NOT: s_setprio
122+
; CHECK: s_branch [[RET:.*]]
123+
; CHECK: [[ANOTHER_LOAD]]: ; %another_load
124+
; CHECK: buffer_load_dwordx2
125+
; CHECK-NEXT: s_setprio 0
126+
; CHECK: s_branch [[RET]]
127+
; CHECK: [[RET]]:
128+
define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) {
129+
entry:
130+
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
131+
%cond = icmp ne i32 %x, 0
132+
br i1 %cond, label %loop, label %another_load
133+
134+
loop:
135+
%i = phi i32 [0, %entry], [%i2, %loop]
136+
%mul = phi <2 x float> [%v, %entry], [%mul2, %loop]
137+
138+
%i2 = add i32 %i, 1
139+
%mul2 = fmul <2 x float> %mul, %v
140+
141+
%cond2 = icmp ult i32 %i2, 5
142+
br i1 %cond2, label %loop, label %exit
143+
144+
exit:
145+
ret <2 x float> %mul2
146+
147+
another_load:
148+
%v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
149+
%sum = fadd <2 x float> %v, %v2
150+
ret <2 x float> %sum
151+
}
152+
153+
declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind

0 commit comments

Comments
 (0)