Skip to content

Commit 9293b65

Browse files
authored
[AMDGPU] SelectionDAG divergence tracking should take into account Target divergency. (#147560)
This is the next attempt to upstream this: #144947 The las one caused build errors in AArch64. Issue was resolved.
1 parent 4647398 commit 9293b65

File tree

9 files changed

+102
-27
lines changed

9 files changed

+102
-27
lines changed

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,8 @@ class SelectionDAG {
238238
LLVMContext *Context;
239239
CodeGenOptLevel OptLevel;
240240

241+
bool DivergentTarget = false;
242+
241243
UniformityInfo *UA = nullptr;
242244
FunctionLoweringInfo * FLI = nullptr;
243245

@@ -471,14 +473,16 @@ class SelectionDAG {
471473
Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
472474
UniformityInfo *UA, ProfileSummaryInfo *PSIin,
473475
BlockFrequencyInfo *BFIin, MachineModuleInfo &MMI,
474-
FunctionVarLocs const *FnVarLocs);
476+
FunctionVarLocs const *FnVarLocs, bool HasDivergency);
475477

476478
void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
477479
MachineFunctionAnalysisManager &AM,
478480
const TargetLibraryInfo *LibraryInfo, UniformityInfo *UA,
479481
ProfileSummaryInfo *PSIin, BlockFrequencyInfo *BFIin,
480-
MachineModuleInfo &MMI, FunctionVarLocs const *FnVarLocs) {
481-
init(NewMF, NewORE, nullptr, LibraryInfo, UA, PSIin, BFIin, MMI, FnVarLocs);
482+
MachineModuleInfo &MMI, FunctionVarLocs const *FnVarLocs,
483+
bool HasDivergency) {
484+
init(NewMF, NewORE, nullptr, LibraryInfo, UA, PSIin, BFIin, MMI, FnVarLocs,
485+
HasDivergency);
482486
MFAM = &AM;
483487
}
484488

llvm/include/llvm/CodeGen/SelectionDAGISel.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,7 @@ class SelectionDAGISel {
5757
AssumptionCache *AC = nullptr;
5858
GCFunctionInfo *GFI = nullptr;
5959
SSPLayoutInfo *SP = nullptr;
60-
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
6160
TargetTransformInfo *TTI = nullptr;
62-
#endif
6361
CodeGenOptLevel OptLevel;
6462
const TargetInstrInfo *TII;
6563
const TargetLowering *TLI;

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1370,7 +1370,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
13701370
const TargetLibraryInfo *LibraryInfo,
13711371
UniformityInfo *NewUA, ProfileSummaryInfo *PSIin,
13721372
BlockFrequencyInfo *BFIin, MachineModuleInfo &MMIin,
1373-
FunctionVarLocs const *VarLocs) {
1373+
FunctionVarLocs const *VarLocs, bool HasDivergency) {
13741374
MF = &NewMF;
13751375
SDAGISelPass = PassPtr;
13761376
ORE = &NewORE;
@@ -1383,6 +1383,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
13831383
BFI = BFIin;
13841384
MMI = &MMIin;
13851385
FnVarLocs = VarLocs;
1386+
DivergentTarget = HasDivergency;
13861387
}
13871388

13881389
SelectionDAG::~SelectionDAG() {
@@ -2329,7 +2330,8 @@ SDValue SelectionDAG::getRegister(Register Reg, EVT VT) {
23292330
return SDValue(E, 0);
23302331

23312332
auto *N = newSDNode<RegisterSDNode>(Reg, VTs);
2332-
N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
2333+
N->SDNodeBits.IsDivergent =
2334+
DivergentTarget && TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
23332335
CSEMap.InsertNode(N, IP);
23342336
InsertNode(N);
23352337
return SDValue(N, 0);
@@ -12142,6 +12144,8 @@ static bool gluePropagatesDivergence(const SDNode *Node) {
1214212144
}
1214312145

1214412146
bool SelectionDAG::calculateDivergence(SDNode *N) {
12147+
if (!DivergentTarget)
12148+
return false;
1214512149
if (TLI->isSDNodeAlwaysUniform(N)) {
1214612150
assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) &&
1214712151
"Conflicting divergence information!");
@@ -12161,6 +12165,8 @@ bool SelectionDAG::calculateDivergence(SDNode *N) {
1216112165
}
1216212166

1216312167
void SelectionDAG::updateDivergence(SDNode *N) {
12168+
if (!DivergentTarget)
12169+
return;
1216412170
SmallVector<SDNode *, 16> Worklist(1, N);
1216512171
do {
1216612172
N = Worklist.pop_back_val();
@@ -13720,16 +13726,20 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
1372013726
Ops[I].setInitial(Vals[I]);
1372113727
EVT VT = Ops[I].getValueType();
1372213728

13729+
// Take care of the Node's operands iff target has divergence
1372313730
// Skip Chain. It does not carry divergence.
13724-
if (VT != MVT::Other &&
13731+
if (DivergentTarget && VT != MVT::Other &&
1372513732
(VT != MVT::Glue || gluePropagatesDivergence(Ops[I].getNode())) &&
1372613733
Ops[I].getNode()->isDivergent()) {
13734+
// Node is going to be divergent if at least one of its operand is
13735+
// divergent, unless it belongs to the "AlwaysUniform" exemptions.
1372713736
IsDivergent = true;
1372813737
}
1372913738
}
1373013739
Node->NumOperands = Vals.size();
1373113740
Node->OperandList = Ops;
13732-
if (!TLI->isSDNodeAlwaysUniform(Node)) {
13741+
// Check the divergence of the Node itself.
13742+
if (DivergentTarget && !TLI->isSDNodeAlwaysUniform(Node)) {
1373313743
IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, UA);
1373413744
Node->SDNodeBits.IsDivergent = IsDivergent;
1373513745
}

llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,10 @@ void SelectionDAGISel::initializeAnalysisResults(
482482
MachineModuleInfo &MMI =
483483
MAMP.getCachedResult<MachineModuleAnalysis>(*Fn.getParent())->getMMI();
484484

485-
CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
485+
TTI = &FAM.getResult<TargetIRAnalysis>(Fn);
486+
487+
CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs,
488+
TTI->hasBranchDivergence(&Fn));
486489

487490
// Now get the optional analyzes if we want to.
488491
// This is based on the possibly changed OptLevel (after optnone is taken
@@ -500,10 +503,6 @@ void SelectionDAGISel::initializeAnalysisResults(
500503
BatchAA = std::nullopt;
501504

502505
SP = &FAM.getResult<SSPLayoutAnalysis>(Fn);
503-
504-
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
505-
TTI = &FAM.getResult<TargetIRAnalysis>(Fn);
506-
#endif
507506
}
508507

509508
void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
@@ -539,7 +538,10 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
539538
MachineModuleInfo &MMI =
540539
MFP.getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
541540

542-
CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
541+
TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
542+
543+
CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs,
544+
TTI->hasBranchDivergence(&Fn));
543545

544546
// Now get the optional analyzes if we want to.
545547
// This is based on the possibly changed OptLevel (after optnone is taken
@@ -558,10 +560,6 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
558560
BatchAA = std::nullopt;
559561

560562
SP = &MFP.getAnalysis<StackProtector>().getLayoutInfo();
561-
562-
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
563-
TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
564-
#endif
565563
}
566564

567565
bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {

llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
1818
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
1919
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8
2020
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
21-
; GFX11-NEXT: v_mov_b32_e32 v2, s3
22-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
23-
; GFX11-NEXT: v_or3_b32 v2, s2, v2, s0
21+
; GFX11-NEXT: s_or_b32 s1, s2, s3
22+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
23+
; GFX11-NEXT: s_or_b32 s0, s0, s1
24+
; GFX11-NEXT: v_mov_b32_e32 v2, s0
2425
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
2526
; GFX11-NEXT: s_endpgm
2627
;
@@ -33,12 +34,14 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
3334
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
3435
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
3536
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
36-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
37+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
3738
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
3839
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
3940
; GFX12-NEXT: s_wait_kmcnt 0x0
41+
; GFX12-NEXT: s_or_b32 s0, s0, s1
42+
; GFX12-NEXT: s_or_b32 s0, s2, s0
43+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4044
; GFX12-NEXT: v_mov_b32_e32 v2, s0
41-
; GFX12-NEXT: v_or3_b32 v2, v2, s1, s2
4245
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
4346
; GFX12-NEXT: s_endpgm
4447
bb:
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GCN %s
3+
4+
declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32)
5+
6+
7+
define amdgpu_kernel void @test_isel_single_lane(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
8+
; GCN-LABEL: test_isel_single_lane:
9+
; GCN: ; %bb.0:
10+
; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
11+
; GCN-NEXT: s_wait_kmcnt 0x0
12+
; GCN-NEXT: s_load_b32 s4, s[0:1], 0x58
13+
; GCN-NEXT: s_wait_kmcnt 0x0
14+
; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
15+
; GCN-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN
16+
; GCN-NEXT: s_wait_loadcnt 0x0
17+
; GCN-NEXT: v_readfirstlane_b32 s0, v1
18+
; GCN-NEXT: s_addk_co_i32 s0, 0xf4
19+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
20+
; GCN-NEXT: s_lshl_b32 s1, s0, 4
21+
; GCN-NEXT: s_mul_i32 s0, s0, s1
22+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
23+
; GCN-NEXT: s_lshl_b32 s0, s0, 12
24+
; GCN-NEXT: s_sub_co_i32 s0, s1, s0
25+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
26+
; GCN-NEXT: v_mov_b32_e32 v1, s0
27+
; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
28+
; GCN-NEXT: s_endpgm
29+
%gep0 = getelementptr i32, ptr addrspace(1) %in, i32 22
30+
%val0 = load i32, ptr addrspace(1) %gep0, align 4
31+
%gep1 = getelementptr i32, ptr addrspace(1) %in, i32 4
32+
%val1 = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr addrspace(1) %gep1, i32 %val0)
33+
%res0 = add i32 %val1, 244
34+
%res1 = shl i32 %res0, 4
35+
%res2 = mul i32 %res0, %res1
36+
%res3 = shl i32 %res2, 12
37+
%res4 = sub i32 %res1, %res3
38+
store i32 %res4, ptr addrspace(1) %out
39+
ret void
40+
}
41+
42+
43+
attributes #0 = {
44+
"amdgpu-flat-work-group-size"="1,1"
45+
"amdgpu-waves-per-eu"="1,1"
46+
"uniform-work-group-size"="true"
47+
}

llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
1010
#include "llvm/Analysis/MemoryLocation.h"
1111
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
12+
#include "llvm/Analysis/TargetTransformInfo.h"
1213
#include "llvm/AsmParser/Parser.h"
1314
#include "llvm/CodeGen/MachineModuleInfo.h"
1415
#include "llvm/CodeGen/SelectionDAG.h"
@@ -78,8 +79,12 @@ class SelectionDAGAddressAnalysisTest : public testing::Test {
7879
if (!DAG)
7980
report_fatal_error("DAG?");
8081
OptimizationRemarkEmitter ORE(F);
82+
FunctionAnalysisManager FAM;
83+
FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
84+
85+
TargetTransformInfo TTI = TM->getTargetIRAnalysis().run(*F, FAM);
8186
DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
82-
nullptr);
87+
nullptr, TTI.hasBranchDivergence(F));
8388
}
8489

8590
TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {

llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
10+
#include "llvm/Analysis/TargetTransformInfo.h"
1011
#include "llvm/AsmParser/Parser.h"
1112
#include "llvm/CodeGen/MachineModuleInfo.h"
1213
#include "llvm/CodeGen/SDPatternMatch.h"
@@ -76,8 +77,12 @@ class SelectionDAGPatternMatchTest : public testing::Test {
7677
if (!DAG)
7778
report_fatal_error("DAG?");
7879
OptimizationRemarkEmitter ORE(F);
80+
FunctionAnalysisManager FAM;
81+
FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
82+
83+
TargetTransformInfo TTI = TM->getTargetIRAnalysis().run(*F, FAM);
7984
DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
80-
nullptr);
85+
nullptr, TTI.hasBranchDivergence(F));
8186
}
8287

8388
TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {

llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "AArch64SelectionDAGInfo.h"
99
#include "llvm/Analysis/MemoryLocation.h"
1010
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
11+
#include "llvm/Analysis/TargetTransformInfo.h"
1112
#include "llvm/AsmParser/Parser.h"
1213
#include "llvm/CodeGen/MachineModuleInfo.h"
1314
#include "llvm/CodeGen/SelectionDAG.h"
@@ -62,8 +63,12 @@ class AArch64SelectionDAGTest : public testing::Test {
6263
if (!DAG)
6364
report_fatal_error("DAG?");
6465
OptimizationRemarkEmitter ORE(F);
66+
FunctionAnalysisManager FAM;
67+
FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
68+
69+
TargetTransformInfo TTI = TM->getTargetIRAnalysis().run(*F, FAM);
6570
DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
66-
nullptr);
71+
nullptr, TTI.hasBranchDivergence(F));
6772
}
6873

6974
TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {

0 commit comments

Comments
 (0)