Skip to content

Commit 129faec

Browse files
committed
[OpenMP] Identify non-aligned barriers executed in an aligned context
Even if a barrier does not enforce aligned execution, it will effectively be like an aligned barrier if it is executed by all threads in an aligned way. We lack control flow divergence analysis here so we can only do (basic block) local reasoning for now.
1 parent 0326ee7 commit 129faec

File tree

4 files changed

+163
-15
lines changed

4 files changed

+163
-15
lines changed

llvm/include/llvm/Transforms/IPO/Attributor.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3332,9 +3332,12 @@ struct AANoSync
33323332
/// Helper function specific for intrinsics which are potentially volatile.
33333333
static bool isNoSyncIntrinsic(const Instruction *I);
33343334

3335-
/// Helper function to determine if \p CB is an aligned (GPU) barrier.
3336-
/// Aligned barriers have to be executed by all threads.
3337-
static bool isAlignedBarrier(const CallBase &CB);
3335+
/// Helper function to determine if \p CB is an aligned (GPU) barrier. Aligned
3336+
/// barriers have to be executed by all threads. The flag \p ExecutedAligned
3337+
/// indicates if the call is executed by all threads in a (thread) block in an
3338+
/// aligned way. If that is the case, non-aligned barriers are effectively
3339+
/// aligned barriers.
3340+
static bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned);
33383341

33393342
/// Create an abstract attribute view for the position \p IRP.
33403343
static AANoSync &createForPosition(const IRPosition &IRP, Attributor &A);

llvm/lib/Transforms/IPO/AttributorAttributes.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2226,14 +2226,15 @@ struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
22262226

22272227
/// ------------------------ NoSync Function Attribute -------------------------
22282228

2229-
bool AANoSync::isAlignedBarrier(const CallBase &CB) {
2229+
bool AANoSync::isAlignedBarrier(const CallBase &CB, bool ExecutedAligned) {
22302230
switch (CB.getIntrinsicID()) {
22312231
case Intrinsic::nvvm_barrier0:
22322232
case Intrinsic::nvvm_barrier0_and:
22332233
case Intrinsic::nvvm_barrier0_or:
22342234
case Intrinsic::nvvm_barrier0_popc:
22352235
return true;
2236-
// TODO: Check for amdgcn_s_barrier executed in a uniform/aligned way.
2236+
case Intrinsic::amdgcn_s_barrier:
2237+
return ExecutedAligned;
22372238
default:
22382239
break;
22392240
}

llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2827,17 +2827,23 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
28272827

28282828
Function *F = getAnchorScope();
28292829
BasicBlock &EntryBB = F->getEntryBlock();
2830+
bool IsKernel = OMPInfoCache.Kernels.count(F);
28302831

28312832
SmallVector<Instruction *> SyncInstWorklist;
28322833
for (auto &RIt : *RPOT) {
28332834
BasicBlock &BB = *RIt;
28342835

2836+
bool IsEntryBB = &BB == &EntryBB;
2837+
// TODO: We use local reasoning since we don't have a divergence analysis
2838+
// running as well. We could basically allow uniform branches here.
2839+
bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
28352840
ExecutionDomainTy ED;
28362841
// Propagate "incoming edges" into information about this block.
2837-
if (&BB == &EntryBB) {
2842+
if (IsEntryBB) {
28382843
handleEntryBB(A, ED);
28392844
} else {
2840-
// For live non-entry blocks we only propagate information via live edges.
2845+
// For live non-entry blocks we only propagate
2846+
// information via live edges.
28412847
if (LivenessAA.isAssumedDead(&BB))
28422848
continue;
28432849

@@ -2874,14 +2880,18 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
28742880
auto *CB = dyn_cast<CallBase>(&I);
28752881
bool IsNoSync = AA::isNoSyncInst(A, I, *this);
28762882
bool IsAlignedBarrier =
2877-
!IsNoSync && CB && AANoSync::isAlignedBarrier(*CB);
2883+
!IsNoSync && CB &&
2884+
AANoSync::isAlignedBarrier(*CB, AlignedBarrierLastInBlock);
2885+
2886+
AlignedBarrierLastInBlock &= IsNoSync;
28782887

28792888
// Next we check for calls. Aligned barriers are handled
28802889
// explicitly, everything else is kept for the backward traversal and will
28812890
// also affect our state.
28822891
if (CB) {
28832892
if (IsAlignedBarrier) {
28842893
HandleAlignedBarrier(CB, ED);
2894+
AlignedBarrierLastInBlock = true;
28852895
continue;
28862896
}
28872897

@@ -2913,6 +2923,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
29132923
const auto &CalleeED = EDAA.getFunctionExecutionDomain();
29142924
ED.IsReachedFromAlignedBarrierOnly =
29152925
CalleeED.IsReachedFromAlignedBarrierOnly;
2926+
AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
29162927
if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
29172928
ED.EncounteredNonLocalSideEffect |=
29182929
CalleeED.EncounteredNonLocalSideEffect;
@@ -2928,6 +2939,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
29282939
}
29292940
ED.IsReachedFromAlignedBarrierOnly =
29302941
IsNoSync && ED.IsReachedFromAlignedBarrierOnly;
2942+
AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
29312943
ED.EncounteredNonLocalSideEffect |= true;
29322944
if (!IsNoSync)
29332945
SyncInstWorklist.push_back(&I);
@@ -2971,7 +2983,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
29712983
auto &FnED = BEDMap[nullptr];
29722984
mergeInPredecessor(A, FnED, ED);
29732985

2974-
if (OMPInfoCache.Kernels.count(F))
2986+
if (IsKernel)
29752987
HandleAlignedBarrier(nullptr, ED);
29762988
}
29772989

llvm/test/Transforms/OpenMP/barrier_removal.ll

Lines changed: 138 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,131 @@ define void @pos_empty_6() {
7070
call i32 @llvm.nvvm.barrier0.popc(i32 0)
7171
ret void
7272
}
73-
define void @neg_empty_7() {
74-
; CHECK-LABEL: define {{[^@]+}}@neg_empty_7() {
73+
define void @pos_empty_7a() {
74+
; CHECK-LABEL: define {{[^@]+}}@pos_empty_7a() {
75+
; CHECK-NEXT: call void @unknown()
76+
; CHECK-NEXT: ret void
77+
;
78+
call void @llvm.amdgcn.s.barrier()
79+
call void @unknown()
80+
ret void
81+
}
82+
; FIXME: We should remove the barrier.
83+
define void @pos_empty_7b() {
84+
; CHECK-LABEL: define {{[^@]+}}@pos_empty_7b() {
85+
; CHECK-NEXT: call void @unknown() #[[ATTR4:[0-9]+]]
86+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
87+
; CHECK-NEXT: call void @unknown()
88+
; CHECK-NEXT: ret void
89+
;
90+
call void @unknown() nosync readnone
91+
call void @llvm.amdgcn.s.barrier()
92+
call void @unknown()
93+
ret void
94+
}
95+
define void @neg_empty_8() {
96+
; CHECK-LABEL: define {{[^@]+}}@neg_empty_8() {
97+
; CHECK-NEXT: call void @unknown()
7598
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
7699
; CHECK-NEXT: ret void
77100
;
101+
call void @unknown()
102+
call void @llvm.amdgcn.s.barrier()
103+
ret void
104+
}
105+
define void @neg_empty_9(i1 %c) {
106+
; CHECK-LABEL: define {{[^@]+}}@neg_empty_9
107+
; CHECK-SAME: (i1 [[C:%.*]]) {
108+
; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
109+
; CHECK: t:
110+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
111+
; CHECK-NEXT: br label [[M:%.*]]
112+
; CHECK: f:
113+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
114+
; CHECK-NEXT: br label [[M]]
115+
; CHECK: m:
116+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
117+
; CHECK-NEXT: ret void
118+
;
119+
br i1 %c, label %t, label %f
120+
t:
121+
call void @llvm.amdgcn.s.barrier()
122+
br label %m
123+
f:
124+
call void @llvm.amdgcn.s.barrier()
125+
br label %m
126+
m:
127+
call void @llvm.amdgcn.s.barrier()
128+
ret void
129+
}
130+
; FIXME: We should remove the barrier
131+
define void @pos_empty_10() {
132+
; CHECK-LABEL: define {{[^@]+}}@pos_empty_10() {
133+
; CHECK-NEXT: br label [[M:%.*]]
134+
; CHECK: m:
135+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
136+
; CHECK-NEXT: ret void
137+
;
138+
br label %m
139+
m:
140+
call void @llvm.amdgcn.s.barrier()
141+
ret void
142+
}
143+
define void @pos_empty_11() {
144+
; CHECK-LABEL: define {{[^@]+}}@pos_empty_11() {
145+
; CHECK-NEXT: br label [[M:%.*]]
146+
; CHECK: m:
147+
; CHECK-NEXT: ret void
148+
;
149+
br label %m
150+
m:
151+
call void @aligned_barrier()
152+
call void @llvm.amdgcn.s.barrier()
153+
ret void
154+
}
155+
define void @empty() {
156+
; CHECK-LABEL: define {{[^@]+}}@empty() {
157+
; CHECK-NEXT: ret void
158+
;
159+
ret void
160+
}
161+
; FIXME: We should remove the barrier in the end but not the first one.
162+
define void @neg_empty_12(i1 %c) {
163+
; MODULE-LABEL: define {{[^@]+}}@neg_empty_12
164+
; MODULE-SAME: (i1 [[C:%.*]]) {
165+
; MODULE-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
166+
; MODULE: t:
167+
; MODULE-NEXT: call void @llvm.amdgcn.s.barrier()
168+
; MODULE-NEXT: br label [[M:%.*]]
169+
; MODULE: f:
170+
; MODULE-NEXT: br label [[M]]
171+
; MODULE: m:
172+
; MODULE-NEXT: call void @llvm.amdgcn.s.barrier()
173+
; MODULE-NEXT: ret void
174+
;
175+
; CGSCC-LABEL: define {{[^@]+}}@neg_empty_12
176+
; CGSCC-SAME: (i1 [[C:%.*]]) {
177+
; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
178+
; CGSCC: t:
179+
; CGSCC-NEXT: call void @empty()
180+
; CGSCC-NEXT: call void @llvm.amdgcn.s.barrier()
181+
; CGSCC-NEXT: br label [[M:%.*]]
182+
; CGSCC: f:
183+
; CGSCC-NEXT: call void @empty()
184+
; CGSCC-NEXT: br label [[M]]
185+
; CGSCC: m:
186+
; CGSCC-NEXT: call void @llvm.amdgcn.s.barrier()
187+
; CGSCC-NEXT: ret void
188+
;
189+
br i1 %c, label %t, label %f
190+
t:
191+
call void @empty()
192+
call void @llvm.amdgcn.s.barrier()
193+
br label %m
194+
f:
195+
call void @empty()
196+
br label %m
197+
m:
78198
call void @llvm.amdgcn.s.barrier()
79199
ret void
80200
}
@@ -214,7 +334,6 @@ define void @neg_mem() {
214334

215335
define void @pos_multiple() {
216336
; CHECK-LABEL: define {{[^@]+}}@pos_multiple() {
217-
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
218337
; CHECK-NEXT: ret void
219338
;
220339
call void @llvm.nvvm.barrier0()
@@ -846,15 +965,21 @@ m3:
846965
}
847966

848967
!llvm.module.flags = !{!16,!15}
849-
!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14}
968+
!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14,!17,!18,!19,!20,!21,!22}
850969

851970
!0 = !{void ()* @pos_empty_1, !"kernel", i32 1}
852971
!1 = !{void ()* @pos_empty_2, !"kernel", i32 1}
853972
!2 = !{void ()* @pos_empty_3, !"kernel", i32 1}
854973
!3 = !{void ()* @pos_empty_4, !"kernel", i32 1}
855974
!4 = !{void ()* @pos_empty_5, !"kernel", i32 1}
856975
!5 = !{void ()* @pos_empty_6, !"kernel", i32 1}
857-
!6 = !{void ()* @neg_empty_7, !"kernel", i32 1}
976+
!17 = !{void ()* @pos_empty_7a, !"kernel", i32 1}
977+
!18 = !{void ()* @pos_empty_7b, !"kernel", i32 1}
978+
!6 = !{void ()* @neg_empty_8, !"kernel", i32 1}
979+
!19 = !{void (i1)* @neg_empty_9, !"kernel", i32 1}
980+
!20 = !{void ()* @pos_empty_10, !"kernel", i32 1}
981+
!21 = !{void ()* @pos_empty_11, !"kernel", i32 1}
982+
!22 = !{void (i1)* @neg_empty_12, !"kernel", i32 1}
858983
!7 = !{void ()* @pos_constant_loads, !"kernel", i32 1}
859984
!8 = !{void ()* @neg_loads, !"kernel", i32 1}
860985
!9 = !{void ()* @pos_priv_mem, !"kernel", i32 1}
@@ -870,6 +995,7 @@ m3:
870995
; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nounwind }
871996
; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
872997
; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
998+
; CHECK: attributes #[[ATTR4]] = { nosync memory(none) }
873999
;.
8741000
; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
8751001
; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
@@ -879,7 +1005,7 @@ m3:
8791005
; CHECK: [[META5:![0-9]+]] = !{ptr @pos_empty_4, !"kernel", i32 1}
8801006
; CHECK: [[META6:![0-9]+]] = !{ptr @pos_empty_5, !"kernel", i32 1}
8811007
; CHECK: [[META7:![0-9]+]] = !{ptr @pos_empty_6, !"kernel", i32 1}
882-
; CHECK: [[META8:![0-9]+]] = !{ptr @neg_empty_7, !"kernel", i32 1}
1008+
; CHECK: [[META8:![0-9]+]] = !{ptr @neg_empty_8, !"kernel", i32 1}
8831009
; CHECK: [[META9:![0-9]+]] = !{ptr @pos_constant_loads, !"kernel", i32 1}
8841010
; CHECK: [[META10:![0-9]+]] = !{ptr @neg_loads, !"kernel", i32 1}
8851011
; CHECK: [[META11:![0-9]+]] = !{ptr @pos_priv_mem, !"kernel", i32 1}
@@ -888,4 +1014,10 @@ m3:
8881014
; CHECK: [[META14:![0-9]+]] = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1}
8891015
; CHECK: [[META15:![0-9]+]] = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1}
8901016
; CHECK: [[META16:![0-9]+]] = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1}
1017+
; CHECK: [[META17:![0-9]+]] = !{ptr @pos_empty_7a, !"kernel", i32 1}
1018+
; CHECK: [[META18:![0-9]+]] = !{ptr @pos_empty_7b, !"kernel", i32 1}
1019+
; CHECK: [[META19:![0-9]+]] = !{ptr @neg_empty_9, !"kernel", i32 1}
1020+
; CHECK: [[META20:![0-9]+]] = !{ptr @pos_empty_10, !"kernel", i32 1}
1021+
; CHECK: [[META21:![0-9]+]] = !{ptr @pos_empty_11, !"kernel", i32 1}
1022+
; CHECK: [[META22:![0-9]+]] = !{ptr @neg_empty_12, !"kernel", i32 1}
8911023
;.

0 commit comments

Comments
 (0)