Skip to content

Commit 4a1864a

Browse files
committed
address comments
1 parent 0e92b67 commit 4a1864a

File tree

4 files changed

+57
-31
lines changed

4 files changed

+57
-31
lines changed

llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
//===----------------------------------------------------------------------===//
88
//
99
// A simple pass that looks at local memory arrays that are statically
10-
// sized and sets an appropriate alignment for them. This enables vectorization
10+
// sized and potentially increases their alignment. This enables vectorization
1111
// of loads/stores to these arrays if not explicitly specified by the client.
1212
//
1313
// TODO: Ideally we should do a bin-packing of local arrays to maximize
@@ -16,12 +16,15 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "NVPTX.h"
19+
#include "llvm/Analysis/TargetTransformInfo.h"
1920
#include "llvm/IR/DataLayout.h"
2021
#include "llvm/IR/Instructions.h"
2122
#include "llvm/IR/Module.h"
23+
#include "llvm/IR/PassManager.h"
2224
#include "llvm/Pass.h"
2325
#include "llvm/Support/CommandLine.h"
2426
#include "llvm/Support/MathExtras.h"
27+
#include "llvm/Support/NVPTXAddrSpace.h"
2528

2629
using namespace llvm;
2730

@@ -30,16 +33,35 @@ static cl::opt<bool>
3033
cl::init(false), cl::Hidden,
3134
cl::desc("Use maximum alignment for local memory"));
3235

33-
static constexpr Align MaxPTXArrayAlignment = Align::Constant<16>();
36+
static Align getMaxLocalArrayAlignment(const TargetTransformInfo &TTI) {
37+
const unsigned MaxBitWidth =
38+
TTI.getLoadStoreVecRegBitWidth(NVPTXAS::ADDRESS_SPACE_LOCAL);
39+
return Align(MaxBitWidth / 8);
40+
}
41+
42+
namespace {
43+
struct NVPTXIncreaseLocalAlignment {
44+
const Align MaxAlign;
45+
46+
NVPTXIncreaseLocalAlignment(const TargetTransformInfo &TTI)
47+
: MaxAlign(getMaxLocalArrayAlignment(TTI)) {}
48+
49+
bool run(Function &F);
50+
bool updateAllocaAlignment(AllocaInst *Alloca, const DataLayout &DL);
51+
Align getAggressiveArrayAlignment(unsigned ArraySize);
52+
Align getConservativeArrayAlignment(unsigned ArraySize);
53+
};
54+
} // namespace
3455

3556
/// Get the maximum useful alignment for an array. This is more likely to
3657
/// produce holes in the local memory.
3758
///
3859
/// Choose an alignment large enough that the entire array could be loaded with
3960
/// a single vector load (if possible). Cap the alignment at
4061
/// MaxPTXArrayAlignment.
41-
static Align getAggressiveArrayAlignment(const unsigned ArraySize) {
42-
return std::min(MaxPTXArrayAlignment, Align(PowerOf2Ceil(ArraySize)));
62+
Align NVPTXIncreaseLocalAlignment::getAggressiveArrayAlignment(
63+
const unsigned ArraySize) {
64+
return std::min(MaxAlign, Align(PowerOf2Ceil(ArraySize)));
4365
}
4466

4567
/// Get the alignment of arrays that reduces the chances of leaving holes when
@@ -49,20 +71,18 @@ static Align getAggressiveArrayAlignment(const unsigned ArraySize) {
4971
/// Choose the largest alignment such that the array size is a multiple of the
5072
/// alignment. If all elements of the buffer are allocated in order of
5173
/// alignment (higher to lower) no holes will be left.
52-
static Align getConservativeArrayAlignment(const unsigned ArraySize) {
53-
return commonAlignment(MaxPTXArrayAlignment, ArraySize);
74+
Align NVPTXIncreaseLocalAlignment::getConservativeArrayAlignment(
75+
const unsigned ArraySize) {
76+
return commonAlignment(MaxAlign, ArraySize);
5477
}
5578

5679
/// Find a better alignment for local arrays
57-
static bool updateAllocaAlignment(const DataLayout &DL, AllocaInst *Alloca) {
80+
bool NVPTXIncreaseLocalAlignment::updateAllocaAlignment(AllocaInst *Alloca,
81+
const DataLayout &DL) {
5882
// Looking for statically sized local arrays
5983
if (!Alloca->isStaticAlloca())
6084
return false;
6185

62-
// For now, we only support array allocas
63-
if (!(Alloca->isArrayAllocation() || Alloca->getAllocatedType()->isArrayTy()))
64-
return false;
65-
6686
const auto ArraySize = Alloca->getAllocationSize(DL);
6787
if (!(ArraySize && ArraySize->isFixed()))
6888
return false;
@@ -80,14 +100,14 @@ static bool updateAllocaAlignment(const DataLayout &DL, AllocaInst *Alloca) {
80100
return false;
81101
}
82102

83-
static bool runSetLocalArrayAlignment(Function &F) {
103+
bool NVPTXIncreaseLocalAlignment::run(Function &F) {
84104
bool Changed = false;
85-
const DataLayout &DL = F.getParent()->getDataLayout();
105+
const auto &DL = F.getParent()->getDataLayout();
86106

87107
BasicBlock &EntryBB = F.getEntryBlock();
88108
for (Instruction &I : EntryBB)
89109
if (AllocaInst *Alloca = dyn_cast<AllocaInst>(&I))
90-
Changed |= updateAllocaAlignment(DL, Alloca);
110+
Changed |= updateAllocaAlignment(Alloca, DL);
91111

92112
return Changed;
93113
}
@@ -98,6 +118,9 @@ struct NVPTXIncreaseLocalAlignmentLegacyPass : public FunctionPass {
98118
NVPTXIncreaseLocalAlignmentLegacyPass() : FunctionPass(ID) {}
99119

100120
bool runOnFunction(Function &F) override;
121+
void getAnalysisUsage(AnalysisUsage &AU) const override {
122+
AU.addRequired<TargetTransformInfoWrapperPass>();
123+
}
101124
StringRef getPassName() const override {
102125
return "NVPTX Increase Local Alignment";
103126
}
@@ -115,12 +138,15 @@ FunctionPass *llvm::createNVPTXIncreaseLocalAlignmentPass() {
115138
}
116139

117140
bool NVPTXIncreaseLocalAlignmentLegacyPass::runOnFunction(Function &F) {
118-
return runSetLocalArrayAlignment(F);
141+
const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
142+
return NVPTXIncreaseLocalAlignment(TTI).run(F);
119143
}
120144

121145
PreservedAnalyses
122-
NVPTXIncreaseLocalAlignmentPass::run(Function &F, FunctionAnalysisManager &AM) {
123-
bool Changed = runSetLocalArrayAlignment(F);
146+
NVPTXIncreaseLocalAlignmentPass::run(Function &F,
147+
FunctionAnalysisManager &FAM) {
148+
const auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
149+
bool Changed = NVPTXIncreaseLocalAlignment(TTI).run(F);
124150

125151
if (!Changed)
126152
return PreservedAnalyses::all();

llvm/test/CodeGen/NVPTX/increase-local-align.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ define void @test6() {
6767

6868
define void @test7() {
6969
; COMMON-LABEL: define void @test7() {
70-
; COMMON-NEXT: [[A:%.*]] = alloca i32, align 2
70+
; COMMON-NEXT: [[A:%.*]] = alloca i32, align 4
7171
; COMMON-NEXT: ret void
7272
;
7373
%a = alloca i32, align 2

llvm/test/CodeGen/NVPTX/lower-byval-args.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out
135135
;
136136
; PTX-LABEL: escape_ptr(
137137
; PTX: {
138-
; PTX-NEXT: .local .align 4 .b8 __local_depot2[8];
138+
; PTX-NEXT: .local .align 8 .b8 __local_depot2[8];
139139
; PTX-NEXT: .reg .b64 %SP;
140140
; PTX-NEXT: .reg .b64 %SPL;
141141
; PTX-NEXT: .reg .b32 %r<3>;
@@ -179,7 +179,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone
179179
;
180180
; PTX-LABEL: escape_ptr_gep(
181181
; PTX: {
182-
; PTX-NEXT: .local .align 4 .b8 __local_depot3[8];
182+
; PTX-NEXT: .local .align 8 .b8 __local_depot3[8];
183183
; PTX-NEXT: .reg .b64 %SP;
184184
; PTX-NEXT: .reg .b64 %SPL;
185185
; PTX-NEXT: .reg .b32 %r<3>;
@@ -194,7 +194,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone
194194
; PTX-NEXT: st.local.b32 [%rd2+4], %r1;
195195
; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_param_1];
196196
; PTX-NEXT: st.local.b32 [%rd2], %r2;
197-
; PTX-NEXT: add.s64 %rd3, %rd1, 4;
197+
; PTX-NEXT: or.b64 %rd3, %rd1, 4;
198198
; PTX-NEXT: { // callseq 1, 0
199199
; PTX-NEXT: .param .b64 param0;
200200
; PTX-NEXT: st.param.b64 [param0], %rd3;
@@ -224,7 +224,7 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon
224224
;
225225
; PTX-LABEL: escape_ptr_store(
226226
; PTX: {
227-
; PTX-NEXT: .local .align 4 .b8 __local_depot4[8];
227+
; PTX-NEXT: .local .align 8 .b8 __local_depot4[8];
228228
; PTX-NEXT: .reg .b64 %SP;
229229
; PTX-NEXT: .reg .b64 %SPL;
230230
; PTX-NEXT: .reg .b32 %r<3>;
@@ -262,7 +262,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri
262262
;
263263
; PTX-LABEL: escape_ptr_gep_store(
264264
; PTX: {
265-
; PTX-NEXT: .local .align 4 .b8 __local_depot5[8];
265+
; PTX-NEXT: .local .align 8 .b8 __local_depot5[8];
266266
; PTX-NEXT: .reg .b64 %SP;
267267
; PTX-NEXT: .reg .b64 %SPL;
268268
; PTX-NEXT: .reg .b32 %r<3>;
@@ -279,7 +279,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri
279279
; PTX-NEXT: st.local.b32 [%rd4+4], %r1;
280280
; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_store_param_1];
281281
; PTX-NEXT: st.local.b32 [%rd4], %r2;
282-
; PTX-NEXT: add.s64 %rd5, %rd3, 4;
282+
; PTX-NEXT: or.b64 %rd5, %rd3, 4;
283283
; PTX-NEXT: st.global.b64 [%rd2], %rd5;
284284
; PTX-NEXT: ret;
285285
entry:
@@ -302,7 +302,7 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl
302302
;
303303
; PTX-LABEL: escape_ptrtoint(
304304
; PTX: {
305-
; PTX-NEXT: .local .align 4 .b8 __local_depot6[8];
305+
; PTX-NEXT: .local .align 8 .b8 __local_depot6[8];
306306
; PTX-NEXT: .reg .b64 %SP;
307307
; PTX-NEXT: .reg .b64 %SPL;
308308
; PTX-NEXT: .reg .b32 %r<3>;

llvm/test/CodeGen/NVPTX/variadics-backend.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias
208208
define dso_local i32 @bar() {
209209
; CHECK-PTX-LABEL: bar(
210210
; CHECK-PTX: {
211-
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24];
211+
; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot3[32];
212212
; CHECK-PTX-NEXT: .reg .b64 %SP;
213213
; CHECK-PTX-NEXT: .reg .b64 %SPL;
214214
; CHECK-PTX-NEXT: .reg .b16 %rs<5>;
@@ -226,12 +226,12 @@ define dso_local i32 @bar() {
226226
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
227227
; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3;
228228
; CHECK-PTX-NEXT: mov.b32 %r1, 1;
229-
; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1;
229+
; CHECK-PTX-NEXT: st.b32 [%SP+16], %r1;
230230
; CHECK-PTX-NEXT: mov.b16 %rs4, 1;
231-
; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs4;
231+
; CHECK-PTX-NEXT: st.b8 [%SP+20], %rs4;
232232
; CHECK-PTX-NEXT: mov.b64 %rd3, 1;
233-
; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd3;
234-
; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 8;
233+
; CHECK-PTX-NEXT: st.b64 [%SP+24], %rd3;
234+
; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 16;
235235
; CHECK-PTX-NEXT: { // callseq 1, 0
236236
; CHECK-PTX-NEXT: .param .b32 param0;
237237
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
@@ -371,7 +371,7 @@ entry:
371371
define dso_local void @qux() {
372372
; CHECK-PTX-LABEL: qux(
373373
; CHECK-PTX: {
374-
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24];
374+
; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot7[32];
375375
; CHECK-PTX-NEXT: .reg .b64 %SP;
376376
; CHECK-PTX-NEXT: .reg .b64 %SPL;
377377
; CHECK-PTX-NEXT: .reg .b32 %r<3>;

0 commit comments

Comments
 (0)