Skip to content

Commit c428a3d

Browse files
[LoopCacheAnalysis] Enable delinearization of fixed sized arrays
Currently loop cache cost (LCC) cannot analyze fix-sized arrays since it cannot delinearize them. This patch adds the capability to delinearize fix-sized arrays to LCC. Most of the code is ported from DependenceAnalysis.cpp and some refactoring will be done in a next patch. Reviewed By: #loopoptwg, Meinersbur Differential Revision: https://reviews.llvm.org/D122857
1 parent ac33c33 commit c428a3d

File tree

3 files changed

+226
-5
lines changed

3 files changed

+226
-5
lines changed

llvm/include/llvm/Analysis/LoopCacheAnalysis.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ class IndexedReference {
9898
/// Attempt to delinearize the indexed reference.
9999
bool delinearize(const LoopInfo &LI);
100100

101+
bool tryDelinearizeFixedSize(ScalarEvolution *SE, Instruction *Src,
102+
const SCEV *SrcAccessFn,
103+
SmallVectorImpl<const SCEV *> &SrcSubscripts);
104+
101105
/// Return true if the index reference is invariant with respect to loop \p L.
102106
bool isLoopInvariant(const Loop &L) const;
103107

llvm/lib/Analysis/LoopCacheAnalysis.cpp

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,51 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
319319
return CacheCost::InvalidCost;
320320
}
321321

322+
bool IndexedReference::tryDelinearizeFixedSize(
323+
ScalarEvolution *SE, Instruction *Src, const SCEV *SrcAccessFn,
324+
SmallVectorImpl<const SCEV *> &SrcSubscripts) {
325+
Value *SrcPtr = getLoadStorePointerOperand(Src);
326+
const SCEVUnknown *SrcBase =
327+
dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcAccessFn));
328+
329+
// Check the simple case where the array dimensions are fixed size.
330+
auto *SrcGEP = dyn_cast<GetElementPtrInst>(SrcPtr);
331+
if (!SrcGEP)
332+
return false;
333+
334+
SmallVector<int, 4> SrcSizes;
335+
getIndexExpressionsFromGEP(*SE, SrcGEP, SrcSubscripts, SrcSizes);
336+
337+
// Check that the two size arrays are non-empty and equal in length and
338+
// value.
339+
if (SrcSizes.empty() || SrcSubscripts.size() <= 1) {
340+
SrcSubscripts.clear();
341+
return false;
342+
}
343+
344+
Value *SrcBasePtr = SrcGEP->getOperand(0)->stripPointerCasts();
345+
346+
// Check that for identical base pointers we do not miss index offsets
347+
// that have been added before this GEP is applied.
348+
if (SrcBasePtr != SrcBase->getValue()) {
349+
SrcSubscripts.clear();
350+
return false;
351+
}
352+
353+
assert(SrcSubscripts.size() == SrcSizes.size() + 1 &&
354+
"Expected equal number of entries in the list of size and "
355+
"subscript.");
356+
357+
for (auto Idx : seq<unsigned>(1, Subscripts.size()))
358+
Sizes.push_back(SE->getConstant(Subscripts[Idx]->getType(), SrcSizes[Idx - 1]));
359+
360+
LLVM_DEBUG({
361+
dbgs() << "Delinearized subscripts of fixed-size array\n"
362+
<< "SrcGEP:" << *SrcGEP << "\n";
363+
});
364+
return true;
365+
}
366+
322367
bool IndexedReference::delinearize(const LoopInfo &LI) {
323368
assert(Subscripts.empty() && "Subscripts should be empty");
324369
assert(Sizes.empty() && "Sizes should be empty");
@@ -340,13 +385,25 @@ bool IndexedReference::delinearize(const LoopInfo &LI) {
340385
return false;
341386
}
342387

343-
AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
388+
bool IsFixedSize = false;
389+
// Try to delinearize fixed-size arrays.
390+
if (tryDelinearizeFixedSize(&SE, &StoreOrLoadInst, AccessFn, Subscripts)) {
391+
IsFixedSize = true;
392+
/// The last element of \p Sizes is the element size.
393+
Sizes.push_back(ElemSize);
394+
LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
395+
<< "', AccessFn: " << *AccessFn << "\n");
396+
}
344397

345-
LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
346-
<< "', AccessFn: " << *AccessFn << "\n");
398+
AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
347399

348-
llvm::delinearize(SE, AccessFn, Subscripts, Sizes,
349-
SE.getElementSize(&StoreOrLoadInst));
400+
// Try to delinearize parametric-size arrays.
401+
if (!IsFixedSize) {
402+
LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
403+
<< "', AccessFn: " << *AccessFn << "\n");
404+
llvm::delinearize(SE, AccessFn, Subscripts, Sizes,
405+
SE.getElementSize(&StoreOrLoadInst));
406+
}
350407

351408
if (Subscripts.empty() || Sizes.empty() ||
352409
Subscripts.size() != Sizes.size()) {
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
; RUN: opt < %s -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
2+
3+
target datalayout = "e-m:e-i64:64-n32:64"
4+
target triple = "powerpc64le-unknown-linux-gnu"
5+
6+
; Check delinearization in loop cache analysis can handle fixed-size arrays.
7+
; The IR is copied from llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll
8+
9+
; CHECK: Loop 'for.body' has cost = 4186116
10+
; CHECK: Loop 'for.body4' has cost = 128898
11+
12+
;; #define N 1024
13+
;; #define M 2048
14+
;; void t1(int a[N][M]) {
15+
;; for (int i = 0; i < N-1; ++i)
16+
;; for (int j = 2; j < M; ++j)
17+
;; a[i][j] = a[i+1][j-2];
18+
;; }
19+
20+
define void @t1([2048 x i32]* %a) {
21+
entry:
22+
br label %for.body
23+
24+
for.body: ; preds = %entry, %for.inc11
25+
%indvars.iv4 = phi i64 [ 0, %entry ], [ %indvars.iv.next5, %for.inc11 ]
26+
br label %for.body4
27+
28+
for.body4: ; preds = %for.body, %for.body4
29+
%indvars.iv = phi i64 [ 2, %for.body ], [ %indvars.iv.next, %for.body4 ]
30+
%0 = add nuw nsw i64 %indvars.iv4, 1
31+
%1 = add nsw i64 %indvars.iv, -2
32+
%arrayidx6 = getelementptr inbounds [2048 x i32], [2048 x i32]* %a, i64 %0, i64 %1
33+
%2 = load i32, i32* %arrayidx6, align 4
34+
%a_gep = getelementptr inbounds [2048 x i32], [2048 x i32]* %a, i64 0
35+
%arrayidx10 = getelementptr inbounds [2048 x i32], [2048 x i32]* %a_gep, i64 %indvars.iv4, i64 %indvars.iv
36+
store i32 %2, i32* %arrayidx10, align 4
37+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
38+
%exitcond = icmp ne i64 %indvars.iv.next, 2048
39+
br i1 %exitcond, label %for.body4, label %for.inc11
40+
41+
for.inc11: ; preds = %for.body4
42+
%indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
43+
%exitcond7 = icmp ne i64 %indvars.iv.next5, 1023
44+
br i1 %exitcond7, label %for.body, label %for.end13
45+
46+
for.end13: ; preds = %for.inc11
47+
ret void
48+
}
49+
50+
51+
; CHECK: Loop 'for.body' has cost = 4186116
52+
; CHECK: Loop 'for.body4' has cost = 128898
53+
54+
define void @t2([2048 x i32]* %a) {
55+
entry:
56+
br label %for.body
57+
58+
for.body: ; preds = %entry, %for.inc11
59+
%indvars.iv4 = phi i64 [ 0, %entry ], [ %indvars.iv.next5, %for.inc11 ]
60+
br label %for.body4
61+
62+
for.body4: ; preds = %for.body, %for.body4
63+
%indvars.iv = phi i64 [ 2, %for.body ], [ %indvars.iv.next, %for.body4 ]
64+
%0 = add nuw nsw i64 %indvars.iv4, 1
65+
%1 = add nsw i64 %indvars.iv, -2
66+
%arrayidx6 = getelementptr inbounds [2048 x i32], [2048 x i32]* %a, i64 %0, i64 %1
67+
%2 = load i32, i32* %arrayidx6, align 4
68+
%call = call [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %a)
69+
%arrayidx10 = getelementptr inbounds [2048 x i32], [2048 x i32]* %call, i64 %indvars.iv4, i64 %indvars.iv
70+
store i32 %2, i32* %arrayidx10, align 4
71+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
72+
%exitcond = icmp ne i64 %indvars.iv.next, 2048
73+
br i1 %exitcond, label %for.body4, label %for.inc11
74+
75+
for.inc11: ; preds = %for.body4
76+
%indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
77+
%exitcond7 = icmp ne i64 %indvars.iv.next5, 1023
78+
br i1 %exitcond7, label %for.body, label %for.end13
79+
80+
for.end13: ; preds = %for.inc11
81+
ret void
82+
}
83+
84+
declare [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %arg)
85+
86+
; CHECK: Loop 'for.body' has cost = 4472886244958208
87+
; CHECK: Loop 'for.body4' has cost = 4472886244958208
88+
; CHECK: Loop 'for.body8' has cost = 4472886244958208
89+
; CHECK: Loop 'for.body12' has cost = 4472886244958208
90+
; CHECK: Loop 'for.body16' has cost = 137728168833024
91+
92+
93+
;; #define N 1024
94+
;; #define M 2048
95+
;; void t3(int a[][N][N][N][M]) {
96+
;; for (int i1 = 0; i1 < N-1; ++i1)
97+
;; for (int i2 = 2; i2 < N; ++i2)
98+
;; for (int i3 = 0; i3 < N; ++i3)
99+
;; for (int i4 = 3; i4 < N; ++i4)
100+
;; for (int i5 = 0; i5 < M-2; ++i5)
101+
;; a[i1][i2][i3][i4][i5] = a[i1+1][i2-2][i3][i4-3][i5+2];
102+
;; }
103+
104+
define void @t3([1024 x [1024 x [1024 x [2048 x i32]]]]* %a) {
105+
entry:
106+
br label %for.body
107+
108+
for.body: ; preds = %entry, %for.inc46
109+
%indvars.iv18 = phi i64 [ 0, %entry ], [ %indvars.iv.next19, %for.inc46 ]
110+
br label %for.body4
111+
112+
for.body4: ; preds = %for.body, %for.inc43
113+
%indvars.iv14 = phi i64 [ 2, %for.body ], [ %indvars.iv.next15, %for.inc43 ]
114+
br label %for.body8
115+
116+
for.body8: ; preds = %for.body4, %for.inc40
117+
%indvars.iv11 = phi i64 [ 0, %for.body4 ], [ %indvars.iv.next12, %for.inc40 ]
118+
br label %for.body12
119+
120+
for.body12: ; preds = %for.body8, %for.inc37
121+
%indvars.iv7 = phi i64 [ 3, %for.body8 ], [ %indvars.iv.next8, %for.inc37 ]
122+
br label %for.body16
123+
124+
for.body16: ; preds = %for.body12, %for.body16
125+
%indvars.iv = phi i64 [ 0, %for.body12 ], [ %indvars.iv.next, %for.body16 ]
126+
%0 = add nuw nsw i64 %indvars.iv18, 1
127+
%1 = add nsw i64 %indvars.iv14, -2
128+
%2 = add nsw i64 %indvars.iv7, -3
129+
%3 = add nuw nsw i64 %indvars.iv, 2
130+
%arrayidx26 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3
131+
%4 = load i32, i32* %arrayidx26, align 4
132+
%arrayidx36 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv
133+
store i32 %4, i32* %arrayidx36, align 4
134+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
135+
%exitcond = icmp ne i64 %indvars.iv.next, 2046
136+
br i1 %exitcond, label %for.body16, label %for.inc37
137+
138+
for.inc37: ; preds = %for.body16
139+
%indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
140+
%exitcond10 = icmp ne i64 %indvars.iv.next8, 1024
141+
br i1 %exitcond10, label %for.body12, label %for.inc40
142+
143+
for.inc40: ; preds = %for.inc37
144+
%indvars.iv.next12 = add nuw nsw i64 %indvars.iv11, 1
145+
%exitcond13 = icmp ne i64 %indvars.iv.next12, 1024
146+
br i1 %exitcond13, label %for.body8, label %for.inc43
147+
148+
for.inc43: ; preds = %for.inc40
149+
%indvars.iv.next15 = add nuw nsw i64 %indvars.iv14, 1
150+
%exitcond17 = icmp ne i64 %indvars.iv.next15, 1024
151+
br i1 %exitcond17, label %for.body4, label %for.inc46
152+
153+
for.inc46: ; preds = %for.inc43
154+
%indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
155+
%exitcond21 = icmp ne i64 %indvars.iv.next19, 1023
156+
br i1 %exitcond21, label %for.body, label %for.end48
157+
158+
for.end48: ; preds = %for.inc46
159+
ret void
160+
}

0 commit comments

Comments
 (0)