[LoopCacheAnalysis] Enable delinearization of fixed sized arrays

CongzheUalberta · CongzheUalberta · commit c428a3d2a09e · 2022-04-29T16:01:27.000-04:00
Currently loop cache cost (LCC) cannot analyze fix-sized arrays since it cannot delinearize them. This patch adds the capability to delinearize fix-sized arrays to LCC. Most of the code is ported from DependenceAnalysis.cpp and some refactoring will be done in a next patch. Reviewed By: #loopoptwg, Meinersbur Differential Revision: https://reviews.llvm.org/D122857
diff --git a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
@@ -98,6 +98,10 @@ class IndexedReference {
   /// Attempt to delinearize the indexed reference.
   bool delinearize(const LoopInfo &LI);
 
+  bool tryDelinearizeFixedSize(ScalarEvolution *SE, Instruction *Src,
+                               const SCEV *SrcAccessFn,
+                               SmallVectorImpl<const SCEV *> &SrcSubscripts);
+
   /// Return true if the index reference is invariant with respect to loop \p L.
   bool isLoopInvariant(const Loop &L) const;
 
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -319,6 +319,51 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
   return CacheCost::InvalidCost;
 }
 
+bool IndexedReference::tryDelinearizeFixedSize(
+    ScalarEvolution *SE, Instruction *Src, const SCEV *SrcAccessFn,
+    SmallVectorImpl<const SCEV *> &SrcSubscripts) {
+  Value *SrcPtr = getLoadStorePointerOperand(Src);
+  const SCEVUnknown *SrcBase =
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcAccessFn));
+
+  // Check the simple case where the array dimensions are fixed size.
+  auto *SrcGEP = dyn_cast<GetElementPtrInst>(SrcPtr);
+  if (!SrcGEP)
+    return false;
+
+  SmallVector<int, 4> SrcSizes;
+  getIndexExpressionsFromGEP(*SE, SrcGEP, SrcSubscripts, SrcSizes);
+
+  // Check that the two size arrays are non-empty and equal in length and
+  // value.
+  if (SrcSizes.empty() || SrcSubscripts.size() <= 1) {
+    SrcSubscripts.clear();
+    return false;
+  }
+
+  Value *SrcBasePtr = SrcGEP->getOperand(0)->stripPointerCasts();
+
+  // Check that for identical base pointers we do not miss index offsets
+  // that have been added before this GEP is applied.
+  if (SrcBasePtr != SrcBase->getValue()) {
+    SrcSubscripts.clear();
+    return false;
+  }
+
+  assert(SrcSubscripts.size() == SrcSizes.size() + 1 &&
+         "Expected equal number of entries in the list of size and "
+         "subscript.");
+
+  for (auto Idx : seq<unsigned>(1, Subscripts.size()))
+    Sizes.push_back(SE->getConstant(Subscripts[Idx]->getType(), SrcSizes[Idx - 1]));
+
+  LLVM_DEBUG({
+    dbgs() << "Delinearized subscripts of fixed-size array\n"
+           << "SrcGEP:" << *SrcGEP << "\n";
+  });
+  return true;
+}
+
 bool IndexedReference::delinearize(const LoopInfo &LI) {
   assert(Subscripts.empty() && "Subscripts should be empty");
   assert(Sizes.empty() && "Sizes should be empty");
@@ -340,13 +385,25 @@ bool IndexedReference::delinearize(const LoopInfo &LI) {
       return false;
     }
 
-    AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
+    bool IsFixedSize = false;
+    // Try to delinearize fixed-size arrays.
+    if (tryDelinearizeFixedSize(&SE, &StoreOrLoadInst, AccessFn, Subscripts)) {
+      IsFixedSize = true;
+      /// The last element of \p Sizes is the element size.
+      Sizes.push_back(ElemSize);
+      LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
+                                  << "', AccessFn: " << *AccessFn << "\n");
+    }
 
-    LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
-                                << "', AccessFn: " << *AccessFn << "\n");
+    AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
 
-    llvm::delinearize(SE, AccessFn, Subscripts, Sizes,
-                      SE.getElementSize(&StoreOrLoadInst));
+    // Try to delinearize parametric-size arrays.
+    if (!IsFixedSize) {
+      LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
+                                  << "', AccessFn: " << *AccessFn << "\n");
+      llvm::delinearize(SE, AccessFn, Subscripts, Sizes,
+                        SE.getElementSize(&StoreOrLoadInst));
+    }
 
     if (Subscripts.empty() || Sizes.empty() ||
         Subscripts.size() != Sizes.size()) {
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -0,0 +1,160 @@
+; RUN: opt < %s -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Check delinearization in loop cache analysis can handle fixed-size arrays.
+; The IR is copied from llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll
+
+; CHECK: Loop 'for.body' has cost = 4186116
+; CHECK: Loop 'for.body4' has cost = 128898
+
+;; #define N 1024
+;; #define M 2048
+;; void t1(int a[N][M]) {
+;;   for (int i = 0; i < N-1; ++i)
+;;     for (int j = 2; j < M; ++j)
+;;       a[i][j] = a[i+1][j-2];
+;; }
+
+define void @t1([2048 x i32]* %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc11
+  %indvars.iv4 = phi i64 [ 0, %entry ], [ %indvars.iv.next5, %for.inc11 ]
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body, %for.body4
+  %indvars.iv = phi i64 [ 2, %for.body ], [ %indvars.iv.next, %for.body4 ]
+  %0 = add nuw nsw i64 %indvars.iv4, 1
+  %1 = add nsw i64 %indvars.iv, -2
+  %arrayidx6 = getelementptr inbounds [2048 x i32], [2048 x i32]* %a, i64 %0, i64 %1
+  %2 = load i32, i32* %arrayidx6, align 4
+  %a_gep = getelementptr inbounds [2048 x i32], [2048 x i32]* %a, i64 0
+  %arrayidx10 = getelementptr inbounds [2048 x i32], [2048 x i32]* %a_gep, i64 %indvars.iv4, i64 %indvars.iv
+  store i32 %2, i32* %arrayidx10, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 2048
+  br i1 %exitcond, label %for.body4, label %for.inc11
+
+for.inc11:                                        ; preds = %for.body4
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  %exitcond7 = icmp ne i64 %indvars.iv.next5, 1023
+  br i1 %exitcond7, label %for.body, label %for.end13
+
+for.end13:                                        ; preds = %for.inc11
+  ret void
+}
+
+
+; CHECK: Loop 'for.body' has cost = 4186116
+; CHECK: Loop 'for.body4' has cost = 128898
+
+define void @t2([2048 x i32]* %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc11
+  %indvars.iv4 = phi i64 [ 0, %entry ], [ %indvars.iv.next5, %for.inc11 ]
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body, %for.body4
+  %indvars.iv = phi i64 [ 2, %for.body ], [ %indvars.iv.next, %for.body4 ]
+  %0 = add nuw nsw i64 %indvars.iv4, 1
+  %1 = add nsw i64 %indvars.iv, -2
+  %arrayidx6 = getelementptr inbounds [2048 x i32], [2048 x i32]* %a, i64 %0, i64 %1
+  %2 = load i32, i32* %arrayidx6, align 4
+  %call = call [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %a)
+  %arrayidx10 = getelementptr inbounds [2048 x i32], [2048 x i32]* %call, i64 %indvars.iv4, i64 %indvars.iv
+  store i32 %2, i32* %arrayidx10, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 2048
+  br i1 %exitcond, label %for.body4, label %for.inc11
+
+for.inc11:                                        ; preds = %for.body4
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  %exitcond7 = icmp ne i64 %indvars.iv.next5, 1023
+  br i1 %exitcond7, label %for.body, label %for.end13
+
+for.end13:                                        ; preds = %for.inc11
+  ret void
+}
+
+declare [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %arg)
+
+; CHECK: Loop 'for.body' has cost = 4472886244958208
+; CHECK: Loop 'for.body4' has cost = 4472886244958208
+; CHECK: Loop 'for.body8' has cost = 4472886244958208
+; CHECK: Loop 'for.body12' has cost = 4472886244958208
+; CHECK: Loop 'for.body16' has cost = 137728168833024
+
+
+;; #define N 1024
+;; #define M 2048
+;; void t3(int a[][N][N][N][M]) {
+;;   for (int i1 = 0; i1 < N-1; ++i1)
+;;     for (int i2 = 2; i2 < N; ++i2)
+;;       for (int i3 = 0; i3 < N; ++i3)
+;;         for (int i4 = 3; i4 < N; ++i4)
+;;           for (int i5 = 0; i5 < M-2; ++i5)
+;;             a[i1][i2][i3][i4][i5] = a[i1+1][i2-2][i3][i4-3][i5+2];
+;; }
+
+define void @t3([1024 x [1024 x [1024 x [2048 x i32]]]]* %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc46
+  %indvars.iv18 = phi i64 [ 0, %entry ], [ %indvars.iv.next19, %for.inc46 ]
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body, %for.inc43
+  %indvars.iv14 = phi i64 [ 2, %for.body ], [ %indvars.iv.next15, %for.inc43 ]
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.body4, %for.inc40
+  %indvars.iv11 = phi i64 [ 0, %for.body4 ], [ %indvars.iv.next12, %for.inc40 ]
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body8, %for.inc37
+  %indvars.iv7 = phi i64 [ 3, %for.body8 ], [ %indvars.iv.next8, %for.inc37 ]
+  br label %for.body16
+
+for.body16:                                       ; preds = %for.body12, %for.body16
+  %indvars.iv = phi i64 [ 0, %for.body12 ], [ %indvars.iv.next, %for.body16 ]
+  %0 = add nuw nsw i64 %indvars.iv18, 1
+  %1 = add nsw i64 %indvars.iv14, -2
+  %2 = add nsw i64 %indvars.iv7, -3
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx26 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3
+  %4 = load i32, i32* %arrayidx26, align 4
+  %arrayidx36 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv
+  store i32 %4, i32* %arrayidx36, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 2046
+  br i1 %exitcond, label %for.body16, label %for.inc37
+
+for.inc37:                                        ; preds = %for.body16
+  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
+  %exitcond10 = icmp ne i64 %indvars.iv.next8, 1024
+  br i1 %exitcond10, label %for.body12, label %for.inc40
+
+for.inc40:                                        ; preds = %for.inc37
+  %indvars.iv.next12 = add nuw nsw i64 %indvars.iv11, 1
+  %exitcond13 = icmp ne i64 %indvars.iv.next12, 1024
+  br i1 %exitcond13, label %for.body8, label %for.inc43
+
+for.inc43:                                        ; preds = %for.inc40
+  %indvars.iv.next15 = add nuw nsw i64 %indvars.iv14, 1
+  %exitcond17 = icmp ne i64 %indvars.iv.next15, 1024
+  br i1 %exitcond17, label %for.body4, label %for.inc46
+
+for.inc46:                                        ; preds = %for.inc43
+  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
+  %exitcond21 = icmp ne i64 %indvars.iv.next19, 1023
+  br i1 %exitcond21, label %for.body, label %for.end48
+
+for.end48:                                        ; preds = %for.inc46
+  ret void
+}