[mlir] properly support min/max in affine parallelization

ftynse · ftynse · commit 2fe30a3534da · 2020-12-08T10:43:35.000+01:00
The existing implementation of the affine parallelization silently copies over the lower and upper bound maps from affine.for to affine.parallel. However, the semantics of these maps differ between these two ops: in affine.for, a max(min) of results is taken for the lower(upper) bound; in affine.parallel, multiple induction variables can be defined an each result corresponds to one induction variable. Thus the existing implementation could generate invalid IR or IR that passes the verifier but has different semantics than the original code. Fix the parallelization utility to emit dedicated min/max operations before the affine.parallel in such cases. Disallow parallelization if min/max would have been in an operation without the AffineScope trait, e.g., in another loop, since the result of these operations is not considered a valid affine dimension identifier and may not be properly handled by the affine analyses. Reviewed By: wsmoses Differential Revision: https://reviews.llvm.org/D92763
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -134,11 +134,43 @@ static AffineIfOp hoistAffineIfOp(AffineIfOp ifOp, Operation *hoistOverOp) {
 void mlir::affineParallelize(AffineForOp forOp) {
   Location loc = forOp.getLoc();
   OpBuilder outsideBuilder(forOp);
+
+  // If a loop has a 'max' in the lower bound, emit it outside the parallel loop
+  // as it does not have implicit 'max' behavior.
+  AffineMap lowerBoundMap = forOp.getLowerBoundMap();
+  ValueRange lowerBoundOperands = forOp.getLowerBoundOperands();
+  AffineMap upperBoundMap = forOp.getUpperBoundMap();
+  ValueRange upperBoundOperands = forOp.getUpperBoundOperands();
+
+  bool needsMax = lowerBoundMap.getNumResults() > 1;
+  bool needsMin = upperBoundMap.getNumResults() > 1;
+  AffineMap identityMap;
+  if (needsMax || needsMin) {
+    if (forOp->getParentOp() &&
+        !forOp->getParentOp()->hasTrait<OpTrait::AffineScope>())
+      return;
+
+    identityMap = AffineMap::getMultiDimIdentityMap(1, loc->getContext());
+  }
+  if (needsMax) {
+    auto maxOp = outsideBuilder.create<AffineMaxOp>(loc, lowerBoundMap,
+                                                    lowerBoundOperands);
+    lowerBoundMap = identityMap;
+    lowerBoundOperands = maxOp->getResults();
+  }
+
+  // Same for the upper bound.
+  if (needsMin) {
+    auto minOp = outsideBuilder.create<AffineMinOp>(loc, upperBoundMap,
+                                                    upperBoundOperands);
+    upperBoundMap = identityMap;
+    upperBoundOperands = minOp->getResults();
+  }
+
   // Creating empty 1-D affine.parallel op.
   AffineParallelOp newPloop = outsideBuilder.create<AffineParallelOp>(
-      loc, llvm::None, llvm::None, forOp.getLowerBoundMap(),
-      forOp.getLowerBoundOperands(), forOp.getUpperBoundMap(),
-      forOp.getUpperBoundOperands());
+      loc, llvm::None, llvm::None, lowerBoundMap, lowerBoundOperands,
+      upperBoundMap, upperBoundOperands);
   // Steal the body of the old affine for op and erase it.
   newPloop.region().takeBody(forOp.region());
   forOp.erase();
diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir
@@ -114,3 +114,33 @@ func @non_affine_load() {
   }
   return
 }
+
+// CHECK-LABEL: for_with_minmax
+func @for_with_minmax(%m: memref<?xf32>, %lb0: index, %lb1: index,
+                      %ub0: index, %ub1: index) {
+  // CHECK: %[[lb:.*]] = affine.max
+  // CHECK: %[[ub:.*]] = affine.min
+  // CHECK: affine.parallel (%{{.*}}) = (%[[lb]]) to (%[[ub]])
+  affine.for %i = max affine_map<(d0, d1) -> (d0, d1)>(%lb0, %lb1)
+          to min affine_map<(d0, d1) -> (d0, d1)>(%ub0, %ub1) {
+    affine.load %m[%i] : memref<?xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: nested_for_with_minmax
+func @nested_for_with_minmax(%m: memref<?xf32>, %lb0: index,
+                             %ub0: index, %ub1: index) {
+  // CHECK: affine.parallel
+  affine.for %j = 0 to 10 {
+    // Cannot parallelize the inner loop because we would need to compute
+    // affine.max for its lower bound inside the loop, and that is not (yet)
+    // considered as a valid affine dimension.
+    // CHECK: affine.for
+    affine.for %i = max affine_map<(d0, d1) -> (d0, d1)>(%lb0, %j)
+            to min affine_map<(d0, d1) -> (d0, d1)>(%ub0, %ub1) {
+      affine.load %m[%i] : memref<?xf32>
+    }
+  }
+  return
+}