[mlir] Add an option to control the number of loops in affine parallelizer

ftynse · ftynse · commit 80766ecc6509 · 2020-12-08T10:44:37.000+01:00
Add a pass option to control the number of nested parallel loops produced by the parallelization passes. This is useful to build end-to-end passes targeting systems that don't need multiple parallel dimensions (e.g., CPUs typically need only one). Reviewed By: wsmoses, chelini Differential Revision: https://reviews.llvm.org/D92765
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -118,6 +118,11 @@ def AffineVectorize : FunctionPass<"affine-super-vectorize"> {
 def AffineParallelize : FunctionPass<"affine-parallelize"> {
   let summary = "Convert affine.for ops into 1-D affine.parallel";
   let constructor = "mlir::createAffineParallelizePass()";
+  let options = [
+    Option<"maxNested", "max-nested", "unsigned", /*default=*/"-1u",
+           "Maximum number of nested parallel loops to produce. "
+           "Defaults to unlimited (UINT_MAX).">,
+  ];
 }
 
 def AffineLoopNormalize : FunctionPass<"affine-loop-normalize"> {
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
@@ -36,13 +36,28 @@ struct AffineParallelize : public AffineParallelizeBase<AffineParallelize> {
 
 void AffineParallelize::runOnFunction() {
   FuncOp f = getFunction();
-  SmallVector<AffineForOp, 8> parallelizableLoops;
+
+  // The walker proceeds in post-order, but we need to process outer loops first
+  // to control the number of outer parallel loops, so push candidate loops to
+  // the front of a deque.
+  std::deque<AffineForOp> parallelizableLoops;
   f.walk([&](AffineForOp loop) {
     if (isLoopParallel(loop))
-      parallelizableLoops.push_back(loop);
+      parallelizableLoops.push_front(loop);
   });
-  for (AffineForOp loop : parallelizableLoops)
-    affineParallelize(loop);
+
+  for (AffineForOp loop : parallelizableLoops) {
+    unsigned numParentParallelOps = 0;
+    for (Operation *op = loop->getParentOp();
+         op != nullptr && !op->hasTrait<OpTrait::AffineScope>();
+         op = op->getParentOp()) {
+      if (isa<AffineParallelOp>(op))
+        ++numParentParallelOps;
+    }
+
+    if (numParentParallelOps < maxNested)
+      affineParallelize(loop);
+  }
 }
 
 std::unique_ptr<OperationPass<FuncOp>> mlir::createAffineParallelizePass() {
diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize| FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize='max-nested=1' | FileCheck --check-prefix=MAX-NESTED %s
 
 // CHECK-LABEL:    func @reduce_window_max() {
 func @reduce_window_max() {
@@ -144,3 +145,18 @@ func @nested_for_with_minmax(%m: memref<?xf32>, %lb0: index,
   }
   return
 }
+
+// MAX-NESTED-LABEL: @max_nested
+func @max_nested(%m: memref<?x?xf32>, %lb0: index, %lb1: index,
+                 %ub0: index, %ub1: index) {
+  // MAX-NESTED: affine.parallel
+  affine.for %i = affine_map<(d0) -> (d0)>(%lb0) to affine_map<(d0) -> (d0)>(%ub0) {
+    // MAX-NESTED: affine.for
+    affine.for %j = affine_map<(d0) -> (d0)>(%lb1) to affine_map<(d0) -> (d0)>(%ub1) {
+      affine.load %m[%i, %j] : memref<?x?xf32>
+    }
+  }
+  return
+}
+
+