[Flang][OpenMP] Add FissionWorkdistribute lowering.

skc7 · skc7 · commit 2d138e2db2cb · 2025-05-16T13:01:02.000+05:30
Fission logic inspired from ivanradanov implementation : c97eca4
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
@@ -10,31 +10,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <flang/Optimizer/Builder/FIRBuilder.h>
-#include <flang/Optimizer/Dialect/FIROps.h>
-#include <flang/Optimizer/Dialect/FIRType.h>
-#include <flang/Optimizer/HLFIR/HLFIROps.h>
-#include <flang/Optimizer/OpenMP/Passes.h>
-#include <llvm/ADT/BreadthFirstIterator.h>
-#include <llvm/ADT/STLExtras.h>
-#include <llvm/ADT/SmallVectorExtras.h>
-#include <llvm/ADT/iterator_range.h>
-#include <llvm/Support/ErrorHandling.h>
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "flang/Optimizer/HLFIR/Passes.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include <mlir/Dialect/Arith/IR/Arith.h>
 #include <mlir/Dialect/LLVMIR/LLVMTypes.h>
-#include <mlir/Dialect/OpenMP/OpenMPClauseOperands.h>
-#include <mlir/Dialect/OpenMP/OpenMPDialect.h>
-#include <mlir/Dialect/SCF/IR/SCF.h>
+#include <mlir/Dialect/Utils/IndexingUtils.h>
+#include <mlir/IR/BlockSupport.h>
 #include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/Diagnostics.h>
 #include <mlir/IR/IRMapping.h>
-#include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/PatternMatch.h>
-#include <mlir/IR/Value.h>
-#include <mlir/IR/Visitors.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 #include <mlir/Support/LLVM.h>
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
+#include <optional>
 #include <variant>
 
 namespace flangomp {
@@ -48,52 +43,188 @@ using namespace mlir;
 
 namespace {
 
-struct WorkdistributeToSingle : public mlir::OpRewritePattern<mlir::omp::WorkdistributeOp> {
-using OpRewritePattern::OpRewritePattern;
-mlir::LogicalResult
-    matchAndRewrite(mlir::omp::WorkdistributeOp workdistribute,
-                       mlir::PatternRewriter &rewriter) const override {
-        auto loc = workdistribute->getLoc();
-        auto teams = llvm::dyn_cast<mlir::omp::TeamsOp>(workdistribute->getParentOp());
-        if (!teams) {
-            mlir::emitError(loc, "workdistribute not nested in teams\n");
-            return mlir::failure();
-        }
-        if (workdistribute.getRegion().getBlocks().size() != 1) {
-            mlir::emitError(loc, "workdistribute with multiple blocks\n");
-            return mlir::failure();
+template <typename T>
+static T getPerfectlyNested(Operation *op) {
+  if (op->getNumRegions() != 1)
+    return nullptr;
+  auto &region = op->getRegion(0);
+  if (region.getBlocks().size() != 1)
+    return nullptr;
+  auto *block = &region.front();
+  auto *firstOp = &block->front();
+  if (auto nested = dyn_cast<T>(firstOp))
+    if (firstOp->getNextNode() == block->getTerminator())
+      return nested;
+  return nullptr;
+}
+
+/// This is the single source of truth about whether we should parallelize an
+/// operation nested in an omp.workdistribute region.
+static bool shouldParallelize(Operation *op) {
+    // Currently we cannot parallelize operations with results that have uses
+    if (llvm::any_of(op->getResults(),
+                     [](OpResult v) -> bool { return !v.use_empty(); }))
+      return false;
+    // We will parallelize unordered loops - these come from array syntax
+    if (auto loop = dyn_cast<fir::DoLoopOp>(op)) {
+      auto unordered = loop.getUnordered();
+      if (!unordered)
+        return false;
+      return *unordered;
+    }
+    if (auto callOp = dyn_cast<fir::CallOp>(op)) {
+      auto callee = callOp.getCallee();
+      if (!callee)
+        return false;
+      auto *func = op->getParentOfType<ModuleOp>().lookupSymbol(*callee);
+      // TODO need to insert a check here whether it is a call we can actually
+      // parallelize currently
+      if (func->getAttr(fir::FIROpsDialect::getFirRuntimeAttrName()))
+        return true;
+      return false;
+    }
+    // We cannot parallise anything else
+    return false;
+}
+
+struct WorkdistributeToSingle : public OpRewritePattern<omp::TeamsOp> {
+    using OpRewritePattern::OpRewritePattern;
+    LogicalResult matchAndRewrite(omp::TeamsOp teamsOp,
+                                    PatternRewriter &rewriter) const override {
+        auto workdistributeOp = getPerfectlyNested<omp::WorkdistributeOp>(teamsOp);
+        if (!workdistributeOp) {
+            LLVM_DEBUG(llvm::dbgs() << DEBUG_TYPE << " No workdistribute nested\n");
+            return failure();
         }
-        if (teams.getRegion().getBlocks().size() != 1) {
-            mlir::emitError(loc, "teams with multiple blocks\n");
-           return mlir::failure();
+      
+        Block *workdistributeBlock = &workdistributeOp.getRegion().front();
+        rewriter.eraseOp(workdistributeBlock->getTerminator());
+        rewriter.inlineBlockBefore(workdistributeBlock, teamsOp);
+        rewriter.eraseOp(teamsOp);
+        workdistributeOp.emitWarning("unable to parallelize coexecute");
+        return success();
+    }
+};
+
+/// If B() and D() are parallelizable,
+///
+/// omp.teams {
+///   omp.workdistribute {
+///     A()
+///     B()
+///     C()
+///     D()
+///     E()
+///   }
+/// }
+///
+/// becomes
+///
+/// A()
+/// omp.teams {
+///   omp.workdistribute {
+///     B()
+///   }
+/// }
+/// C()
+/// omp.teams {
+///   omp.workdistribute {
+///     D()
+///   }
+/// }
+/// E()
+
+struct FissionWorkdistribute
+    : public OpRewritePattern<omp::WorkdistributeOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult
+  matchAndRewrite(omp::WorkdistributeOp workdistribute,
+                  PatternRewriter &rewriter) const override {
+    auto loc = workdistribute->getLoc();
+    auto teams = dyn_cast<omp::TeamsOp>(workdistribute->getParentOp());
+    if (!teams) {
+      emitError(loc, "workdistribute not nested in teams\n");
+      return failure();
+    }
+    if (workdistribute.getRegion().getBlocks().size() != 1) {
+      emitError(loc, "workdistribute with multiple blocks\n");
+      return failure();
+    }
+    if (teams.getRegion().getBlocks().size() != 1) {
+      emitError(loc, "teams with multiple blocks\n");
+      return failure();
+    }
+    if (teams.getRegion().getBlocks().front().getOperations().size() != 2) {
+      emitError(loc, "teams with multiple nested ops\n");
+      return failure();
+    }
+
+    auto *teamsBlock = &teams.getRegion().front();
+
+    // While we have unhandled operations in the original workdistribute
+    auto *workdistributeBlock = &workdistribute.getRegion().front();
+    auto *terminator = workdistributeBlock->getTerminator();
+    bool changed = false;
+    while (&workdistributeBlock->front() != terminator) {
+      rewriter.setInsertionPoint(teams);
+      IRMapping mapping;
+      llvm::SmallVector<Operation *> hoisted;
+      Operation *parallelize = nullptr;
+      for (auto &op : workdistribute.getOps()) {
+        if (&op == terminator) {
+          break;
         }
-        if (teams.getRegion().getBlocks().front().getOperations().size() != 2) {
-            mlir::emitError(loc, "teams with multiple nested ops\n");
-            return mlir::failure();
+        if (shouldParallelize(&op)) {
+          parallelize = &op;
+          break;
+        } else {
+          rewriter.clone(op, mapping);
+          hoisted.push_back(&op);
+          changed = true;
         }
-        mlir::Block *workdistributeBlock = &workdistribute.getRegion().front();
-        rewriter.eraseOp(workdistributeBlock->getTerminator());
-        rewriter.inlineBlockBefore(workdistributeBlock, teams);
-        rewriter.eraseOp(teams);
-        return mlir::success();
+      }
+
+      for (auto *op : hoisted)
+        rewriter.replaceOp(op, mapping.lookup(op));
+
+      if (parallelize && hoisted.empty() &&
+          parallelize->getNextNode() == terminator)
+        break;
+      if (parallelize) {
+        auto newTeams = rewriter.cloneWithoutRegions(teams);
+        auto *newTeamsBlock = rewriter.createBlock(
+            &newTeams.getRegion(), newTeams.getRegion().begin(), {}, {});
+        for (auto arg : teamsBlock->getArguments())
+          newTeamsBlock->addArgument(arg.getType(), arg.getLoc());
+        auto newWorkdistribute = rewriter.create<omp::WorkdistributeOp>(loc);
+        rewriter.create<omp::TerminatorOp>(loc);
+        rewriter.createBlock(&newWorkdistribute.getRegion(),
+                            newWorkdistribute.getRegion().begin(), {}, {});
+        auto *cloned = rewriter.clone(*parallelize);
+        rewriter.replaceOp(parallelize, cloned);
+        rewriter.create<omp::TerminatorOp>(loc);
+        changed = true;
+      }
     }
+    return success(changed);
+  }
 };
 
 class LowerWorkdistributePass
     : public flangomp::impl::LowerWorkdistributeBase<LowerWorkdistributePass> {
 public:
   void runOnOperation() override {
-    mlir::MLIRContext &context = getContext();
-    mlir::RewritePatternSet patterns(&context);
-    mlir::GreedyRewriteConfig config;
+    MLIRContext &context = getContext();
+    RewritePatternSet patterns(&context);
+    GreedyRewriteConfig config;
     // prevent the pattern driver form merging blocks
     config.setRegionSimplificationLevel(
-        mlir::GreedySimplifyRegionLevel::Disabled);
+        GreedySimplifyRegionLevel::Disabled);
   
-    patterns.insert<WorkdistributeToSingle>(&context);
-    mlir::Operation *op = getOperation();
-    if (mlir::failed(mlir::applyPatternsGreedily(op, std::move(patterns), config))) {
-      mlir::emitError(op->getLoc(), DEBUG_TYPE " pass failed\n");
+    patterns.insert<FissionWorkdistribute, WorkdistributeToSingle>(&context);
+    Operation *op = getOperation();
+    if (failed(applyPatternsGreedily(op, std::move(patterns), config))) {
+      emitError(op->getLoc(), DEBUG_TYPE " pass failed\n");
       signalPassFailure();
     }
   }
diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir
@@ -0,0 +1,60 @@
+// RUN: fir-opt --lower-workdistribute %s | FileCheck %s
+
+// CHECK-LABEL:   func.func @test_fission_workdistribute({{.*}}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 9 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 5.000000e+00 : f32
+// CHECK:           fir.store %[[VAL_3]] to %[[ARG2:.*]] : !fir.ref<f32>
+// CHECK:           fir.do_loop %[[VAL_4:.*]] = %[[VAL_0]] to %[[VAL_2]] step %[[VAL_1]] unordered {
+// CHECK:             %[[VAL_5:.*]] = fir.coordinate_of %[[ARG0:.*]], %[[VAL_4]] : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+// CHECK:             %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<f32>
+// CHECK:             %[[VAL_7:.*]] = fir.coordinate_of %[[ARG1:.*]], %[[VAL_4]] : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+// CHECK:             fir.store %[[VAL_6]] to %[[VAL_7]] : !fir.ref<f32>
+// CHECK:           }
+// CHECK:           fir.call @regular_side_effect_func(%[[ARG2:.*]]) : (!fir.ref<f32>) -> ()
+// CHECK:           fir.call @my_fir_parallel_runtime_func(%[[ARG3:.*]]) : (!fir.ref<f32>) -> ()
+// CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_2]] step %[[VAL_1]] {
+// CHECK:             %[[VAL_9:.*]] = fir.coordinate_of %[[ARG0:.*]], %[[VAL_8]] : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+// CHECK:             fir.store %[[VAL_3]] to %[[VAL_9]] : !fir.ref<f32>
+// CHECK:           }
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[ARG2:.*]] : !fir.ref<f32>
+// CHECK:           fir.store %[[VAL_10]] to %[[ARG3:.*]] : !fir.ref<f32>
+// CHECK:           return
+// CHECK:         }
+module {
+func.func @regular_side_effect_func(%arg0: !fir.ref<f32>) {
+  return
+}
+func.func @my_fir_parallel_runtime_func(%arg0: !fir.ref<f32>) attributes {fir.runtime} {
+  return
+}
+func.func @test_fission_workdistribute(%arr1: !fir.ref<!fir.array<10xf32>>, %arr2: !fir.ref<!fir.array<10xf32>>, %scalar_ref1: !fir.ref<f32>, %scalar_ref2: !fir.ref<f32>) {
+  %c0_idx = arith.constant 0 : index
+  %c1_idx = arith.constant 1 : index
+  %c9_idx = arith.constant 9 : index
+  %float_val = arith.constant 5.0 : f32
+  omp.teams   {
+    omp.workdistribute   {
+      fir.store %float_val to %scalar_ref1 : !fir.ref<f32>
+      fir.do_loop %iv = %c0_idx to %c9_idx step %c1_idx unordered {
+        %elem_ptr_arr1 = fir.coordinate_of %arr1, %iv : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+        %loaded_val_loop1 = fir.load %elem_ptr_arr1 : !fir.ref<f32>
+        %elem_ptr_arr2 = fir.coordinate_of %arr2, %iv : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+        fir.store %loaded_val_loop1 to %elem_ptr_arr2 : !fir.ref<f32>
+      }
+      fir.call @regular_side_effect_func(%scalar_ref1) : (!fir.ref<f32>) -> ()
+      fir.call @my_fir_parallel_runtime_func(%scalar_ref2) : (!fir.ref<f32>) -> ()
+      fir.do_loop %jv = %c0_idx to %c9_idx step %c1_idx {
+        %elem_ptr_ordered_loop = fir.coordinate_of %arr1, %jv : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+        fir.store %float_val to %elem_ptr_ordered_loop : !fir.ref<f32>
+      }
+      %loaded_for_hoist = fir.load %scalar_ref1 : !fir.ref<f32>
+      fir.store %loaded_for_hoist to %scalar_ref2 : !fir.ref<f32>
+      omp.terminator  
+    }
+    omp.terminator
+  }
+  return
+}
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-to-single.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-to-single.mlir
@@ -49,4 +49,4 @@ func.func @_QPtarget_simple() {
         omp.terminator
     }
     return
-}
+}

Original file line number	Diff line number	Diff line change
`@@ -49,4 +49,4 @@ func.func @_QPtarget_simple() {`
`49`	`49`	`omp.terminator`
`50`	`50`	`}`
`51`	`51`	`return`
`52`		`-}`
	`52`	`+}`