Skip to content

Commit 27c02ed

Browse files
committed
[flang][OpenMP] Privatize locally destroyed values in do concurent
Locally destroyed values are those values for which the Fortran runtime calls `@_FortranADestroy` inside the loops body. If these values are allocated outside the loop, and the loop is mapped to OpenMP, then a runtime error would occur due to multiple teams trying to access the same allocation. In such cases, a local privatized value is created in the OpenMP region to prevent multiple teams of treads from accessing and destroying the same memory block which causes runtime issues.
1 parent d5a3c4d commit 27c02ed

File tree

3 files changed

+167
-9
lines changed

3 files changed

+167
-9
lines changed

flang/docs/DoConcurrentConversionToOpenMP.md

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,9 +234,16 @@ see the "Data environment" section below.
234234
By default, variables that are used inside a `do concurernt` loop nest are
235235
either treated as `shared` in case of mapping to `host`, or mapped into the
236236
`target` region using a `map` clause in case of mapping to `device`. The only
237-
exception to this is the loop's iteration variable(s) (IV) of **perfect** loop
238-
nest. In that case, for each IV, we allocate a local copy as shown the by the
239-
mapping examples above.
237+
exceptions to this are:
238+
1. the loop's iteration variable(s) (IV) of **perfect** loop nest. In that
239+
case, for each IV, we allocate a local copy as shown the by the mapping
240+
examples above.
241+
1. any values that are allocation outside the loop nest and destroyed inside
242+
of it. In such cases, a local privatized value is created in the OpenMP
243+
region to prevent multiple teams of treads from accessing and destroying
244+
the same memory block which causes runtime issues. For an example of such
245+
cases, see
246+
`flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90`.
240247

241248
#### Non-perfectly-nested loops' IVs
242249

flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp

Lines changed: 91 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
1616
#include "flang/Optimizer/HLFIR/HLFIROps.h"
1717
#include "flang/Optimizer/Transforms/Passes.h"
18+
#include "mlir/Analysis/SliceAnalysis.h"
19+
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
1820
#include "mlir/Dialect/Func/IR/FuncOps.h"
21+
#include "mlir/Dialect/Math/IR/Math.h"
1922
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
2023
#include "mlir/IR/Diagnostics.h"
2124
#include "mlir/IR/IRMapping.h"
@@ -468,6 +471,77 @@ void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
468471
++idx;
469472
}
470473
}
474+
475+
/// Collects values are that are destroyed by the Fortran runtime within the
476+
/// loop's scope.
477+
///
478+
/// \param [in] doLoop - the loop within which the function searches for locally
479+
/// destroyed values.
480+
///
481+
/// \param [out] local - a map from locally destroyed values to the runtime
482+
/// destroy opertaions that destroy them.
483+
void collectLocallyDestroyedValuesInLoop(
484+
fir::DoLoopOp doLoop,
485+
llvm::DenseMap<mlir::Value, mlir::Operation *> &locals) {
486+
constexpr static auto destroy{"_FortranADestroy"};
487+
doLoop.getRegion().walk([&](fir::CallOp call) {
488+
auto callee = call.getCallee();
489+
490+
if (!callee.has_value())
491+
return;
492+
493+
if (callee.value().getLeafReference().str() != destroy)
494+
return;
495+
496+
assert(call.getNumOperands() == 1);
497+
498+
mlir::BackwardSliceOptions options;
499+
options.inclusive = true;
500+
llvm::SetVector<mlir::Operation *> opSlice;
501+
mlir::getBackwardSlice(call, &opSlice, options);
502+
503+
if (auto alloca = mlir::dyn_cast_if_present<fir::AllocaOp>(opSlice.front()))
504+
locals.try_emplace(alloca.getResult(), call);
505+
});
506+
}
507+
508+
/// For a locally destroyed value \p local within a loop's scope, localizes that
509+
/// value within the scope of the parallel region the loop maps to. Towards that
510+
/// end, this function allocates a private copy of \p local within \p
511+
/// allocRegion.
512+
///
513+
/// \param local - the locally destroyed value within a loop's scope (see
514+
/// collectLocallyDestroyedValuesInLoop).
515+
///
516+
/// \param localDestroyer - the Fortran runtime call operation that destroys \p
517+
/// local.
518+
///
519+
/// \param allocRegion - the parallel region where \p local's allocation will be
520+
/// cloned (i.e. privatized).
521+
///
522+
/// \param rewriter - builder used for updating \p allocRegion.
523+
///
524+
/// \param mapper - mapper to track updated references \p local within \p
525+
/// allocRegion.
526+
void localizeLocallyDestroyedValue(mlir::Value local,
527+
mlir::Operation *localDestroyer,
528+
mlir::Region &allocRegion,
529+
mlir::ConversionPatternRewriter &rewriter,
530+
mlir::IRMapping &mapper) {
531+
mlir::Region *loopRegion = localDestroyer->getParentRegion();
532+
assert(loopRegion != nullptr);
533+
534+
mlir::IRRewriter::InsertPoint ip = rewriter.saveInsertionPoint();
535+
rewriter.setInsertionPointToStart(&allocRegion.front());
536+
mlir::Operation *newLocalDef = rewriter.clone(*local.getDefiningOp(), mapper);
537+
rewriter.replaceUsesWithIf(
538+
local, newLocalDef->getResult(0), [&](mlir::OpOperand &operand) {
539+
return operand.getOwner()->getParentRegion() == loopRegion;
540+
});
541+
mapper.map(local, newLocalDef->getResult(0));
542+
543+
rewriter.restoreInsertionPoint(ip);
544+
}
471545
} // namespace looputils
472546

473547
class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
@@ -519,9 +593,14 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
519593
bool hasRemainingNestedLoops =
520594
failed(looputils::collectLoopNest(doLoop, loopNest));
521595

596+
mlir::IRMapping mapper;
597+
598+
llvm::DenseMap<mlir::Value, mlir::Operation *> locals;
599+
looputils::collectLocallyDestroyedValuesInLoop(loopNest.back().first,
600+
locals);
601+
522602
looputils::sinkLoopIVArgs(rewriter, loopNest);
523603

524-
mlir::IRMapping mapper;
525604
mlir::omp::TargetOp targetOp;
526605
mlir::omp::LoopNestClauseOps loopNestClauseOps;
527606

@@ -541,8 +620,13 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
541620
genDistributeOp(doLoop.getLoc(), rewriter);
542621
}
543622

544-
genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper,
545-
loopNestClauseOps);
623+
mlir::omp::ParallelOp parallelOp = genParallelOp(
624+
doLoop.getLoc(), rewriter, loopNest, mapper, loopNestClauseOps);
625+
626+
for (auto &[local, localDestroyer] : locals)
627+
looputils::localizeLocallyDestroyedValue(
628+
local, localDestroyer, parallelOp.getRegion(), rewriter, mapper);
629+
546630
mlir::omp::LoopNestOp ompLoopNest =
547631
genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps);
548632

@@ -919,9 +1003,10 @@ class DoConcurrentConversionPass
9191003
context, mapTo == fir::omp::DoConcurrentMappingKind::DCMK_Device,
9201004
concurrentLoopsToSkip);
9211005
mlir::ConversionTarget target(*context);
922-
target.addLegalDialect<fir::FIROpsDialect, hlfir::hlfirDialect,
923-
mlir::arith::ArithDialect, mlir::func::FuncDialect,
924-
mlir::omp::OpenMPDialect>();
1006+
target.addLegalDialect<
1007+
fir::FIROpsDialect, hlfir::hlfirDialect, mlir::arith::ArithDialect,
1008+
mlir::func::FuncDialect, mlir::omp::OpenMPDialect,
1009+
mlir::cf::ControlFlowDialect, mlir::math::MathDialect>();
9251010

9261011
target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
9271012
return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
! Tests that locally destroyed values in a `do concurrent` loop are properly
2+
! handled. Locally destroyed values are those values for which the Fortran runtime
3+
! calls `@_FortranADestroy` inside the loops body. If these values are allocated
4+
! outside the loop, and the loop is mapped to OpenMP, then a runtime error would
5+
! occur due to multiple teams trying to access the same allocation.
6+
7+
! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \
8+
! RUN: | FileCheck %s
9+
10+
module struct_mod
11+
type test_struct
12+
integer, allocatable :: x_
13+
end type
14+
15+
interface test_struct
16+
pure module function construct_from_components(x) result(struct)
17+
implicit none
18+
integer, intent(in) :: x
19+
type(test_struct) struct
20+
end function
21+
end interface
22+
end module
23+
24+
submodule(struct_mod) struct_sub
25+
implicit none
26+
27+
contains
28+
module procedure construct_from_components
29+
struct%x_ = x
30+
end procedure
31+
end submodule struct_sub
32+
33+
program main
34+
use struct_mod, only : test_struct
35+
36+
implicit none
37+
type(test_struct), dimension(10) :: a
38+
integer :: i
39+
integer :: total
40+
41+
do concurrent (i=1:10)
42+
a(i) = test_struct(i)
43+
end do
44+
45+
do i=1,10
46+
total = total + a(i)%x_
47+
end do
48+
49+
print *, "total =", total
50+
end program main
51+
52+
! CHECK: omp.parallel {
53+
! CHECK: %[[LOCAL_TEMP:.*]] = fir.alloca !fir.type<_QMstruct_modTtest_struct{x_:!fir.box<!fir.heap<i32>>}> {bindc_name = ".result"}
54+
! CHECK: omp.wsloop {
55+
! CHECK: omp.loop_nest {{.*}} {
56+
! CHECK: %[[TEMP_VAL:.*]] = fir.call @_QMstruct_modPconstruct_from_components
57+
! CHECK: fir.save_result %[[TEMP_VAL]] to %[[LOCAL_TEMP]]
58+
! CHECK: %[[EMBOXED_LOCAL:.*]] = fir.embox %[[LOCAL_TEMP]]
59+
! CHECK: %[[CONVERTED_LOCAL:.*]] = fir.convert %[[EMBOXED_LOCAL]]
60+
! CHECK: fir.call @_FortranADestroy(%[[CONVERTED_LOCAL]])
61+
! CHECK: omp.yield
62+
! CHECK: }
63+
! CHECK: omp.terminator
64+
! CHECK: }
65+
! CHECK: omp.terminator
66+
! CHECK: }

0 commit comments

Comments
 (0)