Skip to content

Commit f353b06

Browse files
committed
[MLIR][OpenMP] Host support for 'distribute'
This patch enables LLVM IR code generation for standalone 'distribute' and 'distribute parallel do' constructs by appropriately updating generated runtime calls. For the 'distribute parallel do' case, generated code does not match clang, which produces two nested loops (each one using `__kmpc_for_static_init` to set loop bounds). Instead, `__kmpc_dist_for_static_init` runtime calls are produced to prevent the need for lowering the same `omp.loop_nest` twice, passing updated values from the outer loop into the inner loop, forcing the MLIR to LLVM IR translation process to not follow the nesting hierarchy of the "hoisted parallel" MLIR representation. This way, existing support for each of the loop wrappers involved is used, simplifying greatly the process.
1 parent f2992aa commit f353b06

File tree

3 files changed

+50
-21
lines changed

3 files changed

+50
-21
lines changed

llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1029,12 +1029,12 @@ class OpenMPIRBuilder {
10291029
/// preheader of the loop.
10301030
/// \param NeedsBarrier Indicates whether a barrier must be inserted after
10311031
/// the loop.
1032+
/// \param LoopType Type of workshare loop.
10321033
///
10331034
/// \returns Point where to insert code after the workshare construct.
1034-
InsertPointOrErrorTy applyStaticWorkshareLoop(DebugLoc DL,
1035-
CanonicalLoopInfo *CLI,
1036-
InsertPointTy AllocaIP,
1037-
bool NeedsBarrier);
1035+
InsertPointOrErrorTy applyStaticWorkshareLoop(
1036+
DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1037+
omp::WorksharingLoopType LoopType, bool NeedsBarrier);
10381038

10391039
/// Modifies the canonical loop a statically-scheduled workshare loop with a
10401040
/// user-specified chunk size.

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2319,7 +2319,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
23192319
return LoopInfo.takeError();
23202320

23212321
InsertPointOrErrorTy WsloopIP =
2322-
applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait);
2322+
applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2323+
WorksharingLoopType::ForStaticLoop, !IsNowait);
23232324
if (!WsloopIP)
23242325
return WsloopIP.takeError();
23252326
InsertPointTy AfterIP = *WsloopIP;
@@ -4224,6 +4225,23 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
42244225
return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
42254226
}
42264227

4228+
// Returns an LLVM function to call for initializing loop bounds using OpenMP
4229+
// static scheduling for composite `distribute parallel for` depending on
4230+
// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4231+
// integers as unsigned similarly to CanonicalLoopInfo.
4232+
static FunctionCallee
4233+
getKmpcDistForStaticInitForType(Type *Ty, Module &M,
4234+
OpenMPIRBuilder &OMPBuilder) {
4235+
unsigned Bitwidth = Ty->getIntegerBitWidth();
4236+
if (Bitwidth == 32)
4237+
return OMPBuilder.getOrCreateRuntimeFunction(
4238+
M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4239+
if (Bitwidth == 64)
4240+
return OMPBuilder.getOrCreateRuntimeFunction(
4241+
M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4242+
llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4243+
}
4244+
42274245
// Returns an LLVM function to call for initializing loop bounds using OpenMP
42284246
// static scheduling depending on `type`. Only i32 and i64 are supported by the
42294247
// runtime. Always interpret integers as unsigned similarly to
@@ -4240,10 +4258,9 @@ static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M,
42404258
llvm_unreachable("unknown OpenMP loop iterator bitwidth");
42414259
}
42424260

4243-
OpenMPIRBuilder::InsertPointOrErrorTy
4244-
OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4245-
InsertPointTy AllocaIP,
4246-
bool NeedsBarrier) {
4261+
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4262+
DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4263+
WorksharingLoopType LoopType, bool NeedsBarrier) {
42474264
assert(CLI->isValid() && "Requires a valid canonical loop");
42484265
assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
42494266
"Require dedicated allocate IP");
@@ -4259,7 +4276,10 @@ OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
42594276
// Declare useful OpenMP runtime functions.
42604277
Value *IV = CLI->getIndVar();
42614278
Type *IVTy = IV->getType();
4262-
FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4279+
FunctionCallee StaticInit =
4280+
LoopType == WorksharingLoopType::DistributeForStaticLoop
4281+
? getKmpcDistForStaticInitForType(IVTy, M, *this)
4282+
: getKmpcForStaticInitForType(IVTy, M, *this);
42634283
FunctionCallee StaticFini =
42644284
getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
42654285

@@ -4286,14 +4306,24 @@ OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
42864306

42874307
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
42884308

4289-
Constant *SchedulingType = ConstantInt::get(
4290-
I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4309+
OMPScheduleType SchedType =
4310+
(LoopType == WorksharingLoopType::DistributeStaticLoop)
4311+
? OMPScheduleType::OrderedDistribute
4312+
: OMPScheduleType::UnorderedStatic;
4313+
Constant *SchedulingType =
4314+
ConstantInt::get(I32Type, static_cast<int>(SchedType));
42914315

42924316
// Call the "init" function and update the trip count of the loop with the
42934317
// value it produced.
4294-
Builder.CreateCall(StaticInit,
4295-
{SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4296-
PUpperBound, PStride, One, Zero});
4318+
SmallVector<Value *, 10> Args(
4319+
{SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4320+
if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4321+
Value *PDistUpperBound =
4322+
Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4323+
Args.push_back(PDistUpperBound);
4324+
}
4325+
Args.append({PStride, One, Zero});
4326+
Builder.CreateCall(StaticInit, Args);
42974327
Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
42984328
Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
42994329
Value *TripCountMinusOne =
@@ -4755,7 +4785,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
47554785
return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
47564786
NeedsBarrier, ChunkSize);
47574787
// FIXME: Monotonicity ignored?
4758-
return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4788+
return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
47594789

47604790
case OMPScheduleType::BaseStaticChunked:
47614791
if (IsOrdered)

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4208,9 +4208,8 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
42084208

42094209
// Skip applying a workshare loop below when translating 'distribute
42104210
// parallel do' (it's been already handled by this point while translating
4211-
// the nested omp.wsloop) and when not targeting a GPU.
4212-
if (isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper()) ||
4213-
!ompBuilder->Config.isGPU())
4211+
// the nested omp.wsloop).
4212+
if (isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper()))
42144213
return llvm::Error::success();
42154214

42164215
// TODO: Add support for clauses which are valid for DISTRIBUTE construct
@@ -4221,13 +4220,13 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
42214220
bool isSimd = false;
42224221
llvm::omp::WorksharingLoopType workshareLoopType =
42234222
llvm::omp::WorksharingLoopType::DistributeStaticLoop;
4224-
bool loopNeedsBarier = true;
4223+
bool loopNeedsBarrier = false;
42254224
llvm::Value *chunk = nullptr;
42264225

42274226
llvm::CanonicalLoopInfo *loopInfo = findCurrentLoopInfo(moduleTranslation);
42284227
llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP =
42294228
ompBuilder->applyWorkshareLoop(
4230-
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarier,
4229+
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
42314230
convertToScheduleKind(schedule), chunk, isSimd,
42324231
scheduleMod == omp::ScheduleModifier::monotonic,
42334232
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,

0 commit comments

Comments
 (0)