Skip to content

Commit 46caad5

Browse files
authored
[flang][cuda] Do not produce data transfer in offloaded do concurrent (#147435)
If a `do concurrent` loop is offloaded then there should be no CUDA data transfer in it. Update the semantic and lowering to take that into account. `AssignmentChecker` has to be put into a separate pass because the checkers in `SemanticsVisitor` cannot have the same `Enter/Leave` functions. The `DoForallChecker` already has `Eneter/Leave` functions for the `DoConstruct`.
1 parent 1e3f6a6 commit 46caad5

File tree

10 files changed

+109
-72
lines changed

10 files changed

+109
-72
lines changed

flang/include/flang/Optimizer/Builder/CUFCommon.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
2727
mlir::SymbolTable &symTab);
2828

2929
bool isCUDADeviceContext(mlir::Operation *op);
30-
bool isCUDADeviceContext(mlir::Region &);
30+
bool isCUDADeviceContext(mlir::Region &,
31+
bool isDoConcurrentOffloadEnabled = false);
3132
bool isRegisteredDeviceGlobal(fir::GlobalOp op);
3233
bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);
3334

flang/include/flang/Support/Fortran-features.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
5555
SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank,
5656
IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor,
5757
ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy,
58-
InaccessibleDeferredOverride, CudaWarpMatchFunction)
58+
InaccessibleDeferredOverride, CudaWarpMatchFunction, DoConcurrentOffload)
5959

6060
// Portability and suspicious usage warnings
6161
ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,

flang/lib/Lower/Bridge.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4886,7 +4886,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
48864886
mlir::Location loc = getCurrentLocation();
48874887
fir::FirOpBuilder &builder = getFirOpBuilder();
48884888

4889-
bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
4889+
bool isInDeviceContext = cuf::isCUDADeviceContext(
4890+
builder.getRegion(),
4891+
getFoldingContext().languageFeatures().IsEnabled(
4892+
Fortran::common::LanguageFeature::DoConcurrentOffload));
48904893

48914894
bool isCUDATransfer =
48924895
IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;

flang/lib/Optimizer/Builder/CUFCommon.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ bool cuf::isCUDADeviceContext(mlir::Operation *op) {
4343
// for it.
4444
// If the insertion point is inside an OpenACC region op, it is considered
4545
// device context.
46-
bool cuf::isCUDADeviceContext(mlir::Region &region) {
46+
bool cuf::isCUDADeviceContext(mlir::Region &region,
47+
bool isDoConcurrentOffloadEnabled) {
4748
if (region.getParentOfType<cuf::KernelOp>())
4849
return true;
4950
if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
@@ -56,6 +57,9 @@ bool cuf::isCUDADeviceContext(mlir::Region &region) {
5657
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
5758
}
5859
}
60+
if (isDoConcurrentOffloadEnabled &&
61+
region.getParentOfType<fir::DoConcurrentLoopOp>())
62+
return true;
5963
return false;
6064
}
6165

flang/lib/Semantics/assignment.cpp

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ class AssignmentContext {
4242
void Analyze(const parser::AssignmentStmt &);
4343
void Analyze(const parser::PointerAssignmentStmt &);
4444
void Analyze(const parser::ConcurrentControl &);
45-
int deviceConstructDepth_{0};
4645
SemanticsContext &context() { return context_; }
4746

4847
private:
@@ -97,21 +96,6 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
9796
if (whereDepth_ > 0) {
9897
CheckShape(lhsLoc, &lhs);
9998
}
100-
if (context_.foldingContext().languageFeatures().IsEnabled(
101-
common::LanguageFeature::CUDA)) {
102-
const auto &scope{context_.FindScope(lhsLoc)};
103-
const Scope &progUnit{GetProgramUnitContaining(scope)};
104-
if (!IsCUDADeviceContext(&progUnit) && deviceConstructDepth_ == 0) {
105-
if (Fortran::evaluate::HasCUDADeviceAttrs(lhs) &&
106-
Fortran::evaluate::HasCUDAImplicitTransfer(rhs)) {
107-
if (GetNbOfCUDAManagedOrUnifiedSymbols(lhs) == 1 &&
108-
GetNbOfCUDAManagedOrUnifiedSymbols(rhs) == 1 &&
109-
GetNbOfCUDADeviceSymbols(rhs) == 1)
110-
return; // This is a special case handled on the host.
111-
context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
112-
}
113-
}
114-
}
11599
}
116100
}
117101

@@ -254,46 +238,6 @@ void AssignmentChecker::Enter(const parser::MaskedElsewhereStmt &x) {
254238
void AssignmentChecker::Leave(const parser::MaskedElsewhereStmt &) {
255239
context_.value().PopWhereContext();
256240
}
257-
void AssignmentChecker::Enter(const parser::CUFKernelDoConstruct &x) {
258-
++context_.value().deviceConstructDepth_;
259-
}
260-
void AssignmentChecker::Leave(const parser::CUFKernelDoConstruct &) {
261-
--context_.value().deviceConstructDepth_;
262-
}
263-
static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
264-
const auto &beginBlockDirective =
265-
std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
266-
const auto &blockDirective =
267-
std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
268-
if (blockDirective.v == llvm::acc::ACCD_parallel ||
269-
blockDirective.v == llvm::acc::ACCD_serial ||
270-
blockDirective.v == llvm::acc::ACCD_kernels) {
271-
return true;
272-
}
273-
return false;
274-
}
275-
void AssignmentChecker::Enter(const parser::OpenACCBlockConstruct &x) {
276-
if (IsOpenACCComputeConstruct(x)) {
277-
++context_.value().deviceConstructDepth_;
278-
}
279-
}
280-
void AssignmentChecker::Leave(const parser::OpenACCBlockConstruct &x) {
281-
if (IsOpenACCComputeConstruct(x)) {
282-
--context_.value().deviceConstructDepth_;
283-
}
284-
}
285-
void AssignmentChecker::Enter(const parser::OpenACCCombinedConstruct &) {
286-
++context_.value().deviceConstructDepth_;
287-
}
288-
void AssignmentChecker::Leave(const parser::OpenACCCombinedConstruct &) {
289-
--context_.value().deviceConstructDepth_;
290-
}
291-
void AssignmentChecker::Enter(const parser::OpenACCLoopConstruct &) {
292-
++context_.value().deviceConstructDepth_;
293-
}
294-
void AssignmentChecker::Leave(const parser::OpenACCLoopConstruct &) {
295-
--context_.value().deviceConstructDepth_;
296-
}
297241

298242
} // namespace Fortran::semantics
299243
template class Fortran::common::Indirection<

flang/lib/Semantics/assignment.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,6 @@ class AssignmentChecker : public virtual BaseChecker {
4646
void Leave(const parser::EndWhereStmt &);
4747
void Enter(const parser::MaskedElsewhereStmt &);
4848
void Leave(const parser::MaskedElsewhereStmt &);
49-
void Enter(const parser::CUFKernelDoConstruct &);
50-
void Leave(const parser::CUFKernelDoConstruct &);
51-
void Enter(const parser::OpenACCBlockConstruct &);
52-
void Leave(const parser::OpenACCBlockConstruct &);
53-
void Enter(const parser::OpenACCCombinedConstruct &);
54-
void Leave(const parser::OpenACCCombinedConstruct &);
55-
void Enter(const parser::OpenACCLoopConstruct &);
56-
void Leave(const parser::OpenACCLoopConstruct &);
5749

5850
SemanticsContext &context();
5951

flang/lib/Semantics/check-cuda.cpp

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -685,18 +685,67 @@ void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
685685
std::get<std::list<parser::CUFReduction>>(directive.t)) {
686686
CheckReduce(context_, reduce);
687687
}
688-
inCUFKernelDoConstruct_ = true;
688+
++deviceConstructDepth_;
689+
}
690+
691+
static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
692+
const auto &beginBlockDirective =
693+
std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
694+
const auto &blockDirective =
695+
std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
696+
if (blockDirective.v == llvm::acc::ACCD_parallel ||
697+
blockDirective.v == llvm::acc::ACCD_serial ||
698+
blockDirective.v == llvm::acc::ACCD_kernels) {
699+
return true;
700+
}
701+
return false;
689702
}
690703

691704
void CUDAChecker::Leave(const parser::CUFKernelDoConstruct &) {
692-
inCUFKernelDoConstruct_ = false;
705+
--deviceConstructDepth_;
706+
}
707+
void CUDAChecker::Enter(const parser::OpenACCBlockConstruct &x) {
708+
if (IsOpenACCComputeConstruct(x)) {
709+
++deviceConstructDepth_;
710+
}
711+
}
712+
void CUDAChecker::Leave(const parser::OpenACCBlockConstruct &x) {
713+
if (IsOpenACCComputeConstruct(x)) {
714+
--deviceConstructDepth_;
715+
}
716+
}
717+
void CUDAChecker::Enter(const parser::OpenACCCombinedConstruct &) {
718+
++deviceConstructDepth_;
719+
}
720+
void CUDAChecker::Leave(const parser::OpenACCCombinedConstruct &) {
721+
--deviceConstructDepth_;
722+
}
723+
void CUDAChecker::Enter(const parser::OpenACCLoopConstruct &) {
724+
++deviceConstructDepth_;
725+
}
726+
void CUDAChecker::Leave(const parser::OpenACCLoopConstruct &) {
727+
--deviceConstructDepth_;
728+
}
729+
void CUDAChecker::Enter(const parser::DoConstruct &x) {
730+
if (x.IsDoConcurrent() &&
731+
context_.foldingContext().languageFeatures().IsEnabled(
732+
common::LanguageFeature::DoConcurrentOffload)) {
733+
++deviceConstructDepth_;
734+
}
735+
}
736+
void CUDAChecker::Leave(const parser::DoConstruct &x) {
737+
if (x.IsDoConcurrent() &&
738+
context_.foldingContext().languageFeatures().IsEnabled(
739+
common::LanguageFeature::DoConcurrentOffload)) {
740+
--deviceConstructDepth_;
741+
}
693742
}
694743

695744
void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
696745
auto lhsLoc{std::get<parser::Variable>(x.t).GetSource()};
697746
const auto &scope{context_.FindScope(lhsLoc)};
698747
const Scope &progUnit{GetProgramUnitContaining(scope)};
699-
if (IsCUDADeviceContext(&progUnit) || inCUFKernelDoConstruct_) {
748+
if (IsCUDADeviceContext(&progUnit) || deviceConstructDepth_ > 0) {
700749
return; // Data transfer with assignment is only perform on host.
701750
}
702751

@@ -714,6 +763,16 @@ void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
714763
context_.Say(lhsLoc,
715764
"More than one reference to a CUDA object on the right hand side of the assigment"_err_en_US);
716765
}
766+
767+
if (Fortran::evaluate::HasCUDADeviceAttrs(assign->lhs) &&
768+
Fortran::evaluate::HasCUDAImplicitTransfer(assign->rhs)) {
769+
if (GetNbOfCUDAManagedOrUnifiedSymbols(assign->lhs) == 1 &&
770+
GetNbOfCUDAManagedOrUnifiedSymbols(assign->rhs) == 1 &&
771+
GetNbOfCUDADeviceSymbols(assign->rhs) == 1) {
772+
return; // This is a special case handled on the host.
773+
}
774+
context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
775+
}
717776
}
718777

719778
} // namespace Fortran::semantics

flang/lib/Semantics/check-cuda.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,18 @@ class CUDAChecker : public virtual BaseChecker {
4141
void Enter(const parser::CUFKernelDoConstruct &);
4242
void Leave(const parser::CUFKernelDoConstruct &);
4343
void Enter(const parser::AssignmentStmt &);
44+
void Enter(const parser::OpenACCBlockConstruct &);
45+
void Leave(const parser::OpenACCBlockConstruct &);
46+
void Enter(const parser::OpenACCCombinedConstruct &);
47+
void Leave(const parser::OpenACCCombinedConstruct &);
48+
void Enter(const parser::OpenACCLoopConstruct &);
49+
void Leave(const parser::OpenACCLoopConstruct &);
50+
void Enter(const parser::DoConstruct &);
51+
void Leave(const parser::DoConstruct &);
4452

4553
private:
4654
SemanticsContext &context_;
47-
bool inCUFKernelDoConstruct_ = false;
55+
int deviceConstructDepth_{0};
4856
};
4957

5058
bool CanonicalizeCUDA(parser::Program &);

flang/test/Lower/CUDA/cuda-data-transfer.cuf

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,3 +403,19 @@ end subroutine
403403
! CHECK-LABEL: func.func @_QPsub20()
404404
! CHECK-NOT: cuf.data_transfer
405405
! CHECK: hlfir.assign
406+
407+
subroutine sub21()
408+
real, allocatable,device:: a(:,:), b(:,:)
409+
real:: s
410+
integer:: i,j,N=16
411+
allocate(a(N,N),b(N,N))
412+
do concurrent(i=1:N, j=1:N) reduce(+:s)
413+
b(i,j)=a(i,j)**2
414+
s=s+b(i,j)
415+
end do
416+
end subroutine
417+
418+
! CHECK-LABEL: func.func @_QPsub21()
419+
! CHECK: fir.do_concurrent.loop
420+
! CHECK-NOT: cuf.data_transfer
421+
! CHECK: hlfir.assign

flang/tools/bbc/bbc.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,11 @@ static llvm::cl::opt<bool> enableCUDA("fcuda",
223223
llvm::cl::desc("enable CUDA Fortran"),
224224
llvm::cl::init(false));
225225

226+
static llvm::cl::opt<bool>
227+
enableDoConcurrentOffload("fdoconcurrent-offload",
228+
llvm::cl::desc("enable do concurrent offload"),
229+
llvm::cl::init(false));
230+
226231
static llvm::cl::opt<bool>
227232
disableCUDAWarpFunction("fcuda-disable-warp-function",
228233
llvm::cl::desc("Disable CUDA Warp Function"),
@@ -608,6 +613,11 @@ int main(int argc, char **argv) {
608613
options.features.Enable(Fortran::common::LanguageFeature::CUDA);
609614
}
610615

616+
if (enableDoConcurrentOffload) {
617+
options.features.Enable(
618+
Fortran::common::LanguageFeature::DoConcurrentOffload);
619+
}
620+
611621
if (disableCUDAWarpFunction) {
612622
options.features.Enable(
613623
Fortran::common::LanguageFeature::CudaWarpMatchFunction, false);

0 commit comments

Comments
 (0)