[flang][cuda] Do not produce data transfer in offloaded do concurrent (#147435)

clementval · web-flow · commit 46caad52ac14 · 2025-07-08T10:52:15.000-07:00
If a `do concurrent` loop is offloaded then there should be no CUDA data
transfer in it. Update the semantic and lowering to take that into
account.

`AssignmentChecker` has to be put into a separate pass because the
checkers in `SemanticsVisitor` cannot have the same `Enter/Leave`
functions. The `DoForallChecker` already has `Eneter/Leave` functions
for the `DoConstruct`.
diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -27,7 +27,8 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
                                             mlir::SymbolTable &symTab);
 
 bool isCUDADeviceContext(mlir::Operation *op);
-bool isCUDADeviceContext(mlir::Region &);
+bool isCUDADeviceContext(mlir::Region &,
+                         bool isDoConcurrentOffloadEnabled = false);
 bool isRegisteredDeviceGlobal(fir::GlobalOp op);
 bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);
 
diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h
@@ -55,7 +55,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank,
     IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor,
     ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy,
-    InaccessibleDeferredOverride, CudaWarpMatchFunction)
+    InaccessibleDeferredOverride, CudaWarpMatchFunction, DoConcurrentOffload)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
@@ -4886,7 +4886,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     mlir::Location loc = getCurrentLocation();
     fir::FirOpBuilder &builder = getFirOpBuilder();
 
-    bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
+    bool isInDeviceContext = cuf::isCUDADeviceContext(
+        builder.getRegion(),
+        getFoldingContext().languageFeatures().IsEnabled(
+            Fortran::common::LanguageFeature::DoConcurrentOffload));
 
     bool isCUDATransfer =
         IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -43,7 +43,8 @@ bool cuf::isCUDADeviceContext(mlir::Operation *op) {
 // for it.
 // If the insertion point is inside an OpenACC region op, it is considered
 // device context.
-bool cuf::isCUDADeviceContext(mlir::Region &region) {
+bool cuf::isCUDADeviceContext(mlir::Region &region,
+                              bool isDoConcurrentOffloadEnabled) {
   if (region.getParentOfType<cuf::KernelOp>())
     return true;
   if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
@@ -56,6 +57,9 @@ bool cuf::isCUDADeviceContext(mlir::Region &region) {
              cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
     }
   }
+  if (isDoConcurrentOffloadEnabled &&
+      region.getParentOfType<fir::DoConcurrentLoopOp>())
+    return true;
   return false;
 }
 
diff --git a/flang/lib/Semantics/assignment.cpp b/flang/lib/Semantics/assignment.cpp
@@ -42,7 +42,6 @@ class AssignmentContext {
   void Analyze(const parser::AssignmentStmt &);
   void Analyze(const parser::PointerAssignmentStmt &);
   void Analyze(const parser::ConcurrentControl &);
-  int deviceConstructDepth_{0};
   SemanticsContext &context() { return context_; }
 
 private:
@@ -97,21 +96,6 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
     if (whereDepth_ > 0) {
       CheckShape(lhsLoc, &lhs);
     }
-    if (context_.foldingContext().languageFeatures().IsEnabled(
-            common::LanguageFeature::CUDA)) {
-      const auto &scope{context_.FindScope(lhsLoc)};
-      const Scope &progUnit{GetProgramUnitContaining(scope)};
-      if (!IsCUDADeviceContext(&progUnit) && deviceConstructDepth_ == 0) {
-        if (Fortran::evaluate::HasCUDADeviceAttrs(lhs) &&
-            Fortran::evaluate::HasCUDAImplicitTransfer(rhs)) {
-          if (GetNbOfCUDAManagedOrUnifiedSymbols(lhs) == 1 &&
-              GetNbOfCUDAManagedOrUnifiedSymbols(rhs) == 1 &&
-              GetNbOfCUDADeviceSymbols(rhs) == 1)
-            return; // This is a special case handled on the host.
-          context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
-        }
-      }
-    }
   }
 }
 
@@ -254,46 +238,6 @@ void AssignmentChecker::Enter(const parser::MaskedElsewhereStmt &x) {
 void AssignmentChecker::Leave(const parser::MaskedElsewhereStmt &) {
   context_.value().PopWhereContext();
 }
-void AssignmentChecker::Enter(const parser::CUFKernelDoConstruct &x) {
-  ++context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Leave(const parser::CUFKernelDoConstruct &) {
-  --context_.value().deviceConstructDepth_;
-}
-static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
-  const auto &beginBlockDirective =
-      std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
-  const auto &blockDirective =
-      std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
-  if (blockDirective.v == llvm::acc::ACCD_parallel ||
-      blockDirective.v == llvm::acc::ACCD_serial ||
-      blockDirective.v == llvm::acc::ACCD_kernels) {
-    return true;
-  }
-  return false;
-}
-void AssignmentChecker::Enter(const parser::OpenACCBlockConstruct &x) {
-  if (IsOpenACCComputeConstruct(x)) {
-    ++context_.value().deviceConstructDepth_;
-  }
-}
-void AssignmentChecker::Leave(const parser::OpenACCBlockConstruct &x) {
-  if (IsOpenACCComputeConstruct(x)) {
-    --context_.value().deviceConstructDepth_;
-  }
-}
-void AssignmentChecker::Enter(const parser::OpenACCCombinedConstruct &) {
-  ++context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Leave(const parser::OpenACCCombinedConstruct &) {
-  --context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Enter(const parser::OpenACCLoopConstruct &) {
-  ++context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Leave(const parser::OpenACCLoopConstruct &) {
-  --context_.value().deviceConstructDepth_;
-}
 
 } // namespace Fortran::semantics
 template class Fortran::common::Indirection<
diff --git a/flang/lib/Semantics/assignment.h b/flang/lib/Semantics/assignment.h
@@ -46,14 +46,6 @@ class AssignmentChecker : public virtual BaseChecker {
   void Leave(const parser::EndWhereStmt &);
   void Enter(const parser::MaskedElsewhereStmt &);
   void Leave(const parser::MaskedElsewhereStmt &);
-  void Enter(const parser::CUFKernelDoConstruct &);
-  void Leave(const parser::CUFKernelDoConstruct &);
-  void Enter(const parser::OpenACCBlockConstruct &);
-  void Leave(const parser::OpenACCBlockConstruct &);
-  void Enter(const parser::OpenACCCombinedConstruct &);
-  void Leave(const parser::OpenACCCombinedConstruct &);
-  void Enter(const parser::OpenACCLoopConstruct &);
-  void Leave(const parser::OpenACCLoopConstruct &);
 
   SemanticsContext &context();
 
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
@@ -685,18 +685,67 @@ void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
       std::get<std::list<parser::CUFReduction>>(directive.t)) {
     CheckReduce(context_, reduce);
   }
-  inCUFKernelDoConstruct_ = true;
+  ++deviceConstructDepth_;
+}
+
+static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
+  const auto &beginBlockDirective =
+      std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
+  const auto &blockDirective =
+      std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
+  if (blockDirective.v == llvm::acc::ACCD_parallel ||
+      blockDirective.v == llvm::acc::ACCD_serial ||
+      blockDirective.v == llvm::acc::ACCD_kernels) {
+    return true;
+  }
+  return false;
 }
 
 void CUDAChecker::Leave(const parser::CUFKernelDoConstruct &) {
-  inCUFKernelDoConstruct_ = false;
+  --deviceConstructDepth_;
+}
+void CUDAChecker::Enter(const parser::OpenACCBlockConstruct &x) {
+  if (IsOpenACCComputeConstruct(x)) {
+    ++deviceConstructDepth_;
+  }
+}
+void CUDAChecker::Leave(const parser::OpenACCBlockConstruct &x) {
+  if (IsOpenACCComputeConstruct(x)) {
+    --deviceConstructDepth_;
+  }
+}
+void CUDAChecker::Enter(const parser::OpenACCCombinedConstruct &) {
+  ++deviceConstructDepth_;
+}
+void CUDAChecker::Leave(const parser::OpenACCCombinedConstruct &) {
+  --deviceConstructDepth_;
+}
+void CUDAChecker::Enter(const parser::OpenACCLoopConstruct &) {
+  ++deviceConstructDepth_;
+}
+void CUDAChecker::Leave(const parser::OpenACCLoopConstruct &) {
+  --deviceConstructDepth_;
+}
+void CUDAChecker::Enter(const parser::DoConstruct &x) {
+  if (x.IsDoConcurrent() &&
+      context_.foldingContext().languageFeatures().IsEnabled(
+          common::LanguageFeature::DoConcurrentOffload)) {
+    ++deviceConstructDepth_;
+  }
+}
+void CUDAChecker::Leave(const parser::DoConstruct &x) {
+  if (x.IsDoConcurrent() &&
+      context_.foldingContext().languageFeatures().IsEnabled(
+          common::LanguageFeature::DoConcurrentOffload)) {
+    --deviceConstructDepth_;
+  }
 }
 
 void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
   auto lhsLoc{std::get<parser::Variable>(x.t).GetSource()};
   const auto &scope{context_.FindScope(lhsLoc)};
   const Scope &progUnit{GetProgramUnitContaining(scope)};
-  if (IsCUDADeviceContext(&progUnit) || inCUFKernelDoConstruct_) {
+  if (IsCUDADeviceContext(&progUnit) || deviceConstructDepth_ > 0) {
     return; // Data transfer with assignment is only perform on host.
   }
 
@@ -714,6 +763,16 @@ void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
     context_.Say(lhsLoc,
         "More than one reference to a CUDA object on the right hand side of the assigment"_err_en_US);
   }
+
+  if (Fortran::evaluate::HasCUDADeviceAttrs(assign->lhs) &&
+      Fortran::evaluate::HasCUDAImplicitTransfer(assign->rhs)) {
+    if (GetNbOfCUDAManagedOrUnifiedSymbols(assign->lhs) == 1 &&
+        GetNbOfCUDAManagedOrUnifiedSymbols(assign->rhs) == 1 &&
+        GetNbOfCUDADeviceSymbols(assign->rhs) == 1) {
+      return; // This is a special case handled on the host.
+    }
+    context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
+  }
 }
 
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/check-cuda.h b/flang/lib/Semantics/check-cuda.h
@@ -41,10 +41,18 @@ class CUDAChecker : public virtual BaseChecker {
   void Enter(const parser::CUFKernelDoConstruct &);
   void Leave(const parser::CUFKernelDoConstruct &);
   void Enter(const parser::AssignmentStmt &);
+  void Enter(const parser::OpenACCBlockConstruct &);
+  void Leave(const parser::OpenACCBlockConstruct &);
+  void Enter(const parser::OpenACCCombinedConstruct &);
+  void Leave(const parser::OpenACCCombinedConstruct &);
+  void Enter(const parser::OpenACCLoopConstruct &);
+  void Leave(const parser::OpenACCLoopConstruct &);
+  void Enter(const parser::DoConstruct &);
+  void Leave(const parser::DoConstruct &);
 
 private:
   SemanticsContext &context_;
-  bool inCUFKernelDoConstruct_ = false;
+  int deviceConstructDepth_{0};
 };
 
 bool CanonicalizeCUDA(parser::Program &);
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -403,3 +403,19 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPsub20()
 ! CHECK-NOT: cuf.data_transfer
 ! CHECK: hlfir.assign
+
+subroutine sub21()
+  real, allocatable,device:: a(:,:), b(:,:)
+  real:: s
+  integer:: i,j,N=16
+  allocate(a(N,N),b(N,N))
+  do concurrent(i=1:N, j=1:N) reduce(+:s)
+    b(i,j)=a(i,j)**2
+    s=s+b(i,j)
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub21()
+! CHECK: fir.do_concurrent.loop
+! CHECK-NOT: cuf.data_transfer
+! CHECK: hlfir.assign
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
@@ -223,6 +223,11 @@ static llvm::cl::opt<bool> enableCUDA("fcuda",
                                       llvm::cl::desc("enable CUDA Fortran"),
                                       llvm::cl::init(false));
 
+static llvm::cl::opt<bool>
+    enableDoConcurrentOffload("fdoconcurrent-offload",
+                              llvm::cl::desc("enable do concurrent offload"),
+                              llvm::cl::init(false));
+
 static llvm::cl::opt<bool>
     disableCUDAWarpFunction("fcuda-disable-warp-function",
                             llvm::cl::desc("Disable CUDA Warp Function"),
@@ -608,6 +613,11 @@ int main(int argc, char **argv) {
     options.features.Enable(Fortran::common::LanguageFeature::CUDA);
   }
 
+  if (enableDoConcurrentOffload) {
+    options.features.Enable(
+        Fortran::common::LanguageFeature::DoConcurrentOffload);
+  }
+
   if (disableCUDAWarpFunction) {
     options.features.Enable(
         Fortran::common::LanguageFeature::CudaWarpMatchFunction, false);

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,8 @@ bool cuf::isCUDADeviceContext(mlir::Operation *op) {`
`43`	`43`	`// for it.`
`44`	`44`	`// If the insertion point is inside an OpenACC region op, it is considered`
`45`	`45`	`// device context.`
`46`		`-bool cuf::isCUDADeviceContext(mlir::Region &region) {`
	`46`	`+bool cuf::isCUDADeviceContext(mlir::Region &region,`
	`47`	`+ bool isDoConcurrentOffloadEnabled) {`
`47`	`48`	`if (region.getParentOfType<cuf::KernelOp>())`
`48`	`49`	`return true;`
`49`	`50`	`if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())`
`@@ -56,6 +57,9 @@ bool cuf::isCUDADeviceContext(mlir::Region &region) {`
`56`	`57`	`cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;`
`57`	`58`	`}`
`58`	`59`	`}`
	`60`	`+ if (isDoConcurrentOffloadEnabled &&`
	`61`	`+ region.getParentOfType<fir::DoConcurrentLoopOp>())`
	`62`	`+ return true;`
`59`	`63`	`return false;`
`60`	`64`	`}`
`61`	`65`