[flang][cuda] Register managed variables with double descriptor (#134444)

clementval · web-flow · commit 18ff8df95837 · 2025-04-04T14:38:01.000-07:00
Allocatable or pointer module variables with the CUDA managed attribute
are defined with a double descriptor. One on the host and one on the
device. Only the data pointed to by the descriptor will be allocated in
managed memory.
Allow the registration of any allocatable or pointer module variables
like device or constant.
diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
@@ -105,10 +105,15 @@ struct CUFAddConstructor
         if (!attr)
           continue;
 
+        if (attr.getValue() == cuf::DataAttribute::Managed &&
+            !mlir::isa<fir::BaseBoxType>(globalOp.getType()))
+          TODO(loc, "registration of non-allocatable managed variables");
+
         mlir::func::FuncOp func;
         switch (attr.getValue()) {
         case cuf::DataAttribute::Device:
-        case cuf::DataAttribute::Constant: {
+        case cuf::DataAttribute::Constant:
+        case cuf::DataAttribute::Managed: {
           func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
               loc, builder);
           auto fTy = func.getFunctionType();
@@ -141,8 +146,6 @@ struct CUFAddConstructor
               builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
           builder.create<fir::CallOp>(loc, func, args);
         } break;
-        case cuf::DataAttribute::Managed:
-          TODO(loc, "registration of managed variables");
         default:
           break;
         }
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -60,3 +60,21 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i
     }
   }
 }
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {  
+  fir.global @_QMmEa00 {data_attr = #cuf.cuda<managed>} : !fir.box<!fir.heap<!fir.array<?x?x?x?x?xf64>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.heap<!fir.array<?x?x?x?x?xf64>>
+    %1 = fir.shape %c0, %c0, %c0, %c0, %c0 : (index, index, index, index, index) -> !fir.shape<5>
+    %2 = fir.embox %0(%1) {allocator_idx = 3 : i32} : (!fir.heap<!fir.array<?x?x?x?x?xf64>>, !fir.shape<5>) -> !fir.box<!fir.heap<!fir.array<?x?x?x?x?xf64>>>
+    fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?x?x?x?x?xf64>>>
+  }
+  gpu.module @cuda_device_mod {
+  }
+}
+
+// CHECK: llvm.func internal @__cudaFortranConstructor()
+// CHECK: fir.address_of(@_QMmEa00)
+// CHECK: fir.call @_FortranACUFRegisterVariable