oneapi-src · zhiweij1 · May 19, 2025 · May 20, 2025 · May 20, 2025 · May 21, 2025
@@ -811,11 +811,12 @@ DPCT_ENUM_OPTION(
             "be accessed within a kernel using syntax similar to C++ global "
             "variables.\n",
             false),
-        DPCT_OPTION_ENUM_VALUE(
-            "virtual_mem", int(ExperimentalFeatures::Exp_VirtualMemory),
-            "Experimental extension that allows for mapping of an address range onto "
-            "multiple allocations of physical memory.",
-            false),
+        DPCT_OPTION_ENUM_VALUE("virtual_mem",
+                               int(ExperimentalFeatures::Exp_VirtualMemory),
+                               "Experimental extension that allows for mapping "
+                               "of an address range onto "
+                               "multiple allocations of physical memory.",
+                               false),
         DPCT_OPTION_ENUM_VALUE(
             "in_order_queue_events",
             int(ExperimentalFeatures::Exp_InOrderQueueEvents),
@@ -838,7 +839,13 @@ DPCT_ENUM_OPTION(
             "level_zero", int(ExperimentalFeatures::Exp_LevelZero),
             "Experimental migration feature that enables the use of Level Zero "
             "APIs to migrate target code, like CUDA Inter-Process "
-            "Communication (IPC) APIs.\n", false),
+            "Communication (IPC) APIs.\n",
+            false),
+        DPCT_OPTION_ENUM_VALUE("async_alloc",
+                               int(ExperimentalFeatures::Exp_AsyncAlloc),
+                               "Experimental extension that allows use of SYCL "
+                               "async allocation APIs.\n",
+                               false),
         DPCT_OPTION_ENUM_VALUE(
             "all", int(ExperimentalFeatures::Exp_All),
             "Enable all experimental extensions listed in this option.\n",

@@ -1352,6 +1352,9 @@ class DpctGlobalInfo {
   static bool useExtLevelZero() {
     return getUsingExperimental<ExperimentalFeatures::Exp_LevelZero>();
   }
+  static bool useExtAsyncAlloc() {
+    return getUsingExperimental<ExperimentalFeatures::Exp_AsyncAlloc>();
+  }
   static bool useExtPrefetch() {
     return getUsingExperimental<ExperimentalFeatures::Exp_Prefetch>();
   }

@@ -103,6 +103,7 @@ enum class ExperimentalFeatures : unsigned int {
   Exp_NonStandardSYCLBuiltins,
   Exp_Prefetch,
   Exp_LevelZero,
+  Exp_AsyncAlloc,
   Exp_All
 };
 enum class HelperFuncPreference : unsigned int { NoQueueDevice = 0 };

@@ -1827,6 +1827,10 @@ inline auto UseExtLevelZero = [](const CallExpr *C) -> bool {
   return DpctGlobalInfo::useExtLevelZero();
 };
 
+inline auto UseExtAsyncAlloc = [](const CallExpr *C) -> bool {
+  return DpctGlobalInfo::useExtAsyncAlloc();
+};
+
 inline auto UseExtGraph = [](const CallExpr *C) -> bool {
   return DpctGlobalInfo::useExtGraph();
 };

@@ -52,6 +52,8 @@ STD_HEADER(DL, "<dlfcn.h>")
 #endif
 STD_HEADER(SHMEM, "<ishmem.h>")
 STD_HEADER(SHMEMX, "<ishmemx.h>")
+STD_HEADER(AsyncAlloc,
+           "<sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>")
 
 ONEDPL_HEADER(Algorithm, "<oneapi/dpl/algorithm>")
 ONEDPL_HEADER(Execution, "<oneapi/dpl/execution>")

@@ -993,6 +993,47 @@ CONDITIONAL_FACTORY_ENTRY(
                                    "memcpy", false),
                                MEM_ARG(0), MEM_ARG(1), ARG(2), ARG(3))))))
 
+CONDITIONAL_FACTORY_ENTRY(
+    checkIsUSM(),
+    CONDITIONAL_FACTORY_ENTRY(
+        CheckArgCount(3),
+        CONDITIONAL_FACTORY_ENTRY(
+            UseExtAsyncAlloc,
+            ASSIGNABLE_FACTORY(HEADER_INSERT_FACTORY(
+                HeaderType::HT_AsyncAlloc,
+                ASSIGN_FACTORY_ENTRY(
+                    "cudaMallocAsync", DEREF(0),
+                    CALL(MapNames::getClNamespace() +
+                             "ext::oneapi::experimental::async_malloc",
+                         DEREF(2),
+                         ARG(MapNames::getClNamespace() + "usm::alloc::device"),
+                         ARG(1))))),
+            UNSUPPORT_FACTORY_ENTRY(
+                "cudaMallocAsync", Diagnostics::TRY_EXPERIMENTAL_FEATURE,
+                ARG("cudaMallocAsync"),
+                ARG("--use-experimental-features=virtual_mem"))),
+        UNSUPPORT_FACTORY_ENTRY("cudaMallocAsync",
+                                Diagnostics::API_NOT_MIGRATED,
+                                ARG("cudaMallocAsync"))),
+    UNSUPPORT_FACTORY_ENTRY("cudaMallocAsync", Diagnostics::API_NOT_MIGRATED,
+                            ARG("cudaMallocAsync")))
+CONDITIONAL_FACTORY_ENTRY(
+    checkIsUSM(),
+    CONDITIONAL_FACTORY_ENTRY(
+        UseExtAsyncAlloc,
+        ASSIGNABLE_FACTORY(HEADER_INSERT_FACTORY(
+            HeaderType::HT_AsyncAlloc,
+            CALL_FACTORY_ENTRY("cudaFreeAsync",
+                               CALL(MapNames::getClNamespace() +
+                                        "ext::oneapi::experimental::async_free",
+                                    DEREF(1), ARG(0))))),
+        UNSUPPORT_FACTORY_ENTRY(
+            "cudaFreeAsync", Diagnostics::TRY_EXPERIMENTAL_FEATURE,
+            ARG("cudaFreeAsync"),
+            ARG("--use-experimental-features=virtual_mem"))),
+    UNSUPPORT_FACTORY_ENTRY("cudaFreeAsync", Diagnostics::API_NOT_MIGRATED,
+                            ARG("cudaFreeAsync")))
+
 #define CUDA_FREE(NAME)                                                        \
   CONDITIONAL_FACTORY_ENTRY(                                                   \
       hasManagedAttr(0),                                                       \

@@ -5784,7 +5784,7 @@ void MemoryMigrationRule::mallocMigration(
   } else if (Name == "cudaHostAlloc" || Name == "cudaMallocHost" ||
              Name == "cuMemHostAlloc" || Name == "cuMemAllocHost_v2" ||
              Name == "cuMemAllocPitch_v2" || Name == "cudaMallocPitch" ||
-             Name == "cudaMallocMipmappedArray") {
+             Name == "cudaMallocMipmappedArray" || Name == "cudaMallocAsync") {
     ExprAnalysis EA(C);
     emplaceTransformation(EA.getReplacement());
     EA.applyAllSubExprRepl();
@@ -6815,7 +6815,7 @@ void MemoryMigrationRule::registerMatcher(MatchFinder &MF) {
         "cuMemsetD8_v2", "cuMemsetD8Async", "cudaMallocMipmappedArray",
         "cudaGetMipmappedArrayLevel", "cudaFreeMipmappedArray",
         "cudaMemcpyPeer", "cudaMemcpyPeerAsync", "cuMemcpyPeer",
-        "cuMemcpyPeerAsync");
+        "cuMemcpyPeerAsync", "cudaMallocAsync", "cudaFreeAsync");
   };
 
   MF.addMatcher(callExpr(allOf(callee(functionDecl(memoryAPI())), parentStmt()))
@@ -6905,7 +6905,8 @@ void MemoryMigrationRule::runRule(const MatchFinder::MatchResult &Result) {
         Name.compare("cudaMallocMipmappedArray") &&
         Name.compare("cudaGetMipmappedArrayLevel") &&
         Name.compare("cudaFreeMipmappedArray") && Name.compare("cudaMemcpy") &&
-        Name.compare("cudaFree") && Name.compare("cublasFree")) {
+        Name.compare("cudaFree") && Name.compare("cublasFree") &&
+        Name.compare("cudaMallocAsync") && Name.compare("cudaFreeAsync")) {
       requestFeature(HelperFeatureEnum::device_ext);
       insertAroundStmt(C, MapNames::getCheckErrorMacroName() + "(", ")");
     } else if (IsAssigned && !Name.compare("cudaMemAdvise") &&
@@ -6968,6 +6969,7 @@ MemoryMigrationRule::MemoryMigrationRule() {
                          const CallExpr *, const UnresolvedLookupExpr *, bool)>>
       Dispatcher{
           {"cudaMalloc", &MemoryMigrationRule::mallocMigration},
+          {"cudaMallocAsync", &MemoryMigrationRule::mallocMigration},
           {"cuMemAlloc_v2", &MemoryMigrationRule::mallocMigration},
           {"cudaHostAlloc", &MemoryMigrationRule::mallocMigration},
           {"cudaMallocHost", &MemoryMigrationRule::mallocMigration},
@@ -7030,6 +7032,7 @@ MemoryMigrationRule::MemoryMigrationRule() {
           {"cuMemcpyDtoA_v2", &MemoryMigrationRule::arrayMigration},
           {"cuMemcpyAtoA_v2", &MemoryMigrationRule::arrayMigration},
           {"cudaFree", &MemoryMigrationRule::freeMigration},
+          {"cudaFreeAsync", &MemoryMigrationRule::freeMigration},
           {"cuMemFree_v2", &MemoryMigrationRule::freeMigration},
           {"cudaFreeArray", &MemoryMigrationRule::freeMigration},
           {"cudaFreeMipmappedArray", &MemoryMigrationRule::freeMigration},

@@ -237,8 +237,8 @@ ENTRY(cudaMemcpyToArray, cudaMemcpyToArray, true, NO_FLAG, P0, "Successful")
 ENTRY(cudaMemcpyToArrayAsync, cudaMemcpyToArrayAsync, true, NO_FLAG, P0, "Successful")
 
 // stream ordered memory allocator functions of runtime API
-ENTRY(cudaFreeAsync, cudaFreeAsync, false, NO_FLAG, P7, "comment")
-ENTRY(cudaMallocAsync, cudaMallocAsync, false, NO_FLAG, P7, "comment")
+ENTRY(cudaFreeAsync, cudaFreeAsync, true, NO_FLAG, P7, "comment")
+ENTRY(cudaMallocAsync, cudaMallocAsync, true, NO_FLAG, P7, "comment")
 ENTRY(cudaMallocFromPoolAsync, cudaMallocFromPoolAsync, false, NO_FLAG, P7, "comment")
 ENTRY(cudaMemPoolCreate, cudaMemPoolCreate, false, NO_FLAG, P4, "comment")
 ENTRY(cudaMemPoolDestroy, cudaMemPoolDestroy, false, NO_FLAG, P7, "comment")

@@ -0,0 +1,23 @@
+// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2, cuda-11.0, cuda-11.1
+// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2, v11.0, v11.1
+// RUN: dpct --format-range=none --out-root %T/async_alloc %s --cuda-include-path="%cuda-path/include" --use-experimental-features=async_alloc
+// RUN: FileCheck --match-full-lines --input-file %T/async_alloc/async_alloc.dp.cpp %s
+// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/async_alloc/async_alloc.dp.cpp -o %T/async_alloc/async_alloc.dp.o %}
+
+// CHECK: #include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
+
+void foo_1(float *f, cudaStream_t hStream) {
+  // CHECK: cudaMemPool_t memPool;
+  // CHECK-NEXT: /*
+  // CHECK-NEXT: DPCT1007:{{[0-9]+}}: Migration of cudaMallocAsync is not supported.
+  // CHECK-NEXT: */
+  // CHECK-NEXT: cudaMallocAsync(&f, 1024, memPool, hStream);
+  // CHECK-NEXT: f = sycl::ext::oneapi::experimental::async_malloc(*hStream, sycl::usm::alloc::device, 1024);
+  // CHECK-NEXT: sycl::ext::oneapi::experimental::async_free(*hStream, f);
+#ifndef NO_BUILD_TEST
+  cudaMemPool_t memPool;
+  cudaMallocAsync(&f, 1024, memPool, hStream);
+#endif
+  cudaMallocAsync(&f, 1024, hStream);
+  cudaFreeAsync(f, hStream);
+}
@@ -172,6 +172,7 @@ All DPCT options
     =non-stdandard-sycl-builtins          -   Experimental extension that allows use of non standard SYCL builtin functions.
     =prefetch                             -   Experimental extension that allows use of SYCL prefetch APIs.
     =level_zero                           -   Experimental migration feature that enables the use of Level Zero APIs to migrate target code, like CUDA Inter-Process Communication (IPC) APIs.
+    =async_alloc                          -   Experimental extension that allows use of SYCL async allocation APIs.
     =all                                  -   Enable all experimental extensions listed in this option.
   --use-explicit-namespace=<value>        - Define the namespaces to use explicitly in generated code. The <value> is a comma
                                             separated list. Default: dpct/syclcompat, sycl.

@@ -171,6 +171,7 @@ All DPCT options
     =non-stdandard-sycl-builtins          -   Experimental extension that allows use of non standard SYCL builtin functions.
     =prefetch                             -   Experimental extension that allows use of SYCL prefetch APIs.
     =level_zero                           -   Experimental migration feature that enables the use of Level Zero APIs to migrate target code, like CUDA Inter-Process Communication (IPC) APIs.
+    =async_alloc                          -   Experimental extension that allows use of SYCL async allocation APIs.
     =all                                  -   Enable all experimental extensions listed in this option.
   --use-explicit-namespace=<value>        - Define the namespaces to use explicitly in generated code. The <value> is a comma
                                             separated list. Default: dpct/syclcompat, sycl.

diff --git a/clang/test/dpct/memory_management_restricted.cu b/clang/test/dpct/memory_management_restricted.cu
@@ -2,7 +2,7 @@
 // UNSUPPORTED: system-windows
 // RUN: dpct --format-range=none --usm-level=restricted -out-root %T/memory_management_restricted %s --cuda-include-path="%cuda-path/include" -- -x cuda --cuda-host-only -std=c++11
 // RUN: FileCheck --match-full-lines --input-file %T/memory_management_restricted/memory_management_restricted.dp.cpp %s
-// RUN: %if build_lit %{icpx -c -fsycl %T/memory_management_restricted/memory_management_restricted.dp.cpp -o %T/memory_management_restricted/memory_management_restricted.dp.o %}
+// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/memory_management_restricted/memory_management_restricted.dp.cpp -o %T/memory_management_restricted/memory_management_restricted.dp.o %}
 
 #include <cuda_runtime.h>
 #include <cuda.h>
@@ -407,7 +407,7 @@ __global__ void MyKernel(cudaPitchedPtr devPitchedPtr,
 // CHECK-NEXT:  dpct::pitched_data devPitchedPtr;
 // CHECK-NEXT:  devPitchedPtr = dpct::dpct_malloc(extent);
 // CHECK-NEXT:  /*
-// CHECK-NEXT:  DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.
+// CHECK-NEXT:  DPCT1049:{{[0-9]+}}: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.
 // CHECK-NEXT:  */
 // CHECK-NEXT:  dpct::get_in_order_queue().parallel_for(
 // CHECK-NEXT:    sycl::nd_range<3>(sycl::range<3>(1, 1, 100) * sycl::range<3>(1, 1, 512), sycl::range<3>(1, 1, 512)),
@@ -422,3 +422,25 @@ void foo_8() {
   cudaMalloc3D(&devPitchedPtr, extent);
   MyKernel<<<100, 512>>>(devPitchedPtr, width, height, depth);
 }
+
+void foo_9(float *f, cudaStream_t hStream) {
+#ifndef NO_BUILD_TEST
+  // CHECK: cudaMemPool_t memPool;
+  // CHECK-NEXT: /*
+  // CHECK-NEXT: DPCT1007:{{[0-9]+}}: Migration of cudaMallocAsync is not supported.
+  // CHECK-NEXT: */
+  // CHECK-NEXT: cudaMallocAsync(&f, 1024, memPool, hStream);
+  // CHECK-NEXT: /*
+  // CHECK-NEXT: DPCT1119:{{[0-9]+}}: Migration of cudaMallocAsync is not supported, please try to remigrate with option: --use-experimental-features=virtual_mem.
+  // CHECK-NEXT: */
+  // CHECK-NEXT: cudaMallocAsync(&f, 1024, hStream);
+  // CHECK-NEXT: /*
+  // CHECK-NEXT: DPCT1119:{{[0-9]+}}: Migration of cudaFreeAsync is not supported, please try to remigrate with option: --use-experimental-features=virtual_mem.
+  // CHECK-NEXT: */
+  // CHECK-NEXT: cudaFreeAsync(f, hStream);
+  cudaMemPool_t memPool;
+  cudaMallocAsync(&f, 1024, memPool, hStream);
+  cudaMallocAsync(&f, 1024, hStream);
+  cudaFreeAsync(f, hStream);
+#endif
+}