[STF] Support dynamic dependencies in the cuda_kernel construct and document cuda_kernel (NVIDIA#4490)

caugonnet · web-flow · commit 34de5af1654e · 2025-04-22T12:58:34.000Z
* Support add_deps with cuda_kernel_chain

* forgot a file, and move where we reset the task

* Add documentation for cuda_kernel

* Improvements in the documentation of cuda_kernel

* fix year
diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -96,6 +96,27 @@ class context
       }
     }
 
+    template <typename... Args>
+    auto& add_deps(Args&&... args)
+    {
+      ::std::visit(
+        [&](auto& self) {
+          self.add_deps(::std::forward<Args>(args)...);
+        },
+        payload);
+      return *this;
+    }
+
+    template <typename T>
+    decltype(auto) get(size_t submitted_index) const
+    {
+      return ::std::visit(
+        [&](auto& self) {
+          return self.template get<T>(submitted_index);
+        },
+        payload);
+    }
+
   private:
     ::std::variant<T1, T2> payload;
   };
diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -116,6 +116,24 @@ public:
   // move-constructible
   cuda_kernel_scope(cuda_kernel_scope&&) = default;
 
+  /// Add a set of dependencies
+  template <typename... Pack>
+  void add_deps(task_dep_untyped first, Pack&&... pack)
+  {
+    dynamic_deps.push_back(mv(first));
+    if constexpr (sizeof...(Pack) > 0)
+    {
+      add_deps(::std::forward<Pack>(pack)...);
+    }
+  }
+
+  template <typename T>
+  decltype(auto) get(size_t submitted_index) const
+  {
+    _CCCL_ASSERT(untyped_t.has_value(), "uninitialized task");
+    return untyped_t->template get<T>(submitted_index);
+  }
+
   /**
    * @brief Sets the symbol for this object.
    *
@@ -143,7 +161,18 @@ public:
     // If a place is specified, use it
     auto t = e_place ? ctx.task(e_place.value()) : ctx.task();
 
+    // So that we can use get to retrieve dynamic dependencies
+    untyped_t = t;
+
     t.add_deps(deps);
+
+    // Append all dynamic deps
+    for (auto& d : dynamic_deps)
+    {
+      t.add_deps(mv(d));
+    }
+    dynamic_deps.clear();
+
     if (!symbol.empty())
     {
       t.set_symbol(symbol);
@@ -186,6 +215,9 @@ public:
       }
 
       t.clear();
+
+      // Now that we have executed 'f', we do not need to access it anymore
+      untyped_t.reset();
     };
 
     if constexpr (::std::is_same_v<Ctx, stream_ctx>)
@@ -286,7 +318,14 @@ private:
 
   ::std::string symbol;
   Ctx& ctx;
+  // Statically defined deps
   task_dep_vector<Deps...> deps;
+
+  // Dependencies added with add_deps
+  ::std::vector<task_dep_untyped> dynamic_deps;
+  // Used to retrieve deps with t.get<>(...)
+  ::std::optional<task> untyped_t;
+
   ::std::optional<exec_place> e_place;
 };
 
diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt
@@ -32,6 +32,7 @@ set(stf_test_sources
   graph/graph_ctx_low_level.cu
   graph/static_graph_ctx.cu
   hashtable/test.cu
+  interface/cuda_kernel_chain-add_deps.cu
   interface/data_from_device_async.cu
   interface/move_operator.cu
   local_stf/legacy_to_stf.cu
diff --git a/cudax/test/stf/interface/cuda_kernel_chain-add_deps.cu b/cudax/test/stf/interface/cuda_kernel_chain-add_deps.cu
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief Example of task implementing a chain of CUDA kernels with dynamic dependencies (add_deps)
+ *
+ */
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+__global__ void axpy(double a, slice<const double> x, slice<double> y)
+{
+  int tid      = blockIdx.x * blockDim.x + threadIdx.x;
+  int nthreads = gridDim.x * blockDim.x;
+
+  for (int i = tid; i < x.size(); i += nthreads)
+  {
+    y(i) += a * x(i);
+  }
+}
+
+double X0(int i)
+{
+  return sin((double) i);
+}
+
+double Y0(int i)
+{
+  return cos((double) i);
+}
+
+int main()
+{
+  context ctx    = graph_ctx();
+  const size_t N = 16;
+  double X[N], Y[N];
+
+  for (size_t i = 0; i < N; i++)
+  {
+    X[i] = X0(i);
+    Y[i] = Y0(i);
+  }
+
+  double alpha = 3.14;
+  double beta  = 4.5;
+  double gamma = -4.1;
+
+  auto lX = ctx.logical_data(X);
+  auto lY = ctx.logical_data(Y);
+
+  /* Compute Y = Y + alpha X, Y = Y + beta X and then  Y = Y + gamma X */
+  auto t = ctx.cuda_kernel_chain();
+  t.add_deps(lX.read());
+  t.add_deps(lY.rw());
+  t->*[&]() {
+    auto dX = t.template get<slice<double>>(0);
+    auto dY = t.template get<slice<double>>(1);
+    // clang-format off
+    return std::vector<cuda_kernel_desc> {
+        { axpy, 16, 128, 0, alpha, dX, dY },
+        { axpy, 16, 128, 0, beta, dX, dY },
+        { axpy, 16, 128, 0, gamma, dX, dY }
+    };
+    // clang-format on
+  };
+
+  ctx.finalize();
+
+  for (size_t i = 0; i < N; i++)
+  {
+    assert(fabs(Y[i] - (Y0(i) + (alpha + beta + gamma) * X0(i))) < 0.0001);
+    assert(fabs(X[i] - X0(i)) < 0.0001);
+  }
+}
diff --git a/docs/cudax/stf.rst b/docs/cudax/stf.rst
@@ -1601,6 +1601,111 @@ will differ between the different threads which call `inner()`.
     ...
     th.inner().sync(); // synchronize threads in the same block of the second level of the hierarchy
 
+``cuda_kernel`` construct
+-------------------------
+
+CUDASTF provides the `cuda_kernel` construct to implement tasks executing a
+CUDA kernel. This construct is especially useful when we writing code that may
+be executed using a CUDA graph backend, because its `task` construct relies on
+a graph capture mechanism which has some overhead, while the `cuda_kernel`
+construct is directly translated to CUDA kernel launch APIs, thus avoiding this
+overhead.
+
+
+`cuda_kernel` accepts the same arguments as the task construct, including an
+execution place and a list of data dependencies.  It implements a `->*`
+operator that takes a lambda function as argument.  This lambda function must
+return an object of type `cuda_kernel_desc`, describing the CUDA kernel to
+execute.  The constructor of the `cuda_kernel_desc` class, shown below, takes
+the CUDA kernel function pointer (ie. the ``__global__`` method defining the
+kernel), a grid description, the amount of dynamically allocated shared memory,
+and finally all the arguments that must be passed to the CUDA kernel.
+
+.. code:: cpp
+
+  template <typename Fun, typename... Args>
+  cuda_kernel_desc(Fun func,           // Pointer to the CUDA kernel function (__global__)
+                   dim3 gridDim_,      // Dimensions of the grid (number of thread blocks)
+                   dim3 blockDim_,     // Dimensions of each thread block
+                   size_t sharedMem_,  // Amount of dynamically allocated shared memory
+                   Args... args)       // Arguments passed to the CUDA kernel
+
+For example, the following piece of code creates a task that launches a CUDA kernel that accesses two logical data.
+
+.. code:: cpp
+
+  ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
+    // calls __global__ void axpy(double a, slice<const double> x, slice<double> y);
+    // similarly to axpy<<<16, 128, 0, ...>>>(alpha, dX, dY)
+    return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY};
+  };
+
+Similar to the `task` construct, the `cuda_kernel` construct also supports
+specifying dynamic dependencies using the `add_deps` method and retrieving data
+instances using `get`. The previous code can therefore be rewritten as:
+
+.. code:: cpp
+
+  auto t = ctx.cuda_kernel();
+  t.add_deps(lX.read());
+  t.add_deps(lY.rw());
+  t->*[&]() {
+    auto dX = t.template get<slice<double>>(0);
+    auto dY = t.template get<slice<double>>(1);
+    return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY};
+  };
+
+``cuda_kernel_chain`` construct
+-------------------------------
+
+In addition to `cuda_kernel`, CUDASTF provides the `cuda_kernel_chain`
+construct to execute sequences of CUDA kernels within a single task. Unlike
+`cuda_kernel`, which expects a single kernel descriptor, the lambda passed to
+the `->*` operator of `cuda_kernel_chain` should return a
+`::std::vector<cuda_kernel_desc>` describing multiple kernel launches.
+Kernels specified within the vector are executed sequentially in the order they appear.
+
+The following two constructs are therefore equivalent, except that the
+`cuda_kernel_chain` implementation directly translate to efficient, direct CUDA
+kernel launch APIs, while the implementation of the `task` construct may rely
+on graph capture when using a CUDA graph backend.
+
+.. code:: cpp
+
+  /* Compute Y = Y + alpha X, Y = Y + beta X, then Y = Y + gamma X sequentially */
+  ctx.cuda_kernel_chain(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
+     return ::std::vector<cuda_kernel_desc> {
+         { axpy, 16, 128, 0, alpha, dX, dY },
+         { axpy, 16, 128, 0, beta,  dX, dY },
+         { axpy, 16, 128, 0, gamma, dX, dY }
+     };
+  };
+
+  /* Equivalent to the previous construct, but possibly less efficient */
+  ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t stream, auto dX, auto dY) {
+     axpy<<<16, 128, 0, stream>>>(alpha, dX, dY);
+     axpy<<<16, 128, 0, stream>>>(beta,  dX, dY);
+     axpy<<<16, 128, 0, stream>>>(gamma, dX, dY);
+  };
+
+Similarly to the `cuda_kernel` constructs, dependencies can be set dynamically:
+
+.. code:: cpp
+
+  /* Compute Y = Y + alpha X, Y = Y + beta X, then Y = Y + gamma X sequentially */
+  auto t = ctx.cuda_kernel_chain();
+  t.add_deps(lX.read());
+  t.add_deps(lY.rw());
+  t->*[&]() {
+    auto dX = t.template get<slice<double>>(0);
+    auto dY = t.template get<slice<double>>(1);
+    return ::std::vector<cuda_kernel_desc> {
+        { axpy, 16, 128, 0, alpha, dX, dY },
+        { axpy, 16, 128, 0, beta, dX, dY },
+        { axpy, 16, 128, 0, gamma, dX, dY }
+    };
+  };
+
 C++ Types of logical data and tasks
 -----------------------------------