Skip to content

Commit 34de5af

Browse files
authored
[STF] Support dynamic dependencies in the cuda_kernel construct and document cuda_kernel (NVIDIA#4490)
* Support add_deps with cuda_kernel_chain * forgot a file, and move where we reset the task * Add documentation for cuda_kernel * Improvements in the documentation of cuda_kernel * fix year
1 parent c9210c6 commit 34de5af

File tree

5 files changed

+251
-0
lines changed

5 files changed

+251
-0
lines changed

cudax/include/cuda/experimental/__stf/internal/context.cuh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,27 @@ class context
9696
}
9797
}
9898

99+
template <typename... Args>
100+
auto& add_deps(Args&&... args)
101+
{
102+
::std::visit(
103+
[&](auto& self) {
104+
self.add_deps(::std::forward<Args>(args)...);
105+
},
106+
payload);
107+
return *this;
108+
}
109+
110+
template <typename T>
111+
decltype(auto) get(size_t submitted_index) const
112+
{
113+
return ::std::visit(
114+
[&](auto& self) {
115+
return self.template get<T>(submitted_index);
116+
},
117+
payload);
118+
}
119+
99120
private:
100121
::std::variant<T1, T2> payload;
101122
};

cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,24 @@ public:
116116
// move-constructible
117117
cuda_kernel_scope(cuda_kernel_scope&&) = default;
118118

119+
/// Add a set of dependencies
120+
template <typename... Pack>
121+
void add_deps(task_dep_untyped first, Pack&&... pack)
122+
{
123+
dynamic_deps.push_back(mv(first));
124+
if constexpr (sizeof...(Pack) > 0)
125+
{
126+
add_deps(::std::forward<Pack>(pack)...);
127+
}
128+
}
129+
130+
template <typename T>
131+
decltype(auto) get(size_t submitted_index) const
132+
{
133+
_CCCL_ASSERT(untyped_t.has_value(), "uninitialized task");
134+
return untyped_t->template get<T>(submitted_index);
135+
}
136+
119137
/**
120138
* @brief Sets the symbol for this object.
121139
*
@@ -143,7 +161,18 @@ public:
143161
// If a place is specified, use it
144162
auto t = e_place ? ctx.task(e_place.value()) : ctx.task();
145163

164+
// So that we can use get to retrieve dynamic dependencies
165+
untyped_t = t;
166+
146167
t.add_deps(deps);
168+
169+
// Append all dynamic deps
170+
for (auto& d : dynamic_deps)
171+
{
172+
t.add_deps(mv(d));
173+
}
174+
dynamic_deps.clear();
175+
147176
if (!symbol.empty())
148177
{
149178
t.set_symbol(symbol);
@@ -186,6 +215,9 @@ public:
186215
}
187216

188217
t.clear();
218+
219+
// Now that we have executed 'f', we do not need to access it anymore
220+
untyped_t.reset();
189221
};
190222

191223
if constexpr (::std::is_same_v<Ctx, stream_ctx>)
@@ -286,7 +318,14 @@ private:
286318

287319
::std::string symbol;
288320
Ctx& ctx;
321+
// Statically defined deps
289322
task_dep_vector<Deps...> deps;
323+
324+
// Dependencies added with add_deps
325+
::std::vector<task_dep_untyped> dynamic_deps;
326+
// Used to retrieve deps with t.get<>(...)
327+
::std::optional<task> untyped_t;
328+
290329
::std::optional<exec_place> e_place;
291330
};
292331

cudax/test/stf/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ set(stf_test_sources
3232
graph/graph_ctx_low_level.cu
3333
graph/static_graph_ctx.cu
3434
hashtable/test.cu
35+
interface/cuda_kernel_chain-add_deps.cu
3536
interface/data_from_device_async.cu
3637
interface/move_operator.cu
3738
local_stf/legacy_to_stf.cu
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDASTF in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
/**
12+
* @file
13+
*
14+
* @brief Example of task implementing a chain of CUDA kernels with dynamic dependencies (add_deps)
15+
*
16+
*/
17+
18+
#include <cuda/experimental/stf.cuh>
19+
20+
using namespace cuda::experimental::stf;
21+
22+
__global__ void axpy(double a, slice<const double> x, slice<double> y)
23+
{
24+
int tid = blockIdx.x * blockDim.x + threadIdx.x;
25+
int nthreads = gridDim.x * blockDim.x;
26+
27+
for (int i = tid; i < x.size(); i += nthreads)
28+
{
29+
y(i) += a * x(i);
30+
}
31+
}
32+
33+
double X0(int i)
34+
{
35+
return sin((double) i);
36+
}
37+
38+
double Y0(int i)
39+
{
40+
return cos((double) i);
41+
}
42+
43+
int main()
44+
{
45+
context ctx = graph_ctx();
46+
const size_t N = 16;
47+
double X[N], Y[N];
48+
49+
for (size_t i = 0; i < N; i++)
50+
{
51+
X[i] = X0(i);
52+
Y[i] = Y0(i);
53+
}
54+
55+
double alpha = 3.14;
56+
double beta = 4.5;
57+
double gamma = -4.1;
58+
59+
auto lX = ctx.logical_data(X);
60+
auto lY = ctx.logical_data(Y);
61+
62+
/* Compute Y = Y + alpha X, Y = Y + beta X and then Y = Y + gamma X */
63+
auto t = ctx.cuda_kernel_chain();
64+
t.add_deps(lX.read());
65+
t.add_deps(lY.rw());
66+
t->*[&]() {
67+
auto dX = t.template get<slice<double>>(0);
68+
auto dY = t.template get<slice<double>>(1);
69+
// clang-format off
70+
return std::vector<cuda_kernel_desc> {
71+
{ axpy, 16, 128, 0, alpha, dX, dY },
72+
{ axpy, 16, 128, 0, beta, dX, dY },
73+
{ axpy, 16, 128, 0, gamma, dX, dY }
74+
};
75+
// clang-format on
76+
};
77+
78+
ctx.finalize();
79+
80+
for (size_t i = 0; i < N; i++)
81+
{
82+
assert(fabs(Y[i] - (Y0(i) + (alpha + beta + gamma) * X0(i))) < 0.0001);
83+
assert(fabs(X[i] - X0(i)) < 0.0001);
84+
}
85+
}

docs/cudax/stf.rst

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1601,6 +1601,111 @@ will differ between the different threads which call `inner()`.
16011601
...
16021602
th.inner().sync(); // synchronize threads in the same block of the second level of the hierarchy
16031603
1604+
``cuda_kernel`` construct
1605+
-------------------------
1606+
1607+
CUDASTF provides the `cuda_kernel` construct to implement tasks executing a
1608+
CUDA kernel. This construct is especially useful when we writing code that may
1609+
be executed using a CUDA graph backend, because its `task` construct relies on
1610+
a graph capture mechanism which has some overhead, while the `cuda_kernel`
1611+
construct is directly translated to CUDA kernel launch APIs, thus avoiding this
1612+
overhead.
1613+
1614+
1615+
`cuda_kernel` accepts the same arguments as the task construct, including an
1616+
execution place and a list of data dependencies. It implements a `->*`
1617+
operator that takes a lambda function as argument. This lambda function must
1618+
return an object of type `cuda_kernel_desc`, describing the CUDA kernel to
1619+
execute. The constructor of the `cuda_kernel_desc` class, shown below, takes
1620+
the CUDA kernel function pointer (ie. the ``__global__`` method defining the
1621+
kernel), a grid description, the amount of dynamically allocated shared memory,
1622+
and finally all the arguments that must be passed to the CUDA kernel.
1623+
1624+
.. code:: cpp
1625+
1626+
template <typename Fun, typename... Args>
1627+
cuda_kernel_desc(Fun func, // Pointer to the CUDA kernel function (__global__)
1628+
dim3 gridDim_, // Dimensions of the grid (number of thread blocks)
1629+
dim3 blockDim_, // Dimensions of each thread block
1630+
size_t sharedMem_, // Amount of dynamically allocated shared memory
1631+
Args... args) // Arguments passed to the CUDA kernel
1632+
1633+
For example, the following piece of code creates a task that launches a CUDA kernel that accesses two logical data.
1634+
1635+
.. code:: cpp
1636+
1637+
ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
1638+
// calls __global__ void axpy(double a, slice<const double> x, slice<double> y);
1639+
// similarly to axpy<<<16, 128, 0, ...>>>(alpha, dX, dY)
1640+
return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY};
1641+
};
1642+
1643+
Similar to the `task` construct, the `cuda_kernel` construct also supports
1644+
specifying dynamic dependencies using the `add_deps` method and retrieving data
1645+
instances using `get`. The previous code can therefore be rewritten as:
1646+
1647+
.. code:: cpp
1648+
1649+
auto t = ctx.cuda_kernel();
1650+
t.add_deps(lX.read());
1651+
t.add_deps(lY.rw());
1652+
t->*[&]() {
1653+
auto dX = t.template get<slice<double>>(0);
1654+
auto dY = t.template get<slice<double>>(1);
1655+
return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY};
1656+
};
1657+
1658+
``cuda_kernel_chain`` construct
1659+
-------------------------------
1660+
1661+
In addition to `cuda_kernel`, CUDASTF provides the `cuda_kernel_chain`
1662+
construct to execute sequences of CUDA kernels within a single task. Unlike
1663+
`cuda_kernel`, which expects a single kernel descriptor, the lambda passed to
1664+
the `->*` operator of `cuda_kernel_chain` should return a
1665+
`::std::vector<cuda_kernel_desc>` describing multiple kernel launches.
1666+
Kernels specified within the vector are executed sequentially in the order they appear.
1667+
1668+
The following two constructs are therefore equivalent, except that the
1669+
`cuda_kernel_chain` implementation directly translate to efficient, direct CUDA
1670+
kernel launch APIs, while the implementation of the `task` construct may rely
1671+
on graph capture when using a CUDA graph backend.
1672+
1673+
.. code:: cpp
1674+
1675+
/* Compute Y = Y + alpha X, Y = Y + beta X, then Y = Y + gamma X sequentially */
1676+
ctx.cuda_kernel_chain(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
1677+
return ::std::vector<cuda_kernel_desc> {
1678+
{ axpy, 16, 128, 0, alpha, dX, dY },
1679+
{ axpy, 16, 128, 0, beta, dX, dY },
1680+
{ axpy, 16, 128, 0, gamma, dX, dY }
1681+
};
1682+
};
1683+
1684+
/* Equivalent to the previous construct, but possibly less efficient */
1685+
ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t stream, auto dX, auto dY) {
1686+
axpy<<<16, 128, 0, stream>>>(alpha, dX, dY);
1687+
axpy<<<16, 128, 0, stream>>>(beta, dX, dY);
1688+
axpy<<<16, 128, 0, stream>>>(gamma, dX, dY);
1689+
};
1690+
1691+
Similarly to the `cuda_kernel` constructs, dependencies can be set dynamically:
1692+
1693+
.. code:: cpp
1694+
1695+
/* Compute Y = Y + alpha X, Y = Y + beta X, then Y = Y + gamma X sequentially */
1696+
auto t = ctx.cuda_kernel_chain();
1697+
t.add_deps(lX.read());
1698+
t.add_deps(lY.rw());
1699+
t->*[&]() {
1700+
auto dX = t.template get<slice<double>>(0);
1701+
auto dY = t.template get<slice<double>>(1);
1702+
return ::std::vector<cuda_kernel_desc> {
1703+
{ axpy, 16, 128, 0, alpha, dX, dY },
1704+
{ axpy, 16, 128, 0, beta, dX, dY },
1705+
{ axpy, 16, 128, 0, gamma, dX, dY }
1706+
};
1707+
};
1708+
16041709
C++ Types of logical data and tasks
16051710
-----------------------------------
16061711

0 commit comments

Comments
 (0)