Skip to content

Commit be60b81

Browse files
Merge branch 'main' into dispatch-common-predicate-should-be-runtime-functions
2 parents 963efd4 + bb86c71 commit be60b81

File tree

41 files changed

+684
-147
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+684
-147
lines changed

c/parallel/test/test_scan.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,51 @@ TEST_CASE("Scan works with output iterators", "[scan]")
201201
}
202202
}
203203

204+
TEST_CASE("Scan works with reverse input iterators", "[scan]")
205+
{
206+
const std::size_t num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
207+
operation_t op = make_operation("op", get_reduce_op(get_type_info<int>().type));
208+
iterator_t<int, random_access_iterator_state_t<int>> input_it =
209+
make_reverse_iterator<int>(iterator_kind::INPUT, "int");
210+
std::vector<int> input = generate<int>(num_items);
211+
pointer_t<int> input_ptr(input);
212+
input_it.state.data = input_ptr.ptr + num_items - 1;
213+
pointer_t<int> output_it(num_items);
214+
value_t<int> init{42};
215+
216+
scan(input_it, output_it, num_items, op, init, false);
217+
218+
std::vector<int> expected(num_items);
219+
std::exclusive_scan(input.rbegin(), input.rend(), expected.begin(), init.value);
220+
if (num_items > 0)
221+
{
222+
REQUIRE(expected == std::vector<int>(output_it));
223+
}
224+
}
225+
226+
TEST_CASE("Scan works with reverse output iterators", "[scan]")
227+
{
228+
const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
229+
operation_t op = make_operation("op", get_reduce_op(get_type_info<int>().type));
230+
iterator_t<int, random_access_iterator_state_t<int>> output_it =
231+
make_reverse_iterator<int>(iterator_kind::OUTPUT, "int", "out");
232+
const std::vector<int> input = generate<int>(num_items);
233+
pointer_t<int> input_it(input);
234+
pointer_t<int> inner_output_it(num_items);
235+
output_it.state.data = inner_output_it.ptr + num_items - 1;
236+
value_t<int> init{42};
237+
238+
scan(input_it, output_it, num_items, op, init, false);
239+
240+
std::vector<int> expected(num_items);
241+
std::exclusive_scan(input.begin(), input.end(), expected.rbegin(), init.value);
242+
243+
if (num_items > 0)
244+
{
245+
REQUIRE(expected == std::vector<int>(inner_output_it));
246+
}
247+
}
248+
204249
TEST_CASE("Scan works with input and output iterators", "[scan]")
205250
{
206251
const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));

c/parallel/test/test_util.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,46 @@ make_constant_iterator(std::string value_type, std::string prefix = "")
586586
return make_iterator<ValueT, constant_iterator_state_t<ValueT>>(iterator_state, advance, dereference);
587587
}
588588

589+
template <class ValueT>
590+
iterator_t<ValueT, random_access_iterator_state_t<ValueT>>
591+
make_reverse_iterator(iterator_kind kind, std::string value_type, std::string prefix = "", std::string transform = "")
592+
{
593+
std::string iterator_state = std::format("struct state_t {{ {0}* data; }};\n", value_type);
594+
595+
operation_t advance = {
596+
std::format("{0}_advance", prefix),
597+
std::format("extern \"C\" __device__ void {0}_advance(state_t* state, unsigned long long offset) {{\n"
598+
" state->data -= offset;\n"
599+
"}}",
600+
prefix)};
601+
602+
std::string dereference_method;
603+
if (kind == iterator_kind::INPUT)
604+
{
605+
dereference_method = std::format(
606+
"extern \"C\" __device__ {1} {0}_dereference(state_t* state) {{\n"
607+
" return (*state->data){2};\n"
608+
"}}",
609+
prefix,
610+
value_type,
611+
transform);
612+
}
613+
else
614+
{
615+
dereference_method = std::format(
616+
"extern \"C\" __device__ void {0}_dereference(state_t* state, {1} x) {{\n"
617+
" *state->data = x{2};\n"
618+
"}}",
619+
prefix,
620+
value_type,
621+
transform);
622+
}
623+
624+
operation_t dereference = {std::format("{0}_dereference", prefix), dereference_method};
625+
626+
return make_iterator<ValueT, random_access_iterator_state_t<ValueT>>(iterator_state, advance, dereference);
627+
}
628+
589629
template <class T>
590630
struct value_t
591631
{

ci-overview.md

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,30 +26,37 @@ The results of every job in the CI pipeline are summarized on the bottom of the
2626

2727
### Special CI Commands
2828

29-
Special commands are provided that can be included in commit messages to direct the CI pipeline execution:
29+
Special commands can be included in the most recent commit message to control which jobs are spawned for the next pull-request CI run.
30+
These commands can be combined with the [override matrix](#temporarily-overriding-the-pull-request-matrix) for even more fine-grained control.
3031

31-
- `[skip ci]`: Skips the entire CI pipeline. Useful for documentation changes or others that don't require CI validation.
32+
- `[skip-<component>]`: Skips a subset of the CI jobs. These commands will block the PR from being merged while present in the last commit message of the branch. Recognized components are:
33+
- `[skip-matrix]`: Skip all build and test jobs specified in `ci/matrix.yaml`.
34+
- `[skip-vdc]`: Skip all "Validate Devcontainer" jobs.
35+
- `[skip-docs]`: Skip the documentation verification build.
36+
- `[skip-rapids]`: Skip all RAPIDS canary builds.
37+
- `[skip-matx]`: Skip all MatX canary builds.
38+
- **Example:** `git commit -m "Fix RAPIDS failures [skip-matrix][skip-vdc][skip-docs][skip-matx]"`
3239

33-
- **Example:** `git commit -m "[skip ci] Update README."`
34-
35-
- `[skip-tests]`: Skips CI jobs that execute tests, but runs all other jobs. Useful to avoid time-consuming tests when changes are unlikely to affect them.
36-
- `[all-projects]`: CI normally skips projects that don't have changes in themselves or their dependencies. This forces all projects to build.
3740
- `[workflow:<workflow>]`: Execute jobs from the named workflow. Example: `[workflow:nightly]` runs all jobs defined in `matrix.yaml`'s `workflows.nightly` list.
3841

39-
Use these commands judiciously. While they offer flexibility, they should be used appropriately to maintain the codebase's integrity and quality.
40-
4142
### Temporarily Overriding the Pull Request Matrix
4243

43-
If a workflow named `override` exists in the matrix.yaml file, this matrix will be used for pull requests instead of the `pull_request` matrix.
44-
This is useful for reducing resource usage when launching many CI workflows from a PR (for example, while testing CI features).
45-
The overridden CI job will be marked as a failure until the override is removed.
44+
If a non-empty workflow named `override` exists in the `ci/matrix.yaml` file, this matrix will be used for pull requests instead of the `pull_request` matrix.
45+
This is useful for reducing resource usage and turn-around time when a full run is not needed, for example:
46+
47+
- Testing changes that only apply to a specific compiler, OS, etc.
48+
- Testing fixes to nightly CI failures by only running the nightly jobs that failed.
49+
- Testing changes to CI infrastructure that only require a few jobs to run.
50+
51+
The PR will be blocked from merging until the override matrix is removed, ensuring that the full CI suite runs before landing the PR.
52+
The override matrix can be combined with the `[skip-<...>]` commands detailed in [Special CI Commands](#special-ci-commands) to reduce unnecessary resource usage even further.
4653

4754
Example:
4855

4956
```
5057
workflows:
5158
override:
52-
- {jobs: ['test'], std: 17, ctk: *ctk_curr, cxx: [*gcc12, *llvm16, *msvc2022]}
59+
- {jobs: ['build'], project: 'cudax', ctk: '12.0', std: 'all', cxx: ['msvc14.39', 'gcc10', 'clang14']}
5360
pull_request:
5461
- <...>
5562
```

ci/matrix.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,13 @@ workflows:
102102
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc'] , gpu: 'rtx2080'}
103103
- {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14'], gpu: 'rtx2080'}
104104
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang'], gpu: 'rtx2080'}
105+
# Python and c/parallel jobs:
106+
- {jobs: ['test'], project: ['cccl_c_parallel', 'python'], gpu: 'rtx2080'}
107+
# cccl-infra:
108+
- {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'}
109+
- {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang'], gpu: 'rtx2080'}
110+
# NVHPC stdpar smoke tests
111+
- {jobs: ['build'], project: 'stdpar', std: 'all', ctk: '12.8', cxx: 'nvhpc', cpu: ['amd64', 'arm64']}
105112

106113
# Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows.
107114
exclude:

cudax/examples/stf/void_data_interface.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ int main()
3131
void_interface sync;
3232
auto token2 = ctx.logical_data(sync);
3333

34-
auto token3 = ctx.logical_token();
34+
auto token3 = ctx.token();
3535
ctx.task(token2.write(), token.read())->*[](cudaStream_t, auto, auto) {
3636

3737
};

cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -952,7 +952,7 @@ public:
952952
return logical_data(make_slice(p, n), mv(dplace));
953953
}
954954

955-
auto logical_token()
955+
auto token()
956956
{
957957
// We do not use a shape because we want the first rw() access to succeed
958958
// without an initial write()

cudax/include/cuda/experimental/__stf/internal/context.cuh

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -362,12 +362,12 @@ public:
362362
}
363363
}
364364

365-
auto logical_token()
365+
auto token()
366366
{
367367
_CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized");
368368
return ::std::visit(
369369
[&](auto& self) {
370-
return self.logical_token();
370+
return self.token();
371371
},
372372
payload);
373373
}
@@ -1457,15 +1457,15 @@ UNITTEST("cuda stream place multi-gpu")
14571457
ctx.finalize();
14581458
};
14591459

1460-
// Ensure we can skip logical tokens
1461-
UNITTEST("logical token elision")
1460+
// Ensure we can skip tokens
1461+
UNITTEST("token elision")
14621462
{
14631463
context ctx;
14641464

14651465
int buf[1024];
14661466

1467-
auto lA = ctx.logical_token();
1468-
auto lB = ctx.logical_token();
1467+
auto lA = ctx.token();
1468+
auto lB = ctx.token();
14691469
auto lC = ctx.logical_data(buf);
14701470

14711471
// with all arguments
@@ -1483,6 +1483,26 @@ UNITTEST("logical token elision")
14831483
ctx.finalize();
14841484
};
14851485

1486+
// Use the token type shorthand
1487+
UNITTEST("token vector")
1488+
{
1489+
context ctx;
1490+
1491+
::std::vector<token> tokens(4);
1492+
1493+
for (size_t i = 0; i < 4; i++)
1494+
{
1495+
tokens[i] = ctx.token();
1496+
}
1497+
1498+
ctx.task(tokens[0].write())->*[](cudaStream_t) {};
1499+
ctx.task(tokens[0].read(), tokens[1].write())->*[](cudaStream_t) {};
1500+
ctx.task(tokens[0].read(), tokens[2].write())->*[](cudaStream_t) {};
1501+
ctx.task(tokens[1].read(), tokens[2].read(), tokens[3].write())->*[](cudaStream_t) {};
1502+
1503+
ctx.finalize();
1504+
};
1505+
14861506
#endif // UNITTESTED_FILE
14871507

14881508
} // end namespace cuda::experimental::stf

cudax/include/cuda/experimental/__stf/internal/logical_data.cuh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2439,6 +2439,9 @@ public:
24392439
///@}
24402440
};
24412441

2442+
// Shortcut type for the logical data produced by ctx.token()
2443+
using token = logical_data<void_interface>;
2444+
24422445
/**
24432446
* @brief Reclaims memory from allocated data instances.
24442447
*

cudax/test/stf/dot/sections_2.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ int main()
2222
// TODO (miscco): Make it work for windows
2323
#if !_CCCL_COMPILER(MSVC)
2424
context ctx;
25-
auto lA = ctx.logical_token().set_symbol("A");
26-
auto lB = ctx.logical_token().set_symbol("B");
27-
auto lC = ctx.logical_token().set_symbol("C");
25+
auto lA = ctx.token().set_symbol("A");
26+
auto lB = ctx.token().set_symbol("B");
27+
auto lC = ctx.token().set_symbol("C");
2828

2929
// Begin a top-level section named "foo"
3030
auto s_foo = ctx.dot_section("foo");

cudax/test/stf/freeze/token.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ int main()
2323
{
2424
context ctx;
2525

26-
auto ltoken = ctx.logical_token();
26+
auto ltoken = ctx.token();
2727

2828
auto ftoken = ctx.freeze(ltoken);
2929

cudax/test/stf/local_stf/legacy_to_stf.cu

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,11 @@ void lib_call_generic(async_resources_handle& handle, cudaStream_t stream, doubl
146146
}
147147

148148
template <typename Ctx_t>
149-
void lib_call_logical_token(
150-
async_resources_handle& handle, cudaStream_t stream, double* d_ptrA, double* d_ptrB, size_t N)
149+
void lib_call_token(async_resources_handle& handle, cudaStream_t stream, double* d_ptrA, double* d_ptrB, size_t N)
151150
{
152151
Ctx_t ctx(stream, handle);
153-
auto lA = ctx.logical_token();
154-
auto lB = ctx.logical_token();
152+
auto lA = ctx.token();
153+
auto lB = ctx.token();
155154
ctx.task(lA.write())->*[=](cudaStream_t s) {
156155
initA<<<128, 32, 0, s>>>(d_ptrA, N);
157156
};
@@ -244,7 +243,7 @@ int main()
244243
nvtx_range r_token("logical token");
245244
for (size_t i = 0; i < NITER; i++)
246245
{
247-
lib_call_logical_token<context>(handle, stream, d_ptrA, d_ptrB, N);
246+
lib_call_token<context>(handle, stream, d_ptrA, d_ptrB, N);
248247
}
249248
cuda_safe_call(cudaStreamSynchronize(stream));
250249
r_token.end();

cudax/test/stf/parallel_for/parallel_for_host.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ int main()
1717
context ctx;
1818

1919
int nqpoints = 3;
20-
auto ltoken = ctx.logical_token();
20+
auto ltoken = ctx.token();
2121

2222
ctx.parallel_for(exec_place::host(), box(5), ltoken.read())->*[nqpoints] __host__(size_t, void_interface) {
2323
_CCCL_ASSERT(nqpoints == 3, "invalid value");

docs/cudax/stf.rst

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1781,7 +1781,7 @@ one may however already manage coherency or enforce dependencies.
17811781

17821782
- The "logical data freezing" mechanism ensures data availability while letting
17831783
the application take care of synchronization.
1784-
- Logical token makes it possible to enforce concurrent execution while
1784+
- Tokens make it possible to enforce concurrent execution while
17851785
letting the application manage data allocations and data transfers.
17861786

17871787
Freezing logical data
@@ -1856,35 +1856,35 @@ depend on the completion of the work in the streams used for any preceding
18561856
It is possible to retrieve the access mode used to freeze a logical data with
18571857
the ``get_access_mode()`` method of the ``frozen_logical_data`` object.
18581858

1859-
Logical token
1860-
^^^^^^^^^^^^^
1859+
Tokens
1860+
^^^^^^
18611861

1862-
A logical token is a specific type of logical data whose only purpose is to
1862+
A token is a specific type of logical data whose only purpose is to
18631863
automate synchronization, while letting the application manage the actual data.
18641864
This can, for example, be useful with user-provided buffers on a single device,
18651865
where no allocations or transfers are required, but where concurrent accesses
18661866
may occur.
18671867

1868-
A logical token internally relies on the ``void_interface`` data interface,
1868+
A token internally relies on the ``void_interface`` data interface,
18691869
which is specifically optimized to skip unnecessary stages in the cache
18701870
coherency protocol (e.g., data allocations or copying data). When appropriate,
1871-
using a logical token rather than a logical data with a full-fledged data
1871+
using a token rather than a logical data with a full-fledged data
18721872
interface therefore minimizes runtime overhead.
18731873

18741874
.. code:: cpp
18751875
1876-
auto token = ctx.logical_token();
1876+
auto token = ctx.token();
18771877
18781878
// A and B are assumed to be two other valid logical data
18791879
ctx.task(token.rw(), A.read(), B.rw())->*[](cudaStream_t stream, auto a, auto b)
18801880
{
18811881
...
18821882
};
18831883
1884-
The example above shows how to create a logical token and how to use it in a
1884+
The example above shows how to create a token and how to use it in a
18851885
task.
18861886

1887-
Since the logical token is only used for synchronization purposes, the
1887+
Since the token is only used for synchronization purposes, the
18881888
corresponding argument may be omitted in the lambda function passed as the
18891889
task’s implementation. Thus, the above task is equivalent to this code:
18901890

@@ -1897,11 +1897,15 @@ To avoid ambiguities, you must either consistently ignore every
18971897
unused. Eliding these token arguments is possible in the ``ctx.task`` and
18981898
``ctx.host_launch`` constructs.
18991899

1900-
Note that the token created by the ``logical_token`` method of the context
1900+
Note that the token created by the ``token`` method of the context
19011901
object is already valid, which means the first access can be either a ``read()``
19021902
or an ``rw()`` access. There is no need to set any content in the token
19031903
(unlike a logical data object created from a shape).
19041904

1905+
A token corresponds to a ``logical_data<void_interface>`` object, so that the
1906+
``token`` type serves as a short-hand for this type. ``ctx.token()`` thus
1907+
returns an object with a ``token`` type.
1908+
19051909
Tools
19061910
-----
19071911

@@ -2022,9 +2026,9 @@ illustrates how to add nested sections:
20222026
.. code:: c++
20232027

20242028
context ctx;
2025-
auto lA = ctx.logical_token().set_symbol("A");
2026-
auto lB = ctx.logical_token().set_symbol("B");
2027-
auto lC = ctx.logical_token().set_symbol("C");
2029+
auto lA = ctx.token().set_symbol("A");
2030+
auto lB = ctx.token().set_symbol("B");
2031+
auto lC = ctx.token().set_symbol("C");
20282032

20292033
// Begin a top-level section named "foo"
20302034
auto s_foo = ctx.dot_section("foo");

libcudacxx/include/cuda/__barrier/barrier_block_scope.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
359359
NV_ANY_TARGET,
360360
(return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
361361
_CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);))
362+
_CCCL_UNREACHABLE();
362363
}
363364

364365
public:

0 commit comments

Comments
 (0)