oleksandr-pavlyk
diff --git a/‎c/parallel/test/test_scan.cpp
Lines changed: 45 additions & 0 deletions b/‎c/parallel/test/test_scan.cpp
Lines changed: 45 additions & 0 deletions
diff --git a/‎c/parallel/test/test_util.h
Lines changed: 40 additions & 0 deletions b/‎c/parallel/test/test_util.h
Lines changed: 40 additions & 0 deletions
diff --git a/‎ci-overview.md
Lines changed: 19 additions & 12 deletions b/‎ci-overview.md
Lines changed: 19 additions & 12 deletions
diff --git a/‎ci/matrix.yaml
Lines changed: 7 additions & 0 deletions b/‎ci/matrix.yaml
Lines changed: 7 additions & 0 deletions
diff --git a/‎cudax/examples/stf/void_data_interface.cu
Lines changed: 1 addition & 1 deletion b/‎cudax/examples/stf/void_data_interface.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
Lines changed: 1 addition & 1 deletion b/‎cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
Lines changed: 1 addition & 1 deletion
diff --git a/‎cudax/include/cuda/experimental/__stf/internal/context.cuh
Lines changed: 26 additions & 6 deletions b/‎cudax/include/cuda/experimental/__stf/internal/context.cuh
Lines changed: 26 additions & 6 deletions
diff --git a/‎cudax/include/cuda/experimental/__stf/internal/logical_data.cuh
Lines changed: 3 additions & 0 deletions b/‎cudax/include/cuda/experimental/__stf/internal/logical_data.cuh
Lines changed: 3 additions & 0 deletions
diff --git a/‎cudax/test/stf/dot/sections_2.cu
Lines changed: 3 additions & 3 deletions b/‎cudax/test/stf/dot/sections_2.cu
Lines changed: 3 additions & 3 deletions
diff --git a/‎cudax/test/stf/freeze/token.cu
Lines changed: 1 addition & 1 deletion b/‎cudax/test/stf/freeze/token.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎cudax/test/stf/local_stf/legacy_to_stf.cu
Lines changed: 4 additions & 5 deletions b/‎cudax/test/stf/local_stf/legacy_to_stf.cu
Lines changed: 4 additions & 5 deletions
diff --git a/‎cudax/test/stf/parallel_for/parallel_for_host.cu
Lines changed: 1 addition & 1 deletion b/‎cudax/test/stf/parallel_for/parallel_for_host.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/cudax/stf.rst
Lines changed: 17 additions & 13 deletions b/‎docs/cudax/stf.rst
Lines changed: 17 additions & 13 deletions
diff --git a/‎libcudacxx/include/cuda/__barrier/barrier_block_scope.h
Lines changed: 1 addition & 0 deletions b/‎libcudacxx/include/cuda/__barrier/barrier_block_scope.h
Lines changed: 1 addition & 0 deletions
@@ -201,6 +201,51 @@ TEST_CASE("Scan works with output iterators", "[scan]")
   }
 }
 
+TEST_CASE("Scan works with reverse input iterators", "[scan]")
+{
+  const std::size_t num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op              = make_operation("op", get_reduce_op(get_type_info<int>().type));
+  iterator_t<int, random_access_iterator_state_t<int>> input_it =
+    make_reverse_iterator<int>(iterator_kind::INPUT, "int");
+  std::vector<int> input = generate<int>(num_items);
+  pointer_t<int> input_ptr(input);
+  input_it.state.data = input_ptr.ptr + num_items - 1;
+  pointer_t<int> output_it(num_items);
+  value_t<int> init{42};
+
+  scan(input_it, output_it, num_items, op, init, false);
+
+  std::vector<int> expected(num_items);
+  std::exclusive_scan(input.rbegin(), input.rend(), expected.begin(), init.value);
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<int>(output_it));
+  }
+}
+
+TEST_CASE("Scan works with reverse output iterators", "[scan]")
+{
+  const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op      = make_operation("op", get_reduce_op(get_type_info<int>().type));
+  iterator_t<int, random_access_iterator_state_t<int>> output_it =
+    make_reverse_iterator<int>(iterator_kind::OUTPUT, "int", "out");
+  const std::vector<int> input = generate<int>(num_items);
+  pointer_t<int> input_it(input);
+  pointer_t<int> inner_output_it(num_items);
+  output_it.state.data = inner_output_it.ptr + num_items - 1;
+  value_t<int> init{42};
+
+  scan(input_it, output_it, num_items, op, init, false);
+
+  std::vector<int> expected(num_items);
+  std::exclusive_scan(input.begin(), input.end(), expected.rbegin(), init.value);
+
+  if (num_items > 0)
+  {
+    REQUIRE(expected == std::vector<int>(inner_output_it));
+  }
+}
+
 TEST_CASE("Scan works with input and output iterators", "[scan]")
 {
   const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
 
@@ -586,6 +586,46 @@ make_constant_iterator(std::string value_type, std::string prefix = "")
   return make_iterator<ValueT, constant_iterator_state_t<ValueT>>(iterator_state, advance, dereference);
 }
 
+template <class ValueT>
+iterator_t<ValueT, random_access_iterator_state_t<ValueT>>
+make_reverse_iterator(iterator_kind kind, std::string value_type, std::string prefix = "", std::string transform = "")
+{
+  std::string iterator_state = std::format("struct state_t {{ {0}* data; }};\n", value_type);
+
+  operation_t advance = {
+    std::format("{0}_advance", prefix),
+    std::format("extern \"C\" __device__ void {0}_advance(state_t* state, unsigned long long offset) {{\n"
+                "  state->data -= offset;\n"
+                "}}",
+                prefix)};
+
+  std::string dereference_method;
+  if (kind == iterator_kind::INPUT)
+  {
+    dereference_method = std::format(
+      "extern \"C\" __device__ {1} {0}_dereference(state_t* state) {{\n"
+      "  return (*state->data){2};\n"
+      "}}",
+      prefix,
+      value_type,
+      transform);
+  }
+  else
+  {
+    dereference_method = std::format(
+      "extern \"C\" __device__ void {0}_dereference(state_t* state, {1} x) {{\n"
+      "  *state->data = x{2};\n"
+      "}}",
+      prefix,
+      value_type,
+      transform);
+  }
+
+  operation_t dereference = {std::format("{0}_dereference", prefix), dereference_method};
+
+  return make_iterator<ValueT, random_access_iterator_state_t<ValueT>>(iterator_state, advance, dereference);
+}
+
 template <class T>
 struct value_t
 {
 
@@ -26,30 +26,37 @@ The results of every job in the CI pipeline are summarized on the bottom of the
 
 ### Special CI Commands
 
-Special commands are provided that can be included in commit messages to direct the CI pipeline execution:
+Special commands can be included in the most recent commit message to control which jobs are spawned for the next pull-request CI run.
+These commands can be combined with the [override matrix](#temporarily-overriding-the-pull-request-matrix) for even more fine-grained control.
 
-- `[skip ci]`: Skips the entire CI pipeline. Useful for documentation changes or others that don't require CI validation.
+- `[skip-<component>]`: Skips a subset of the CI jobs. These commands will block the PR from being merged while present in the last commit message of the branch. Recognized components are:
+  - `[skip-matrix]`: Skip all build and test jobs specified in `ci/matrix.yaml`.
+  - `[skip-vdc]`: Skip all "Validate Devcontainer" jobs.
+  - `[skip-docs]`: Skip the documentation verification build.
+  - `[skip-rapids]`: Skip all RAPIDS canary builds.
+  - `[skip-matx]`: Skip all MatX canary builds.
+  - **Example:** `git commit -m "Fix RAPIDS failures [skip-matrix][skip-vdc][skip-docs][skip-matx]"`
 
-   - **Example:** `git commit -m "[skip ci] Update README."`
-
-- `[skip-tests]`: Skips CI jobs that execute tests, but runs all other jobs. Useful to avoid time-consuming tests when changes are unlikely to affect them.
-- `[all-projects]`: CI normally skips projects that don't have changes in themselves or their dependencies. This forces all projects to build.
 - `[workflow:<workflow>]`:  Execute jobs from the named workflow. Example: `[workflow:nightly]` runs all jobs defined in `matrix.yaml`'s `workflows.nightly` list.
 
-Use these commands judiciously. While they offer flexibility, they should be used appropriately to maintain the codebase's integrity and quality.
-
 ### Temporarily Overriding the Pull Request Matrix
 
-If a workflow named `override` exists in the matrix.yaml file, this matrix will be used for pull requests instead of the `pull_request` matrix.
-This is useful for reducing resource usage when launching many CI workflows from a PR (for example, while testing CI features).
-The overridden CI job will be marked as a failure until the override is removed.
+If a non-empty workflow named `override` exists in the `ci/matrix.yaml` file, this matrix will be used for pull requests instead of the `pull_request` matrix.
+This is useful for reducing resource usage and turn-around time when a full run is not needed, for example:
+
+- Testing changes that only apply to a specific compiler, OS, etc.
+- Testing fixes to nightly CI failures by only running the nightly jobs that failed.
+- Testing changes to CI infrastructure that only require a few jobs to run.
+
+The PR will be blocked from merging until the override matrix is removed, ensuring that the full CI suite runs before landing the PR.
+The override matrix can be combined with the `[skip-<...>]` commands detailed in [Special CI Commands](#special-ci-commands) to reduce unnecessary resource usage even further.
 
 Example:
 
 ```
 workflows:
   override:
-    - {jobs: ['test'], std: 17, ctk: *ctk_curr, cxx: [*gcc12, *llvm16, *msvc2022]}
+    - {jobs: ['build'], project: 'cudax', ctk: '12.0', std: 'all', cxx: ['msvc14.39', 'gcc10', 'clang14']}
   pull_request:
     - <...>
 ```
 
@@ -102,6 +102,13 @@ workflows:
     - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc']  ,   gpu: 'rtx2080'}
     - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14'], gpu: 'rtx2080'}
     - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang'],   gpu: 'rtx2080'}
+    # Python and c/parallel jobs:
+    - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], gpu: 'rtx2080'}
+    # cccl-infra:
+    - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'}
+    - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
+    # NVHPC stdpar smoke tests
+    - {jobs: ['build'], project: 'stdpar', std: 'all', ctk: '12.8', cxx: 'nvhpc', cpu: ['amd64', 'arm64']}
 
   # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows.
   exclude:
 
@@ -31,7 +31,7 @@ int main()
   void_interface sync;
   auto token2 = ctx.logical_data(sync);
 
-  auto token3 = ctx.logical_token();
+  auto token3 = ctx.token();
   ctx.task(token2.write(), token.read())->*[](cudaStream_t, auto, auto) {
 
   };
 
@@ -952,7 +952,7 @@ public:
     return logical_data(make_slice(p, n), mv(dplace));
   }
 
-  auto logical_token()
+  auto token()
   {
     // We do not use a shape because we want the first rw() access to succeed
     // without an initial write()
 
@@ -362,12 +362,12 @@ public:
     }
   }
 
-  auto logical_token()
+  auto token()
   {
     _CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized");
     return ::std::visit(
       [&](auto& self) {
-        return self.logical_token();
+        return self.token();
       },
       payload);
   }
@@ -1457,15 +1457,15 @@ UNITTEST("cuda stream place multi-gpu")
   ctx.finalize();
 };
 
-// Ensure we can skip logical tokens
-UNITTEST("logical token elision")
+// Ensure we can skip tokens
+UNITTEST("token elision")
 {
   context ctx;
 
   int buf[1024];
 
-  auto lA = ctx.logical_token();
-  auto lB = ctx.logical_token();
+  auto lA = ctx.token();
+  auto lB = ctx.token();
   auto lC = ctx.logical_data(buf);
 
   // with all arguments
@@ -1483,6 +1483,26 @@ UNITTEST("logical token elision")
   ctx.finalize();
 };
 
+// Use the token type shorthand
+UNITTEST("token vector")
+{
+  context ctx;
+
+  ::std::vector<token> tokens(4);
+
+  for (size_t i = 0; i < 4; i++)
+  {
+    tokens[i] = ctx.token();
+  }
+
+  ctx.task(tokens[0].write())->*[](cudaStream_t) {};
+  ctx.task(tokens[0].read(), tokens[1].write())->*[](cudaStream_t) {};
+  ctx.task(tokens[0].read(), tokens[2].write())->*[](cudaStream_t) {};
+  ctx.task(tokens[1].read(), tokens[2].read(), tokens[3].write())->*[](cudaStream_t) {};
+
+  ctx.finalize();
+};
+
 #endif // UNITTESTED_FILE
 
 } // end namespace cuda::experimental::stf
@@ -2439,6 +2439,9 @@ public:
   ///@}
 };
 
+// Shortcut type for the logical data produced by ctx.token()
+using token = logical_data<void_interface>;
+
 /**
  * @brief Reclaims memory from allocated data instances.
  *
 
@@ -22,9 +22,9 @@ int main()
 // TODO (miscco): Make it work for windows
 #if !_CCCL_COMPILER(MSVC)
   context ctx;
-  auto lA = ctx.logical_token().set_symbol("A");
-  auto lB = ctx.logical_token().set_symbol("B");
-  auto lC = ctx.logical_token().set_symbol("C");
+  auto lA = ctx.token().set_symbol("A");
+  auto lB = ctx.token().set_symbol("B");
+  auto lC = ctx.token().set_symbol("C");
 
   // Begin a top-level section named "foo"
   auto s_foo = ctx.dot_section("foo");
 
@@ -23,7 +23,7 @@ int main()
 {
   context ctx;
 
-  auto ltoken = ctx.logical_token();
+  auto ltoken = ctx.token();
 
   auto ftoken = ctx.freeze(ltoken);
 
 
@@ -146,12 +146,11 @@ void lib_call_generic(async_resources_handle& handle, cudaStream_t stream, doubl
 }
 
 template <typename Ctx_t>
-void lib_call_logical_token(
-  async_resources_handle& handle, cudaStream_t stream, double* d_ptrA, double* d_ptrB, size_t N)
+void lib_call_token(async_resources_handle& handle, cudaStream_t stream, double* d_ptrA, double* d_ptrB, size_t N)
 {
   Ctx_t ctx(stream, handle);
-  auto lA = ctx.logical_token();
-  auto lB = ctx.logical_token();
+  auto lA = ctx.token();
+  auto lB = ctx.token();
   ctx.task(lA.write())->*[=](cudaStream_t s) {
     initA<<<128, 32, 0, s>>>(d_ptrA, N);
   };
@@ -244,7 +243,7 @@ int main()
   nvtx_range r_token("logical token");
   for (size_t i = 0; i < NITER; i++)
   {
-    lib_call_logical_token<context>(handle, stream, d_ptrA, d_ptrB, N);
+    lib_call_token<context>(handle, stream, d_ptrA, d_ptrB, N);
   }
   cuda_safe_call(cudaStreamSynchronize(stream));
   r_token.end();
 
@@ -17,7 +17,7 @@ int main()
   context ctx;
 
   int nqpoints = 3;
-  auto ltoken  = ctx.logical_token();
+  auto ltoken  = ctx.token();
 
   ctx.parallel_for(exec_place::host(), box(5), ltoken.read())->*[nqpoints] __host__(size_t, void_interface) {
     _CCCL_ASSERT(nqpoints == 3, "invalid value");
 
@@ -1781,7 +1781,7 @@ one may however already manage coherency or enforce dependencies.
 
 - The "logical data freezing" mechanism ensures data availability while letting
   the application take care of synchronization.
-- Logical token makes it possible to enforce concurrent execution while
+- Tokens make it possible to enforce concurrent execution while
   letting the application manage data allocations and data transfers.
 
 Freezing logical data
@@ -1856,35 +1856,35 @@ depend on the completion of the work in the streams used for any preceding
 It is possible to retrieve the access mode used to freeze a logical data with
 the ``get_access_mode()`` method of the ``frozen_logical_data`` object.
 
-Logical token
-^^^^^^^^^^^^^
+Tokens
+^^^^^^
 
-A logical token is a specific type of logical data whose only purpose is to
+A token is a specific type of logical data whose only purpose is to
 automate synchronization, while letting the application manage the actual data.
 This can, for example, be useful with user-provided buffers on a single device,
 where no allocations or transfers are required, but where concurrent accesses
 may occur.
 
-A logical token internally relies on the ``void_interface`` data interface,
+A token internally relies on the ``void_interface`` data interface,
 which is specifically optimized to skip unnecessary stages in the cache
 coherency protocol (e.g., data allocations or copying data). When appropriate,
-using a logical token rather than a logical data with a full-fledged data
+using a token rather than a logical data with a full-fledged data
 interface therefore minimizes runtime overhead.
 
 .. code:: cpp
 
-    auto token = ctx.logical_token();
+    auto token = ctx.token();
 
     // A and B are assumed to be two other valid logical data
     ctx.task(token.rw(), A.read(), B.rw())->*[](cudaStream_t stream, auto a, auto b)
     {
         ...
     };
 
-The example above shows how to create a logical token and how to use it in a
+The example above shows how to create a token and how to use it in a
 task.
 
-Since the logical token is only used for synchronization purposes, the
+Since the token is only used for synchronization purposes, the
 corresponding argument may be omitted in the lambda function passed as the
 task’s implementation. Thus, the above task is equivalent to this code:
 
@@ -1897,11 +1897,15 @@ To avoid ambiguities, you must either consistently ignore every
 unused. Eliding these token arguments is possible in the ``ctx.task`` and
 ``ctx.host_launch`` constructs.
 
-Note that the token created by the ``logical_token`` method of the context
+Note that the token created by the ``token`` method of the context
 object is already valid, which means the first access can be either a ``read()``
 or an ``rw()`` access. There is no need to set any content in the token
 (unlike a logical data object created from a shape).
 
+A token corresponds to a ``logical_data<void_interface>`` object, so that the
+``token`` type serves as a short-hand for this type. ``ctx.token()`` thus
+returns an object with a ``token`` type.
+
 Tools
 -----
 
@@ -2022,9 +2026,9 @@ illustrates how to add nested sections:
 .. code:: c++
 
     context ctx;
-    auto lA = ctx.logical_token().set_symbol("A");
-    auto lB = ctx.logical_token().set_symbol("B");
-    auto lC = ctx.logical_token().set_symbol("C");
+    auto lA = ctx.token().set_symbol("A");
+    auto lB = ctx.token().set_symbol("B");
+    auto lC = ctx.token().set_symbol("C");
 
     // Begin a top-level section named "foo"
     auto s_foo = ctx.dot_section("foo");
 
@@ -359,6 +359,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
       NV_ANY_TARGET,
       (return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
                 _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);))
+    _CCCL_UNREACHABLE();
   }
 
 public:
Original file line number	Diff line number	Diff line change
`@@ -952,7 +952,7 @@ public:`
`952`	`952`	`return logical_data(make_slice(p, n), mv(dplace));`
`953`	`953`	`}`
`954`	`954`
`955`		`- auto logical_token()`
	`955`	`+ auto token()`
`956`	`956`	`{`
`957`	`957`	`// We do not use a shape because we want the first rw() access to succeed`
`958`	`958`	`// without an initial write()`
Original file line number	Diff line number	Diff line change
`@@ -2439,6 +2439,9 @@ public:`
`2439`	`2439`	`///@}`
`2440`	`2440`	`};`
`2441`	`2441`
	`2442`	`+// Shortcut type for the logical data produced by ctx.token()`
	`2443`	`+using token = logical_data<void_interface>;`
	`2444`	`+`
`2442`	`2445`	`/**`
`2443`	`2446`	`* @brief Reclaims memory from allocated data instances.`
`2444`	`2447`	`*`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ int main()`
`23`	`23`	`{`
`24`	`24`	`context ctx;`
`25`	`25`
`26`		`- auto ltoken = ctx.logical_token();`
	`26`	`+ auto ltoken = ctx.token();`
`27`	`27`
`28`	`28`	`auto ftoken = ctx.freeze(ltoken);`
`29`	`29`
Original file line number	Diff line number	Diff line change
`@@ -146,12 +146,11 @@ void lib_call_generic(async_resources_handle& handle, cudaStream_t stream, doubl`
`146`	`146`	`}`
`147`	`147`
`148`	`148`	`template <typename Ctx_t>`
`149`		`-void lib_call_logical_token(`
`150`		`- async_resources_handle& handle, cudaStream_t stream, double* d_ptrA, double* d_ptrB, size_t N)`
	`149`	`+void lib_call_token(async_resources_handle& handle, cudaStream_t stream, double* d_ptrA, double* d_ptrB, size_t N)`
`151`	`150`	`{`
`152`	`151`	`Ctx_t ctx(stream, handle);`
`153`		`- auto lA = ctx.logical_token();`
`154`		`- auto lB = ctx.logical_token();`
	`152`	`+ auto lA = ctx.token();`
	`153`	`+ auto lB = ctx.token();`
`155`	`154`	`ctx.task(lA.write())->*[=](cudaStream_t s) {`
`156`	`155`	`initA<<<128, 32, 0, s>>>(d_ptrA, N);`
`157`	`156`	`};`
`@@ -244,7 +243,7 @@ int main()`
`244`	`243`	`nvtx_range r_token("logical token");`
`245`	`244`	`for (size_t i = 0; i < NITER; i++)`
`246`	`245`	`{`
`247`		`- lib_call_logical_token<context>(handle, stream, d_ptrA, d_ptrB, N);`
	`246`	`+ lib_call_token<context>(handle, stream, d_ptrA, d_ptrB, N);`
`248`	`247`	`}`
`249`	`248`	`cuda_safe_call(cudaStreamSynchronize(stream));`
`250`	`249`	`r_token.end();`
Original file line number	Diff line number	Diff line change
`@@ -359,6 +359,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo`
`359`	`359`	`NV_ANY_TARGET,`
`360`	`360`	`(return _CUDA_VSTD::__cccl_thread_poll_with_backoff(`
`361`	`361`	`_CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);))`
	`362`	`+ _CCCL_UNREACHABLE();`
`362`	`363`	`}`
`363`	`364`
`364`	`365`	`public:`