From a40828341ab55de9f732f52d6508f6a6b56c2974 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 15 May 2025 15:56:47 -0700
Subject: [PATCH 01/12] use cuda::std::invoke_result_t

---
 cub/cub/block/radix_rank_sort_operations.cuh             | 4 +++-
 cub/cub/detail/type_traits.cuh                           | 3 ---
 cub/cub/device/dispatch/dispatch_adjacent_difference.cuh | 4 +++-
 cub/cub/device/dispatch/dispatch_reduce.cuh              | 3 ++-
 4 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh
index f213a151e76..31217c46df8 100644
--- a/cub/cub/block/radix_rank_sort_operations.cuh
+++ b/cub/cub/block/radix_rank_sort_operations.cuh
@@ -50,6 +50,7 @@
 #include <thrust/type_traits/integer_sequence.h>
 
 #include <cuda/bit>
+#include <cuda/functional>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__algorithm/min.h>
 #include <cuda/std/cstdint>
@@ -228,7 +229,8 @@ struct is_tuple_of_references_to_fundamental_types_t< //
 {};
 
 template <class KeyT, class DecomposerT>
-using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t<invoke_result_t<DecomposerT, KeyT&>>;
+using decomposer_check_t =
+  is_tuple_of_references_to_fundamental_types_t<_CUDA_VSTD::invoke_result_t<DecomposerT, KeyT&>>;
 
 template <class T>
 struct bit_ordered_conversion_policy_t
diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh
index 2362e42990a..f83a55cebc4 100644
--- a/cub/cub/detail/type_traits.cuh
+++ b/cub/cub/detail/type_traits.cuh
@@ -57,9 +57,6 @@ CUB_NAMESPACE_BEGIN
 namespace detail
 {
 
-template <typename Invokable, typename... Args>
-using invoke_result_t = _CUDA_VSTD::invoke_result_t<Invokable, Args...>;
-
 template <typename T, typename... TArgs>
 inline constexpr bool is_one_of_v = (_CCCL_TRAIT(_CUDA_VSTD::is_same, T, TArgs) || ...);
 
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index 31d85d7b140..56064ba2076 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -48,6 +48,8 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/functional>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::adjacent_difference
@@ -80,7 +82,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
 
   // It is OK to introspect the return type or parameter types of the
   // `operator()` function of `__device__` extended lambda within device code.
-  using OutputT = invoke_result_t<DifferenceOpT, InputT, InputT>;
+  using OutputT = _CUDA_VSTD::invoke_result_t<DifferenceOpT, InputT, InputT>;
 
   using Agent =
     AgentDifference<ActivePolicyT,
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index c850fd95301..714d3f07bd0 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -59,6 +59,7 @@
 #include <cub/util_type.cuh> // for cub::detail::non_void_value_t, cub::detail::value_t
 
 #include <cuda/std/functional>
+#include <cuda/std/iterator>
 
 CUB_NAMESPACE_BEGIN
 
@@ -572,7 +573,7 @@ template <
   typename InitT,
   typename AccumT =
     ::cuda::std::__accumulator_t<ReductionOpT,
-                                 cub::detail::invoke_result_t<TransformOpT, cub::detail::it_value_t<InputIteratorT>>,
+                                 _CUDA_VSTD::invoke_result_t<TransformOpT, _CUDA_VSTD::iter_value_t<InputIteratorT>>,
                                  InitT>,
   typename PolicyHub    = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
   typename KernelSource = detail::reduce::DeviceReduceKernelSource<

From aaab0e70cf60f5467bb87bbb71241040ac0383e6 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 15 May 2025 15:58:00 -0700
Subject: [PATCH 02/12] simplify array_utils.cuh namespaces

---
 cub/cub/detail/array_utils.cuh | 46 +++++++++++++++-------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/cub/cub/detail/array_utils.cuh b/cub/cub/detail/array_utils.cuh
index cd4115a081a..2c93dfe49ba 100644
--- a/cub/cub/detail/array_utils.cuh
+++ b/cub/cub/detail/array_utils.cuh
@@ -1,8 +1,8 @@
-/******************************************************************************
- * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+/***********************************************************************************************************************
+ * Copyright (c) 2011-2025, NVIDIA CORPORATION.  All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+ * following conditions are met:
  *     * Redistributions of source code must retain the above copyright
  *       notice, this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright
@@ -12,18 +12,15 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- ******************************************************************************/
+ **********************************************************************************************************************/
 
 #pragma once
 
@@ -42,13 +39,11 @@
 
 #include <cuda/std/array> // array
 #include <cuda/std/cstddef> // size_t
-#include <cuda/std/iterator> // cuda::std::iter_value_t
+#include <cuda/std/iterator> // _CUDA_VSTD::iter_value_t
 #include <cuda/std/type_traits> // _If
 #include <cuda/std/utility> // index_sequence
 
 CUB_NAMESPACE_BEGIN
-
-/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
 namespace detail
 {
 
@@ -58,25 +53,24 @@ namespace detail
  * Generic Array-like to Array Conversion
  **********************************************************************************************************************/
 
-template <typename CastType, typename Input, ::cuda::std::size_t... i>
-[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::array<CastType, cub::detail::static_size_v<Input>>
-to_array_impl(const Input& input, ::cuda::std::index_sequence<i...>)
+template <typename CastType, typename Input, _CUDA_VSTD::size_t... i>
+[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE _CUDA_VSTD::array<CastType, static_size_v<Input>>
+to_array_impl(const Input& input, _CUDA_VSTD::index_sequence<i...>)
 {
-  using ArrayType = ::cuda::std::array<CastType, static_size_v<Input>>;
+  using ArrayType = _CUDA_VSTD::array<CastType, static_size_v<Input>>;
   return ArrayType{static_cast<CastType>(input[i])...};
 }
 
 template <typename CastType = void, typename Input>
-[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::array<CastType, cub::detail::static_size_v<Input>>
+[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE _CUDA_VSTD::array<CastType, static_size_v<Input>>
 to_array(const Input& input)
 {
-  using InputType = cuda::std::iter_value_t<Input>;
-  using CastType1 = ::cuda::std::_If<::cuda::std::is_same_v<CastType, void>, InputType, CastType>;
-  return to_array_impl<CastType1>(input, ::cuda::std::make_index_sequence<static_size_v<Input>>{});
+  using InputType = _CUDA_VSTD::iter_value_t<Input>;
+  using CastType1 = _CUDA_VSTD::_If<_CUDA_VSTD::is_same_v<CastType, void>, InputType, CastType>;
+  return to_array_impl<CastType1>(input, _CUDA_VSTD::make_index_sequence<static_size_v<Input>>{});
 }
 
 #endif // !_CCCL_DOXYGEN_INVOKED
 
 } // namespace detail
-
 CUB_NAMESPACE_END

From 5cd48d3428eb50d107fa2c88b48bc2261d3454a3 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 15 May 2025 15:58:32 -0700
Subject: [PATCH 03/12] move unsafe_bitcast into an independent file

---
 cub/cub/detail/unsafe_bitcast.cuh | 56 +++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 cub/cub/detail/unsafe_bitcast.cuh

diff --git a/cub/cub/detail/unsafe_bitcast.cuh b/cub/cub/detail/unsafe_bitcast.cuh
new file mode 100644
index 00000000000..78f5a8432ad
--- /dev/null
+++ b/cub/cub/detail/unsafe_bitcast.cuh
@@ -0,0 +1,56 @@
+/***********************************************************************************************************************
+ * Copyright (c) 2011-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+ * following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **********************************************************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
+
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
+
+// NOTE: bit_cast cannot be always used because __half, __nv_bfloat16, etc. are not trivially copyable
+template <typename Output, typename Input>
+[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE Output unsafe_bitcast(const Input& input)
+{
+  Output output;
+  static_assert(sizeof(input) == sizeof(output), "wrong size");
+  ::memcpy(static_cast<void*>(&output), static_cast<const void*>(&input), sizeof(input));
+  return output;
+}
+
+#endif // !_CCCL_DOXYGEN_INVOKED
+
+} // namespace detail
+CUB_NAMESPACE_END

From 347f7d151ce2d217dd66917c51c61c2a79adc07b Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 15 May 2025 15:59:34 -0700
Subject: [PATCH 04/12] use more precise names for cuda::std:: operators

---
 cub/cub/thread/thread_operators.cuh | 248 +++++++++++++++++++++++-----
 cub/cub/thread/thread_reduce.cuh    |  85 ++++------
 2 files changed, 239 insertions(+), 94 deletions(-)

diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index e8a6c165c94..0dd7bc01e7c 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -1,9 +1,9 @@
-/******************************************************************************
+/***********************************************************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2025, NVIDIA CORPORATION.  All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+ * following conditions are met:
  *     * Redistributions of source code must retain the above copyright
  *       notice, this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright
@@ -13,18 +13,15 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- ******************************************************************************/
+ **********************************************************************************************************************/
 
 /**
  * @file
@@ -39,6 +36,10 @@
 
 #include <cub/config.cuh>
 
+#include "cuda/__functional/maximum.h"
+#include "cuda/std/__functional/operations.h"
+#include "cuda/std/__type_traits/always_false.h"
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -52,7 +53,7 @@
 #include <cuda/functional> // cuda::maximum, cuda::minimum
 #include <cuda/std/cstdint> // cuda::std::uint32_t
 #include <cuda/std/functional> // cuda::std::plus
-#include <cuda/std/utility> // cuda::std::pair
+#include <cuda/std/type_traits> // is_same_v
 
 CUB_NAMESPACE_BEGIN
 
@@ -406,37 +407,196 @@ struct ReduceByKeyOp
 namespace detail
 {
 
-template <typename ReductionOp, typename T>
-inline constexpr bool is_cuda_std_min_max_v =
-  cub::detail::is_one_of_v<ReductionOp, //
-                           ::cuda::minimum<>,
-                           ::cuda::minimum<T>,
-                           ::cuda::maximum<>,
-                           ::cuda::maximum<T>>;
-
-template <typename ReductionOp, typename T>
-inline constexpr bool is_cuda_std_plus_mul_v =
-  cub::detail::is_one_of_v<ReductionOp, //
-                           _CUDA_VSTD::plus<>,
-                           _CUDA_VSTD::plus<T>,
-                           _CUDA_VSTD::multiplies<>,
-                           _CUDA_VSTD::multiplies<T>>;
-
-template <typename ReductionOp, typename T>
+//----------------------------------------------------------------------------------------------------------------------
+// Predefined operators
+
+template <typename, typename = void>
+inline constexpr bool is_cuda_std_plus_v = false;
+
+template <typename T>
+inline constexpr bool is_cuda_std_plus_v<_CUDA_VSTD::plus<T>, void> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_std_plus_v<_CUDA_VSTD::plus<T>, T> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_std_plus_v<_CUDA_VSTD::plus<>, T> = true;
+
+template <>
+inline constexpr bool is_cuda_std_plus_v<_CUDA_VSTD::plus<>, void> = true;
+
+template <typename, typename = void>
+inline constexpr bool is_cuda_std_mul_v = false;
+
+template <typename T>
+inline constexpr bool is_cuda_std_mul_v<_CUDA_VSTD::multiplies<T>, void> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_std_mul_v<_CUDA_VSTD::multiplies<T>, T> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_std_mul_v<_CUDA_VSTD::multiplies<>, T> = true;
+
+template <>
+inline constexpr bool is_cuda_std_mul_v<_CUDA_VSTD::multiplies<>, void> = true;
+
+template <typename, typename = void>
+inline constexpr bool is_cuda_maximum_v = false;
+
+template <typename T>
+inline constexpr bool is_cuda_maximum_v<::cuda::maximum<T>, void> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_maximum_v<::cuda::maximum<T>, T> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_maximum_v<::cuda::maximum<>, T> = true;
+
+template <>
+inline constexpr bool is_cuda_maximum_v<::cuda::maximum<>, void> = true;
+
+template <typename, typename = void>
+inline constexpr bool is_cuda_minimum_v = false;
+
+template <typename T>
+inline constexpr bool is_cuda_minimum_v<::cuda::minimum<T>, void> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_minimum_v<::cuda::minimum<T>, T> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_minimum_v<::cuda::minimum<>, T> = true;
+
+template <>
+inline constexpr bool is_cuda_minimum_v<::cuda::minimum<>, void> = true;
+
+template <typename, typename = void>
+inline constexpr bool is_cuda_std_bit_and_v = false;
+
+template <typename T>
+inline constexpr bool is_cuda_std_bit_and_v<_CUDA_VSTD::bit_and<T>, void> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_std_bit_and_v<_CUDA_VSTD::bit_and<T>, T> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_std_bit_and_v<_CUDA_VSTD::bit_and<>, T> = true;
+
+template <>
+inline constexpr bool is_cuda_std_bit_and_v<_CUDA_VSTD::bit_and<>, void> = true;
+
+template <typename, typename = void>
+inline constexpr bool is_cuda_std_bit_or_v = false;
+
+template <typename T>
+inline constexpr bool is_cuda_std_bit_or_v<_CUDA_VSTD::bit_or<T>, void> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_std_bit_or_v<_CUDA_VSTD::bit_or<T>, T> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_std_bit_or_v<_CUDA_VSTD::bit_or<>, T> = true;
+
+template <>
+inline constexpr bool is_cuda_std_bit_or_v<_CUDA_VSTD::bit_or<>, void> = true;
+
+template <typename, typename = void>
+inline constexpr bool is_cuda_std_bit_xor_v = false;
+
+template <typename T>
+inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor<T>, void> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor<T>, T> = true;
+
+template <typename T>
+inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor<>, T> = true;
+
+template <>
+inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor<>, void> = true;
+
+template <typename Op, typename T = void>
+inline constexpr bool is_cuda_minimum_maximum_v = is_cuda_maximum_v<Op, T> || is_cuda_minimum_v<Op, T>;
+
+template <typename Op, typename T = void>
+inline constexpr bool is_cuda_std_plus_mul_v = is_cuda_std_plus_v<Op, T> || is_cuda_std_mul_v<Op, T>;
+
+template <typename Op, typename T = void>
 inline constexpr bool is_cuda_std_bitwise_v =
-  cub::detail::is_one_of_v<ReductionOp,
-                           _CUDA_VSTD::bit_and<>,
-                           _CUDA_VSTD::bit_and<T>,
-                           _CUDA_VSTD::bit_or<>,
-                           _CUDA_VSTD::bit_or<T>,
-                           _CUDA_VSTD::bit_xor<>,
-                           _CUDA_VSTD::bit_xor<T>>;
-
-template <typename ReductionOp, typename T>
-inline constexpr bool is_cuda_std_operator_v =
-  is_cuda_std_min_max_v<ReductionOp, T> || //
-  is_cuda_std_plus_mul_v<ReductionOp, T> || //
-  is_cuda_std_bitwise_v<ReductionOp, T>;
+  is_cuda_std_bit_and_v<Op, T> || is_cuda_std_bit_or_v<Op, T> || is_cuda_std_bit_xor_v<Op, T>;
+
+template <typename Op, typename T = void>
+inline constexpr bool is_cuda_operator_v =
+  is_cuda_minimum_maximum_v<Op, T> || //
+  is_cuda_std_plus_mul_v<Op, T> || //
+  is_cuda_std_bitwise_v<Op, T>;
+
+//----------------------------------------------------------------------------------------------------------------------
+// Generalize Operator
+
+template <typename Op, typename>
+struct GeneralizeOperator
+{
+  static_assert(is_cuda_operator_v<Op>);
+  using type = Op;
+};
+
+template <typename T>
+struct GeneralizeOperator<_CUDA_VSTD::plus<T>, T>
+{
+  using type = _CUDA_VSTD::plus<>;
+};
+
+template <typename T>
+struct GeneralizeOperator<_CUDA_VSTD::bit_and<T>, T>
+{
+  using type = _CUDA_VSTD::bit_and<>;
+};
+
+template <typename T>
+struct GeneralizeOperator<_CUDA_VSTD::bit_or<T>, T>
+{
+  using type = _CUDA_VSTD::bit_or<>;
+};
+
+template <typename T>
+struct GeneralizeOperator<_CUDA_VSTD::bit_xor<T>, T>
+{
+  using type = _CUDA_VSTD::bit_xor<>;
+};
+
+template <typename T>
+struct GeneralizeOperator<::cuda::maximum<T>, T>
+{
+  using type = ::cuda::maximum<>;
+};
+
+template <typename T>
+struct GeneralizeOperator<::cuda::minimum<T>, T>
+{
+  using type = ::cuda::minimum<>;
+};
+
+template <typename Op, typename T>
+using generalize_operator_t = typename GeneralizeOperator<Op, T>::type;
+
+//----------------------------------------------------------------------------------------------------------------------
+// Identity
+
+template <typename Op, typename T>
+inline constexpr T identity_v;
+
+template <typename T>
+inline constexpr T identity_v<::cuda::minimum<>, T> = _CUDA_VSTD::numeric_limits<T>::max();
+
+template <typename T>
+inline constexpr T identity_v<::cuda::minimum<T>, T> = _CUDA_VSTD::numeric_limits<T>::max();
+
+template <typename T>
+inline constexpr T identity_v<::cuda::maximum<>, T> = _CUDA_VSTD::numeric_limits<T>::min();
+
+template <typename T>
+inline constexpr T identity_v<::cuda::maximum<T>, T> = _CUDA_VSTD::numeric_limits<T>::min();
 
 } // namespace detail
 
diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index 1ded86cf436..0b4b68dea76 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -1,9 +1,9 @@
-/******************************************************************************
+/***********************************************************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2025, NVIDIA CORPORATION.  All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+ * following conditions are met:
  *     * Redistributions of source code must retain the above copyright
  *       notice, this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright
@@ -13,18 +13,15 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- ******************************************************************************/
+ **********************************************************************************************************************/
 
 //! @file
 //! Thread reduction over statically-sized array-like types
@@ -42,6 +39,7 @@
 
 #include <cub/detail/array_utils.cuh> // to_array()
 #include <cub/detail/type_traits.cuh> // are_same()
+#include <cub/detail/unsafe_bitcast.cuh>
 #include <cub/thread/thread_load.cuh> // UnrolledCopy
 #include <cub/thread/thread_operators.cuh>
 #include <cub/thread/thread_simd.cuh>
@@ -225,7 +223,7 @@ namespace detail
 
 template <typename T, typename ReductionOp, int Length>
 inline constexpr bool enable_sm90_simd_reduction_v =
-  cub::detail::is_one_of_v<T, int16_t, uint16_t> && is_cuda_std_min_max_v<ReductionOp, T> && Length >= 10;
+  is_one_of_v<T, int16_t, uint16_t> && is_cuda_minimum_maximum_v<ReductionOp, T> && Length >= 10;
 
 //----------------------------------------------------------------------------------------------------------------------
 // SM80 SIMD
@@ -237,7 +235,7 @@ inline constexpr bool enable_sm80_simd_reduction_v = false;
 
 template <typename ReductionOp, int Length>
 inline constexpr bool enable_sm80_simd_reduction_v<__half, ReductionOp, Length> =
-  (is_cuda_std_min_max_v<ReductionOp, __half> || is_cuda_std_plus_mul_v<ReductionOp, __half>) && Length >= 4;
+  (is_cuda_minimum_maximum_v<ReductionOp, __half> || is_cuda_std_plus_mul_v<ReductionOp, __half>) && Length >= 4;
 
 #  endif // defined(_CCCL_HAS_NVFP16)
 
@@ -245,7 +243,7 @@ inline constexpr bool enable_sm80_simd_reduction_v<__half, ReductionOp, Length>
 
 template <typename ReductionOp, int Length>
 inline constexpr bool enable_sm80_simd_reduction_v<__nv_bfloat16, ReductionOp, Length> =
-  (is_cuda_std_min_max_v<ReductionOp, __nv_bfloat16> || is_cuda_std_plus_mul_v<ReductionOp, __nv_bfloat16>)
+  (is_cuda_minimum_maximum_v<ReductionOp, __nv_bfloat16> || is_cuda_std_plus_mul_v<ReductionOp, __nv_bfloat16>)
   && Length >= 4;
 
 #  endif // _CCCL_HAS_NVBF16()
@@ -272,14 +270,13 @@ inline constexpr bool enable_sm70_simd_reduction_v = false;
 
 template <typename T, typename ReductionOp>
 inline constexpr bool enable_ternary_reduction_sm90_v =
-  cub::detail::is_one_of_v<T, int32_t, uint32_t> && is_cuda_std_min_max_v<ReductionOp, T>;
+  is_one_of_v<T, int32_t, uint32_t> && is_cuda_minimum_maximum_v<ReductionOp, T>;
 
 #  if _CCCL_HAS_NVFP16()
 
 template <typename ReductionOp>
 inline constexpr bool enable_ternary_reduction_sm90_v<__half2, ReductionOp> =
-  is_cuda_std_min_max_v<ReductionOp, __half2>
-  || cub::detail::is_one_of_v<ReductionOp, SimdMin<__half>, SimdMax<__half>>;
+  is_cuda_minimum_maximum_v<ReductionOp, __half2> || is_one_of_v<ReductionOp, SimdMin<__half>, SimdMax<__half>>;
 
 #  endif // _CCCL_HAS_NVFP16()
 
@@ -287,16 +284,15 @@ inline constexpr bool enable_ternary_reduction_sm90_v<__half2, ReductionOp> =
 
 template <typename ReductionOp>
 inline constexpr bool enable_ternary_reduction_sm90_v<__nv_bfloat162, ReductionOp> =
-  is_cuda_std_min_max_v<ReductionOp, __nv_bfloat162>
-  || cub::detail::is_one_of_v<ReductionOp, SimdMin<__nv_bfloat16>, SimdMax<__nv_bfloat16>>;
+  is_cuda_minimum_maximum_v<ReductionOp, __nv_bfloat162>
+  || is_one_of_v<ReductionOp, SimdMin<__nv_bfloat16>, SimdMax<__nv_bfloat16>>;
 
 #  endif // _CCCL_HAS_NVBF16()
 
 template <typename T, typename ReductionOp>
 inline constexpr bool enable_ternary_reduction_sm50_v =
   _CUDA_VSTD::is_integral_v<T> && sizeof(T) <= 4
-  && (cub::detail::is_one_of_v<ReductionOp, _CUDA_VSTD::plus<>, _CUDA_VSTD::plus<T>>
-      || is_cuda_std_bitwise_v<ReductionOp, T>);
+  && (is_cuda_std_plus_v<ReductionOp, T> || is_cuda_std_bitwise_v<ReductionOp, T>);
 
 /***********************************************************************************************************************
  * Internal Reduction Algorithms: Sequential, Binary, Ternary
@@ -307,7 +303,7 @@ template <typename AccumT, typename Input, typename ReductionOp>
 {
   auto retval = static_cast<AccumT>(input[0]);
   _CCCL_PRAGMA_UNROLL_FULL()
-  for (int i = 1; i < cub::detail::static_size_v<Input>; ++i)
+  for (int i = 1; i < static_size_v<Input>; ++i)
   {
     retval = reduction_op(retval, input[i]);
   }
@@ -317,7 +313,7 @@ template <typename AccumT, typename Input, typename ReductionOp>
 template <typename AccumT, typename Input, typename ReductionOp>
 [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduceBinaryTree(const Input& input, ReductionOp reduction_op)
 {
-  constexpr auto length = cub::detail::static_size_v<Input>;
+  constexpr auto length = static_size_v<Input>;
   auto array            = cub::detail::to_array<AccumT>(input);
   _CCCL_PRAGMA_UNROLL_FULL()
   for (int i = 1; i < length; i *= 2)
@@ -334,7 +330,7 @@ template <typename AccumT, typename Input, typename ReductionOp>
 template <typename AccumT, typename Input, typename ReductionOp>
 [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduceTernaryTree(const Input& input, ReductionOp reduction_op)
 {
-  constexpr auto length = cub::detail::static_size_v<Input>;
+  constexpr auto length = static_size_v<Input>;
   auto array            = cub::detail::to_array<AccumT>(input);
   _CCCL_PRAGMA_UNROLL_FULL()
   for (int i = 1; i < length; i *= 3)
@@ -353,16 +349,6 @@ template <typename AccumT, typename Input, typename ReductionOp>
  * SIMD Reduction
  **********************************************************************************************************************/
 
-// NOTE: bit_cast cannot be always used because __half, __nv_bfloat16, etc. are not trivially copyable
-template <typename Output, typename Input>
-[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE Output unsafe_bitcast(const Input& input)
-{
-  Output output;
-  static_assert(sizeof(input) == sizeof(output), "wrong size");
-  ::memcpy(&output, &input, sizeof(input));
-  return output;
-}
-
 template <typename Input, typename ReductionOp>
 _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp)
 {
@@ -370,7 +356,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, Reducti
   using T                       = _CUDA_VSTD::iter_value_t<Input>;
   using SimdReduceOp            = cub_operator_to_simd_operator_t<ReductionOp, T>;
   using SimdType                = simd_type_t<T>;
-  constexpr auto length         = cub::detail::static_size_v<Input>;
+  constexpr auto length         = static_size_v<Input>;
   constexpr auto simd_ratio     = sizeof(SimdType) / sizeof(T);
   constexpr auto length_rounded = ::cuda::round_down(length, simd_ratio);
   using UnpackedType            = _CUDA_VSTD::array<T, simd_ratio>;
@@ -398,7 +384,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, Reducti
 
 template <typename ReductionOp, typename T>
 inline constexpr bool enable_min_max_promotion_v =
-  is_cuda_std_min_max_v<ReductionOp, T> && _CUDA_VSTD::is_integral_v<T> && sizeof(T) <= 2;
+  is_cuda_minimum_maximum_v<ReductionOp, T> && _CUDA_VSTD::is_integral_v<T> && sizeof(T) <= 2;
 
 } // namespace detail
 
@@ -409,25 +395,23 @@ inline constexpr bool enable_min_max_promotion_v =
 template <typename Input, typename ReductionOp, typename ValueT, typename AccumT>
 [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op)
 {
-  static_assert(detail::is_fixed_size_random_access_range_v<Input>,
+  using namespace cub::detail;
+  static_assert(is_fixed_size_random_access_range_v<Input>,
                 "Input must support the subscript operator[] and have a compile-time size");
-  static_assert(cub::detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+  static_assert(has_binary_call_operator<ReductionOp, ValueT>::value,
                 "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
-  if constexpr (cub::detail::static_size_v<Input> == 1)
+  if constexpr (static_size_v<Input> == 1)
   {
     return static_cast<AccumT>(input[0]);
   }
-  using cub::detail::is_one_of_v;
-  using namespace cub::detail;
   using PromT = _CUDA_VSTD::_If<enable_min_max_promotion_v<ReductionOp, ValueT>, int, AccumT>;
   // TODO: should be part of the tuning policy
-  if constexpr ((!is_cuda_std_operator_v<ReductionOp, ValueT> && !is_simd_operator_v<ReductionOp>)
-                || sizeof(ValueT) >= 8)
+  if constexpr ((!is_cuda_operator_v<ReductionOp, ValueT> && !is_simd_operator_v<ReductionOp>) || sizeof(ValueT) >= 8)
   {
     return ThreadReduceSequential<AccumT>(input, reduction_op);
   }
 
-  constexpr auto length = cub::detail::static_size_v<Input>;
+  constexpr auto length = static_size_v<Input>;
   if constexpr (_CUDA_VSTD::is_same_v<Input, AccumT> && enable_sm90_simd_reduction_v<Input, ReductionOp, length>)
   {
     NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSimd(input, reduction_op);))
@@ -449,7 +433,7 @@ template <typename Input, typename ReductionOp, typename ValueT, typename AccumT
     if constexpr ((is_one_of_v<ReductionOp, _CUDA_VSTD::plus<>, _CUDA_VSTD::plus<PromT>>
                    && is_one_of_v<PromT, int32_t, uint32_t>)
                   // the compiler generates bad code for int8/uint8 and min/max for SM90
-                  || (is_cuda_std_min_max_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
+                  || (is_cuda_minimum_maximum_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
     {
       NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential<PromT>(input, reduction_op);));
     }
@@ -496,11 +480,12 @@ template <typename Input,
 [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
 ThreadReduce(const Input& input, ReductionOp reduction_op, PrefixT prefix)
 {
-  static_assert(detail::is_fixed_size_random_access_range_v<Input>,
+  using namespace cub::detail;
+  static_assert(is_fixed_size_random_access_range_v<Input>,
                 "Input must support the subscript operator[] and have a compile-time size");
-  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+  static_assert(has_binary_call_operator<ReductionOp, ValueT>::value,
                 "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
-  constexpr int length = cub::detail::static_size_v<Input>;
+  constexpr int length = static_size_v<Input>;
   // copy to a temporary array of type AccumT
   AccumT array[length + 1];
   array[0] = prefix;

From 9073f81afadaba3a5bd448372b1ea2a1e6742c47 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 15 May 2025 16:00:11 -0700
Subject: [PATCH 05/12] add short2/half[2]/bfloat[2] identification type traits

---
 cub/cub/detail/type_traits.cuh | 116 ++++++++++++++++++++++++++++-----
 1 file changed, 100 insertions(+), 16 deletions(-)

diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh
index f83a55cebc4..704781bd2e5 100644
--- a/cub/cub/detail/type_traits.cuh
+++ b/cub/cub/detail/type_traits.cuh
@@ -1,8 +1,8 @@
-/******************************************************************************
- * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+/***********************************************************************************************************************
+ * Copyright (c) 2011-2025, NVIDIA CORPORATION.  All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+ * following conditions are met:
  *     * Redistributions of source code must retain the above copyright
  *       notice, this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright
@@ -12,18 +12,15 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- ******************************************************************************/
+ **********************************************************************************************************************/
 
 /**
  * \file
@@ -51,7 +48,7 @@
 #include <cuda/std/functional>
 #include <cuda/std/mdspan>
 #include <cuda/std/span>
-#include <cuda/std/type_traits>
+#include <cuda/std/type_traits> // is_same_v
 
 CUB_NAMESPACE_BEGIN
 namespace detail
@@ -116,6 +113,93 @@ inline constexpr int static_size_v<_CUDA_VSTD::mdspan<T, E, L, A>> =
 template <typename T>
 using implicit_prom_t = decltype(+T{});
 
-} // namespace detail
+/***********************************************************************************************************************
+ * Extended floating point traits
+ **********************************************************************************************************************/
+// half
+
+template <typename>
+inline constexpr bool is_half_base_v = false;
+
+template <typename>
+inline constexpr bool is_half2_base_v = false;
+
+#if _CCCL_HAS_NVFP16()
+
+template <>
+inline constexpr bool is_half_base_v<__half> = true;
+
+template <>
+inline constexpr bool is_half2_base_v<__half2> = true;
+
+#endif // _CCCL_HAS_NVFP16
+
+template <typename T>
+inline constexpr bool is_half_v = is_half_base_v<_CUDA_VSTD::remove_cv_t<T>>;
+
+template <typename T>
+inline constexpr bool is_half2_v = is_half2_base_v<_CUDA_VSTD::remove_cv_t<T>>;
+
+template <typename T>
+inline constexpr bool is_any_half_v = is_half_base_v<T> || is_half2_base_v<T>;
+
+//----------------------------------------------------------------------------------------------------------------------
+// bfloat16
+
+template <typename>
+inline constexpr bool is_bfloat16_base_v = false;
+
+template <typename>
+inline constexpr bool is_bfloat16x2_base_v = false;
+
+#if _CCCL_HAS_NVBF16()
+
+template <>
+inline constexpr bool is_bfloat16_base_v<__nv_bfloat16> = true;
 
+template <>
+inline constexpr bool is_bfloat16x2_base_v<__nv_bfloat162> = true;
+
+#endif // _CCCL_HAS_NVBF16
+
+template <typename T>
+inline constexpr bool is_bfloat16_v = is_bfloat16_base_v<_CUDA_VSTD::remove_cv_t<T>>;
+
+template <typename T>
+inline constexpr bool is_bfloat16x2_v = is_bfloat16x2_base_v<_CUDA_VSTD::remove_cv_t<T>>;
+
+template <typename T>
+inline constexpr bool is_any_bfloat16_v = is_bfloat16_v<T> || is_bfloat16x2_v<T>;
+
+//----------------------------------------------------------------------------------------------------------------------
+// short2/ushort2
+
+template <typename T>
+inline constexpr bool is_any_short2_base_v = false;
+
+template <>
+inline constexpr bool is_any_short2_base_v<short2> = true;
+
+template <>
+inline constexpr bool is_any_short2_base_v<ushort2> = true;
+
+template <typename T>
+inline constexpr bool is_any_short2_v = is_any_short2_base_v<_CUDA_VSTD::remove_cv_t<T>>;
+
+//----------------------------------------------------------------------------------------------------------------------
+
+// half/bfloat16
+template <typename T>
+inline constexpr bool is_arithmetic_cuda_floating_point_v =
+  is_any_half_v<T> || is_any_bfloat16_v<T> || _CUDA_VSTD::is_floating_point_v<T>;
+
+// - promote small integer types to their corresponding 32-bit promotion type
+// - address the incompatibility between linux/windows for int/long
+template <typename T>
+using normalize_integer_t = _CUDA_VSTD::_If<
+  _CUDA_VSTD::__cccl_is_signed_integer_v<T> && sizeof(T) <= sizeof(int),
+  int,
+  _CUDA_VSTD::_If<_CUDA_VSTD::__cccl_is_unsigned_integer_v<T> && sizeof(T) <= sizeof(uint32_t), uint32_t, T>>;
+
+} // namespace detail
 CUB_NAMESPACE_END

From c0f0f74fd8f7e9520a36929da669ea7b840cba41 Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Mon, 19 May 2025 09:56:12 -0700
Subject: [PATCH 06/12] Update cub/cub/detail/unsafe_bitcast.cuh

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 cub/cub/detail/unsafe_bitcast.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/detail/unsafe_bitcast.cuh b/cub/cub/detail/unsafe_bitcast.cuh
index 78f5a8432ad..4d0e0be623c 100644
--- a/cub/cub/detail/unsafe_bitcast.cuh
+++ b/cub/cub/detail/unsafe_bitcast.cuh
@@ -46,7 +46,7 @@ template <typename Output, typename Input>
 {
   Output output;
   static_assert(sizeof(input) == sizeof(output), "wrong size");
-  ::memcpy(static_cast<void*>(&output), static_cast<const void*>(&input), sizeof(input));
+  ::memcpy(&output, &input, sizeof(input));
   return output;
 }
 

From 067530718d3f5e3f6d63b6fcafad629b1c75a147 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Mon, 19 May 2025 10:04:14 -0700
Subject: [PATCH 07/12] fix license

---
 cub/cub/detail/array_utils.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cub/cub/detail/array_utils.cuh b/cub/cub/detail/array_utils.cuh
index 2c93dfe49ba..8051a2ddd6a 100644
--- a/cub/cub/detail/array_utils.cuh
+++ b/cub/cub/detail/array_utils.cuh
@@ -1,5 +1,6 @@
 /***********************************************************************************************************************
- * Copyright (c) 2011-2025, NVIDIA CORPORATION.  All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
  * following conditions are met:

From 577178043d97d1d54c57e9c453363e54df0d4e86 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Mon, 19 May 2025 10:04:34 -0700
Subject: [PATCH 08/12] use impl for type_traits

---
 cub/cub/detail/type_traits.cuh | 39 +++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh
index 704781bd2e5..df68d7c77ac 100644
--- a/cub/cub/detail/type_traits.cuh
+++ b/cub/cub/detail/type_traits.cuh
@@ -1,5 +1,6 @@
 /***********************************************************************************************************************
- * Copyright (c) 2011-2025, NVIDIA CORPORATION.  All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
  * following conditions are met:
@@ -119,72 +120,72 @@ using implicit_prom_t = decltype(+T{});
 // half
 
 template <typename>
-inline constexpr bool is_half_base_v = false;
+inline constexpr bool is_half_impl_v = false;
 
 template <typename>
-inline constexpr bool is_half2_base_v = false;
+inline constexpr bool is_half2_impl_v = false;
 
 #if _CCCL_HAS_NVFP16()
 
 template <>
-inline constexpr bool is_half_base_v<__half> = true;
+inline constexpr bool is_half_impl_v<__half> = true;
 
 template <>
-inline constexpr bool is_half2_base_v<__half2> = true;
+inline constexpr bool is_half2_impl_v<__half2> = true;
 
 #endif // _CCCL_HAS_NVFP16
 
 template <typename T>
-inline constexpr bool is_half_v = is_half_base_v<_CUDA_VSTD::remove_cv_t<T>>;
+inline constexpr bool is_half_v = is_half_impl_v<_CUDA_VSTD::remove_cv_t<T>>;
 
 template <typename T>
-inline constexpr bool is_half2_v = is_half2_base_v<_CUDA_VSTD::remove_cv_t<T>>;
+inline constexpr bool is_half2_v = is_half2_impl_v<_CUDA_VSTD::remove_cv_t<T>>;
 
 template <typename T>
-inline constexpr bool is_any_half_v = is_half_base_v<T> || is_half2_base_v<T>;
+inline constexpr bool is_any_half_v = is_half_impl_v<T> || is_half2_impl_v<T>;
 
 //----------------------------------------------------------------------------------------------------------------------
 // bfloat16
 
 template <typename>
-inline constexpr bool is_bfloat16_base_v = false;
+inline constexpr bool is_bfloat16_impl_v = false;
 
 template <typename>
-inline constexpr bool is_bfloat16x2_base_v = false;
+inline constexpr bool is_bfloat162_impl_v = false;
 
 #if _CCCL_HAS_NVBF16()
 
 template <>
-inline constexpr bool is_bfloat16_base_v<__nv_bfloat16> = true;
+inline constexpr bool is_bfloat16_impl_v<__nv_bfloat16> = true;
 
 template <>
-inline constexpr bool is_bfloat16x2_base_v<__nv_bfloat162> = true;
+inline constexpr bool is_bfloat162_impl_v<__nv_bfloat162> = true;
 
 #endif // _CCCL_HAS_NVBF16
 
 template <typename T>
-inline constexpr bool is_bfloat16_v = is_bfloat16_base_v<_CUDA_VSTD::remove_cv_t<T>>;
+inline constexpr bool is_bfloat16_v = is_bfloat16_impl_v<_CUDA_VSTD::remove_cv_t<T>>;
 
 template <typename T>
-inline constexpr bool is_bfloat16x2_v = is_bfloat16x2_base_v<_CUDA_VSTD::remove_cv_t<T>>;
+inline constexpr bool is_bfloat162_v = is_bfloat162_impl_v<_CUDA_VSTD::remove_cv_t<T>>;
 
 template <typename T>
-inline constexpr bool is_any_bfloat16_v = is_bfloat16_v<T> || is_bfloat16x2_v<T>;
+inline constexpr bool is_any_bfloat16_v = is_bfloat16_v<T> || is_bfloat162_v<T>;
 
 //----------------------------------------------------------------------------------------------------------------------
 // short2/ushort2
 
 template <typename T>
-inline constexpr bool is_any_short2_base_v = false;
+inline constexpr bool is_any_short2_impl_v = false;
 
 template <>
-inline constexpr bool is_any_short2_base_v<short2> = true;
+inline constexpr bool is_any_short2_impl_v<short2> = true;
 
 template <>
-inline constexpr bool is_any_short2_base_v<ushort2> = true;
+inline constexpr bool is_any_short2_impl_v<ushort2> = true;
 
 template <typename T>
-inline constexpr bool is_any_short2_v = is_any_short2_base_v<_CUDA_VSTD::remove_cv_t<T>>;
+inline constexpr bool is_any_short2_v = is_any_short2_impl_v<_CUDA_VSTD::remove_cv_t<T>>;
 
 //----------------------------------------------------------------------------------------------------------------------
 

From 37241d510287ad43c341fa0573a9841654423d25 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Mon, 19 May 2025 10:06:49 -0700
Subject: [PATCH 09/12] fix headers

---
 cub/cub/thread/thread_operators.cuh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index 0dd7bc01e7c..bb110fabe71 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -36,10 +36,6 @@
 
 #include <cub/config.cuh>
 
-#include "cuda/__functional/maximum.h"
-#include "cuda/std/__functional/operations.h"
-#include "cuda/std/__type_traits/always_false.h"
-
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -52,7 +48,6 @@
 
 #include <cuda/functional> // cuda::maximum, cuda::minimum
 #include <cuda/std/cstdint> // cuda::std::uint32_t
-#include <cuda/std/functional> // cuda::std::plus
 #include <cuda/std/type_traits> // is_same_v
 
 CUB_NAMESPACE_BEGIN

From 5ddb04b2487554260a2bd9ce14079534a2aa3390 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Mon, 19 May 2025 10:13:06 -0700
Subject: [PATCH 10/12] rename normalized_integer_t

---
 cub/cub/detail/type_traits.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh
index df68d7c77ac..9d0678ba7e2 100644
--- a/cub/cub/detail/type_traits.cuh
+++ b/cub/cub/detail/type_traits.cuh
@@ -197,7 +197,7 @@ inline constexpr bool is_arithmetic_cuda_floating_point_v =
 // - promote small integer types to their corresponding 32-bit promotion type
 // - address the incompatibility between linux/windows for int/long
 template <typename T>
-using normalize_integer_t = _CUDA_VSTD::_If<
+using signed_promotion_t = _CUDA_VSTD::_If<
   _CUDA_VSTD::__cccl_is_signed_integer_v<T> && sizeof(T) <= sizeof(int),
   int,
   _CUDA_VSTD::_If<_CUDA_VSTD::__cccl_is_unsigned_integer_v<T> && sizeof(T) <= sizeof(uint32_t), uint32_t, T>>;

From 1b12b7d7d2ea6bd22f9e67ec5afdaa61d3f829c6 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Tue, 20 May 2025 09:51:43 -0700
Subject: [PATCH 11/12] add logical_and/or

---
 cub/cub/thread/thread_operators.cuh | 33 +++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index bb110fabe71..812e8231acb 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -510,6 +510,36 @@ inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor<>, T> = true;
 template <>
 inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor<>, void> = true;
 
+template <typename, typename = void>
+inline constexpr bool is_cuda_std_logical_and_v = false;
+
+template <>
+inline constexpr bool is_cuda_std_logical_and_v<_CUDA_VSTD::logical_and<bool>, void> = true;
+
+template <>
+inline constexpr bool is_cuda_std_logical_and_v<_CUDA_VSTD::logical_and<bool>, bool> = true;
+
+template <>
+inline constexpr bool is_cuda_std_logical_and_v<_CUDA_VSTD::logical_and<>, bool> = true;
+
+template <>
+inline constexpr bool is_cuda_std_logical_and_v<_CUDA_VSTD::logical_and<>, void> = true;
+
+template <typename, typename = void>
+inline constexpr bool is_cuda_std_logical_or_v = false;
+
+template <>
+inline constexpr bool is_cuda_std_logical_or_v<_CUDA_VSTD::logical_or<bool>, void> = true;
+
+template <>
+inline constexpr bool is_cuda_std_logical_or_v<_CUDA_VSTD::logical_or<bool>, bool> = true;
+
+template <>
+inline constexpr bool is_cuda_std_logical_or_v<_CUDA_VSTD::logical_or<>, bool> = true;
+
+template <>
+inline constexpr bool is_cuda_std_logical_or_v<_CUDA_VSTD::logical_or<>, void> = true;
+
 template <typename Op, typename T = void>
 inline constexpr bool is_cuda_minimum_maximum_v = is_cuda_maximum_v<Op, T> || is_cuda_minimum_v<Op, T>;
 
@@ -521,7 +551,7 @@ inline constexpr bool is_cuda_std_bitwise_v =
   is_cuda_std_bit_and_v<Op, T> || is_cuda_std_bit_or_v<Op, T> || is_cuda_std_bit_xor_v<Op, T>;
 
 template <typename Op, typename T = void>
-inline constexpr bool is_cuda_operator_v =
+inline constexpr bool is_simd_enabled_cuda_operator =
   is_cuda_minimum_maximum_v<Op, T> || //
   is_cuda_std_plus_mul_v<Op, T> || //
   is_cuda_std_bitwise_v<Op, T>;
@@ -532,7 +562,6 @@ inline constexpr bool is_cuda_operator_v =
 template <typename Op, typename>
 struct GeneralizeOperator
 {
-  static_assert(is_cuda_operator_v<Op>);
   using type = Op;
 };
 

From 4f4062a1c3264b68aad5781e9674c68facc0b6bb Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Tue, 20 May 2025 09:51:55 -0700
Subject: [PATCH 12/12] use is_simd_enabled_cuda_operator

---
 cub/cub/thread/thread_reduce.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index 0b4b68dea76..d2a62c01c6f 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -406,7 +406,8 @@ template <typename Input, typename ReductionOp, typename ValueT, typename AccumT
   }
   using PromT = _CUDA_VSTD::_If<enable_min_max_promotion_v<ReductionOp, ValueT>, int, AccumT>;
   // TODO: should be part of the tuning policy
-  if constexpr ((!is_cuda_operator_v<ReductionOp, ValueT> && !is_simd_operator_v<ReductionOp>) || sizeof(ValueT) >= 8)
+  if constexpr ((!is_simd_enabled_cuda_operator<ReductionOp, ValueT> && !is_simd_operator_v<ReductionOp>)
+                || sizeof(ValueT) >= 8)
   {
     return ThreadReduceSequential<AccumT>(input, reduction_op);
   }