From a40828341ab55de9f732f52d6508f6a6b56c2974 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 15 May 2025 15:56:47 -0700 Subject: [PATCH 01/12] use cuda::std::invoke_result_t --- cub/cub/block/radix_rank_sort_operations.cuh | 4 +++- cub/cub/detail/type_traits.cuh | 3 --- cub/cub/device/dispatch/dispatch_adjacent_difference.cuh | 4 +++- cub/cub/device/dispatch/dispatch_reduce.cuh | 3 ++- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh index f213a151e76..31217c46df8 100644 --- a/cub/cub/block/radix_rank_sort_operations.cuh +++ b/cub/cub/block/radix_rank_sort_operations.cuh @@ -50,6 +50,7 @@ #include #include +#include #include #include #include @@ -228,7 +229,8 @@ struct is_tuple_of_references_to_fundamental_types_t< // {}; template -using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t>; +using decomposer_check_t = + is_tuple_of_references_to_fundamental_types_t<_CUDA_VSTD::invoke_result_t>; template struct bit_ordered_conversion_policy_t diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh index 2362e42990a..f83a55cebc4 100644 --- a/cub/cub/detail/type_traits.cuh +++ b/cub/cub/detail/type_traits.cuh @@ -57,9 +57,6 @@ CUB_NAMESPACE_BEGIN namespace detail { -template -using invoke_result_t = _CUDA_VSTD::invoke_result_t; - template inline constexpr bool is_one_of_v = (_CCCL_TRAIT(_CUDA_VSTD::is_same, T, TArgs) || ...); diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 31d85d7b140..56064ba2076 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -48,6 +48,8 @@ #include +#include + CUB_NAMESPACE_BEGIN namespace detail::adjacent_difference @@ -80,7 +82,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel( // It is OK to introspect the return type or parameter types of the // `operator()` function of `__device__` extended lambda within device code. - using OutputT = invoke_result_t; + using OutputT = _CUDA_VSTD::invoke_result_t; using Agent = AgentDifference // for cub::detail::non_void_value_t, cub::detail::value_t #include +#include CUB_NAMESPACE_BEGIN @@ -572,7 +573,7 @@ template < typename InitT, typename AccumT = ::cuda::std::__accumulator_t>, + _CUDA_VSTD::invoke_result_t>, InitT>, typename PolicyHub = detail::reduce::policy_hub, typename KernelSource = detail::reduce::DeviceReduceKernelSource< From aaab0e70cf60f5467bb87bbb71241040ac0383e6 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 15 May 2025 15:58:00 -0700 Subject: [PATCH 02/12] simplify array_utils.cuh namespaces --- cub/cub/detail/array_utils.cuh | 46 +++++++++++++++------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/cub/cub/detail/array_utils.cuh b/cub/cub/detail/array_utils.cuh index cd4115a081a..2c93dfe49ba 100644 --- a/cub/cub/detail/array_utils.cuh +++ b/cub/cub/detail/array_utils.cuh @@ -1,8 +1,8 @@ -/****************************************************************************** - * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved. +/*********************************************************************************************************************** + * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the + * following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright @@ -12,18 +12,15 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ******************************************************************************/ + **********************************************************************************************************************/ #pragma once @@ -42,13 +39,11 @@ #include // array #include // size_t -#include // cuda::std::iter_value_t +#include // _CUDA_VSTD::iter_value_t #include // _If #include // index_sequence CUB_NAMESPACE_BEGIN - -/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) namespace detail { @@ -58,25 +53,24 @@ namespace detail * Generic Array-like to Array Conversion **********************************************************************************************************************/ -template -[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::array> -to_array_impl(const Input& input, ::cuda::std::index_sequence) +template +[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE _CUDA_VSTD::array> +to_array_impl(const Input& input, _CUDA_VSTD::index_sequence) { - using ArrayType = ::cuda::std::array>; + using ArrayType = _CUDA_VSTD::array>; return ArrayType{static_cast(input[i])...}; } template -[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::array> +[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE _CUDA_VSTD::array> to_array(const Input& input) { - using InputType = cuda::std::iter_value_t; - using CastType1 = ::cuda::std::_If<::cuda::std::is_same_v, InputType, CastType>; - return to_array_impl(input, ::cuda::std::make_index_sequence>{}); + using InputType = _CUDA_VSTD::iter_value_t; + using CastType1 = _CUDA_VSTD::_If<_CUDA_VSTD::is_same_v, InputType, CastType>; + return to_array_impl(input, _CUDA_VSTD::make_index_sequence>{}); } #endif // !_CCCL_DOXYGEN_INVOKED } // namespace detail - CUB_NAMESPACE_END From 5cd48d3428eb50d107fa2c88b48bc2261d3454a3 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 15 May 2025 15:58:32 -0700 Subject: [PATCH 03/12] move unsafe_bitcast into an independent file --- cub/cub/detail/unsafe_bitcast.cuh | 56 +++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 cub/cub/detail/unsafe_bitcast.cuh diff --git a/cub/cub/detail/unsafe_bitcast.cuh b/cub/cub/detail/unsafe_bitcast.cuh new file mode 100644 index 00000000000..78f5a8432ad --- /dev/null +++ b/cub/cub/detail/unsafe_bitcast.cuh @@ -0,0 +1,56 @@ +/*********************************************************************************************************************** + * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the + * following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **********************************************************************************************************************/ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +CUB_NAMESPACE_BEGIN +namespace detail +{ + +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document + +// NOTE: bit_cast cannot be always used because __half, __nv_bfloat16, etc. are not trivially copyable +template +[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE Output unsafe_bitcast(const Input& input) +{ + Output output; + static_assert(sizeof(input) == sizeof(output), "wrong size"); + ::memcpy(static_cast(&output), static_cast(&input), sizeof(input)); + return output; +} + +#endif // !_CCCL_DOXYGEN_INVOKED + +} // namespace detail +CUB_NAMESPACE_END From 347f7d151ce2d217dd66917c51c61c2a79adc07b Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 15 May 2025 15:59:34 -0700 Subject: [PATCH 04/12] use more precise names for cuda::std:: operators --- cub/cub/thread/thread_operators.cuh | 248 +++++++++++++++++++++++----- cub/cub/thread/thread_reduce.cuh | 85 ++++------ 2 files changed, 239 insertions(+), 94 deletions(-) diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index e8a6c165c94..0dd7bc01e7c 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -1,9 +1,9 @@ -/****************************************************************************** +/*********************************************************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the + * following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright @@ -13,18 +13,15 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ******************************************************************************/ + **********************************************************************************************************************/ /** * @file @@ -39,6 +36,10 @@ #include +#include "cuda/__functional/maximum.h" +#include "cuda/std/__functional/operations.h" +#include "cuda/std/__type_traits/always_false.h" + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -52,7 +53,7 @@ #include // cuda::maximum, cuda::minimum #include // cuda::std::uint32_t #include // cuda::std::plus -#include // cuda::std::pair +#include // is_same_v CUB_NAMESPACE_BEGIN @@ -406,37 +407,196 @@ struct ReduceByKeyOp namespace detail { -template -inline constexpr bool is_cuda_std_min_max_v = - cub::detail::is_one_of_v, - ::cuda::minimum, - ::cuda::maximum<>, - ::cuda::maximum>; - -template -inline constexpr bool is_cuda_std_plus_mul_v = - cub::detail::is_one_of_v, - _CUDA_VSTD::plus, - _CUDA_VSTD::multiplies<>, - _CUDA_VSTD::multiplies>; - -template +//---------------------------------------------------------------------------------------------------------------------- +// Predefined operators + +template +inline constexpr bool is_cuda_std_plus_v = false; + +template +inline constexpr bool is_cuda_std_plus_v<_CUDA_VSTD::plus, void> = true; + +template +inline constexpr bool is_cuda_std_plus_v<_CUDA_VSTD::plus, T> = true; + +template +inline constexpr bool is_cuda_std_plus_v<_CUDA_VSTD::plus<>, T> = true; + +template <> +inline constexpr bool is_cuda_std_plus_v<_CUDA_VSTD::plus<>, void> = true; + +template +inline constexpr bool is_cuda_std_mul_v = false; + +template +inline constexpr bool is_cuda_std_mul_v<_CUDA_VSTD::multiplies, void> = true; + +template +inline constexpr bool is_cuda_std_mul_v<_CUDA_VSTD::multiplies, T> = true; + +template +inline constexpr bool is_cuda_std_mul_v<_CUDA_VSTD::multiplies<>, T> = true; + +template <> +inline constexpr bool is_cuda_std_mul_v<_CUDA_VSTD::multiplies<>, void> = true; + +template +inline constexpr bool is_cuda_maximum_v = false; + +template +inline constexpr bool is_cuda_maximum_v<::cuda::maximum, void> = true; + +template +inline constexpr bool is_cuda_maximum_v<::cuda::maximum, T> = true; + +template +inline constexpr bool is_cuda_maximum_v<::cuda::maximum<>, T> = true; + +template <> +inline constexpr bool is_cuda_maximum_v<::cuda::maximum<>, void> = true; + +template +inline constexpr bool is_cuda_minimum_v = false; + +template +inline constexpr bool is_cuda_minimum_v<::cuda::minimum, void> = true; + +template +inline constexpr bool is_cuda_minimum_v<::cuda::minimum, T> = true; + +template +inline constexpr bool is_cuda_minimum_v<::cuda::minimum<>, T> = true; + +template <> +inline constexpr bool is_cuda_minimum_v<::cuda::minimum<>, void> = true; + +template +inline constexpr bool is_cuda_std_bit_and_v = false; + +template +inline constexpr bool is_cuda_std_bit_and_v<_CUDA_VSTD::bit_and, void> = true; + +template +inline constexpr bool is_cuda_std_bit_and_v<_CUDA_VSTD::bit_and, T> = true; + +template +inline constexpr bool is_cuda_std_bit_and_v<_CUDA_VSTD::bit_and<>, T> = true; + +template <> +inline constexpr bool is_cuda_std_bit_and_v<_CUDA_VSTD::bit_and<>, void> = true; + +template +inline constexpr bool is_cuda_std_bit_or_v = false; + +template +inline constexpr bool is_cuda_std_bit_or_v<_CUDA_VSTD::bit_or, void> = true; + +template +inline constexpr bool is_cuda_std_bit_or_v<_CUDA_VSTD::bit_or, T> = true; + +template +inline constexpr bool is_cuda_std_bit_or_v<_CUDA_VSTD::bit_or<>, T> = true; + +template <> +inline constexpr bool is_cuda_std_bit_or_v<_CUDA_VSTD::bit_or<>, void> = true; + +template +inline constexpr bool is_cuda_std_bit_xor_v = false; + +template +inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor, void> = true; + +template +inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor, T> = true; + +template +inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor<>, T> = true; + +template <> +inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor<>, void> = true; + +template +inline constexpr bool is_cuda_minimum_maximum_v = is_cuda_maximum_v || is_cuda_minimum_v; + +template +inline constexpr bool is_cuda_std_plus_mul_v = is_cuda_std_plus_v || is_cuda_std_mul_v; + +template inline constexpr bool is_cuda_std_bitwise_v = - cub::detail::is_one_of_v, - _CUDA_VSTD::bit_and, - _CUDA_VSTD::bit_or<>, - _CUDA_VSTD::bit_or, - _CUDA_VSTD::bit_xor<>, - _CUDA_VSTD::bit_xor>; - -template -inline constexpr bool is_cuda_std_operator_v = - is_cuda_std_min_max_v || // - is_cuda_std_plus_mul_v || // - is_cuda_std_bitwise_v; + is_cuda_std_bit_and_v || is_cuda_std_bit_or_v || is_cuda_std_bit_xor_v; + +template +inline constexpr bool is_cuda_operator_v = + is_cuda_minimum_maximum_v || // + is_cuda_std_plus_mul_v || // + is_cuda_std_bitwise_v; + +//---------------------------------------------------------------------------------------------------------------------- +// Generalize Operator + +template +struct GeneralizeOperator +{ + static_assert(is_cuda_operator_v); + using type = Op; +}; + +template +struct GeneralizeOperator<_CUDA_VSTD::plus, T> +{ + using type = _CUDA_VSTD::plus<>; +}; + +template +struct GeneralizeOperator<_CUDA_VSTD::bit_and, T> +{ + using type = _CUDA_VSTD::bit_and<>; +}; + +template +struct GeneralizeOperator<_CUDA_VSTD::bit_or, T> +{ + using type = _CUDA_VSTD::bit_or<>; +}; + +template +struct GeneralizeOperator<_CUDA_VSTD::bit_xor, T> +{ + using type = _CUDA_VSTD::bit_xor<>; +}; + +template +struct GeneralizeOperator<::cuda::maximum, T> +{ + using type = ::cuda::maximum<>; +}; + +template +struct GeneralizeOperator<::cuda::minimum, T> +{ + using type = ::cuda::minimum<>; +}; + +template +using generalize_operator_t = typename GeneralizeOperator::type; + +//---------------------------------------------------------------------------------------------------------------------- +// Identity + +template +inline constexpr T identity_v; + +template +inline constexpr T identity_v<::cuda::minimum<>, T> = _CUDA_VSTD::numeric_limits::max(); + +template +inline constexpr T identity_v<::cuda::minimum, T> = _CUDA_VSTD::numeric_limits::max(); + +template +inline constexpr T identity_v<::cuda::maximum<>, T> = _CUDA_VSTD::numeric_limits::min(); + +template +inline constexpr T identity_v<::cuda::maximum, T> = _CUDA_VSTD::numeric_limits::min(); } // namespace detail diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index 1ded86cf436..0b4b68dea76 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -1,9 +1,9 @@ -/****************************************************************************** +/*********************************************************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the + * following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright @@ -13,18 +13,15 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ******************************************************************************/ + **********************************************************************************************************************/ //! @file //! Thread reduction over statically-sized array-like types @@ -42,6 +39,7 @@ #include // to_array() #include // are_same() +#include #include // UnrolledCopy #include #include @@ -225,7 +223,7 @@ namespace detail template inline constexpr bool enable_sm90_simd_reduction_v = - cub::detail::is_one_of_v && is_cuda_std_min_max_v && Length >= 10; + is_one_of_v && is_cuda_minimum_maximum_v && Length >= 10; //---------------------------------------------------------------------------------------------------------------------- // SM80 SIMD @@ -237,7 +235,7 @@ inline constexpr bool enable_sm80_simd_reduction_v = false; template inline constexpr bool enable_sm80_simd_reduction_v<__half, ReductionOp, Length> = - (is_cuda_std_min_max_v || is_cuda_std_plus_mul_v) && Length >= 4; + (is_cuda_minimum_maximum_v || is_cuda_std_plus_mul_v) && Length >= 4; # endif // defined(_CCCL_HAS_NVFP16) @@ -245,7 +243,7 @@ inline constexpr bool enable_sm80_simd_reduction_v<__half, ReductionOp, Length> template inline constexpr bool enable_sm80_simd_reduction_v<__nv_bfloat16, ReductionOp, Length> = - (is_cuda_std_min_max_v || is_cuda_std_plus_mul_v) + (is_cuda_minimum_maximum_v || is_cuda_std_plus_mul_v) && Length >= 4; # endif // _CCCL_HAS_NVBF16() @@ -272,14 +270,13 @@ inline constexpr bool enable_sm70_simd_reduction_v = false; template inline constexpr bool enable_ternary_reduction_sm90_v = - cub::detail::is_one_of_v && is_cuda_std_min_max_v; + is_one_of_v && is_cuda_minimum_maximum_v; # if _CCCL_HAS_NVFP16() template inline constexpr bool enable_ternary_reduction_sm90_v<__half2, ReductionOp> = - is_cuda_std_min_max_v - || cub::detail::is_one_of_v, SimdMax<__half>>; + is_cuda_minimum_maximum_v || is_one_of_v, SimdMax<__half>>; # endif // _CCCL_HAS_NVFP16() @@ -287,16 +284,15 @@ inline constexpr bool enable_ternary_reduction_sm90_v<__half2, ReductionOp> = template inline constexpr bool enable_ternary_reduction_sm90_v<__nv_bfloat162, ReductionOp> = - is_cuda_std_min_max_v - || cub::detail::is_one_of_v, SimdMax<__nv_bfloat16>>; + is_cuda_minimum_maximum_v + || is_one_of_v, SimdMax<__nv_bfloat16>>; # endif // _CCCL_HAS_NVBF16() template inline constexpr bool enable_ternary_reduction_sm50_v = _CUDA_VSTD::is_integral_v && sizeof(T) <= 4 - && (cub::detail::is_one_of_v, _CUDA_VSTD::plus> - || is_cuda_std_bitwise_v); + && (is_cuda_std_plus_v || is_cuda_std_bitwise_v); /*********************************************************************************************************************** * Internal Reduction Algorithms: Sequential, Binary, Ternary @@ -307,7 +303,7 @@ template { auto retval = static_cast(input[0]); _CCCL_PRAGMA_UNROLL_FULL() - for (int i = 1; i < cub::detail::static_size_v; ++i) + for (int i = 1; i < static_size_v; ++i) { retval = reduction_op(retval, input[i]); } @@ -317,7 +313,7 @@ template template [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduceBinaryTree(const Input& input, ReductionOp reduction_op) { - constexpr auto length = cub::detail::static_size_v; + constexpr auto length = static_size_v; auto array = cub::detail::to_array(input); _CCCL_PRAGMA_UNROLL_FULL() for (int i = 1; i < length; i *= 2) @@ -334,7 +330,7 @@ template template [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduceTernaryTree(const Input& input, ReductionOp reduction_op) { - constexpr auto length = cub::detail::static_size_v; + constexpr auto length = static_size_v; auto array = cub::detail::to_array(input); _CCCL_PRAGMA_UNROLL_FULL() for (int i = 1; i < length; i *= 3) @@ -353,16 +349,6 @@ template * SIMD Reduction **********************************************************************************************************************/ -// NOTE: bit_cast cannot be always used because __half, __nv_bfloat16, etc. are not trivially copyable -template -[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE Output unsafe_bitcast(const Input& input) -{ - Output output; - static_assert(sizeof(input) == sizeof(output), "wrong size"); - ::memcpy(&output, &input, sizeof(input)); - return output; -} - template _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp) { @@ -370,7 +356,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, Reducti using T = _CUDA_VSTD::iter_value_t; using SimdReduceOp = cub_operator_to_simd_operator_t; using SimdType = simd_type_t; - constexpr auto length = cub::detail::static_size_v; + constexpr auto length = static_size_v; constexpr auto simd_ratio = sizeof(SimdType) / sizeof(T); constexpr auto length_rounded = ::cuda::round_down(length, simd_ratio); using UnpackedType = _CUDA_VSTD::array; @@ -398,7 +384,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, Reducti template inline constexpr bool enable_min_max_promotion_v = - is_cuda_std_min_max_v && _CUDA_VSTD::is_integral_v && sizeof(T) <= 2; + is_cuda_minimum_maximum_v && _CUDA_VSTD::is_integral_v && sizeof(T) <= 2; } // namespace detail @@ -409,25 +395,23 @@ inline constexpr bool enable_min_max_promotion_v = template [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op) { - static_assert(detail::is_fixed_size_random_access_range_v, + using namespace cub::detail; + static_assert(is_fixed_size_random_access_range_v, "Input must support the subscript operator[] and have a compile-time size"); - static_assert(cub::detail::has_binary_call_operator::value, + static_assert(has_binary_call_operator::value, "ReductionOp must have the binary call operator: operator(ValueT, ValueT)"); - if constexpr (cub::detail::static_size_v == 1) + if constexpr (static_size_v == 1) { return static_cast(input[0]); } - using cub::detail::is_one_of_v; - using namespace cub::detail; using PromT = _CUDA_VSTD::_If, int, AccumT>; // TODO: should be part of the tuning policy - if constexpr ((!is_cuda_std_operator_v && !is_simd_operator_v) - || sizeof(ValueT) >= 8) + if constexpr ((!is_cuda_operator_v && !is_simd_operator_v) || sizeof(ValueT) >= 8) { return ThreadReduceSequential(input, reduction_op); } - constexpr auto length = cub::detail::static_size_v; + constexpr auto length = static_size_v; if constexpr (_CUDA_VSTD::is_same_v && enable_sm90_simd_reduction_v) { NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSimd(input, reduction_op);)) @@ -449,7 +433,7 @@ template , _CUDA_VSTD::plus> && is_one_of_v) // the compiler generates bad code for int8/uint8 and min/max for SM90 - || (is_cuda_std_min_max_v && is_one_of_v) ) + || (is_cuda_minimum_maximum_v && is_one_of_v) ) { NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential(input, reduction_op);)); } @@ -496,11 +480,12 @@ template , + using namespace cub::detail; + static_assert(is_fixed_size_random_access_range_v, "Input must support the subscript operator[] and have a compile-time size"); - static_assert(detail::has_binary_call_operator::value, + static_assert(has_binary_call_operator::value, "ReductionOp must have the binary call operator: operator(ValueT, ValueT)"); - constexpr int length = cub::detail::static_size_v; + constexpr int length = static_size_v; // copy to a temporary array of type AccumT AccumT array[length + 1]; array[0] = prefix; From 9073f81afadaba3a5bd448372b1ea2a1e6742c47 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 15 May 2025 16:00:11 -0700 Subject: [PATCH 05/12] add short2/half[2]/bfloat[2] identification type traits --- cub/cub/detail/type_traits.cuh | 116 ++++++++++++++++++++++++++++----- 1 file changed, 100 insertions(+), 16 deletions(-) diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh index f83a55cebc4..704781bd2e5 100644 --- a/cub/cub/detail/type_traits.cuh +++ b/cub/cub/detail/type_traits.cuh @@ -1,8 +1,8 @@ -/****************************************************************************** - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +/*********************************************************************************************************************** + * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the + * following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright @@ -12,18 +12,15 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ******************************************************************************/ + **********************************************************************************************************************/ /** * \file @@ -51,7 +48,7 @@ #include #include #include -#include +#include // is_same_v CUB_NAMESPACE_BEGIN namespace detail @@ -116,6 +113,93 @@ inline constexpr int static_size_v<_CUDA_VSTD::mdspan> = template using implicit_prom_t = decltype(+T{}); -} // namespace detail +/*********************************************************************************************************************** + * Extended floating point traits + **********************************************************************************************************************/ +// half + +template +inline constexpr bool is_half_base_v = false; + +template +inline constexpr bool is_half2_base_v = false; + +#if _CCCL_HAS_NVFP16() + +template <> +inline constexpr bool is_half_base_v<__half> = true; + +template <> +inline constexpr bool is_half2_base_v<__half2> = true; + +#endif // _CCCL_HAS_NVFP16 + +template +inline constexpr bool is_half_v = is_half_base_v<_CUDA_VSTD::remove_cv_t>; + +template +inline constexpr bool is_half2_v = is_half2_base_v<_CUDA_VSTD::remove_cv_t>; + +template +inline constexpr bool is_any_half_v = is_half_base_v || is_half2_base_v; + +//---------------------------------------------------------------------------------------------------------------------- +// bfloat16 + +template +inline constexpr bool is_bfloat16_base_v = false; + +template +inline constexpr bool is_bfloat16x2_base_v = false; + +#if _CCCL_HAS_NVBF16() + +template <> +inline constexpr bool is_bfloat16_base_v<__nv_bfloat16> = true; +template <> +inline constexpr bool is_bfloat16x2_base_v<__nv_bfloat162> = true; + +#endif // _CCCL_HAS_NVBF16 + +template +inline constexpr bool is_bfloat16_v = is_bfloat16_base_v<_CUDA_VSTD::remove_cv_t>; + +template +inline constexpr bool is_bfloat16x2_v = is_bfloat16x2_base_v<_CUDA_VSTD::remove_cv_t>; + +template +inline constexpr bool is_any_bfloat16_v = is_bfloat16_v || is_bfloat16x2_v; + +//---------------------------------------------------------------------------------------------------------------------- +// short2/ushort2 + +template +inline constexpr bool is_any_short2_base_v = false; + +template <> +inline constexpr bool is_any_short2_base_v = true; + +template <> +inline constexpr bool is_any_short2_base_v = true; + +template +inline constexpr bool is_any_short2_v = is_any_short2_base_v<_CUDA_VSTD::remove_cv_t>; + +//---------------------------------------------------------------------------------------------------------------------- + +// half/bfloat16 +template +inline constexpr bool is_arithmetic_cuda_floating_point_v = + is_any_half_v || is_any_bfloat16_v || _CUDA_VSTD::is_floating_point_v; + +// - promote small integer types to their corresponding 32-bit promotion type +// - address the incompatibility between linux/windows for int/long +template +using normalize_integer_t = _CUDA_VSTD::_If< + _CUDA_VSTD::__cccl_is_signed_integer_v && sizeof(T) <= sizeof(int), + int, + _CUDA_VSTD::_If<_CUDA_VSTD::__cccl_is_unsigned_integer_v && sizeof(T) <= sizeof(uint32_t), uint32_t, T>>; + +} // namespace detail CUB_NAMESPACE_END From c0f0f74fd8f7e9520a36929da669ea7b840cba41 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Mon, 19 May 2025 09:56:12 -0700 Subject: [PATCH 06/12] Update cub/cub/detail/unsafe_bitcast.cuh Co-authored-by: Bernhard Manfred Gruber --- cub/cub/detail/unsafe_bitcast.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/detail/unsafe_bitcast.cuh b/cub/cub/detail/unsafe_bitcast.cuh index 78f5a8432ad..4d0e0be623c 100644 --- a/cub/cub/detail/unsafe_bitcast.cuh +++ b/cub/cub/detail/unsafe_bitcast.cuh @@ -46,7 +46,7 @@ template { Output output; static_assert(sizeof(input) == sizeof(output), "wrong size"); - ::memcpy(static_cast(&output), static_cast(&input), sizeof(input)); + ::memcpy(&output, &input, sizeof(input)); return output; } From 067530718d3f5e3f6d63b6fcafad629b1c75a147 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 19 May 2025 10:04:14 -0700 Subject: [PATCH 07/12] fix license --- cub/cub/detail/array_utils.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cub/cub/detail/array_utils.cuh b/cub/cub/detail/array_utils.cuh index 2c93dfe49ba..8051a2ddd6a 100644 --- a/cub/cub/detail/array_utils.cuh +++ b/cub/cub/detail/array_utils.cuh @@ -1,5 +1,6 @@ /*********************************************************************************************************************** - * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the * following conditions are met: From 577178043d97d1d54c57e9c453363e54df0d4e86 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 19 May 2025 10:04:34 -0700 Subject: [PATCH 08/12] use impl for type_traits --- cub/cub/detail/type_traits.cuh | 39 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh index 704781bd2e5..df68d7c77ac 100644 --- a/cub/cub/detail/type_traits.cuh +++ b/cub/cub/detail/type_traits.cuh @@ -1,5 +1,6 @@ /*********************************************************************************************************************** - * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the * following conditions are met: @@ -119,72 +120,72 @@ using implicit_prom_t = decltype(+T{}); // half template -inline constexpr bool is_half_base_v = false; +inline constexpr bool is_half_impl_v = false; template -inline constexpr bool is_half2_base_v = false; +inline constexpr bool is_half2_impl_v = false; #if _CCCL_HAS_NVFP16() template <> -inline constexpr bool is_half_base_v<__half> = true; +inline constexpr bool is_half_impl_v<__half> = true; template <> -inline constexpr bool is_half2_base_v<__half2> = true; +inline constexpr bool is_half2_impl_v<__half2> = true; #endif // _CCCL_HAS_NVFP16 template -inline constexpr bool is_half_v = is_half_base_v<_CUDA_VSTD::remove_cv_t>; +inline constexpr bool is_half_v = is_half_impl_v<_CUDA_VSTD::remove_cv_t>; template -inline constexpr bool is_half2_v = is_half2_base_v<_CUDA_VSTD::remove_cv_t>; +inline constexpr bool is_half2_v = is_half2_impl_v<_CUDA_VSTD::remove_cv_t>; template -inline constexpr bool is_any_half_v = is_half_base_v || is_half2_base_v; +inline constexpr bool is_any_half_v = is_half_impl_v || is_half2_impl_v; //---------------------------------------------------------------------------------------------------------------------- // bfloat16 template -inline constexpr bool is_bfloat16_base_v = false; +inline constexpr bool is_bfloat16_impl_v = false; template -inline constexpr bool is_bfloat16x2_base_v = false; +inline constexpr bool is_bfloat162_impl_v = false; #if _CCCL_HAS_NVBF16() template <> -inline constexpr bool is_bfloat16_base_v<__nv_bfloat16> = true; +inline constexpr bool is_bfloat16_impl_v<__nv_bfloat16> = true; template <> -inline constexpr bool is_bfloat16x2_base_v<__nv_bfloat162> = true; +inline constexpr bool is_bfloat162_impl_v<__nv_bfloat162> = true; #endif // _CCCL_HAS_NVBF16 template -inline constexpr bool is_bfloat16_v = is_bfloat16_base_v<_CUDA_VSTD::remove_cv_t>; +inline constexpr bool is_bfloat16_v = is_bfloat16_impl_v<_CUDA_VSTD::remove_cv_t>; template -inline constexpr bool is_bfloat16x2_v = is_bfloat16x2_base_v<_CUDA_VSTD::remove_cv_t>; +inline constexpr bool is_bfloat162_v = is_bfloat162_impl_v<_CUDA_VSTD::remove_cv_t>; template -inline constexpr bool is_any_bfloat16_v = is_bfloat16_v || is_bfloat16x2_v; +inline constexpr bool is_any_bfloat16_v = is_bfloat16_v || is_bfloat162_v; //---------------------------------------------------------------------------------------------------------------------- // short2/ushort2 template -inline constexpr bool is_any_short2_base_v = false; +inline constexpr bool is_any_short2_impl_v = false; template <> -inline constexpr bool is_any_short2_base_v = true; +inline constexpr bool is_any_short2_impl_v = true; template <> -inline constexpr bool is_any_short2_base_v = true; +inline constexpr bool is_any_short2_impl_v = true; template -inline constexpr bool is_any_short2_v = is_any_short2_base_v<_CUDA_VSTD::remove_cv_t>; +inline constexpr bool is_any_short2_v = is_any_short2_impl_v<_CUDA_VSTD::remove_cv_t>; //---------------------------------------------------------------------------------------------------------------------- From 37241d510287ad43c341fa0573a9841654423d25 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 19 May 2025 10:06:49 -0700 Subject: [PATCH 09/12] fix headers --- cub/cub/thread/thread_operators.cuh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index 0dd7bc01e7c..bb110fabe71 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -36,10 +36,6 @@ #include -#include "cuda/__functional/maximum.h" -#include "cuda/std/__functional/operations.h" -#include "cuda/std/__type_traits/always_false.h" - #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -52,7 +48,6 @@ #include // cuda::maximum, cuda::minimum #include // cuda::std::uint32_t -#include // cuda::std::plus #include // is_same_v CUB_NAMESPACE_BEGIN From 5ddb04b2487554260a2bd9ce14079534a2aa3390 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 19 May 2025 10:13:06 -0700 Subject: [PATCH 10/12] rename normalized_integer_t --- cub/cub/detail/type_traits.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh index df68d7c77ac..9d0678ba7e2 100644 --- a/cub/cub/detail/type_traits.cuh +++ b/cub/cub/detail/type_traits.cuh @@ -197,7 +197,7 @@ inline constexpr bool is_arithmetic_cuda_floating_point_v = // - promote small integer types to their corresponding 32-bit promotion type // - address the incompatibility between linux/windows for int/long template -using normalize_integer_t = _CUDA_VSTD::_If< +using signed_promotion_t = _CUDA_VSTD::_If< _CUDA_VSTD::__cccl_is_signed_integer_v && sizeof(T) <= sizeof(int), int, _CUDA_VSTD::_If<_CUDA_VSTD::__cccl_is_unsigned_integer_v && sizeof(T) <= sizeof(uint32_t), uint32_t, T>>; From 1b12b7d7d2ea6bd22f9e67ec5afdaa61d3f829c6 Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 20 May 2025 09:51:43 -0700 Subject: [PATCH 11/12] add logical_and/or --- cub/cub/thread/thread_operators.cuh | 33 +++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index bb110fabe71..812e8231acb 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -510,6 +510,36 @@ inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor<>, T> = true; template <> inline constexpr bool is_cuda_std_bit_xor_v<_CUDA_VSTD::bit_xor<>, void> = true; +template +inline constexpr bool is_cuda_std_logical_and_v = false; + +template <> +inline constexpr bool is_cuda_std_logical_and_v<_CUDA_VSTD::logical_and, void> = true; + +template <> +inline constexpr bool is_cuda_std_logical_and_v<_CUDA_VSTD::logical_and, bool> = true; + +template <> +inline constexpr bool is_cuda_std_logical_and_v<_CUDA_VSTD::logical_and<>, bool> = true; + +template <> +inline constexpr bool is_cuda_std_logical_and_v<_CUDA_VSTD::logical_and<>, void> = true; + +template +inline constexpr bool is_cuda_std_logical_or_v = false; + +template <> +inline constexpr bool is_cuda_std_logical_or_v<_CUDA_VSTD::logical_or, void> = true; + +template <> +inline constexpr bool is_cuda_std_logical_or_v<_CUDA_VSTD::logical_or, bool> = true; + +template <> +inline constexpr bool is_cuda_std_logical_or_v<_CUDA_VSTD::logical_or<>, bool> = true; + +template <> +inline constexpr bool is_cuda_std_logical_or_v<_CUDA_VSTD::logical_or<>, void> = true; + template inline constexpr bool is_cuda_minimum_maximum_v = is_cuda_maximum_v || is_cuda_minimum_v; @@ -521,7 +551,7 @@ inline constexpr bool is_cuda_std_bitwise_v = is_cuda_std_bit_and_v || is_cuda_std_bit_or_v || is_cuda_std_bit_xor_v; template -inline constexpr bool is_cuda_operator_v = +inline constexpr bool is_simd_enabled_cuda_operator = is_cuda_minimum_maximum_v || // is_cuda_std_plus_mul_v || // is_cuda_std_bitwise_v; @@ -532,7 +562,6 @@ inline constexpr bool is_cuda_operator_v = template struct GeneralizeOperator { - static_assert(is_cuda_operator_v); using type = Op; }; From 4f4062a1c3264b68aad5781e9674c68facc0b6bb Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 20 May 2025 09:51:55 -0700 Subject: [PATCH 12/12] use is_simd_enabled_cuda_operator --- cub/cub/thread/thread_reduce.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index 0b4b68dea76..d2a62c01c6f 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -406,7 +406,8 @@ template , int, AccumT>; // TODO: should be part of the tuning policy - if constexpr ((!is_cuda_operator_v && !is_simd_operator_v) || sizeof(ValueT) >= 8) + if constexpr ((!is_simd_enabled_cuda_operator && !is_simd_operator_v) + || sizeof(ValueT) >= 8) { return ThreadReduceSequential(input, reduction_op); }