Skip to content

Split Optimize Warp Reduce PR - CUB part #4716

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
May 20, 2025
4 changes: 3 additions & 1 deletion cub/cub/block/radix_rank_sort_operations.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include <thrust/type_traits/integer_sequence.h>

#include <cuda/bit>
#include <cuda/functional>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/cstdint>
Expand Down Expand Up @@ -228,7 +229,8 @@ struct is_tuple_of_references_to_fundamental_types_t< //
{};

template <class KeyT, class DecomposerT>
using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t<invoke_result_t<DecomposerT, KeyT&>>;
using decomposer_check_t =
is_tuple_of_references_to_fundamental_types_t<_CUDA_VSTD::invoke_result_t<DecomposerT, KeyT&>>;

template <class T>
struct bit_ordered_conversion_policy_t
Expand Down
46 changes: 20 additions & 26 deletions cub/cub/detail/array_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/******************************************************************************
* Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
/***********************************************************************************************************************
* Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
* following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
Expand All @@ -12,18 +12,15 @@
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
**********************************************************************************************************************/

#pragma once

Expand All @@ -42,13 +39,11 @@

#include <cuda/std/array> // array
#include <cuda/std/cstddef> // size_t
#include <cuda/std/iterator> // cuda::std::iter_value_t
#include <cuda/std/iterator> // _CUDA_VSTD::iter_value_t
#include <cuda/std/type_traits> // _If
#include <cuda/std/utility> // index_sequence

CUB_NAMESPACE_BEGIN

/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
namespace detail
{

Expand All @@ -58,25 +53,24 @@ namespace detail
* Generic Array-like to Array Conversion
**********************************************************************************************************************/

template <typename CastType, typename Input, ::cuda::std::size_t... i>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::array<CastType, cub::detail::static_size_v<Input>>
to_array_impl(const Input& input, ::cuda::std::index_sequence<i...>)
template <typename CastType, typename Input, _CUDA_VSTD::size_t... i>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE _CUDA_VSTD::array<CastType, static_size_v<Input>>
to_array_impl(const Input& input, _CUDA_VSTD::index_sequence<i...>)
{
using ArrayType = ::cuda::std::array<CastType, static_size_v<Input>>;
using ArrayType = _CUDA_VSTD::array<CastType, static_size_v<Input>>;
return ArrayType{static_cast<CastType>(input[i])...};
}

template <typename CastType = void, typename Input>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::array<CastType, cub::detail::static_size_v<Input>>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE _CUDA_VSTD::array<CastType, static_size_v<Input>>
to_array(const Input& input)
{
using InputType = cuda::std::iter_value_t<Input>;
using CastType1 = ::cuda::std::_If<::cuda::std::is_same_v<CastType, void>, InputType, CastType>;
return to_array_impl<CastType1>(input, ::cuda::std::make_index_sequence<static_size_v<Input>>{});
using InputType = _CUDA_VSTD::iter_value_t<Input>;
using CastType1 = _CUDA_VSTD::_If<_CUDA_VSTD::is_same_v<CastType, void>, InputType, CastType>;
return to_array_impl<CastType1>(input, _CUDA_VSTD::make_index_sequence<static_size_v<Input>>{});
}

#endif // !_CCCL_DOXYGEN_INVOKED

} // namespace detail

CUB_NAMESPACE_END
119 changes: 100 additions & 19 deletions cub/cub/detail/type_traits.cuh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/******************************************************************************
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
/***********************************************************************************************************************
* Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
* following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
Expand All @@ -12,18 +12,15 @@
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
**********************************************************************************************************************/

/**
* \file
Expand Down Expand Up @@ -51,15 +48,12 @@
#include <cuda/std/functional>
#include <cuda/std/mdspan>
#include <cuda/std/span>
#include <cuda/std/type_traits>
#include <cuda/std/type_traits> // is_same_v

CUB_NAMESPACE_BEGIN
namespace detail
{

template <typename Invokable, typename... Args>
using invoke_result_t = _CUDA_VSTD::invoke_result_t<Invokable, Args...>;

template <typename T, typename... TArgs>
inline constexpr bool is_one_of_v = (_CCCL_TRAIT(_CUDA_VSTD::is_same, T, TArgs) || ...);

Expand Down Expand Up @@ -119,6 +113,93 @@ inline constexpr int static_size_v<_CUDA_VSTD::mdspan<T, E, L, A>> =
template <typename T>
using implicit_prom_t = decltype(+T{});

} // namespace detail
/***********************************************************************************************************************
* Extended floating point traits
**********************************************************************************************************************/
// half

template <typename>
inline constexpr bool is_half_base_v = false;

template <typename>
inline constexpr bool is_half2_base_v = false;

#if _CCCL_HAS_NVFP16()

template <>
inline constexpr bool is_half_base_v<__half> = true;

template <>
inline constexpr bool is_half2_base_v<__half2> = true;

#endif // _CCCL_HAS_NVFP16

template <typename T>
inline constexpr bool is_half_v = is_half_base_v<_CUDA_VSTD::remove_cv_t<T>>;

template <typename T>
inline constexpr bool is_half2_v = is_half2_base_v<_CUDA_VSTD::remove_cv_t<T>>;

template <typename T>
inline constexpr bool is_any_half_v = is_half_base_v<T> || is_half2_base_v<T>;

//----------------------------------------------------------------------------------------------------------------------
// bfloat16

template <typename>
inline constexpr bool is_bfloat16_base_v = false;

template <typename>
inline constexpr bool is_bfloat16x2_base_v = false;

#if _CCCL_HAS_NVBF16()

template <>
inline constexpr bool is_bfloat16_base_v<__nv_bfloat16> = true;

template <>
inline constexpr bool is_bfloat16x2_base_v<__nv_bfloat162> = true;

#endif // _CCCL_HAS_NVBF16

template <typename T>
inline constexpr bool is_bfloat16_v = is_bfloat16_base_v<_CUDA_VSTD::remove_cv_t<T>>;

template <typename T>
inline constexpr bool is_bfloat16x2_v = is_bfloat16x2_base_v<_CUDA_VSTD::remove_cv_t<T>>;

template <typename T>
inline constexpr bool is_any_bfloat16_v = is_bfloat16_v<T> || is_bfloat16x2_v<T>;

//----------------------------------------------------------------------------------------------------------------------
// short2/ushort2

template <typename T>
inline constexpr bool is_any_short2_base_v = false;

template <>
inline constexpr bool is_any_short2_base_v<short2> = true;

template <>
inline constexpr bool is_any_short2_base_v<ushort2> = true;

template <typename T>
inline constexpr bool is_any_short2_v = is_any_short2_base_v<_CUDA_VSTD::remove_cv_t<T>>;

//----------------------------------------------------------------------------------------------------------------------

// half/bfloat16
template <typename T>
inline constexpr bool is_arithmetic_cuda_floating_point_v =
is_any_half_v<T> || is_any_bfloat16_v<T> || _CUDA_VSTD::is_floating_point_v<T>;

// - promote small integer types to their corresponding 32-bit promotion type
// - address the incompatibility between linux/windows for int/long
template <typename T>
using normalize_integer_t = _CUDA_VSTD::_If<
_CUDA_VSTD::__cccl_is_signed_integer_v<T> && sizeof(T) <= sizeof(int),
int,
_CUDA_VSTD::_If<_CUDA_VSTD::__cccl_is_unsigned_integer_v<T> && sizeof(T) <= sizeof(uint32_t), uint32_t, T>>;

} // namespace detail
CUB_NAMESPACE_END
56 changes: 56 additions & 0 deletions cub/cub/detail/unsafe_bitcast.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/***********************************************************************************************************************
* Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
* following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**********************************************************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

CUB_NAMESPACE_BEGIN
namespace detail
{

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document

// NOTE: bit_cast cannot be always used because __half, __nv_bfloat16, etc. are not trivially copyable
template <typename Output, typename Input>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE Output unsafe_bitcast(const Input& input)
{
Output output;
static_assert(sizeof(input) == sizeof(output), "wrong size");
::memcpy(static_cast<void*>(&output), static_cast<const void*>(&input), sizeof(input));
return output;
}

#endif // !_CCCL_DOXYGEN_INVOKED

} // namespace detail
CUB_NAMESPACE_END
4 changes: 3 additions & 1 deletion cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/std/functional>

CUB_NAMESPACE_BEGIN

namespace detail::adjacent_difference
Expand Down Expand Up @@ -80,7 +82,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(

// It is OK to introspect the return type or parameter types of the
// `operator()` function of `__device__` extended lambda within device code.
using OutputT = invoke_result_t<DifferenceOpT, InputT, InputT>;
using OutputT = _CUDA_VSTD::invoke_result_t<DifferenceOpT, InputT, InputT>;

using Agent =
AgentDifference<ActivePolicyT,
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
#include <cub/util_type.cuh> // for cub::detail::non_void_value_t, cub::detail::value_t

#include <cuda/std/functional>
#include <cuda/std/iterator>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -572,7 +573,7 @@ template <
typename InitT,
typename AccumT =
::cuda::std::__accumulator_t<ReductionOpT,
cub::detail::invoke_result_t<TransformOpT, cub::detail::it_value_t<InputIteratorT>>,
_CUDA_VSTD::invoke_result_t<TransformOpT, _CUDA_VSTD::iter_value_t<InputIteratorT>>,
InitT>,
typename PolicyHub = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
typename KernelSource = detail::reduce::DeviceReduceKernelSource<
Expand Down
Loading
Loading