|
22 | 22 | #endif // no system header
|
23 | 23 |
|
24 | 24 | #include <cuda/std/__bit/bit_cast.h>
|
25 |
| -#include <cuda/std/__bit/clz.h> |
26 | 25 | #include <cuda/std/__concepts/concept_macros.h>
|
27 | 26 | #include <cuda/std/__type_traits/conditional.h>
|
28 | 27 | #include <cuda/std/__type_traits/is_constant_evaluated.h>
|
| 28 | +#include <cuda/std/__type_traits/is_same.h> |
29 | 29 | #include <cuda/std/__type_traits/is_unsigned_integer.h>
|
30 | 30 | #include <cuda/std/cstdint>
|
31 | 31 | #include <cuda/std/limits>
|
32 | 32 |
|
| 33 | +#if _CCCL_COMPILER(MSVC) |
| 34 | +# include <intrin.h> |
| 35 | +#endif // _CCCL_COMPILER(MSVC) |
| 36 | + |
33 | 37 | _LIBCUDACXX_BEGIN_NAMESPACE_STD
|
34 | 38 |
|
35 |
| -_CCCL_TEMPLATE(class _Tp) |
36 |
| -_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp) _CCCL_AND(sizeof(_Tp) <= sizeof(uint64_t))) |
37 |
| -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept |
| 39 | +template <typename _Tp> |
| 40 | +[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_countl_zero_impl_constexpr(_Tp __v) noexcept |
38 | 41 | {
|
39 |
| - using _Sp = _If<sizeof(_Tp) <= sizeof(uint32_t), uint32_t, uint64_t>; |
40 |
| - constexpr auto __digits_diff = numeric_limits<_Sp>::digits - numeric_limits<_Tp>::digits; |
41 |
| - return _CUDA_VSTD::__cccl_clz(static_cast<_Sp>(__t)) - __digits_diff; |
42 |
| -} |
| 42 | + constexpr auto __digits = numeric_limits<_Tp>::digits; |
43 | 43 |
|
44 |
| -_CCCL_TEMPLATE(class _Tp) |
45 |
| -_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp) _CCCL_AND(sizeof(_Tp) > sizeof(uint64_t))) |
46 |
| -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept |
47 |
| -{ |
48 |
| - constexpr int _Ratio = sizeof(_Tp) / sizeof(uint64_t); |
49 |
| - for (int __i = _Ratio - 1; __i >= 0; --__i) |
| 44 | + if (__v == 0) |
| 45 | + { |
| 46 | + return __digits; |
| 47 | + } |
| 48 | + |
| 49 | + if constexpr (sizeof(_Tp) == sizeof(uint32_t)) |
50 | 50 | {
|
51 |
| - auto __value64 = static_cast<uint64_t>(__t >> (__i * numeric_limits<uint64_t>::digits)); |
52 |
| - if (static_cast<uint64_t>(__value64)) |
| 51 | +#if defined(_CCCL_BUILTIN_CLZ) |
| 52 | + return _CCCL_BUILTIN_CLZ(__v); |
| 53 | +#else // ^^^ _CCCL_BUILTIN_CLZ ^^^ // vvv !_CCCL_BUILTIN_CLZ vvv |
| 54 | + uint32_t __res = 0; |
| 55 | + for (uint32_t __i = __digits / 2; __i >= 1; __i /= 2) |
53 | 56 | {
|
54 |
| - return _CUDA_VSTD::__countl_zero(__value64) + (_Ratio - 1 - __i) * numeric_limits<uint64_t>::digits; |
| 57 | + const auto __mark = (~uint32_t{0} >> (__digits - __i)) << __i; |
| 58 | + if (__v & __mark) |
| 59 | + { |
| 60 | + __v >>= __i; |
| 61 | + __res |= __i; |
| 62 | + } |
55 | 63 | }
|
| 64 | + return __digits - 1 - __res; |
| 65 | +#endif // ^^^ !_CCCL_BUILTIN_CLZ ^^^ |
| 66 | + } |
| 67 | + else |
| 68 | + { |
| 69 | +#if defined(_CCCL_BUILTIN_CLZLL) |
| 70 | + return _CCCL_BUILTIN_CLZLL(__v); |
| 71 | +#else // ^^^ _CCCL_BUILTIN_CLZLL ^^^ // vvv !_CCCL_BUILTIN_CLZLL vvv |
| 72 | + const auto __hi = static_cast<uint32_t>(__v >> 32); |
| 73 | + const auto __lo = static_cast<uint32_t>(__v); |
| 74 | + return (__hi != 0) ? _CUDA_VSTD::__cccl_countl_zero_impl_constexpr(__hi) |
| 75 | + : (numeric_limits<uint32_t>::digits + _CUDA_VSTD::__cccl_countl_zero_impl_constexpr(__lo)); |
| 76 | +#endif // ^^^ !_CCCL_BUILTIN_CLZLL ^^^ |
56 | 77 | }
|
57 |
| - return numeric_limits<_Tp>::digits; |
| 78 | +} |
| 79 | + |
| 80 | +#if !_CCCL_COMPILER(NVRTC) |
| 81 | +template <typename _Tp> |
| 82 | +[[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_HOST int __cccl_countl_zero_impl_host(_Tp __v) noexcept |
| 83 | +{ |
| 84 | +# if _CCCL_COMPILER(MSVC) |
| 85 | + constexpr auto __digits = numeric_limits<_Tp>::digits; |
| 86 | + unsigned long __where{}; |
| 87 | + const auto __res = sizeof(_Tp) == sizeof(uint32_t) |
| 88 | + ? ::_BitScanReverse(&__where, static_cast<uint32_t>(__v)) |
| 89 | + : ::_BitScanReverse64(&__where, static_cast<uint64_t>(__v)); |
| 90 | + return (__res) ? (__digits - 1 - static_cast<int>(__where)) : __digits; |
| 91 | +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ // vvv !_CCCL_COMPILER(MSVC) vvv |
| 92 | + return _CUDA_VSTD::__cccl_countl_zero_impl_constexpr(__v); |
| 93 | +# endif // ^^^ !_CCCL_COMPILER(MSVC) ^^^ |
| 94 | +} |
| 95 | +#endif // !_CCCL_COMPILER(NVRTC) |
| 96 | + |
| 97 | +#if _CCCL_HAS_CUDA_COMPILER() |
| 98 | +template <typename _Tp> |
| 99 | +[[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept |
| 100 | +{ |
| 101 | + return (sizeof(_Tp) == sizeof(uint32_t)) ? ::__clz(static_cast<int>(__v)) : ::__clzll(static_cast<long long>(__v)); |
| 102 | +} |
| 103 | +#endif // _CCCL_HAS_CUDA_COMPILER() |
| 104 | + |
| 105 | +template <typename _Tp> |
| 106 | +[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_countl_zero_impl(_Tp __v) noexcept |
| 107 | +{ |
| 108 | + static_assert(is_same_v<_Tp, uint32_t> || is_same_v<_Tp, uint64_t>); |
| 109 | + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) |
| 110 | + { |
| 111 | + NV_IF_ELSE_TARGET(NV_IS_HOST, |
| 112 | + (return _CUDA_VSTD::__cccl_countl_zero_impl_host(__v);), |
| 113 | + (return _CUDA_VSTD::__cccl_countl_zero_impl_device(__v);)); |
| 114 | + } |
| 115 | + return _CUDA_VSTD::__cccl_countl_zero_impl_constexpr(__v); |
58 | 116 | }
|
59 | 117 |
|
60 | 118 | _CCCL_TEMPLATE(class _Tp)
|
61 | 119 | _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
|
62 |
| -[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int countl_zero(_Tp __t) noexcept |
| 120 | +[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int countl_zero(_Tp __v) noexcept |
63 | 121 | {
|
64 |
| - auto __ret = _CUDA_VSTD::__countl_zero(static_cast<_Tp>(__t)); |
65 |
| - _CCCL_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits); |
66 |
| - return __ret; |
| 122 | + int __count{}; |
| 123 | +#if defined(_CCCL_BUILTIN_CLZG) |
| 124 | + __count = _CCCL_BUILTIN_CLZG(__v, numeric_limits<_Tp>::digits); |
| 125 | +#else // ^^^ _CCCL_BUILTIN_CLZG ^^^ // vvv !_CCCL_BUILTIN_CLZG vvv |
| 126 | + if constexpr (sizeof(_Tp) <= sizeof(uint64_t)) |
| 127 | + { |
| 128 | + using _Sp = _If<sizeof(_Tp) <= sizeof(uint32_t), uint32_t, uint64_t>; |
| 129 | + constexpr auto __digits_diff = numeric_limits<_Sp>::digits - numeric_limits<_Tp>::digits; |
| 130 | + __count = _CUDA_VSTD::__cccl_countl_zero_impl(static_cast<_Sp>(__v)) - __digits_diff; |
| 131 | + } |
| 132 | + else |
| 133 | + { |
| 134 | + constexpr int _Ratio = sizeof(_Tp) / sizeof(uint64_t); |
| 135 | + |
| 136 | + _CCCL_PRAGMA_UNROLL_FULL() |
| 137 | + for (int __i = _Ratio - 1; __i >= 0; --__i) |
| 138 | + { |
| 139 | + const auto __value64 = static_cast<uint64_t>(__v >> (__i * numeric_limits<uint64_t>::digits)); |
| 140 | + if (__value64 != 0) |
| 141 | + { |
| 142 | + __count += _CUDA_VSTD::countl_zero(__value64); |
| 143 | + break; |
| 144 | + } |
| 145 | + __count += numeric_limits<uint64_t>::digits; |
| 146 | + } |
| 147 | + } |
| 148 | +#endif // ^^^ !_CCCL_BUILTIN_CLZG ^^^ |
| 149 | + |
| 150 | + _CCCL_ASSUME(__count >= 0 && __count <= numeric_limits<_Tp>::digits); |
| 151 | + return __count; |
67 | 152 | }
|
68 | 153 |
|
69 | 154 | _CCCL_TEMPLATE(class _Tp)
|
70 | 155 | _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
|
71 |
| -[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int countl_one(_Tp __t) noexcept |
| 156 | +[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int countl_one(_Tp __v) noexcept |
72 | 157 | {
|
73 |
| - return _CUDA_VSTD::countl_zero(static_cast<_Tp>(~__t)); |
| 158 | + return _CUDA_VSTD::countl_zero(static_cast<_Tp>(~__v)); |
74 | 159 | }
|
75 | 160 |
|
76 | 161 | _LIBCUDACXX_END_NAMESPACE_STD
|
|
0 commit comments