Skip to content

Commit c9210c6

Browse files
authored
Refactor cuda::std::countl_* (NVIDIA#4469)
* Refactor `cuda::std::countl_*` * disable `__builtin_clzg` for clang-cuda * I should have just read the documentation
1 parent bf23f89 commit c9210c6

File tree

4 files changed

+120
-150
lines changed

4 files changed

+120
-150
lines changed

libcudacxx/include/cuda/std/__bit/clz.h

Lines changed: 0 additions & 124 deletions
This file was deleted.

libcudacxx/include/cuda/std/__bit/countl.h

Lines changed: 109 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,55 +22,140 @@
2222
#endif // no system header
2323

2424
#include <cuda/std/__bit/bit_cast.h>
25-
#include <cuda/std/__bit/clz.h>
2625
#include <cuda/std/__concepts/concept_macros.h>
2726
#include <cuda/std/__type_traits/conditional.h>
2827
#include <cuda/std/__type_traits/is_constant_evaluated.h>
28+
#include <cuda/std/__type_traits/is_same.h>
2929
#include <cuda/std/__type_traits/is_unsigned_integer.h>
3030
#include <cuda/std/cstdint>
3131
#include <cuda/std/limits>
3232

33+
#if _CCCL_COMPILER(MSVC)
34+
# include <intrin.h>
35+
#endif // _CCCL_COMPILER(MSVC)
36+
3337
_LIBCUDACXX_BEGIN_NAMESPACE_STD
3438

35-
_CCCL_TEMPLATE(class _Tp)
36-
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp) _CCCL_AND(sizeof(_Tp) <= sizeof(uint64_t)))
37-
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept
39+
template <typename _Tp>
40+
[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_countl_zero_impl_constexpr(_Tp __v) noexcept
3841
{
39-
using _Sp = _If<sizeof(_Tp) <= sizeof(uint32_t), uint32_t, uint64_t>;
40-
constexpr auto __digits_diff = numeric_limits<_Sp>::digits - numeric_limits<_Tp>::digits;
41-
return _CUDA_VSTD::__cccl_clz(static_cast<_Sp>(__t)) - __digits_diff;
42-
}
42+
constexpr auto __digits = numeric_limits<_Tp>::digits;
4343

44-
_CCCL_TEMPLATE(class _Tp)
45-
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp) _CCCL_AND(sizeof(_Tp) > sizeof(uint64_t)))
46-
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept
47-
{
48-
constexpr int _Ratio = sizeof(_Tp) / sizeof(uint64_t);
49-
for (int __i = _Ratio - 1; __i >= 0; --__i)
44+
if (__v == 0)
45+
{
46+
return __digits;
47+
}
48+
49+
if constexpr (sizeof(_Tp) == sizeof(uint32_t))
5050
{
51-
auto __value64 = static_cast<uint64_t>(__t >> (__i * numeric_limits<uint64_t>::digits));
52-
if (static_cast<uint64_t>(__value64))
51+
#if defined(_CCCL_BUILTIN_CLZ)
52+
return _CCCL_BUILTIN_CLZ(__v);
53+
#else // ^^^ _CCCL_BUILTIN_CLZ ^^^ // vvv !_CCCL_BUILTIN_CLZ vvv
54+
uint32_t __res = 0;
55+
for (uint32_t __i = __digits / 2; __i >= 1; __i /= 2)
5356
{
54-
return _CUDA_VSTD::__countl_zero(__value64) + (_Ratio - 1 - __i) * numeric_limits<uint64_t>::digits;
57+
const auto __mark = (~uint32_t{0} >> (__digits - __i)) << __i;
58+
if (__v & __mark)
59+
{
60+
__v >>= __i;
61+
__res |= __i;
62+
}
5563
}
64+
return __digits - 1 - __res;
65+
#endif // ^^^ !_CCCL_BUILTIN_CLZ ^^^
66+
}
67+
else
68+
{
69+
#if defined(_CCCL_BUILTIN_CLZLL)
70+
return _CCCL_BUILTIN_CLZLL(__v);
71+
#else // ^^^ _CCCL_BUILTIN_CLZLL ^^^ // vvv !_CCCL_BUILTIN_CLZLL vvv
72+
const auto __hi = static_cast<uint32_t>(__v >> 32);
73+
const auto __lo = static_cast<uint32_t>(__v);
74+
return (__hi != 0) ? _CUDA_VSTD::__cccl_countl_zero_impl_constexpr(__hi)
75+
: (numeric_limits<uint32_t>::digits + _CUDA_VSTD::__cccl_countl_zero_impl_constexpr(__lo));
76+
#endif // ^^^ !_CCCL_BUILTIN_CLZLL ^^^
5677
}
57-
return numeric_limits<_Tp>::digits;
78+
}
79+
80+
#if !_CCCL_COMPILER(NVRTC)
81+
template <typename _Tp>
82+
[[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_HOST int __cccl_countl_zero_impl_host(_Tp __v) noexcept
83+
{
84+
# if _CCCL_COMPILER(MSVC)
85+
constexpr auto __digits = numeric_limits<_Tp>::digits;
86+
unsigned long __where{};
87+
const auto __res = sizeof(_Tp) == sizeof(uint32_t)
88+
? ::_BitScanReverse(&__where, static_cast<uint32_t>(__v))
89+
: ::_BitScanReverse64(&__where, static_cast<uint64_t>(__v));
90+
return (__res) ? (__digits - 1 - static_cast<int>(__where)) : __digits;
91+
# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ // vvv !_CCCL_COMPILER(MSVC) vvv
92+
return _CUDA_VSTD::__cccl_countl_zero_impl_constexpr(__v);
93+
# endif // ^^^ !_CCCL_COMPILER(MSVC) ^^^
94+
}
95+
#endif // !_CCCL_COMPILER(NVRTC)
96+
97+
#if _CCCL_HAS_CUDA_COMPILER()
98+
template <typename _Tp>
99+
[[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept
100+
{
101+
return (sizeof(_Tp) == sizeof(uint32_t)) ? ::__clz(static_cast<int>(__v)) : ::__clzll(static_cast<long long>(__v));
102+
}
103+
#endif // _CCCL_HAS_CUDA_COMPILER()
104+
105+
template <typename _Tp>
106+
[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_countl_zero_impl(_Tp __v) noexcept
107+
{
108+
static_assert(is_same_v<_Tp, uint32_t> || is_same_v<_Tp, uint64_t>);
109+
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
110+
{
111+
NV_IF_ELSE_TARGET(NV_IS_HOST,
112+
(return _CUDA_VSTD::__cccl_countl_zero_impl_host(__v);),
113+
(return _CUDA_VSTD::__cccl_countl_zero_impl_device(__v);));
114+
}
115+
return _CUDA_VSTD::__cccl_countl_zero_impl_constexpr(__v);
58116
}
59117

60118
_CCCL_TEMPLATE(class _Tp)
61119
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
62-
[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int countl_zero(_Tp __t) noexcept
120+
[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int countl_zero(_Tp __v) noexcept
63121
{
64-
auto __ret = _CUDA_VSTD::__countl_zero(static_cast<_Tp>(__t));
65-
_CCCL_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits);
66-
return __ret;
122+
int __count{};
123+
#if defined(_CCCL_BUILTIN_CLZG)
124+
__count = _CCCL_BUILTIN_CLZG(__v, numeric_limits<_Tp>::digits);
125+
#else // ^^^ _CCCL_BUILTIN_CLZG ^^^ // vvv !_CCCL_BUILTIN_CLZG vvv
126+
if constexpr (sizeof(_Tp) <= sizeof(uint64_t))
127+
{
128+
using _Sp = _If<sizeof(_Tp) <= sizeof(uint32_t), uint32_t, uint64_t>;
129+
constexpr auto __digits_diff = numeric_limits<_Sp>::digits - numeric_limits<_Tp>::digits;
130+
__count = _CUDA_VSTD::__cccl_countl_zero_impl(static_cast<_Sp>(__v)) - __digits_diff;
131+
}
132+
else
133+
{
134+
constexpr int _Ratio = sizeof(_Tp) / sizeof(uint64_t);
135+
136+
_CCCL_PRAGMA_UNROLL_FULL()
137+
for (int __i = _Ratio - 1; __i >= 0; --__i)
138+
{
139+
const auto __value64 = static_cast<uint64_t>(__v >> (__i * numeric_limits<uint64_t>::digits));
140+
if (__value64 != 0)
141+
{
142+
__count += _CUDA_VSTD::countl_zero(__value64);
143+
break;
144+
}
145+
__count += numeric_limits<uint64_t>::digits;
146+
}
147+
}
148+
#endif // ^^^ !_CCCL_BUILTIN_CLZG ^^^
149+
150+
_CCCL_ASSUME(__count >= 0 && __count <= numeric_limits<_Tp>::digits);
151+
return __count;
67152
}
68153

69154
_CCCL_TEMPLATE(class _Tp)
70155
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
71-
[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int countl_one(_Tp __t) noexcept
156+
[[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr int countl_one(_Tp __v) noexcept
72157
{
73-
return _CUDA_VSTD::countl_zero(static_cast<_Tp>(~__t));
158+
return _CUDA_VSTD::countl_zero(static_cast<_Tp>(~__v));
74159
}
75160

76161
_LIBCUDACXX_END_NAMESPACE_STD

libcudacxx/include/cuda/std/__cccl/builtin.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@
207207
# define _CCCL_BUILTIN_POPCOUNTG(...) __builtin_popcountg(__VA_ARGS__)
208208
#endif // _CCCL_CHECK_BUILTIN(builtin_popcountg)
209209

210-
// NVCC cannot handle __builtin_popcountg
210+
// nvcc cannot handle __builtin_popcountg
211211
#if _CCCL_CUDA_COMPILER(NVCC)
212212
# undef _CCCL_BUILTIN_POPCOUNTG
213213
#endif // _CCCL_CUDA_COMPILER(NVCC)
@@ -217,6 +217,15 @@
217217
# define _CCCL_BUILTIN_CLZLL(...) __builtin_clzll(__VA_ARGS__)
218218
#endif // _CCCL_CHECK_BUILTIN(builtin_clz)
219219

220+
#if _CCCL_CHECK_BUILTIN(builtin_clzg)
221+
# define _CCCL_BUILTIN_CLZG(...) __builtin_clzg(__VA_ARGS__)
222+
#endif // _CCCL_CHECK_BUILTIN(builtin_clzg)
223+
224+
// nvcc cannot handle __builtin_clzg
225+
#if _CCCL_CUDA_COMPILER(NVCC)
226+
# undef _CCCL_BUILTIN_CLZG
227+
#endif // _CCCL_CUDA_COMPILER(NVCC)
228+
220229
#if _CCCL_CHECK_BUILTIN(builtin_ctz) || _CCCL_COMPILER(GCC, <, 10) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC)
221230
# define _CCCL_BUILTIN_CTZ(...) __builtin_ctz(__VA_ARGS__)
222231
# define _CCCL_BUILTIN_CTZLL(...) __builtin_ctzll(__VA_ARGS__)

libcudacxx/include/cuda/std/detail/libcxx/include/algorithm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1070,7 +1070,7 @@ typename uniform_int_distribution<_IntType>::result_type uniform_int_distributio
10701070
{
10711071
return static_cast<result_type>(_Eng(__g, _Dt)());
10721072
}
1073-
size_t __w = _Dt - __cccl_clz(_Rp) - 1;
1073+
size_t __w = _Dt - _CUDA_VSTD::countl_zero(_Rp) - 1;
10741074
if ((_Rp & (std::numeric_limits<_UIntType>::max() >> (_Dt - __w))) != 0)
10751075
{
10761076
++__w;

0 commit comments

Comments
 (0)