Add ceil_ilog2 (NVIDIA#4485)

fbusato · web-flow · commit c6923df4c5fa · 2025-04-22T08:29:12.000+02:00
diff --git a/docs/libcudacxx/extended_api/math/ilog.rst b/docs/libcudacxx/extended_api/math/ilog.rst
@@ -3,27 +3,34 @@
 ``cuda::ilog2`` and ``cuda::ilog10``
 ====================================
 
-.. code:: cpp
+.. code:: cuda
 
    template <typename T>
    [[nodiscard]] __host__ __device__ inline constexpr
    int ilog2(T value) noexcept;
 
+.. code:: cuda
+
+   template <typename T>
+   [[nodiscard]] __host__ __device__ inline constexpr
+   int ceil_ilog2(T value) noexcept;
+
 .. code:: cuda
 
    template <typename T>
    [[nodiscard]] __host__ __device__ inline constexpr
    int ilog10(T value) noexcept;
 
-The functions compute the logarithm to the base 2 and 10 respectively of an integer value.
+The functions compute the logarithm to the base 2 and 10 of an integer value.
 
 **Parameters**
 
 - ``value``: The input value.
 
 **Return value**
 
-- The logarithm to the base 2 and 10 respectively, rounded down to the nearest integer.
+- ``ilog2``, ``ceil_ilog2``: The logarithm to the base 2, rounded down and up to the nearest integer respectively.
+-  ``ilog10``: The logarithm to the 10, rounded down to the nearest integer.
 
 **Constraints**
 
@@ -38,6 +45,7 @@ The functions compute the logarithm to the base 2 and 10 respectively of an inte
 The function performs the following operations in device code:
 
 - ``ilog2``: ``FLO``
+- ``ceil_ilog2``: ``FLO``, ``POPC``, ``ADD``, comparison
 - ``ilog10``: ``FLO``, ``FMUL``, ``F2I``, constant memory lookup, ``SEL`` + ``IADD`` only if ``T == uint32_t`` or ``T == __uint128_t``
 
 Example
@@ -50,7 +58,9 @@ Example
 
     __global__ void ilog_kernel() {
         assert(cuda::ilog2(20) == 4);
+        assert(cuda::ceil_ilog2(20) == 5);
         assert(cuda::ilog2(32) == 5);
+        assert(cuda::ceil_ilog2(32) == 5);
         assert(cuda::ilog10(100) == 2);
         assert(cuda::ilog10(2000) == 3);
     }
@@ -61,4 +71,4 @@ Example
         return 0;
     }
 
-`See it on Godbolt 🔗 <https://godbolt.org/z/nndYnTWer>`_
+`See it on Godbolt 🔗 <https://godbolt.org/z/nqrYvrGTq>`_
diff --git a/libcudacxx/include/cuda/__cmath/ilog.h b/libcudacxx/include/cuda/__cmath/ilog.h
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__bit/has_single_bit.h>
 #include <cuda/std/__bit/integral.h>
 #include <cuda/std/__cmath/rounding_functions.h>
 #include <cuda/std/__concepts/concept_macros.h>
@@ -41,9 +42,17 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int ilog2(_Tp __t) noexcept
 {
   using _Up = _CUDA_VSTD::make_unsigned_t<_Tp>;
   _CCCL_ASSERT(__t > 0, "ilog2() argument must be strictly positive");
-  auto __log10_approx = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
-  _CCCL_ASSUME(__log10_approx <= _CUDA_VSTD::numeric_limits<_Tp>::digits);
-  return __log10_approx;
+  auto __log2_approx = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
+  _CCCL_ASSUME(__log2_approx <= _CUDA_VSTD::numeric_limits<_Tp>::digits);
+  return __log2_approx;
+}
+
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_cv_integer, _Tp))
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int ceil_ilog2(_Tp __t) noexcept
+{
+  using _Up = _CUDA_VSTD::make_unsigned_t<_Tp>;
+  return ::cuda::ilog2(__t) + !_CUDA_VSTD::has_single_bit(static_cast<_Up>(__t));
 }
 
 [[nodiscard]] _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::array<uint32_t, 10> __power_of_10_32bit() noexcept
diff --git a/libcudacxx/test/libcudacxx/cuda/cmath/ilog.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath/ilog.pass.cpp
@@ -14,7 +14,6 @@
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
-#include "cuda/std/__type_traits/is_constant_evaluated.h"
 #include "test_macros.h"
 
 template <class T>
@@ -24,16 +23,33 @@ __host__ __device__ constexpr void test_log2()
   for (T value = 1; value <= cuda::std::numeric_limits<T>::max() / 2; value *= 2)
   {
     assert(cuda::ilog2(value) == i);
-    if (i >= 1)
+    if (value > 1)
     {
       assert(cuda::ilog2(static_cast<T>(value - 1)) == i - 1);
-      assert(cuda::ilog2(static_cast<T>(value + 1)) == i); // not true if value == 1
+      assert(cuda::ilog2(static_cast<T>(value + 1)) == i);
     }
     i++;
   }
   assert(cuda::ilog2(cuda::std::numeric_limits<T>::max()) == cuda::std::numeric_limits<T>::digits - 1);
 }
 
+template <class T>
+__host__ __device__ constexpr void test_ceil_log2()
+{
+  int i = 0;
+  for (T value = 1; value <= cuda::std::numeric_limits<T>::max() / 2; value *= 2)
+  {
+    assert(cuda::ceil_ilog2(value) == i);
+    assert(cuda::ceil_ilog2(static_cast<T>(value + 1)) == i + 1);
+    if (value > 2)
+    {
+      assert(cuda::ceil_ilog2(static_cast<T>(value - 1)) == i);
+    }
+    i++;
+  }
+  assert(cuda::ceil_ilog2(cuda::std::numeric_limits<T>::max()) == cuda::std::numeric_limits<T>::digits);
+}
+
 template <class T>
 __host__ __device__ constexpr void test_log10()
 {
@@ -62,6 +78,7 @@ template <class T>
 __host__ __device__ constexpr void test()
 {
   test_log2<T>();
+  test_ceil_log2<T>();
   test_log10<T>();
 }