ginkgo-project · upsj · Apr 30, 2025 · Mar 8, 2025 · Apr 2, 2025 · Apr 20, 2025
diff --git a/common/cuda_hip/components/bitvector.hpp b/common/cuda_hip/components/bitvector.hpp
@@ -0,0 +1,181 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_
+
+#include <thrust/functional.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scatter.h>
+#include <thrust/sort.h>
+
+#include <ginkgo/core/base/intrinsics.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/bitvector.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace bitvector {
+
+
+constexpr auto default_block_size = 512;
+
+
+namespace kernel {
+
+
+template <typename IndexType, typename DevicePredicate>
+__global__ __launch_bounds__(default_block_size) void from_predicate(
+    IndexType size,
+    typename device_bitvector<IndexType>::storage_type* __restrict__ bits,
+    IndexType* __restrict__ popcounts, DevicePredicate predicate)
+{
+    constexpr auto block_size = device_bitvector<IndexType>::block_size;
+    static_assert(block_size <= config::warp_size);
+    const auto subwarp_id = thread::get_subwarp_id_flat<block_size>();
+    const auto subwarp_base = subwarp_id * block_size;
+    if (subwarp_base >= size) {
+        return;
+    }
+    const auto subwarp =
+        group::tiled_partition<block_size>(group::this_thread_block());
+    const auto i = static_cast<IndexType>(subwarp_base + subwarp.thread_rank());
+    const auto bit = i < size ? predicate(i) : false;
+    const auto mask = subwarp.ballot(bit);
+    if (subwarp.thread_rank() == 0) {
+        bits[subwarp_id] = mask;
+        popcounts[subwarp_id] = gko::detail::popcount(mask);
+    }
+}
+
+
+}  // namespace kernel
+
+
+template <typename IndexType, typename DevicePredicate>
+gko::bitvector<IndexType> from_predicate(
+    std::shared_ptr<const DefaultExecutor> exec, IndexType size,
+    DevicePredicate device_predicate)
+{
+    constexpr auto block_size = device_bitvector<IndexType>::block_size;
+    const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
+    array<uint32> bits{exec, num_blocks};
+    array<IndexType> ranks{exec, num_blocks};
+    if (num_blocks > 0) {
+        const auto num_threadblocks =
+            ceildiv(num_blocks, default_block_size / block_size);
+        kernel::from_predicate<<<num_threadblocks, default_block_size, 0,
+                                 exec->get_stream()>>>(
+            size, bits.get_data(), ranks.get_data(), device_predicate);
+        components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks);
+    }
+
+    return gko::bitvector<IndexType>{std::move(bits), std::move(ranks), size};
+}
+
+
+template <typename IndexType>
+struct bitvector_bit_functor {
+    using storage_type = typename device_bitvector<IndexType>::storage_type;
+    __device__ storage_type operator()(IndexType i)
+    {
+        return device_bitvector<IndexType>::get_block_and_mask(i).second;
+    }
+};
+
+
+template <typename IndexType>
+struct bitvector_or_functor {
+    using storage_type = typename device_bitvector<IndexType>::storage_type;
+    __device__ storage_type operator()(storage_type a, storage_type b)
+    {
+        // https://github.com/ROCm/rocThrust/issues/352
+#ifndef GKO_COMPILING_HIP
+        // there must not be any duplicate indices
+        assert(a ^ b == 0);
+#endif
+        return a | b;
+    }
+};
+
+
+template <typename IndexType>
+struct bitvector_block_functor {
+    // workaround for ROCm 4.5 bug
+    using result_type = IndexType;
+    constexpr static auto block_size = device_bitvector<IndexType>::block_size;
+    __device__ IndexType operator()(IndexType i)
+    {
+        assert(i >= 0);
+        assert(i < size);
+        return i / block_size;
+    }
+
+    IndexType size;
+};
+
+
+template <typename IndexType>
+struct bitvector_popcnt_functor {
+    using storage_type = typename device_bitvector<IndexType>::storage_type;
+    __device__ IndexType operator()(storage_type mask)
+    {
+        return gko::detail::popcount(mask);
+    }
+};
+
+
+template <typename IndexIterator>
+gko::bitvector<typename std::iterator_traits<IndexIterator>::value_type>
+from_sorted_indices(
+    std::shared_ptr<const DefaultExecutor> exec, IndexIterator it,
+    typename std::iterator_traits<IndexIterator>::difference_type count,
+    typename std::iterator_traits<IndexIterator>::value_type size)
+{
+    using index_type = typename std::iterator_traits<IndexIterator>::value_type;
+    using storage_type = typename device_bitvector<index_type>::storage_type;
+    constexpr auto block_size = device_bitvector<index_type>::block_size;
+    const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
+    const auto policy = thrust_policy(exec);
+    array<storage_type> bits_compact{exec, num_blocks};
+    array<index_type> bits_position{exec, num_blocks};
+    array<storage_type> bits{exec, num_blocks};
+    array<index_type> ranks{exec, num_blocks};
+    const auto block_it = thrust::make_transform_iterator(
+        it, bitvector_block_functor<index_type>{size});
+    const auto bit_it = thrust::make_transform_iterator(
+        it, bitvector_bit_functor<index_type>{});
+    auto out_pos_it = bits_position.get_data();
+    auto out_bit_it = bits_compact.get_data();
+    auto [out_pos_end, out_bit_end] = thrust::reduce_by_key(
+        policy, block_it, block_it + count, bit_it, out_pos_it, out_bit_it,
+        thrust::equal_to<index_type>{}, bitvector_or_functor<storage_type>{});
+    assert(thrust::is_sorted(policy, out_pos_it, out_pos_end));
+    const auto out_size = out_pos_end - out_pos_it;
+    thrust::fill_n(policy, bits.get_data(), num_blocks, 0);
+    thrust::scatter(policy, out_bit_it, out_bit_it + out_size, out_pos_it,
+                    bits.get_data());
+    const auto rank_it = thrust::make_transform_iterator(
+        bits.get_const_data(), bitvector_popcnt_functor<index_type>{});
+    thrust::exclusive_scan(policy, rank_it, rank_it + num_blocks,
+                           ranks.get_data(), index_type{});
+
+    return gko::bitvector<index_type>{std::move(bits), std::move(ranks), size};
+}
+
+
+}  // namespace bitvector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_
diff --git a/common/unified/components/bitvector.hpp b/common/unified/components/bitvector.hpp
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_
+#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_
+
+
+#include "common/unified/base/kernel_launch.hpp"
+
+
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
+#include "common/cuda_hip/components/bitvector.hpp"
+#elif defined(GKO_COMPILING_OMP)
+#include "omp/components/bitvector.hpp"
+#elif defined(GKO_COMPILING_DPCPP)
+#include "dpcpp/components/bitvector.dp.hpp"
+#else
+#error "This file should only be used inside Ginkgo device compilation"
+#endif
+
+
+#endif  // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_
diff --git a/core/base/array.cpp b/core/base/array.cpp
@@ -91,6 +91,7 @@ ValueType reduce_add(const array<ValueType>& input_arr,
 #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value)
 
 GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL);
+template GKO_DECLARE_ARRAY_FILL(bool);
 template GKO_DECLARE_ARRAY_FILL(uint16);
 template GKO_DECLARE_ARRAY_FILL(uint32);
 #ifndef GKO_SIZE_T_IS_UINT64_T

diff --git a/core/components/bit_packed_storage.hpp b/core/components/bit_packed_storage.hpp
@@ -239,11 +239,13 @@ class bit_packed_span {
  * @tparam num_bits  The number of bits necessary to store a single value in the
  *                   array. Values need to be in the range [0, 2^num_bits).
  * @tparam size  The number of values to store in the array.
+ * @tparam StorageType  the underlying storage type to use for each individual
+ word
  */
-template <int num_bits, int size>
+template <int num_bits, int size, typename StorageType = uint32>
 class bit_packed_array {
 public:
-    using storage_type = uint32;
+    using storage_type = StorageType;
     constexpr static int bits_per_word = sizeof(storage_type) * CHAR_BIT;
     constexpr static int bits_per_value = round_up_pow2_constexpr(num_bits);
     constexpr static int values_per_word = bits_per_word / bits_per_value;

diff --git a/core/components/bitvector.hpp b/core/components/bitvector.hpp
@@ -0,0 +1,167 @@
+// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CORE_COMPONENTS_BITVECTOR_HPP_
+#define GKO_CORE_COMPONENTS_BITVECTOR_HPP_
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/intrinsics.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+namespace gko {
+
+
+template <typename IndexType>
+class device_bitvector {
+public:
+    using index_type = IndexType;
+    using storage_type = uint32;
+    constexpr static int block_size = CHAR_BIT * sizeof(storage_type);
+
+    /**
+     * Returns the block index and bitmask belonging to a specific bit index.
+     *
+     * @param i  the bit index
+     * @returns a pair consisting of the block index and bitmask for this bit.
+     */
+    constexpr static std::pair<index_type, storage_type> get_block_and_mask(
+        index_type i)
+    {
+        return std::make_pair(i / block_size,
+                              storage_type{1} << (i % block_size));
+    }
+
+    /**
+     * Constructs a device_bitvector from its underlying data.
+     *
+     * @param bits  the bitmask array
+     * @param rank  the rank array, it must be the prefix sum over
+     *              `popcount(bits[i])`.
+     * @param size  the number of bits we consider part of this bitvector.
+     */
+    constexpr device_bitvector(const storage_type* bits,
+                               const index_type* ranks, index_type size)
+        : size_{size}, bits_{bits}, ranks_{ranks}
+    {}
+
+    /** Returns the number of bits stored in this bitvector. */
+    constexpr index_type size() const { return size_; }
+
+    /** Returns the number of words (of type storage_type) in this bitvector. */
+    constexpr index_type num_blocks() const
+    {
+        return (this->size() + block_size - 1) / block_size;
+    }
+
+    /**
+     * Returns whether the bit at the given index is set.
+     *
+     * @param i  the index in range [0, size())
+     * @return true if the bit is set, false otherwise.
+     */
+    constexpr bool get(index_type i) const
+    {
+        assert(i >= 0);
+        assert(i < size());
+        const auto block = i / block_size;
+        const auto local = i % block_size;
+        return bool((bits_[block] >> local) & 1);
-        return bool((bits_[block] >> local) & 1);
+        return ((bits_[block] >> local) & 1) != 0;
-        return bool((bits_[block] >> local) & 1);
+        return ((bits_[block] >> local) & 1) != 0;
+    }
+
+    /**
+     * Returns the rank of the given index.
+     *
+     * @param i  the index in range [0, size())
+     * @return the rank of the given index, i.e. the number of 1 bits set
+     *         before the corresponding bit (exclusive).
+     */
+    constexpr index_type rank(index_type i) const
+    {
+        assert(i >= 0);
+        assert(i < size());
+        const auto [block, mask] = get_block_and_mask(i);
+        const auto prefix_mask = mask - 1;
+        return ranks_[block] + detail::popcount(prefix_mask & bits_[block]);
+    }
+
+    /**
+     * Returns the inclusive rank of the given index.
+     *
+     * @param i  the index in range [0, size())
+     * @return the rank of the given index, i.e. the number of 1 bits set
+     *         up to and including the corresponding bit (inclusive).
+     */
+    constexpr index_type rank_inclusive(index_type i) const
+    {
+        assert(i >= 0);
+        assert(i < size());
+        const auto [block, mask] = get_block_and_mask(i);
+        const auto prefix_mask = mask - 1 | mask;
+        return ranks_[block] + detail::popcount(prefix_mask & bits_[block]);
+    }
+
+private:
+    index_type size_;
+    const storage_type* bits_;
+    const index_type* ranks_;
+};
+
+
+/**
+ * Bitvector with rank support. It supports bit queries (whether a bit is set)
+ * and rank queries (how many bits are set before a specific index).
+ *
+ * @tparam IndexType  the type of indices used in the input and rank array.
+ */
+template <typename IndexType>
+class bitvector {
+public:
+    using index_type = IndexType;
+    using view_type = device_bitvector<index_type>;
+    using storage_type = typename view_type::storage_type;
+    constexpr static int block_size = CHAR_BIT * sizeof(storage_type);
+
+    static index_type get_num_blocks(index_type size)
+    {
+        return (size + block_size - 1) / block_size;
+    }
+
+    view_type device_view() const
+    {
+        return view_type{this->get_bits(), this->get_ranks(), this->get_size()};
+    }
+
+    std::shared_ptr<const Executor> get_executor() const
+    {
+        return bits_.get_executor();
+    }
+
+    const storage_type* get_bits() const { return bits_.get_const_data(); }
+
+    const index_type* get_ranks() const { return ranks_.get_const_data(); }
+
+    index_type get_size() const { return size_; }
+
+    index_type get_num_blocks() const { return get_num_blocks(get_size()); }
+
+    bitvector(array<storage_type> bits, array<index_type> ranks,
+              index_type size)
+        : size_{size}, bits_{std::move(bits)}, ranks_{std::move(ranks)}
+    {
+        GKO_ASSERT(bits_.get_executor() == ranks_.get_executor());
+        GKO_ASSERT(this->get_num_blocks() == bits_.get_size());
+        GKO_ASSERT(this->get_num_blocks() == ranks_.get_size());
+    }
+
+private:
+    index_type size_;
+    array<storage_type> bits_;
+    array<index_type> ranks_;
+};
+
+
+}  // namespace gko
+
+#endif  // GKO_CORE_COMPONENTS_BITVECTOR_HPP_