Skip to content
181 changes: 181 additions & 0 deletions common/cuda_hip/components/bitvector.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_
#define GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_

#include <thrust/functional.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/reduce.h>
#include <thrust/scatter.h>
#include <thrust/sort.h>

#include <ginkgo/core/base/intrinsics.hpp>

#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/components/cooperative_groups.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "core/components/bitvector.hpp"
#include "core/components/prefix_sum_kernels.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace bitvector {


constexpr auto default_block_size = 512;


namespace kernel {


template <typename IndexType, typename DevicePredicate>
__global__ __launch_bounds__(default_block_size) void from_predicate(
IndexType size,
typename device_bitvector<IndexType>::storage_type* __restrict__ bits,
IndexType* __restrict__ popcounts, DevicePredicate predicate)
{
constexpr auto block_size = device_bitvector<IndexType>::block_size;
static_assert(block_size <= config::warp_size);
const auto subwarp_id = thread::get_subwarp_id_flat<block_size>();
const auto subwarp_base = subwarp_id * block_size;
if (subwarp_base >= size) {
return;
}
const auto subwarp =
group::tiled_partition<block_size>(group::this_thread_block());
const auto i = static_cast<IndexType>(subwarp_base + subwarp.thread_rank());
const auto bit = i < size ? predicate(i) : false;
const auto mask = subwarp.ballot(bit);
if (subwarp.thread_rank() == 0) {
bits[subwarp_id] = mask;
popcounts[subwarp_id] = gko::detail::popcount(mask);
}
}


} // namespace kernel


template <typename IndexType, typename DevicePredicate>
gko::bitvector<IndexType> from_predicate(
std::shared_ptr<const DefaultExecutor> exec, IndexType size,
DevicePredicate device_predicate)
{
constexpr auto block_size = device_bitvector<IndexType>::block_size;
const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
array<uint32> bits{exec, num_blocks};
array<IndexType> ranks{exec, num_blocks};
if (num_blocks > 0) {
const auto num_threadblocks =
ceildiv(num_blocks, default_block_size / block_size);
kernel::from_predicate<<<num_threadblocks, default_block_size, 0,
exec->get_stream()>>>(
size, bits.get_data(), ranks.get_data(), device_predicate);
components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks);
}

return gko::bitvector<IndexType>{std::move(bits), std::move(ranks), size};
}


template <typename IndexType>
struct bitvector_bit_functor {
using storage_type = typename device_bitvector<IndexType>::storage_type;
__device__ storage_type operator()(IndexType i)
{
return device_bitvector<IndexType>::get_block_and_mask(i).second;
}
};


template <typename IndexType>
struct bitvector_or_functor {
using storage_type = typename device_bitvector<IndexType>::storage_type;
__device__ storage_type operator()(storage_type a, storage_type b)
{
// https://github.com/ROCm/rocThrust/issues/352
#ifndef GKO_COMPILING_HIP
// there must not be any duplicate indices
assert(a ^ b == 0);
#endif
return a | b;
}
};


template <typename IndexType>
struct bitvector_block_functor {
// workaround for ROCm 4.5 bug
using result_type = IndexType;
constexpr static auto block_size = device_bitvector<IndexType>::block_size;
__device__ IndexType operator()(IndexType i)
{
assert(i >= 0);
assert(i < size);
return i / block_size;
}

IndexType size;
};


template <typename IndexType>
struct bitvector_popcnt_functor {
using storage_type = typename device_bitvector<IndexType>::storage_type;
__device__ IndexType operator()(storage_type mask)
{
return gko::detail::popcount(mask);
}
};


template <typename IndexIterator>
gko::bitvector<typename std::iterator_traits<IndexIterator>::value_type>
from_sorted_indices(
std::shared_ptr<const DefaultExecutor> exec, IndexIterator it,
typename std::iterator_traits<IndexIterator>::difference_type count,
typename std::iterator_traits<IndexIterator>::value_type size)
{
using index_type = typename std::iterator_traits<IndexIterator>::value_type;
using storage_type = typename device_bitvector<index_type>::storage_type;
constexpr auto block_size = device_bitvector<index_type>::block_size;
const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
const auto policy = thrust_policy(exec);
array<storage_type> bits_compact{exec, num_blocks};
array<index_type> bits_position{exec, num_blocks};
array<storage_type> bits{exec, num_blocks};
array<index_type> ranks{exec, num_blocks};
Comment on lines +149 to +152
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

compared to the others, it uses double memory allocation.
does it give better performance?

Copy link
Member Author

@upsj upsj Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It has a better worst-case behavior, since it doesn't suffer from atomic collisions. If we implemented our own kernel for it, we could also get rid of the second pair of arrays, but reduce_by_key has an interface that prevents us from scattering the results to the output already.

const auto block_it = thrust::make_transform_iterator(
it, bitvector_block_functor<index_type>{size});
const auto bit_it = thrust::make_transform_iterator(
it, bitvector_bit_functor<index_type>{});
auto out_pos_it = bits_position.get_data();
auto out_bit_it = bits_compact.get_data();
auto [out_pos_end, out_bit_end] = thrust::reduce_by_key(
policy, block_it, block_it + count, bit_it, out_pos_it, out_bit_it,
thrust::equal_to<index_type>{}, bitvector_or_functor<storage_type>{});
assert(thrust::is_sorted(policy, out_pos_it, out_pos_end));
const auto out_size = out_pos_end - out_pos_it;
thrust::fill_n(policy, bits.get_data(), num_blocks, 0);
thrust::scatter(policy, out_bit_it, out_bit_it + out_size, out_pos_it,
bits.get_data());
const auto rank_it = thrust::make_transform_iterator(
bits.get_const_data(), bitvector_popcnt_functor<index_type>{});
thrust::exclusive_scan(policy, rank_it, rank_it + num_blocks,
ranks.get_data(), index_type{});

return gko::bitvector<index_type>{std::move(bits), std::move(ranks), size};
}


} // namespace bitvector
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko


#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_
23 changes: 23 additions & 0 deletions common/unified/components/bitvector.hpp
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is still a blocker for me. I just don't see that we should put kernel headers (that are not used exclusively in common/unified) into common/unified.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With a uniform transform_iterator implementation, I can also implement #1832 in common/unified. Would that be sufficient justification for putting it there?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My issue is not that it includes non-uniform implementations from different backends, but that I don't really understand how this header is going to be used. If the kernels are only called from kernels implemented in common/unified, then it's fine with me. If this file is going to be included anywhere else, then I think it should be put under core.
In any case, I hope I explained my reservation enough, and I will not hold up the PR because of it anymore. However, if the header gets included outside common/unified, I will bring it up again.

Copy link
Member Author

@upsj upsj Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For tests and kernels implemented in unified, I would use this header, for other tests and kernels I would use the backend-specific one. Your reservations definitely make sense and made me reevaluate how I am using the headers.

Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_
#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_


#include "common/unified/base/kernel_launch.hpp"


#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
#include "common/cuda_hip/components/bitvector.hpp"
#elif defined(GKO_COMPILING_OMP)
#include "omp/components/bitvector.hpp"
#elif defined(GKO_COMPILING_DPCPP)
#include "dpcpp/components/bitvector.dp.hpp"
#else
#error "This file should only be used inside Ginkgo device compilation"
#endif


#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_
1 change: 1 addition & 0 deletions core/base/array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ ValueType reduce_add(const array<ValueType>& input_arr,
#define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value)

GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL);
template GKO_DECLARE_ARRAY_FILL(bool);
template GKO_DECLARE_ARRAY_FILL(uint16);
template GKO_DECLARE_ARRAY_FILL(uint32);
#ifndef GKO_SIZE_T_IS_UINT64_T
Expand Down
6 changes: 4 additions & 2 deletions core/components/bit_packed_storage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,11 +239,13 @@ class bit_packed_span {
* @tparam num_bits The number of bits necessary to store a single value in the
* array. Values need to be in the range [0, 2^num_bits).
* @tparam size The number of values to store in the array.
* @tparam StorageType the underlying storage type to use for each individual
word
*/
template <int num_bits, int size>
template <int num_bits, int size, typename StorageType = uint32>
class bit_packed_array {
public:
using storage_type = uint32;
using storage_type = StorageType;
constexpr static int bits_per_word = sizeof(storage_type) * CHAR_BIT;
constexpr static int bits_per_value = round_up_pow2_constexpr(num_bits);
constexpr static int values_per_word = bits_per_word / bits_per_value;
Expand Down
167 changes: 167 additions & 0 deletions core/components/bitvector.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#ifndef GKO_CORE_COMPONENTS_BITVECTOR_HPP_
#define GKO_CORE_COMPONENTS_BITVECTOR_HPP_

#include <ginkgo/core/base/array.hpp>
#include <ginkgo/core/base/intrinsics.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>

namespace gko {


template <typename IndexType>
class device_bitvector {
public:
using index_type = IndexType;
using storage_type = uint32;
constexpr static int block_size = CHAR_BIT * sizeof(storage_type);

/**
* Returns the block index and bitmask belonging to a specific bit index.
*
* @param i the bit index
* @returns a pair consisting of the block index and bitmask for this bit.
*/
constexpr static std::pair<index_type, storage_type> get_block_and_mask(
index_type i)
{
return std::make_pair(i / block_size,
storage_type{1} << (i % block_size));
}

/**
* Constructs a device_bitvector from its underlying data.
*
* @param bits the bitmask array
* @param rank the rank array, it must be the prefix sum over
* `popcount(bits[i])`.
* @param size the number of bits we consider part of this bitvector.
*/
constexpr device_bitvector(const storage_type* bits,
const index_type* ranks, index_type size)
: size_{size}, bits_{bits}, ranks_{ranks}
{}

/** Returns the number of bits stored in this bitvector. */
constexpr index_type size() const { return size_; }

/** Returns the number of words (of type storage_type) in this bitvector. */
constexpr index_type num_blocks() const
{
return (this->size() + block_size - 1) / block_size;
}

/**
* Returns whether the bit at the given index is set.
*
* @param i the index in range [0, size())
* @return true if the bit is set, false otherwise.
*/
constexpr bool get(index_type i) const
{
assert(i >= 0);
assert(i < size());
const auto block = i / block_size;
const auto local = i % block_size;
return bool((bits_[block] >> local) & 1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return bool((bits_[block] >> local) & 1);
return ((bits_[block] >> local) & 1) != 0;

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This intentional, as with a bool cast, there is less risk of operator associativity issues (I've had many issues with & having lower precedence than !=, so missing parentheses causes issues)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe use static_cast<bool>?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think static_cast makes sense to highlight places where casts play an important role, e.g. casting between different integer types. But here to me this is unnecessarily verbose.

}

/**
* Returns the rank of the given index.
*
* @param i the index in range [0, size())
* @return the rank of the given index, i.e. the number of 1 bits set
* before the corresponding bit (exclusive).
*/
constexpr index_type rank(index_type i) const
{
assert(i >= 0);
assert(i < size());
const auto [block, mask] = get_block_and_mask(i);
const auto prefix_mask = mask - 1;
return ranks_[block] + detail::popcount(prefix_mask & bits_[block]);
}

/**
* Returns the inclusive rank of the given index.
*
* @param i the index in range [0, size())
* @return the rank of the given index, i.e. the number of 1 bits set
* up to and including the corresponding bit (inclusive).
*/
constexpr index_type rank_inclusive(index_type i) const
{
assert(i >= 0);
assert(i < size());
const auto [block, mask] = get_block_and_mask(i);
const auto prefix_mask = mask - 1 | mask;
return ranks_[block] + detail::popcount(prefix_mask & bits_[block]);
}

private:
index_type size_;
const storage_type* bits_;
const index_type* ranks_;
};


/**
* Bitvector with rank support. It supports bit queries (whether a bit is set)
* and rank queries (how many bits are set before a specific index).
*
* @tparam IndexType the type of indices used in the input and rank array.
*/
template <typename IndexType>
class bitvector {
public:
using index_type = IndexType;
using view_type = device_bitvector<index_type>;
using storage_type = typename view_type::storage_type;
constexpr static int block_size = CHAR_BIT * sizeof(storage_type);

static index_type get_num_blocks(index_type size)
{
return (size + block_size - 1) / block_size;
}

view_type device_view() const
{
return view_type{this->get_bits(), this->get_ranks(), this->get_size()};
}

std::shared_ptr<const Executor> get_executor() const
{
return bits_.get_executor();
}

const storage_type* get_bits() const { return bits_.get_const_data(); }

const index_type* get_ranks() const { return ranks_.get_const_data(); }

index_type get_size() const { return size_; }

index_type get_num_blocks() const { return get_num_blocks(get_size()); }

bitvector(array<storage_type> bits, array<index_type> ranks,
index_type size)
: size_{size}, bits_{std::move(bits)}, ranks_{std::move(ranks)}
{
GKO_ASSERT(bits_.get_executor() == ranks_.get_executor());
GKO_ASSERT(this->get_num_blocks() == bits_.get_size());
GKO_ASSERT(this->get_num_blocks() == ranks_.get_size());
}

private:
index_type size_;
array<storage_type> bits_;
array<index_type> ranks_;
};


} // namespace gko

#endif // GKO_CORE_COMPONENTS_BITVECTOR_HPP_
Loading
Loading