Skip to content
182 changes: 182 additions & 0 deletions common/cuda_hip/components/bitvector.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_
#define GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_

#include <thrust/functional.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/reduce.h>
#include <thrust/scatter.h>
#include <thrust/sort.h>

#include <ginkgo/core/base/intrinsics.hpp>

#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/components/cooperative_groups.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "core/components/bitvector.hpp"
#include "core/components/prefix_sum_kernels.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace bitvector {


constexpr auto default_block_size = 512;


namespace kernel {


template <typename IndexType, typename DevicePredicate>
__global__ __launch_bounds__(default_block_size) void from_predicate(
IndexType size,
typename device_bitvector<IndexType>::storage_type* __restrict__ bits,
IndexType* __restrict__ popcounts, DevicePredicate predicate)
{
constexpr auto block_size = device_bitvector<IndexType>::block_size;
static_assert(block_size <= config::warp_size);
const auto subwarp_id = thread::get_subwarp_id_flat<block_size>();
const auto subwarp_base = subwarp_id * block_size;
if (subwarp_base >= size) {
return;
}
const auto subwarp =
group::tiled_partition<block_size>(group::this_thread_block());
const auto i = static_cast<IndexType>(subwarp_base + subwarp.thread_rank());
const auto bit = i < size ? predicate(i) : false;
const auto mask = subwarp.ballot(bit);
if (subwarp.thread_rank() == 0) {
bits[subwarp_id] = mask;
popcounts[subwarp_id] = gko::detail::popcount(mask);
}
}


} // namespace kernel


template <typename IndexType, typename DevicePredicate>
gko::bitvector<IndexType> from_predicate(
std::shared_ptr<const DefaultExecutor> exec, IndexType size,
DevicePredicate device_predicate)
{
using storage_type = typename device_bitvector<IndexType>::storage_type;
constexpr auto block_size = device_bitvector<IndexType>::block_size;
const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
array<storage_type> bits{exec, num_blocks};
array<IndexType> ranks{exec, num_blocks};
if (num_blocks > 0) {
const auto num_threadblocks =
ceildiv(num_blocks, default_block_size / block_size);
kernel::from_predicate<<<num_threadblocks, default_block_size, 0,
exec->get_stream()>>>(
size, bits.get_data(), ranks.get_data(), device_predicate);
components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks);
}

return gko::bitvector<IndexType>{std::move(bits), std::move(ranks), size};
}


template <typename IndexType>
struct bitvector_bit_functor {
using storage_type = typename device_bitvector<IndexType>::storage_type;
constexpr storage_type operator()(IndexType i) const
{
return device_bitvector<IndexType>::get_block_and_mask(i).second;
}
};


template <typename IndexType>
struct bitvector_or_functor {
using storage_type = typename device_bitvector<IndexType>::storage_type;
constexpr storage_type operator()(storage_type a, storage_type b) const
{
// https://github.com/ROCm/rocThrust/issues/352
#ifndef GKO_COMPILING_HIP
// there must not be any duplicate indices
assert(a ^ b == 0);
#endif
return a | b;
}
};


template <typename IndexType>
struct bitvector_block_functor {
// workaround for ROCm 4.5 bug
using result_type = IndexType;
constexpr static auto block_size = device_bitvector<IndexType>::block_size;
constexpr IndexType operator()(IndexType i) const
{
assert(i >= 0);
assert(i < size);
return i / block_size;
}

IndexType size;
};


template <typename IndexType>
struct bitvector_popcnt_functor {
using storage_type = typename device_bitvector<IndexType>::storage_type;
constexpr IndexType operator()(storage_type mask) const
{
return gko::detail::popcount(mask);
}
};


template <typename IndexIterator>
gko::bitvector<typename std::iterator_traits<IndexIterator>::value_type>
from_sorted_indices(
std::shared_ptr<const DefaultExecutor> exec, IndexIterator it,
typename std::iterator_traits<IndexIterator>::difference_type count,
typename std::iterator_traits<IndexIterator>::value_type size)
{
using index_type = typename std::iterator_traits<IndexIterator>::value_type;
using storage_type = typename device_bitvector<index_type>::storage_type;
constexpr auto block_size = device_bitvector<index_type>::block_size;
const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
const auto policy = thrust_policy(exec);
array<storage_type> bits_compact{exec, num_blocks};
array<index_type> bits_position{exec, num_blocks};
array<storage_type> bits{exec, num_blocks};
array<index_type> ranks{exec, num_blocks};
Comment on lines +149 to +152
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

compared to the others, it uses double memory allocation.
does it give better performance?

Copy link
Member Author

@upsj upsj Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It has a better worst-case behavior, since it doesn't suffer from atomic collisions. If we implemented our own kernel for it, we could also get rid of the second pair of arrays, but reduce_by_key has an interface that prevents us from scattering the results to the output already.

const auto block_it = thrust::make_transform_iterator(
it, bitvector_block_functor<index_type>{size});
const auto bit_it = thrust::make_transform_iterator(
it, bitvector_bit_functor<index_type>{});
auto out_pos_it = bits_position.get_data();
auto out_bit_it = bits_compact.get_data();
auto [out_pos_end, out_bit_end] = thrust::reduce_by_key(
policy, block_it, block_it + count, bit_it, out_pos_it, out_bit_it,
thrust::equal_to<index_type>{}, bitvector_or_functor<storage_type>{});
assert(thrust::is_sorted(policy, out_pos_it, out_pos_end));
const auto out_size = out_pos_end - out_pos_it;
thrust::fill_n(policy, bits.get_data(), num_blocks, 0);
thrust::scatter(policy, out_bit_it, out_bit_it + out_size, out_pos_it,
bits.get_data());
const auto rank_it = thrust::make_transform_iterator(
bits.get_const_data(), bitvector_popcnt_functor<index_type>{});
thrust::exclusive_scan(policy, rank_it, rank_it + num_blocks,
ranks.get_data(), index_type{});

return gko::bitvector<index_type>{std::move(bits), std::move(ranks), size};
}


} // namespace bitvector
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko


#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_
23 changes: 23 additions & 0 deletions common/unified/components/bitvector.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_
#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_


#include "common/unified/base/kernel_launch.hpp"


#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
#include "common/cuda_hip/components/bitvector.hpp"
#elif defined(GKO_COMPILING_OMP)
#include "omp/components/bitvector.hpp"
#elif defined(GKO_COMPILING_DPCPP)
#include "dpcpp/components/bitvector.dp.hpp"
#else
#error "This file should only be used inside Ginkgo device compilation"
#endif


#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_
1 change: 1 addition & 0 deletions core/base/array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ ValueType reduce_add(const array<ValueType>& input_arr,
#define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value)

GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL);
template GKO_DECLARE_ARRAY_FILL(bool);
template GKO_DECLARE_ARRAY_FILL(uint16);
template GKO_DECLARE_ARRAY_FILL(uint32);
#ifndef GKO_SIZE_T_IS_UINT64_T
Expand Down
6 changes: 4 additions & 2 deletions core/components/bit_packed_storage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,11 +239,13 @@ class bit_packed_span {
* @tparam num_bits The number of bits necessary to store a single value in the
* array. Values need to be in the range [0, 2^num_bits).
* @tparam size The number of values to store in the array.
* @tparam StorageType the underlying storage type to use for each individual
word
*/
template <int num_bits, int size>
template <int num_bits, int size, typename StorageType = uint32>
class bit_packed_array {
public:
using storage_type = uint32;
using storage_type = StorageType;
constexpr static int bits_per_word = sizeof(storage_type) * CHAR_BIT;
constexpr static int bits_per_value = round_up_pow2_constexpr(num_bits);
constexpr static int values_per_word = bits_per_word / bits_per_value;
Expand Down
Loading
Loading