diff --git a/common/cuda_hip/components/bitvector.hpp b/common/cuda_hip/components/bitvector.hpp new file mode 100644 index 00000000000..1b29eaad62a --- /dev/null +++ b/common/cuda_hip/components/bitvector.hpp @@ -0,0 +1,182 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ + +#include +#include +#include +#include +#include + +#include + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/bitvector.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace bitvector { + + +constexpr auto default_block_size = 512; + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void from_predicate( + IndexType size, + typename device_bitvector::storage_type* __restrict__ bits, + IndexType* __restrict__ popcounts, DevicePredicate predicate) +{ + constexpr auto block_size = device_bitvector::block_size; + static_assert(block_size <= config::warp_size); + const auto subwarp_id = thread::get_subwarp_id_flat(); + const auto subwarp_base = subwarp_id * block_size; + if (subwarp_base >= size) { + return; + } + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + const auto i = static_cast(subwarp_base + subwarp.thread_rank()); + const auto bit = i < size ? predicate(i) : false; + const auto mask = subwarp.ballot(bit); + if (subwarp.thread_rank() == 0) { + bits[subwarp_id] = mask; + popcounts[subwarp_id] = gko::detail::popcount(mask); + } +} + + +} // namespace kernel + + +template +gko::bitvector from_predicate( + std::shared_ptr exec, IndexType size, + DevicePredicate device_predicate) +{ + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bits{exec, num_blocks}; + array ranks{exec, num_blocks}; + if (num_blocks > 0) { + const auto num_threadblocks = + ceildiv(num_blocks, default_block_size / block_size); + kernel::from_predicate<<get_stream()>>>( + size, bits.get_data(), ranks.get_data(), device_predicate); + components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); + } + + return gko::bitvector{std::move(bits), std::move(ranks), size}; +} + + +template +struct bitvector_bit_functor { + using storage_type = typename device_bitvector::storage_type; + constexpr storage_type operator()(IndexType i) const + { + return device_bitvector::get_block_and_mask(i).second; + } +}; + + +template +struct bitvector_or_functor { + using storage_type = typename device_bitvector::storage_type; + constexpr storage_type operator()(storage_type a, storage_type b) const + { + // https://github.com/ROCm/rocThrust/issues/352 +#ifndef GKO_COMPILING_HIP + // there must not be any duplicate indices + assert(a ^ b == 0); +#endif + return a | b; + } +}; + + +template +struct bitvector_block_functor { + // workaround for ROCm 4.5 bug + using result_type = IndexType; + constexpr static auto block_size = device_bitvector::block_size; + constexpr IndexType operator()(IndexType i) const + { + assert(i >= 0); + assert(i < size); + return i / block_size; + } + + IndexType size; +}; + + +template +struct bitvector_popcnt_functor { + using storage_type = typename device_bitvector::storage_type; + constexpr IndexType operator()(storage_type mask) const + { + return gko::detail::popcount(mask); + } +}; + + +template +gko::bitvector::value_type> +from_sorted_indices( + std::shared_ptr exec, IndexIterator it, + typename std::iterator_traits::difference_type count, + typename std::iterator_traits::value_type size) +{ + using index_type = typename std::iterator_traits::value_type; + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + const auto policy = thrust_policy(exec); + array bits_compact{exec, num_blocks}; + array bits_position{exec, num_blocks}; + array bits{exec, num_blocks}; + array ranks{exec, num_blocks}; + const auto block_it = thrust::make_transform_iterator( + it, bitvector_block_functor{size}); + const auto bit_it = thrust::make_transform_iterator( + it, bitvector_bit_functor{}); + auto out_pos_it = bits_position.get_data(); + auto out_bit_it = bits_compact.get_data(); + auto [out_pos_end, out_bit_end] = thrust::reduce_by_key( + policy, block_it, block_it + count, bit_it, out_pos_it, out_bit_it, + thrust::equal_to{}, bitvector_or_functor{}); + assert(thrust::is_sorted(policy, out_pos_it, out_pos_end)); + const auto out_size = out_pos_end - out_pos_it; + thrust::fill_n(policy, bits.get_data(), num_blocks, 0); + thrust::scatter(policy, out_bit_it, out_bit_it + out_size, out_pos_it, + bits.get_data()); + const auto rank_it = thrust::make_transform_iterator( + bits.get_const_data(), bitvector_popcnt_functor{}); + thrust::exclusive_scan(policy, rank_it, rank_it + num_blocks, + ranks.get_data(), index_type{}); + + return gko::bitvector{std::move(bits), std::move(ranks), size}; +} + + +} // namespace bitvector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ diff --git a/common/unified/components/bitvector.hpp b/common/unified/components/bitvector.hpp new file mode 100644 index 00000000000..ce289c44dce --- /dev/null +++ b/common/unified/components/bitvector.hpp @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ +#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ + + +#include "common/unified/base/kernel_launch.hpp" + + +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) +#include "common/cuda_hip/components/bitvector.hpp" +#elif defined(GKO_COMPILING_OMP) +#include "omp/components/bitvector.hpp" +#elif defined(GKO_COMPILING_DPCPP) +#include "dpcpp/components/bitvector.dp.hpp" +#else +#error "This file should only be used inside Ginkgo device compilation" +#endif + + +#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ diff --git a/core/base/array.cpp b/core/base/array.cpp index 51fa4b34bb1..da1accc1ca1 100644 --- a/core/base/array.cpp +++ b/core/base/array.cpp @@ -91,6 +91,7 @@ ValueType reduce_add(const array& input_arr, #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value) GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL); +template GKO_DECLARE_ARRAY_FILL(bool); template GKO_DECLARE_ARRAY_FILL(uint16); template GKO_DECLARE_ARRAY_FILL(uint32); #ifndef GKO_SIZE_T_IS_UINT64_T diff --git a/core/components/bit_packed_storage.hpp b/core/components/bit_packed_storage.hpp index 78777c1d2d2..2654b7ecb2a 100644 --- a/core/components/bit_packed_storage.hpp +++ b/core/components/bit_packed_storage.hpp @@ -239,11 +239,13 @@ class bit_packed_span { * @tparam num_bits The number of bits necessary to store a single value in the * array. Values need to be in the range [0, 2^num_bits). * @tparam size The number of values to store in the array. + * @tparam StorageType the underlying storage type to use for each individual + word */ -template +template class bit_packed_array { public: - using storage_type = uint32; + using storage_type = StorageType; constexpr static int bits_per_word = sizeof(storage_type) * CHAR_BIT; constexpr static int bits_per_value = round_up_pow2_constexpr(num_bits); constexpr static int values_per_word = bits_per_word / bits_per_value; diff --git a/core/components/bitvector.hpp b/core/components/bitvector.hpp new file mode 100644 index 00000000000..13b5e47e295 --- /dev/null +++ b/core/components/bitvector.hpp @@ -0,0 +1,171 @@ +// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_COMPONENTS_BITVECTOR_HPP_ +#define GKO_CORE_COMPONENTS_BITVECTOR_HPP_ + +#include +#include +#include +#include + +namespace gko { + + +template +class device_bitvector { +public: + using index_type = IndexType; + using storage_type = uint32; + constexpr static int block_size = CHAR_BIT * sizeof(storage_type); + + /** + * Returns the block index and bitmask belonging to a specific bit index. + * + * @param i the bit index + * + * @returns a pair consisting of the block index and bitmask for this bit. + */ + constexpr static std::pair get_block_and_mask( + index_type i) + { + return std::make_pair(i / block_size, + storage_type{1} << (i % block_size)); + } + + /** + * Constructs a device_bitvector from its underlying data. + * + * @param bits the bitmask array + * @param rank the rank array, it must be the prefix sum over + * `popcount(bits[i])`. + * @param size the number of bits we consider part of this bitvector. + */ + constexpr device_bitvector(const storage_type* bits, + const index_type* ranks, index_type size) + : size_{size}, bits_{bits}, ranks_{ranks} + {} + + /** Returns the number of bits stored in this bitvector. */ + constexpr index_type get_size() const { return size_; } + + /** Returns the number of words (of type storage_type) in this bitvector. */ + constexpr index_type get_num_blocks() const + { + return (this->get_size() + block_size - 1) / block_size; + } + + /** + * Returns whether the bit at the given index is set. + * + * @param i the index in range [0, size()) + * + * @return true if the bit is set, false otherwise. + */ + constexpr bool operator[](index_type i) const + { + assert(i >= 0); + assert(i < get_size()); + const auto block = i / block_size; + const auto local = i % block_size; + return bool((bits_[block] >> local) & 1); + } + + /** + * Returns the rank of the given index. + * + * @param i the index in range [0, size()) + * + * @return the rank of the given index, i.e. the number of 1 bits set + * before the corresponding bit (exclusive). + */ + constexpr index_type get_rank(index_type i) const + { + assert(i >= 0); + assert(i < get_size()); + const auto [block, mask] = get_block_and_mask(i); + const auto prefix_mask = mask - 1; + return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); + } + + /** + * Returns the inclusive rank of the given index. + * + * @param i the index in range [0, size()) + * + * @return the rank of the given index, i.e. the number of 1 bits set + * up to and including the corresponding bit (inclusive). + */ + constexpr index_type get_rank_inclusive(index_type i) const + { + assert(i >= 0); + assert(i < get_size()); + const auto [block, mask] = get_block_and_mask(i); + const auto prefix_mask = mask - 1 | mask; + return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); + } + +private: + index_type size_; + const storage_type* bits_; + const index_type* ranks_; +}; + + +/** + * Bitvector with rank support. It supports bit queries (whether a bit is set) + * and rank queries (how many bits are set before a specific index). + * + * @tparam IndexType the type of indices used in the input and rank array. + */ +template +class bitvector { +public: + using index_type = IndexType; + using view_type = device_bitvector; + using storage_type = typename view_type::storage_type; + constexpr static int block_size = CHAR_BIT * sizeof(storage_type); + + static index_type get_num_blocks(index_type size) + { + return (size + block_size - 1) / block_size; + } + + view_type device_view() const + { + return view_type{this->get_bits(), this->get_ranks(), this->get_size()}; + } + + std::shared_ptr get_executor() const + { + return bits_.get_executor(); + } + + const storage_type* get_bits() const { return bits_.get_const_data(); } + + const index_type* get_ranks() const { return ranks_.get_const_data(); } + + index_type get_size() const { return size_; } + + index_type get_num_blocks() const { return get_num_blocks(get_size()); } + + bitvector(array bits, array ranks, + index_type size) + : size_{size}, bits_{std::move(bits)}, ranks_{std::move(ranks)} + { + GKO_ASSERT(bits_.get_executor() == ranks_.get_executor()); + GKO_ASSERT(this->get_num_blocks() == bits_.get_size()); + GKO_ASSERT(this->get_num_blocks() == ranks_.get_size()); + } + +private: + index_type size_; + array bits_; + array ranks_; +}; + + +} // namespace gko + +#endif // GKO_CORE_COMPONENTS_BITVECTOR_HPP_ diff --git a/core/components/range_minimum_query.cpp b/core/components/range_minimum_query.cpp index 07cb69fcdff..970a2ef739e 100644 --- a/core/components/range_minimum_query.cpp +++ b/core/components/range_minimum_query.cpp @@ -68,10 +68,10 @@ range_minimum_query::get() const } -#define GKO_DEFINE_DEVICE_RANGE_MINIMUM_QUERY(IndexType) \ +#define GKO_DEFINE_RANGE_MINIMUM_QUERY(IndexType) \ class range_minimum_query -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DEFINE_DEVICE_RANGE_MINIMUM_QUERY); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DEFINE_RANGE_MINIMUM_QUERY); } // namespace gko diff --git a/dpcpp/components/bitvector.dp.hpp b/dpcpp/components/bitvector.dp.hpp new file mode 100644 index 00000000000..2c329941eaa --- /dev/null +++ b/dpcpp/components/bitvector.dp.hpp @@ -0,0 +1,115 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_DPCPP_COMPONENTS_BITVECTOR_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_BITVECTOR_DP_HPP_ + + +#include + +#include + +#include "core/components/bitvector.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace bitvector { + + +template +gko::bitvector from_predicate( + std::shared_ptr exec, IndexType size, + DevicePredicate device_predicate) +{ + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bit_array{exec, num_blocks}; + array rank_array{exec, num_blocks}; + const auto bits = bit_array.get_data(); + const auto ranks = rank_array.get_data(); + const auto queue = exec->get_queue(); + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(num_blocks, [=](sycl::id<1> block_i) { + const auto base_i = static_cast(block_i) * block_size; + storage_type mask{}; + const auto local_op = [&](int local_i) { + const storage_type bit = + device_predicate(base_i + local_i) ? 1 : 0; + mask |= bit << local_i; + }; + if (base_i + block_size <= size) { +#pragma unroll + for (int local_i = 0; local_i < block_size; local_i++) { + local_op(local_i); + } + } else { + for (int local_i = 0; base_i + local_i < size; local_i++) { + local_op(local_i); + } + } + bits[block_i] = mask; + ranks[block_i] = gko::detail::popcount(mask); + }); + }); + components::prefix_sum_nonnegative(exec, ranks, num_blocks); + + return gko::bitvector{std::move(bit_array), + std::move(rank_array), size}; +} + + +template +gko::bitvector::value_type> +from_sorted_indices( + std::shared_ptr exec, IndexIterator it, + typename std::iterator_traits::difference_type count, + typename std::iterator_traits::value_type size) +{ + using index_type = typename std::iterator_traits::value_type; + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bit_array{exec, num_blocks}; + array rank_array{exec, num_blocks}; + components::fill_array(exec, bit_array.get_data(), num_blocks, + storage_type{}); + const auto bits = bit_array.get_data(); + const auto ranks = rank_array.get_data(); + const auto queue = exec->get_queue(); + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(count, [=](sycl::id<1> i) { + auto value = it[i]; + const auto [block, mask] = + device_bitvector::get_block_and_mask(value); + sycl::atomic_ref + atomic(bits[block]); + atomic.fetch_or(mask); + }); + }); + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(num_blocks, [=](sycl::id<1> i) { + ranks[i] = gko::detail::popcount(bits[i]); + }); + }); + components::prefix_sum_nonnegative(exec, ranks, num_blocks); + + return gko::bitvector{std::move(bit_array), + std::move(rank_array), size}; +} + + +} // namespace bitvector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_BITVECTOR_DP_HPP_ diff --git a/omp/components/bitvector.hpp b/omp/components/bitvector.hpp new file mode 100644 index 00000000000..9b87a990b3c --- /dev/null +++ b/omp/components/bitvector.hpp @@ -0,0 +1,125 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +#include "core/base/index_range.hpp" +#include "core/components/bitvector.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace bitvector { + + +template +gko::bitvector from_predicate( + std::shared_ptr exec, IndexType size, + DevicePredicate device_predicate) +{ + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bit_array{exec, num_blocks}; + array rank_array{exec, num_blocks}; + const auto bits = bit_array.get_data(); + const auto ranks = rank_array.get_data(); +#pragma omp parallel for + for (IndexType block_i = 0; block_i < num_blocks; block_i++) { + const auto base_i = block_i * block_size; + storage_type mask{}; + const auto local_op = [&](int local_i) { + const storage_type bit = device_predicate(base_i + local_i) ? 1 : 0; + mask |= bit << local_i; + }; + if (base_i + block_size <= size) { +#pragma unroll + for (int local_i = 0; local_i < block_size; local_i++) { + local_op(local_i); + } + } else { + for (int local_i = 0; base_i + local_i < size; local_i++) { + local_op(local_i); + } + } + bits[block_i] = mask; + ranks[block_i] = gko::detail::popcount(mask); + } + components::prefix_sum_nonnegative(exec, ranks, num_blocks); + + return gko::bitvector{std::move(bit_array), + std::move(rank_array), size}; +} + + +template +gko::bitvector::value_type> +from_sorted_indices( + std::shared_ptr exec, IndexIterator it, + typename std::iterator_traits::difference_type count, + typename std::iterator_traits::value_type size) +{ + using index_type = typename std::iterator_traits::value_type; + using bv = device_bitvector; + using storage_type = typename bv::storage_type; + constexpr auto block_size = bv::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bit_array{exec, num_blocks}; + array rank_array{exec, num_blocks}; + const auto bits = bit_array.get_data(); + const auto ranks = rank_array.get_data(); + components::fill_array(exec, bits, num_blocks, storage_type{}); + const auto num_threads = omp_get_max_threads(); + const auto work_per_thread = ceildiv(count, num_threads); + assert(std::is_sorted(it, it + count)); +#pragma omp parallel num_threads(num_threads) + { + const auto tid = omp_get_thread_num(); + const auto begin = std::min(tid * work_per_thread, count); + const auto end = std::min(begin + work_per_thread, count); + if (begin < end) { + const auto first_block = it[begin] / block_size; + const auto last_block = it[end - 1] / block_size; + storage_type mask{0}; + auto block = first_block; + for (auto i : irange{begin, end}) { + const auto value = it[i]; + const auto new_block = value / block_size; + const auto local = value % block_size; + if (new_block != block) { + assert(new_block > block); + if (block == first_block) { +#pragma omp atomic + bits[block] |= mask; + } else { + bits[block] = mask; + } + mask = 0; + block = new_block; + } + mask |= storage_type{1} << local; + } +#pragma omp atomic + bits[last_block] |= mask; + } + } +#pragma omp parallel for + for (size_type i = 0; i < num_blocks; i++) { + ranks[i] = gko::detail::popcount(bits[i]); + } + components::prefix_sum_nonnegative(exec, ranks, num_blocks); + return gko::bitvector{std::move(bit_array), + std::move(rank_array), size}; +} + + +} // namespace bitvector +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/reference/components/bitvector.hpp b/reference/components/bitvector.hpp new file mode 100644 index 00000000000..d35c2b2766b --- /dev/null +++ b/reference/components/bitvector.hpp @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_REFERENCE_COMPONENTS_BITVECTOR_HPP_ +#define GKO_REFERENCE_COMPONENTS_BITVECTOR_HPP_ + +#include "core/base/index_range.hpp" +#include "core/components/bitvector.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace bitvector { + + +template +gko::bitvector from_predicate( + std::shared_ptr exec, IndexType size, + DevicePredicate device_predicate) +{ + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bits{exec, num_blocks}; + array ranks{exec, num_blocks}; + std::fill_n(bits.get_data(), num_blocks, 0); + std::fill_n(ranks.get_data(), num_blocks, 0); + for (auto i : irange{size}) { + if (device_predicate(i)) { + bits.get_data()[i / block_size] |= storage_type{1} + << (i % block_size); + ranks.get_data()[i / block_size]++; + } + } + components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); + + return gko::bitvector{std::move(bits), std::move(ranks), size}; +} + + +template +gko::bitvector::value_type> +from_sorted_indices( + std::shared_ptr exec, IndexIterator it, + typename std::iterator_traits::difference_type count, + typename std::iterator_traits::value_type size) +{ + using index_type = typename std::iterator_traits::value_type; + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bits{exec, num_blocks}; + array ranks{exec, num_blocks}; + std::fill_n(bits.get_data(), num_blocks, 0); + assert(std::is_sorted(it, it + count)); + for (auto i : irange{count}) { + const auto value = it[i]; + const auto [block, mask] = + device_bitvector::get_block_and_mask(value); + assert((bits.get_data()[block] & mask) == 0); + bits.get_data()[block] |= mask; + } + index_type rank{0}; + for (auto i : irange{static_cast(num_blocks)}) { + ranks.get_data()[i] = rank; + rank += gko::detail::popcount(bits.get_const_data()[i]); + } + + return gko::bitvector{std::move(bits), std::move(ranks), size}; +} + + +} // namespace bitvector +} // namespace reference +} // namespace kernels +} // namespace gko + + +#endif // GKO_REFERENCE_COMPONENTS_BITVECTOR_HPP_ diff --git a/reference/test/components/CMakeLists.txt b/reference/test/components/CMakeLists.txt index b17880ab32d..47998d184df 100644 --- a/reference/test/components/CMakeLists.txt +++ b/reference/test/components/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_test(absolute_array_kernels) +ginkgo_create_test(bitvector) ginkgo_create_test(fill_array_kernels) ginkgo_create_test(format_conversion_kernels) ginkgo_create_test(precision_conversion_kernels) diff --git a/reference/test/components/bitvector.cpp b/reference/test/components/bitvector.cpp new file mode 100644 index 00000000000..8c5b428bb39 --- /dev/null +++ b/reference/test/components/bitvector.cpp @@ -0,0 +1,102 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/bitvector.hpp" + +#include +#include +#include + +#include + +#include "core/base/index_range.hpp" +#include "core/components/bitvector.hpp" +#include "core/test/utils.hpp" +#include "reference/components/bitvector.hpp" + + +template +class Bitvector : public ::testing::Test { +protected: + using index_type = IndexType; + using device_type = gko::bitvector; + using storage_type = typename device_type::storage_type; + constexpr static auto block_size = device_type::block_size; + Bitvector() + : ref{gko::ReferenceExecutor::create()}, rng{67593}, sizes{0, 1, + 2, 16, + 31, 32, + 33, 40, + 63, 64, + 65, 127, + 128, 129, + 1000, 1024, + 2000} + {} + + std::vector create_random_values(index_type num_values, + index_type size) + { + assert(num_values <= size); + std::vector values(size); + std::iota(values.begin(), values.end(), index_type{}); + std::shuffle(values.begin(), values.end(), rng); + values.resize(num_values); + std::sort(values.begin(), values.end()); + return values; + } + + std::shared_ptr ref; + std::default_random_engine rng; + std::vector sizes; +}; + +TYPED_TEST_SUITE(Bitvector, gko::test::IndexTypes, TypenameNameGenerator); + + +TYPED_TEST(Bitvector, ComputeBitsAndRanks) +{ + using index_type = typename TestFixture::index_type; + using storage_type = typename TestFixture::storage_type; + constexpr auto block_size = TestFixture::block_size; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + for (auto num_values : + {index_type{0}, size / 10, size / 4, size / 2, size}) { + SCOPED_TRACE(num_values); + auto values = this->create_random_values(num_values, size); + num_values = values.size(); + auto num_blocks = gko::ceildiv(size, block_size); + + auto bv = gko::kernels::reference::bitvector::from_sorted_indices( + this->ref, values.data(), num_values, size); + auto dbv = bv.device_view(); + + // check bits and ranks are correct + ASSERT_EQ(bv.get_size(), size); + ASSERT_EQ(dbv.get_size(), size); + ASSERT_EQ(bv.get_num_blocks(), num_blocks); + ASSERT_EQ(dbv.get_num_blocks(), num_blocks); + auto it = values.begin(); + index_type rank{}; + for (auto i : gko::irange{size}) { + const auto block = i / block_size; + const auto local = i % block_size; + ASSERT_EQ(dbv.get_rank(i), rank); + if (it != values.end() && *it == i) { + ASSERT_TRUE(bool(bv.get_bits()[block] & + (storage_type{1} << local))); + ASSERT_TRUE(dbv[i]); + ++rank; + ++it; + } else { + ASSERT_FALSE(bool(bv.get_bits()[block] & + (storage_type{1} << local))); + ASSERT_FALSE(dbv[i]); + } + ASSERT_EQ(dbv.get_rank_inclusive(i), rank); + } + } + } +} diff --git a/test/components/CMakeLists.txt b/test/components/CMakeLists.txt index 12e738d8eaa..2793f3af575 100644 --- a/test/components/CMakeLists.txt +++ b/test/components/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_common_test(absolute_array_kernels) +ginkgo_create_common_device_test(bitvector) ginkgo_create_common_test(fill_array_kernels) ginkgo_create_common_test(format_conversion_kernels) ginkgo_create_common_test(precision_conversion_kernels) diff --git a/test/components/bitvector.cpp b/test/components/bitvector.cpp new file mode 100644 index 00000000000..dfed105e6ec --- /dev/null +++ b/test/components/bitvector.cpp @@ -0,0 +1,207 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +/*@GKO_PREPROCESSOR_FILENAME_HELPER@*/ + +#include "core/components/bitvector.hpp" + +#include +#include +#include +#include + +#include + +#include +#include + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/components/bitvector.hpp" +#include "core/base/index_range.hpp" +#include "core/test/utils.hpp" +#include "reference/components/bitvector.hpp" +#include "test/utils/common_fixture.hpp" + + +// workaround for cudafe 11.0 bug +using gko::irange; + + +template +class Bitvector : public CommonTestFixture { +protected: + using index_type = T; + using bitvector = gko::bitvector; + using device_bitvector = gko::device_bitvector; + using storage_type = typename bitvector::storage_type; + constexpr static auto block_size = bitvector::block_size; + + Bitvector() + : rng{67193}, sizes{0, 1, 2, 16, 31, 32, 33, + 40, 63, 64, 65, 127, 128, 129, + 1000, 1024, 2000, 10000, 100000} + {} + + gko::array create_random_values(index_type num_values, + index_type size) + { + assert(num_values <= size); + std::vector values(size); + std::iota(values.begin(), values.end(), index_type{}); + std::shuffle(values.begin(), values.end(), rng); + values.resize(num_values); + std::sort(values.begin(), values.end()); + return gko::array{this->ref, values.begin(), values.end()}; + } + + void assert_bitvector_equal(const bitvector& bv, const bitvector& dbv) + { + ASSERT_EQ(bv.get_size(), dbv.get_size()); + const auto num_blocks = + static_cast(bv.get_num_blocks()); + const auto bits = gko::make_const_array_view(bv.get_executor(), + num_blocks, bv.get_bits()); + const auto dbits = gko::make_const_array_view( + dbv.get_executor(), num_blocks, dbv.get_bits()); + const auto ranks = gko::make_const_array_view( + bv.get_executor(), num_blocks, bv.get_ranks()); + const auto dranks = gko::make_const_array_view( + dbv.get_executor(), num_blocks, dbv.get_ranks()); + GKO_ASSERT_ARRAY_EQ(bits, dbits); + GKO_ASSERT_ARRAY_EQ(ranks, dranks); + } + + std::default_random_engine rng; + std::vector sizes; +}; + +TYPED_TEST_SUITE(Bitvector, gko::test::IndexTypes, TypenameNameGenerator); + + +TYPED_TEST(Bitvector, BuildFromIndicesIsEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + using bitvector = typename TestFixture::bitvector; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + for (auto num_values : + {index_type{}, size / 10, size / 4, size / 2, size}) { + SCOPED_TRACE(num_values); + auto values = this->create_random_values(num_values, size); + gko::array dvalues{this->exec, values}; + + auto bv = gko::kernels::reference::bitvector::from_sorted_indices( + this->ref, values.get_data(), values.get_size(), size); + auto dbv = gko::kernels::GKO_DEVICE_NAMESPACE::bitvector:: + from_sorted_indices(this->exec, dvalues.get_data(), + dvalues.get_size(), size); + + this->assert_bitvector_equal(bv, dbv); + } + } +} + + +// nvcc doesn't like device lambdas inside class member functions +template +std::pair, gko::bitvector> run_predicate( + std::shared_ptr ref, + std::shared_ptr exec, IndexType size, int stride) +{ + return std::make_pair( + gko::kernels::reference::bitvector::from_predicate( + ref, size, [stride](int i) { return i % stride == 0; }), + gko::kernels::GKO_DEVICE_NAMESPACE::bitvector::from_predicate( + exec, size, + [stride] GKO_KERNEL(int i) { return i % stride == 0; })); +} + + +TYPED_TEST(Bitvector, BuildFromPredicateIsEquivalentToFromIndices) +{ + using index_type = typename TestFixture::index_type; + using bitvector = typename TestFixture::bitvector; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + for (auto stride : {1, 2, 3, 4, 5, 65}) { + SCOPED_TRACE(stride); + std::vector indices; + for (index_type i = 0; i < size; i += stride) { + indices.push_back(i); + } + gko::array values{this->ref, indices.begin(), + indices.end()}; + + auto [bv, dbv] = run_predicate(this->ref, this->exec, size, stride); + + auto ref_bv = + gko::kernels::reference::bitvector::from_sorted_indices( + this->ref, values.get_data(), values.get_size(), size); + this->assert_bitvector_equal(bv, dbv); + this->assert_bitvector_equal(ref_bv, dbv); + } + } +} + + +// nvcc doesn't like device lambdas inside class member functions +template +void run_device(std::shared_ptr exec, + const gko::device_bitvector bv, + const gko::device_bitvector dbv, + gko::array& output_bools, + gko::array& output_ranks, + gko::array& doutput_bools, + gko::array& doutput_ranks) +{ + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( + exec, + [] GKO_KERNEL(auto i, auto bv, auto output_bool, auto output_rank) { + output_bool[i] = bv[i]; + output_rank[i] = bv.get_rank(i); + }, + dbv.get_size(), dbv, doutput_bools, doutput_ranks); + for (auto i : gko::irange{bv.get_size()}) { + output_bools.get_data()[i] = bv[i]; + output_ranks.get_data()[i] = bv.get_rank(i); + } +} + + +TYPED_TEST(Bitvector, AccessIsEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + using storage_type = typename TestFixture::storage_type; + constexpr auto block_size = TestFixture::block_size; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + for (auto num_values : + {index_type{}, size / 10, size / 4, size / 2, size}) { + SCOPED_TRACE(num_values); + auto values = this->create_random_values(num_values, size); + num_values = values.get_size(); + gko::array dvalues{this->exec, values}; + + auto bv = gko::kernels::reference::bitvector::from_sorted_indices( + this->ref, values.get_const_data(), values.get_size(), size); + auto dbv = gko::kernels::GKO_DEVICE_NAMESPACE::bitvector:: + from_sorted_indices(this->exec, dvalues.get_const_data(), + dvalues.get_size(), size); + + const auto usize = static_cast(size); + gko::array output_bools{this->ref, usize}; + gko::array output_ranks{this->ref, usize}; + gko::array doutput_bools{this->exec, usize}; + gko::array doutput_ranks{this->exec, usize}; + doutput_bools.fill(true); + doutput_ranks.fill(-1); + run_device(this->exec, bv.device_view(), dbv.device_view(), + output_bools, output_ranks, doutput_bools, + doutput_ranks); + + GKO_ASSERT_ARRAY_EQ(doutput_bools, output_bools); + GKO_ASSERT_ARRAY_EQ(doutput_ranks, output_ranks); + } + } +}