-
Notifications
You must be signed in to change notification settings - Fork 99
Add bitvector data structure #1820
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
a8be380
d9d91dd
60c441c
a0f0c3e
26cde3a
f934c25
a9daa2e
629400c
add25b8
59abaff
4c974d4
94eee10
93f4c38
090969e
3c3a66b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,181 @@ | ||
| // SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors | ||
| // | ||
| // SPDX-License-Identifier: BSD-3-Clause | ||
|
|
||
| #ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ | ||
| #define GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ | ||
|
|
||
| #include <thrust/functional.h> | ||
| #include <thrust/iterator/transform_iterator.h> | ||
| #include <thrust/reduce.h> | ||
| #include <thrust/scatter.h> | ||
| #include <thrust/sort.h> | ||
|
|
||
| #include <ginkgo/core/base/intrinsics.hpp> | ||
|
|
||
| #include "common/cuda_hip/base/thrust.hpp" | ||
| #include "common/cuda_hip/components/cooperative_groups.hpp" | ||
| #include "common/cuda_hip/components/thread_ids.hpp" | ||
| #include "core/components/bitvector.hpp" | ||
| #include "core/components/prefix_sum_kernels.hpp" | ||
|
|
||
|
|
||
| namespace gko { | ||
| namespace kernels { | ||
| namespace GKO_DEVICE_NAMESPACE { | ||
| namespace bitvector { | ||
|
|
||
|
|
||
| constexpr auto default_block_size = 512; | ||
|
|
||
|
|
||
| namespace kernel { | ||
|
|
||
|
|
||
| template <typename IndexType, typename DevicePredicate> | ||
| __global__ __launch_bounds__(default_block_size) void from_predicate( | ||
| IndexType size, | ||
| typename device_bitvector<IndexType>::storage_type* __restrict__ bits, | ||
| IndexType* __restrict__ popcounts, DevicePredicate predicate) | ||
| { | ||
| constexpr auto block_size = device_bitvector<IndexType>::block_size; | ||
| static_assert(block_size <= config::warp_size); | ||
| const auto subwarp_id = thread::get_subwarp_id_flat<block_size>(); | ||
| const auto subwarp_base = subwarp_id * block_size; | ||
| if (subwarp_base >= size) { | ||
| return; | ||
| } | ||
| const auto subwarp = | ||
| group::tiled_partition<block_size>(group::this_thread_block()); | ||
| const auto i = static_cast<IndexType>(subwarp_base + subwarp.thread_rank()); | ||
| const auto bit = i < size ? predicate(i) : false; | ||
| const auto mask = subwarp.ballot(bit); | ||
| if (subwarp.thread_rank() == 0) { | ||
| bits[subwarp_id] = mask; | ||
| popcounts[subwarp_id] = gko::detail::popcount(mask); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| } // namespace kernel | ||
|
|
||
|
|
||
| template <typename IndexType, typename DevicePredicate> | ||
| gko::bitvector<IndexType> from_predicate( | ||
| std::shared_ptr<const DefaultExecutor> exec, IndexType size, | ||
| DevicePredicate device_predicate) | ||
| { | ||
| constexpr auto block_size = device_bitvector<IndexType>::block_size; | ||
| const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size)); | ||
| array<uint32> bits{exec, num_blocks}; | ||
upsj marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| array<IndexType> ranks{exec, num_blocks}; | ||
| if (num_blocks > 0) { | ||
| const auto num_threadblocks = | ||
| ceildiv(num_blocks, default_block_size / block_size); | ||
| kernel::from_predicate<<<num_threadblocks, default_block_size, 0, | ||
| exec->get_stream()>>>( | ||
| size, bits.get_data(), ranks.get_data(), device_predicate); | ||
| components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); | ||
| } | ||
|
|
||
| return gko::bitvector<IndexType>{std::move(bits), std::move(ranks), size}; | ||
| } | ||
|
|
||
|
|
||
| template <typename IndexType> | ||
| struct bitvector_bit_functor { | ||
| using storage_type = typename device_bitvector<IndexType>::storage_type; | ||
| __device__ storage_type operator()(IndexType i) | ||
| { | ||
| return device_bitvector<IndexType>::get_block_and_mask(i).second; | ||
| } | ||
| }; | ||
|
|
||
|
|
||
| template <typename IndexType> | ||
| struct bitvector_or_functor { | ||
| using storage_type = typename device_bitvector<IndexType>::storage_type; | ||
| __device__ storage_type operator()(storage_type a, storage_type b) | ||
| { | ||
| // https://github.com/ROCm/rocThrust/issues/352 | ||
| #ifndef GKO_COMPILING_HIP | ||
| // there must not be any duplicate indices | ||
| assert(a ^ b == 0); | ||
| #endif | ||
| return a | b; | ||
| } | ||
| }; | ||
|
|
||
|
|
||
| template <typename IndexType> | ||
| struct bitvector_block_functor { | ||
| // workaround for ROCm 4.5 bug | ||
| using result_type = IndexType; | ||
| constexpr static auto block_size = device_bitvector<IndexType>::block_size; | ||
| __device__ IndexType operator()(IndexType i) | ||
| { | ||
| assert(i >= 0); | ||
| assert(i < size); | ||
| return i / block_size; | ||
| } | ||
|
|
||
| IndexType size; | ||
| }; | ||
|
|
||
|
|
||
| template <typename IndexType> | ||
| struct bitvector_popcnt_functor { | ||
| using storage_type = typename device_bitvector<IndexType>::storage_type; | ||
| __device__ IndexType operator()(storage_type mask) | ||
| { | ||
| return gko::detail::popcount(mask); | ||
| } | ||
| }; | ||
|
|
||
|
|
||
| template <typename IndexIterator> | ||
yhmtsai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| gko::bitvector<typename std::iterator_traits<IndexIterator>::value_type> | ||
| from_sorted_indices( | ||
| std::shared_ptr<const DefaultExecutor> exec, IndexIterator it, | ||
| typename std::iterator_traits<IndexIterator>::difference_type count, | ||
| typename std::iterator_traits<IndexIterator>::value_type size) | ||
| { | ||
| using index_type = typename std::iterator_traits<IndexIterator>::value_type; | ||
| using storage_type = typename device_bitvector<index_type>::storage_type; | ||
| constexpr auto block_size = device_bitvector<index_type>::block_size; | ||
| const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size)); | ||
| const auto policy = thrust_policy(exec); | ||
| array<storage_type> bits_compact{exec, num_blocks}; | ||
| array<index_type> bits_position{exec, num_blocks}; | ||
| array<storage_type> bits{exec, num_blocks}; | ||
| array<index_type> ranks{exec, num_blocks}; | ||
|
Comment on lines
+149
to
+152
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. compared to the others, it uses double memory allocation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It has a better worst-case behavior, since it doesn't suffer from atomic collisions. If we implemented our own kernel for it, we could also get rid of the second pair of arrays, but |
||
| const auto block_it = thrust::make_transform_iterator( | ||
| it, bitvector_block_functor<index_type>{size}); | ||
| const auto bit_it = thrust::make_transform_iterator( | ||
| it, bitvector_bit_functor<index_type>{}); | ||
| auto out_pos_it = bits_position.get_data(); | ||
| auto out_bit_it = bits_compact.get_data(); | ||
| auto [out_pos_end, out_bit_end] = thrust::reduce_by_key( | ||
| policy, block_it, block_it + count, bit_it, out_pos_it, out_bit_it, | ||
| thrust::equal_to<index_type>{}, bitvector_or_functor<storage_type>{}); | ||
| assert(thrust::is_sorted(policy, out_pos_it, out_pos_end)); | ||
| const auto out_size = out_pos_end - out_pos_it; | ||
| thrust::fill_n(policy, bits.get_data(), num_blocks, 0); | ||
| thrust::scatter(policy, out_bit_it, out_bit_it + out_size, out_pos_it, | ||
| bits.get_data()); | ||
| const auto rank_it = thrust::make_transform_iterator( | ||
| bits.get_const_data(), bitvector_popcnt_functor<index_type>{}); | ||
| thrust::exclusive_scan(policy, rank_it, rank_it + num_blocks, | ||
| ranks.get_data(), index_type{}); | ||
|
|
||
| return gko::bitvector<index_type>{std::move(bits), std::move(ranks), size}; | ||
| } | ||
|
|
||
|
|
||
| } // namespace bitvector | ||
| } // namespace GKO_DEVICE_NAMESPACE | ||
| } // namespace kernels | ||
| } // namespace gko | ||
|
|
||
|
|
||
| #endif // GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ | ||
upsj marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is still a blocker for me. I just don't see that we should put kernel headers (that are not used exclusively in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With a uniform There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My issue is not that it includes non-uniform implementations from different backends, but that I don't really understand how this header is going to be used. If the kernels are only called from kernels implemented in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For tests and kernels implemented in |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| // SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors | ||
| // | ||
| // SPDX-License-Identifier: BSD-3-Clause | ||
|
|
||
| #ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ | ||
| #define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ | ||
|
|
||
|
|
||
| #include "common/unified/base/kernel_launch.hpp" | ||
|
|
||
|
|
||
| #if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) | ||
| #include "common/cuda_hip/components/bitvector.hpp" | ||
| #elif defined(GKO_COMPILING_OMP) | ||
| #include "omp/components/bitvector.hpp" | ||
| #elif defined(GKO_COMPILING_DPCPP) | ||
| #include "dpcpp/components/bitvector.dp.hpp" | ||
| #else | ||
| #error "This file should only be used inside Ginkgo device compilation" | ||
| #endif | ||
|
|
||
|
|
||
| #endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,167 @@ | ||||||
| // SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors | ||||||
| // | ||||||
| // SPDX-License-Identifier: BSD-3-Clause | ||||||
|
|
||||||
| #ifndef GKO_CORE_COMPONENTS_BITVECTOR_HPP_ | ||||||
| #define GKO_CORE_COMPONENTS_BITVECTOR_HPP_ | ||||||
|
|
||||||
| #include <ginkgo/core/base/array.hpp> | ||||||
| #include <ginkgo/core/base/intrinsics.hpp> | ||||||
| #include <ginkgo/core/base/math.hpp> | ||||||
| #include <ginkgo/core/base/types.hpp> | ||||||
|
|
||||||
| namespace gko { | ||||||
|
|
||||||
|
|
||||||
| template <typename IndexType> | ||||||
| class device_bitvector { | ||||||
| public: | ||||||
| using index_type = IndexType; | ||||||
| using storage_type = uint32; | ||||||
upsj marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| constexpr static int block_size = CHAR_BIT * sizeof(storage_type); | ||||||
|
|
||||||
| /** | ||||||
| * Returns the block index and bitmask belonging to a specific bit index. | ||||||
| * | ||||||
| * @param i the bit index | ||||||
| * @returns a pair consisting of the block index and bitmask for this bit. | ||||||
upsj marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| */ | ||||||
| constexpr static std::pair<index_type, storage_type> get_block_and_mask( | ||||||
| index_type i) | ||||||
| { | ||||||
| return std::make_pair(i / block_size, | ||||||
| storage_type{1} << (i % block_size)); | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Constructs a device_bitvector from its underlying data. | ||||||
| * | ||||||
| * @param bits the bitmask array | ||||||
| * @param rank the rank array, it must be the prefix sum over | ||||||
| * `popcount(bits[i])`. | ||||||
| * @param size the number of bits we consider part of this bitvector. | ||||||
| */ | ||||||
| constexpr device_bitvector(const storage_type* bits, | ||||||
| const index_type* ranks, index_type size) | ||||||
| : size_{size}, bits_{bits}, ranks_{ranks} | ||||||
| {} | ||||||
|
|
||||||
| /** Returns the number of bits stored in this bitvector. */ | ||||||
| constexpr index_type size() const { return size_; } | ||||||
upsj marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
|
|
||||||
| /** Returns the number of words (of type storage_type) in this bitvector. */ | ||||||
| constexpr index_type num_blocks() const | ||||||
| { | ||||||
| return (this->size() + block_size - 1) / block_size; | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Returns whether the bit at the given index is set. | ||||||
| * | ||||||
| * @param i the index in range [0, size()) | ||||||
| * @return true if the bit is set, false otherwise. | ||||||
upsj marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| */ | ||||||
| constexpr bool get(index_type i) const | ||||||
upsj marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| { | ||||||
| assert(i >= 0); | ||||||
| assert(i < size()); | ||||||
| const auto block = i / block_size; | ||||||
| const auto local = i % block_size; | ||||||
| return bool((bits_[block] >> local) & 1); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This intentional, as with a bool cast, there is less risk of operator associativity issues (I've had many issues with & having lower precedence than !=, so missing parentheses causes issues) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think static_cast makes sense to highlight places where casts play an important role, e.g. casting between different integer types. But here to me this is unnecessarily verbose. |
||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Returns the rank of the given index. | ||||||
| * | ||||||
| * @param i the index in range [0, size()) | ||||||
| * @return the rank of the given index, i.e. the number of 1 bits set | ||||||
upsj marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| * before the corresponding bit (exclusive). | ||||||
| */ | ||||||
| constexpr index_type rank(index_type i) const | ||||||
upsj marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
upsj marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| { | ||||||
| assert(i >= 0); | ||||||
| assert(i < size()); | ||||||
| const auto [block, mask] = get_block_and_mask(i); | ||||||
| const auto prefix_mask = mask - 1; | ||||||
| return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Returns the inclusive rank of the given index. | ||||||
| * | ||||||
| * @param i the index in range [0, size()) | ||||||
| * @return the rank of the given index, i.e. the number of 1 bits set | ||||||
| * up to and including the corresponding bit (inclusive). | ||||||
| */ | ||||||
| constexpr index_type rank_inclusive(index_type i) const | ||||||
| { | ||||||
| assert(i >= 0); | ||||||
| assert(i < size()); | ||||||
| const auto [block, mask] = get_block_and_mask(i); | ||||||
| const auto prefix_mask = mask - 1 | mask; | ||||||
| return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); | ||||||
| } | ||||||
|
|
||||||
| private: | ||||||
| index_type size_; | ||||||
| const storage_type* bits_; | ||||||
| const index_type* ranks_; | ||||||
| }; | ||||||
|
|
||||||
|
|
||||||
| /** | ||||||
| * Bitvector with rank support. It supports bit queries (whether a bit is set) | ||||||
| * and rank queries (how many bits are set before a specific index). | ||||||
| * | ||||||
| * @tparam IndexType the type of indices used in the input and rank array. | ||||||
| */ | ||||||
| template <typename IndexType> | ||||||
| class bitvector { | ||||||
| public: | ||||||
| using index_type = IndexType; | ||||||
| using view_type = device_bitvector<index_type>; | ||||||
| using storage_type = typename view_type::storage_type; | ||||||
| constexpr static int block_size = CHAR_BIT * sizeof(storage_type); | ||||||
|
|
||||||
| static index_type get_num_blocks(index_type size) | ||||||
| { | ||||||
| return (size + block_size - 1) / block_size; | ||||||
| } | ||||||
|
|
||||||
| view_type device_view() const | ||||||
| { | ||||||
| return view_type{this->get_bits(), this->get_ranks(), this->get_size()}; | ||||||
| } | ||||||
|
|
||||||
| std::shared_ptr<const Executor> get_executor() const | ||||||
| { | ||||||
| return bits_.get_executor(); | ||||||
| } | ||||||
|
|
||||||
| const storage_type* get_bits() const { return bits_.get_const_data(); } | ||||||
|
|
||||||
| const index_type* get_ranks() const { return ranks_.get_const_data(); } | ||||||
|
|
||||||
| index_type get_size() const { return size_; } | ||||||
|
|
||||||
| index_type get_num_blocks() const { return get_num_blocks(get_size()); } | ||||||
|
|
||||||
| bitvector(array<storage_type> bits, array<index_type> ranks, | ||||||
| index_type size) | ||||||
| : size_{size}, bits_{std::move(bits)}, ranks_{std::move(ranks)} | ||||||
| { | ||||||
| GKO_ASSERT(bits_.get_executor() == ranks_.get_executor()); | ||||||
| GKO_ASSERT(this->get_num_blocks() == bits_.get_size()); | ||||||
| GKO_ASSERT(this->get_num_blocks() == ranks_.get_size()); | ||||||
| } | ||||||
|
|
||||||
| private: | ||||||
| index_type size_; | ||||||
| array<storage_type> bits_; | ||||||
| array<index_type> ranks_; | ||||||
| }; | ||||||
|
|
||||||
|
|
||||||
| } // namespace gko | ||||||
|
|
||||||
| #endif // GKO_CORE_COMPONENTS_BITVECTOR_HPP_ | ||||||
Uh oh!
There was an error while loading. Please reload this page.