-
Notifications
You must be signed in to change notification settings - Fork 99
Add bitvector data structure #1820
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a8be380
d9d91dd
60c441c
a0f0c3e
26cde3a
f934c25
a9daa2e
629400c
add25b8
59abaff
4c974d4
94eee10
93f4c38
090969e
3c3a66b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors | ||
// | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ | ||
#define GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ | ||
|
||
#include <thrust/functional.h> | ||
#include <thrust/iterator/transform_iterator.h> | ||
#include <thrust/reduce.h> | ||
#include <thrust/scatter.h> | ||
#include <thrust/sort.h> | ||
|
||
#include <ginkgo/core/base/intrinsics.hpp> | ||
|
||
#include "common/cuda_hip/base/thrust.hpp" | ||
#include "common/cuda_hip/components/cooperative_groups.hpp" | ||
#include "common/cuda_hip/components/thread_ids.hpp" | ||
#include "core/components/bitvector.hpp" | ||
#include "core/components/prefix_sum_kernels.hpp" | ||
|
||
|
||
namespace gko { | ||
namespace kernels { | ||
namespace GKO_DEVICE_NAMESPACE { | ||
namespace bitvector { | ||
|
||
|
||
constexpr auto default_block_size = 512; | ||
|
||
|
||
namespace kernel { | ||
|
||
|
||
template <typename IndexType, typename DevicePredicate> | ||
__global__ __launch_bounds__(default_block_size) void from_predicate( | ||
IndexType size, | ||
typename device_bitvector<IndexType>::storage_type* __restrict__ bits, | ||
IndexType* __restrict__ popcounts, DevicePredicate predicate) | ||
{ | ||
constexpr auto block_size = device_bitvector<IndexType>::block_size; | ||
static_assert(block_size <= config::warp_size); | ||
const auto subwarp_id = thread::get_subwarp_id_flat<block_size>(); | ||
const auto subwarp_base = subwarp_id * block_size; | ||
if (subwarp_base >= size) { | ||
return; | ||
} | ||
const auto subwarp = | ||
group::tiled_partition<block_size>(group::this_thread_block()); | ||
const auto i = static_cast<IndexType>(subwarp_base + subwarp.thread_rank()); | ||
const auto bit = i < size ? predicate(i) : false; | ||
const auto mask = subwarp.ballot(bit); | ||
if (subwarp.thread_rank() == 0) { | ||
bits[subwarp_id] = mask; | ||
popcounts[subwarp_id] = gko::detail::popcount(mask); | ||
} | ||
} | ||
|
||
|
||
} // namespace kernel | ||
|
||
|
||
template <typename IndexType, typename DevicePredicate> | ||
gko::bitvector<IndexType> from_predicate( | ||
std::shared_ptr<const DefaultExecutor> exec, IndexType size, | ||
DevicePredicate device_predicate) | ||
{ | ||
using storage_type = typename device_bitvector<IndexType>::storage_type; | ||
constexpr auto block_size = device_bitvector<IndexType>::block_size; | ||
const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size)); | ||
array<storage_type> bits{exec, num_blocks}; | ||
array<IndexType> ranks{exec, num_blocks}; | ||
if (num_blocks > 0) { | ||
const auto num_threadblocks = | ||
ceildiv(num_blocks, default_block_size / block_size); | ||
kernel::from_predicate<<<num_threadblocks, default_block_size, 0, | ||
exec->get_stream()>>>( | ||
size, bits.get_data(), ranks.get_data(), device_predicate); | ||
components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); | ||
} | ||
|
||
return gko::bitvector<IndexType>{std::move(bits), std::move(ranks), size}; | ||
} | ||
|
||
|
||
template <typename IndexType> | ||
struct bitvector_bit_functor { | ||
using storage_type = typename device_bitvector<IndexType>::storage_type; | ||
constexpr storage_type operator()(IndexType i) const | ||
{ | ||
return device_bitvector<IndexType>::get_block_and_mask(i).second; | ||
} | ||
}; | ||
|
||
|
||
template <typename IndexType> | ||
struct bitvector_or_functor { | ||
using storage_type = typename device_bitvector<IndexType>::storage_type; | ||
constexpr storage_type operator()(storage_type a, storage_type b) const | ||
{ | ||
// https://github.com/ROCm/rocThrust/issues/352 | ||
#ifndef GKO_COMPILING_HIP | ||
// there must not be any duplicate indices | ||
assert(a ^ b == 0); | ||
#endif | ||
return a | b; | ||
} | ||
}; | ||
|
||
|
||
template <typename IndexType> | ||
struct bitvector_block_functor { | ||
// workaround for ROCm 4.5 bug | ||
using result_type = IndexType; | ||
constexpr static auto block_size = device_bitvector<IndexType>::block_size; | ||
constexpr IndexType operator()(IndexType i) const | ||
{ | ||
assert(i >= 0); | ||
assert(i < size); | ||
return i / block_size; | ||
} | ||
|
||
IndexType size; | ||
}; | ||
|
||
|
||
template <typename IndexType> | ||
struct bitvector_popcnt_functor { | ||
using storage_type = typename device_bitvector<IndexType>::storage_type; | ||
constexpr IndexType operator()(storage_type mask) const | ||
{ | ||
return gko::detail::popcount(mask); | ||
} | ||
}; | ||
|
||
|
||
template <typename IndexIterator> | ||
yhmtsai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
gko::bitvector<typename std::iterator_traits<IndexIterator>::value_type> | ||
from_sorted_indices( | ||
std::shared_ptr<const DefaultExecutor> exec, IndexIterator it, | ||
typename std::iterator_traits<IndexIterator>::difference_type count, | ||
typename std::iterator_traits<IndexIterator>::value_type size) | ||
{ | ||
using index_type = typename std::iterator_traits<IndexIterator>::value_type; | ||
using storage_type = typename device_bitvector<index_type>::storage_type; | ||
constexpr auto block_size = device_bitvector<index_type>::block_size; | ||
const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size)); | ||
const auto policy = thrust_policy(exec); | ||
array<storage_type> bits_compact{exec, num_blocks}; | ||
array<index_type> bits_position{exec, num_blocks}; | ||
array<storage_type> bits{exec, num_blocks}; | ||
array<index_type> ranks{exec, num_blocks}; | ||
Comment on lines
+149
to
+152
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. compared to the others, it uses double memory allocation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It has a better worst-case behavior, since it doesn't suffer from atomic collisions. If we implemented our own kernel for it, we could also get rid of the second pair of arrays, but |
||
const auto block_it = thrust::make_transform_iterator( | ||
it, bitvector_block_functor<index_type>{size}); | ||
const auto bit_it = thrust::make_transform_iterator( | ||
it, bitvector_bit_functor<index_type>{}); | ||
auto out_pos_it = bits_position.get_data(); | ||
auto out_bit_it = bits_compact.get_data(); | ||
auto [out_pos_end, out_bit_end] = thrust::reduce_by_key( | ||
policy, block_it, block_it + count, bit_it, out_pos_it, out_bit_it, | ||
thrust::equal_to<index_type>{}, bitvector_or_functor<storage_type>{}); | ||
assert(thrust::is_sorted(policy, out_pos_it, out_pos_end)); | ||
const auto out_size = out_pos_end - out_pos_it; | ||
thrust::fill_n(policy, bits.get_data(), num_blocks, 0); | ||
thrust::scatter(policy, out_bit_it, out_bit_it + out_size, out_pos_it, | ||
bits.get_data()); | ||
const auto rank_it = thrust::make_transform_iterator( | ||
bits.get_const_data(), bitvector_popcnt_functor<index_type>{}); | ||
thrust::exclusive_scan(policy, rank_it, rank_it + num_blocks, | ||
ranks.get_data(), index_type{}); | ||
|
||
return gko::bitvector<index_type>{std::move(bits), std::move(ranks), size}; | ||
} | ||
|
||
|
||
} // namespace bitvector | ||
} // namespace GKO_DEVICE_NAMESPACE | ||
} // namespace kernels | ||
} // namespace gko | ||
|
||
|
||
#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors | ||
// | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ | ||
#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ | ||
|
||
|
||
#include "common/unified/base/kernel_launch.hpp" | ||
|
||
|
||
#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) | ||
#include "common/cuda_hip/components/bitvector.hpp" | ||
#elif defined(GKO_COMPILING_OMP) | ||
#include "omp/components/bitvector.hpp" | ||
#elif defined(GKO_COMPILING_DPCPP) | ||
#include "dpcpp/components/bitvector.dp.hpp" | ||
#else | ||
#error "This file should only be used inside Ginkgo device compilation" | ||
#endif | ||
|
||
|
||
#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ |
Uh oh!
There was an error while loading. Please reload this page.