From a8be3803b3fe18cb7e3026b3d21c750e02720618 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 8 Mar 2025 01:39:05 +0100 Subject: [PATCH 01/15] add bitvector functionality --- common/cuda_hip/CMakeLists.txt | 1 + .../cuda_hip/components/bitvector_kernels.cpp | 55 +++++++ core/CMakeLists.txt | 1 + core/base/array.cpp | 1 + core/components/bit_packed_storage.hpp | 6 +- core/components/bitvector.cpp | 60 +++++++ core/components/bitvector.hpp | 90 ++++++++++ core/components/bitvector_kernels.hpp | 46 ++++++ core/components/range_minimum_query.cpp | 4 +- core/device_hooks/common_kernels.inc.cpp | 10 ++ dpcpp/CMakeLists.txt | 1 + dpcpp/components/bitvector_kernels.dp.cpp | 30 ++++ omp/CMakeLists.txt | 1 + omp/components/bitvector_kernels.cpp | 54 ++++++ reference/CMakeLists.txt | 1 + reference/components/bitvector_kernels.cpp | 49 ++++++ reference/test/components/CMakeLists.txt | 1 + .../test/components/bitvector_kernels.cpp | 103 ++++++++++++ test/components/CMakeLists.txt | 1 + test/components/bitvector_kernels.cpp | 155 ++++++++++++++++++ 20 files changed, 666 insertions(+), 4 deletions(-) create mode 100644 common/cuda_hip/components/bitvector_kernels.cpp create mode 100644 core/components/bitvector.cpp create mode 100644 core/components/bitvector.hpp create mode 100644 core/components/bitvector_kernels.hpp create mode 100644 dpcpp/components/bitvector_kernels.dp.cpp create mode 100644 omp/components/bitvector_kernels.cpp create mode 100644 reference/components/bitvector_kernels.cpp create mode 100644 reference/test/components/bitvector_kernels.cpp create mode 100644 test/components/bitvector_kernels.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 10f3b857d82..0b69873b7a9 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -3,6 +3,7 @@ set(CUDA_HIP_SOURCES base/batch_multi_vector_kernels.cpp base/device_matrix_data_kernels.cpp base/index_set_kernels.cpp + components/bitvector_kernels.cpp components/prefix_sum_kernels.cpp distributed/assembly_kernels.cpp distributed/index_map_kernels.cpp diff --git a/common/cuda_hip/components/bitvector_kernels.cpp b/common/cuda_hip/components/bitvector_kernels.cpp new file mode 100644 index 00000000000..36178daacfa --- /dev/null +++ b/common/cuda_hip/components/bitvector_kernels.cpp @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/bitvector_kernels.hpp" + +#include +#include +#include +#include + +#include + +#include "common/cuda_hip/base/thrust.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace bitvector { + + +template +void compute_bits_and_ranks( + std::shared_ptr exec, const IndexType* indices, + IndexType num_indices, IndexType size, + typename device_bitvector::storage_type* bits, IndexType* ranks) +{ + const auto policy = thrust_policy(exec); + using bv = device_bitvector; + using storage_type = typename bv::storage_type; + const auto num_blocks = ceildiv(size, bv::block_size); + thrust::fill_n(policy, bits, num_blocks, 0u); + thrust::for_each_n( + policy, indices, num_indices, [bits] __device__(IndexType idx) { + constexpr auto block_size = device_bitvector::block_size; + const auto block = idx / block_size; + const auto local = idx % block_size; + atomicOr(bits + block, storage_type{1} << local); + }); + const auto it = thrust::make_transform_iterator( + bits, [] __device__(storage_type word) -> IndexType { + return gko::detail::popcount(word); + }); + thrust::exclusive_scan(policy, it, it + num_blocks, ranks, IndexType{}); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL); + + +} // namespace bitvector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 815545e5f8b..c05f7d39c09 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -43,6 +43,7 @@ target_sources( base/segmented_array.cpp base/timer.cpp base/version.cpp + components/bitvector.cpp components/range_minimum_query.cpp config/config.cpp config/config_helper.cpp diff --git a/core/base/array.cpp b/core/base/array.cpp index 51fa4b34bb1..da1accc1ca1 100644 --- a/core/base/array.cpp +++ b/core/base/array.cpp @@ -91,6 +91,7 @@ ValueType reduce_add(const array& input_arr, #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value) GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL); +template GKO_DECLARE_ARRAY_FILL(bool); template GKO_DECLARE_ARRAY_FILL(uint16); template GKO_DECLARE_ARRAY_FILL(uint32); #ifndef GKO_SIZE_T_IS_UINT64_T diff --git a/core/components/bit_packed_storage.hpp b/core/components/bit_packed_storage.hpp index 78777c1d2d2..2654b7ecb2a 100644 --- a/core/components/bit_packed_storage.hpp +++ b/core/components/bit_packed_storage.hpp @@ -239,11 +239,13 @@ class bit_packed_span { * @tparam num_bits The number of bits necessary to store a single value in the * array. Values need to be in the range [0, 2^num_bits). * @tparam size The number of values to store in the array. + * @tparam StorageType the underlying storage type to use for each individual + word */ -template +template class bit_packed_array { public: - using storage_type = uint32; + using storage_type = StorageType; constexpr static int bits_per_word = sizeof(storage_type) * CHAR_BIT; constexpr static int bits_per_value = round_up_pow2_constexpr(num_bits); constexpr static int values_per_word = bits_per_word / bits_per_value; diff --git a/core/components/bitvector.cpp b/core/components/bitvector.cpp new file mode 100644 index 00000000000..562ed0099ac --- /dev/null +++ b/core/components/bitvector.cpp @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/bitvector.hpp" + +#include + +#include "core/components/bitvector_kernels.hpp" + + +namespace gko { +namespace { + + +GKO_REGISTER_OPERATION(compute_bits_and_ranks, + bitvector::compute_bits_and_ranks); + + +} // namespace + +template +device_bitvector bitvector::device_view() const +{ + return device_bitvector{bits_.get_const_data(), + ranks_.get_const_data(), size_}; +} + + +template +bitvector::bitvector(std::shared_ptr exec, + index_type size) + : size_{size}, + bits_{exec, static_cast(ceildiv(size, block_size))}, + ranks_{exec, static_cast(ceildiv(size, block_size))} +{ + bits_.fill(storage_type{}); + ranks_.fill(0); +} + + +template +bitvector bitvector::from_sorted_indices( + const array& indices, index_type size) +{ + const auto exec = indices.get_executor(); + bitvector result{exec, size}; + exec->run(make_compute_bits_and_ranks( + indices.get_const_data(), static_cast(indices.get_size()), + size, result.bits_.get_data(), result.ranks_.get_data())); + return result; +} + + +#define GKO_DEFINE_BITVECTOR(IndexType) class bitvector + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DEFINE_BITVECTOR); + + +} // namespace gko diff --git a/core/components/bitvector.hpp b/core/components/bitvector.hpp new file mode 100644 index 00000000000..fa50eb38ff5 --- /dev/null +++ b/core/components/bitvector.hpp @@ -0,0 +1,90 @@ +// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_COMPONENTS_BITVECTOR_HPP_ +#define GKO_CORE_COMPONENTS_BITVECTOR_HPP_ + +#include +#include +#include +#include + +namespace gko { + + +template +class device_bitvector { +public: + using index_type = IndexType; + using storage_type = uint32; + constexpr static int block_size = CHAR_BIT * sizeof(storage_type); + + constexpr device_bitvector(const storage_type* bits, + const index_type* ranks, index_type size) + : bits_{bits}, ranks_{ranks}, size_{size} + {} + + constexpr index_type size() const { return size_; } + + constexpr index_type num_blocks() const + { + return (size() + block_size - 1) / block_size; + } + + constexpr bool get(index_type i) const + { + assert(i >= 0); + assert(i < size()); + const auto block = i / block_size; + const auto local = i % block_size; + return bool((bits_[block] >> local) & 1); + } + + constexpr index_type rank(index_type i) const + { + assert(i >= 0); + assert(i < size()); + const auto block = i / block_size; + const auto local = i % block_size; + const auto prefix_mask = (storage_type{1} << local) - 1; + return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); + } + +private: + const index_type* ranks_; + const storage_type* bits_; + index_type size_; +}; + + +/** + * Bitvector with rank support. It supports bit queries (whether a bit is set) + * and rank queries (how many bits are set before a specific index). + * + * @tparam IndexType the type of indices used in the input and rank array. + */ +template +class bitvector { +public: + using index_type = IndexType; + using storage_type = uint32; + constexpr static int block_size = CHAR_BIT * sizeof(storage_type); + + device_bitvector device_view() const; + + static bitvector from_sorted_indices(const array& indices, + index_type size); + +private: + bitvector(std::shared_ptr exec, index_type size); + + index_type size_; + array bits_; + array ranks_; +}; + + +} // namespace gko + +#endif // GKO_CORE_COMPONENTS_BITVECTOR_HPP_ diff --git a/core/components/bitvector_kernels.hpp b/core/components/bitvector_kernels.hpp new file mode 100644 index 00000000000..5cbdf9cd6ad --- /dev/null +++ b/core/components/bitvector_kernels.hpp @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_COMPONENTS_BITVECTOR_KERNELS_HPP_ +#define GKO_CORE_COMPONENTS_BITVECTOR_KERNELS_HPP_ + + +#include "core/components/bitvector.hpp" + +#include + +#include +#include + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL(IndexType) \ + void compute_bits_and_ranks( \ + std::shared_ptr exec, const IndexType* indices, \ + IndexType num_indices, IndexType size, \ + typename device_bitvector::storage_type* bits, \ + IndexType* ranks) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL(IndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(bitvector, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + +#endif // GKO_CORE_COMPONENTS_BITVECTOR_KERNELS_HPP_ diff --git a/core/components/range_minimum_query.cpp b/core/components/range_minimum_query.cpp index 07cb69fcdff..970a2ef739e 100644 --- a/core/components/range_minimum_query.cpp +++ b/core/components/range_minimum_query.cpp @@ -68,10 +68,10 @@ range_minimum_query::get() const } -#define GKO_DEFINE_DEVICE_RANGE_MINIMUM_QUERY(IndexType) \ +#define GKO_DEFINE_RANGE_MINIMUM_QUERY(IndexType) \ class range_minimum_query -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DEFINE_DEVICE_RANGE_MINIMUM_QUERY); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DEFINE_RANGE_MINIMUM_QUERY); } // namespace gko diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 1abe27e9558..d6d1c1d1b4b 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -13,6 +13,7 @@ #include "core/base/index_set_kernels.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/absolute_array_kernels.hpp" +#include "core/components/bitvector_kernels.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/components/precision_conversion_kernels.hpp" @@ -290,6 +291,15 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES); } // namespace components +namespace bitvector { + + +GKO_STUB_INDEX_TYPE(GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL); + + +} // namespace bitvector + + namespace range_minimum_query { diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index b8b4be789d1..acb1b4c804d 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -20,6 +20,7 @@ target_sources( base/scoped_device_id.dp.cpp base/timer.dp.cpp base/version.dp.cpp + components/bitvector_kernels.dp.cpp components/prefix_sum_kernels.dp.cpp distributed/assembly_kernels.dp.cpp distributed/index_map_kernels.dp.cpp diff --git a/dpcpp/components/bitvector_kernels.dp.cpp b/dpcpp/components/bitvector_kernels.dp.cpp new file mode 100644 index 00000000000..b71f475ad43 --- /dev/null +++ b/dpcpp/components/bitvector_kernels.dp.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/bitvector_kernels.hpp" + +#include + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace bitvector { + + +template +void compute_bits_and_ranks( + std::shared_ptr exec, const IndexType* indices, + IndexType num_indices, IndexType size, + typename device_bitvector::storage_type* bits, + IndexType* ranks) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL); + + +} // namespace bitvector +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 6e3d0ec2b49..e74f943a36d 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -14,6 +14,7 @@ target_sources( base/index_set_kernels.cpp base/scoped_device_id.cpp base/version.cpp + components/bitvector_kernels.cpp components/prefix_sum_kernels.cpp distributed/assembly_kernels.cpp distributed/index_map_kernels.cpp diff --git a/omp/components/bitvector_kernels.cpp b/omp/components/bitvector_kernels.cpp new file mode 100644 index 00000000000..7c2db95ab71 --- /dev/null +++ b/omp/components/bitvector_kernels.cpp @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/bitvector_kernels.hpp" + +#include + +#include "core/base/index_range.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace bitvector { + + +template +void compute_bits_and_ranks( + std::shared_ptr exec, const IndexType* indices, + IndexType num_indices, IndexType size, + typename device_bitvector::storage_type* bits, IndexType* ranks) +{ + using bv = device_bitvector; + using storage_type = typename bv::storage_type; + const auto num_blocks = ceildiv(size, bv::block_size); +#pragma omp parallel for + for (IndexType i = 0; i < num_blocks; i++) { + bits[i] = 0; + } +#pragma omp parallel for + for (IndexType i = 0; i < num_indices; i++) { + const auto index = indices[i]; + const auto block_idx = index / bv::block_size; + const auto mask = storage_type{1} << index % bv::block_size; +#pragma omp atomic + bits[block_idx] |= mask; + } +#pragma omp parallel for + for (IndexType i = 0; i < num_blocks; i++) { + ranks[i] = gko::detail::popcount(bits[i]); + } + components::prefix_sum_nonnegative(exec, ranks, num_blocks); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL); + + +} // namespace bitvector +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 87858d18812..ab31c1942d9 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -8,6 +8,7 @@ target_sources( base/scoped_device_id.cpp base/version.cpp components/absolute_array_kernels.cpp + components/bitvector_kernels.cpp components/fill_array_kernels.cpp components/format_conversion_kernels.cpp components/range_minimum_query_kernels.cpp diff --git a/reference/components/bitvector_kernels.cpp b/reference/components/bitvector_kernels.cpp new file mode 100644 index 00000000000..8bbb7ff7679 --- /dev/null +++ b/reference/components/bitvector_kernels.cpp @@ -0,0 +1,49 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/bitvector_kernels.hpp" + +#include + +#include "core/base/index_range.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace bitvector { + + +template +void compute_bits_and_ranks( + std::shared_ptr exec, const IndexType* indices, + IndexType num_indices, IndexType size, + typename device_bitvector::storage_type* bits, IndexType* ranks) +{ + using bv = device_bitvector; + using storage_type = typename bv::storage_type; + const auto num_blocks = ceildiv(size, bv::block_size); + std::fill_n(bits, num_blocks, 0u); + for (auto i : irange{num_indices}) { + const auto index = indices[i]; + assert(index >= 0); + assert(index < size); + bits[index / bv::block_size] |= storage_type{1} + << index % bv::block_size; + } + IndexType rank{}; + for (auto i : irange{num_blocks}) { + ranks[i] = rank; + rank += gko::detail::popcount(bits[i]); + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL); + + +} // namespace bitvector +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/test/components/CMakeLists.txt b/reference/test/components/CMakeLists.txt index b17880ab32d..a4e36a3439a 100644 --- a/reference/test/components/CMakeLists.txt +++ b/reference/test/components/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_test(absolute_array_kernels) +ginkgo_create_test(bitvector_kernels) ginkgo_create_test(fill_array_kernels) ginkgo_create_test(format_conversion_kernels) ginkgo_create_test(precision_conversion_kernels) diff --git a/reference/test/components/bitvector_kernels.cpp b/reference/test/components/bitvector_kernels.cpp new file mode 100644 index 00000000000..d8140188bee --- /dev/null +++ b/reference/test/components/bitvector_kernels.cpp @@ -0,0 +1,103 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/bitvector.hpp" + +#include +#include +#include + +#include + +#include "core/base/index_range.hpp" +#include "core/components/bitvector_kernels.hpp" +#include "core/test/utils.hpp" + + +template +class Bitvector : public ::testing::Test { +protected: + using index_type = IndexType; + using device_type = gko::bitvector; + using storage_type = typename device_type::storage_type; + constexpr static auto block_size = device_type::block_size; + Bitvector() + : ref{gko::ReferenceExecutor::create()}, rng{67593}, sizes{0, 1, + 2, 16, + 31, 32, + 33, 40, + 63, 64, + 65, 127, + 128, 129, + 1000, 1024, + 2000} + {} + + std::vector create_random_values(index_type num_values, + index_type size) + { + std::vector values(num_values); + std::uniform_int_distribution dist( + 0, std::max(size - 1, index_type{})); + for (auto& value : values) { + value = dist(this->rng); + } + std::sort(values.begin(), values.end()); + values.erase(std::unique(values.begin(), values.end()), values.end()); + return values; + } + + std::shared_ptr ref; + std::default_random_engine rng; + std::vector sizes; +}; + +TYPED_TEST_SUITE(Bitvector, gko::test::IndexTypes, TypenameNameGenerator); + + +TYPED_TEST(Bitvector, ComputeBitsAndRanks) +{ + using index_type = typename TestFixture::index_type; + using storage_type = typename TestFixture::storage_type; + constexpr auto block_size = TestFixture::block_size; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + for (auto num_values : + {index_type{}, size / 10, size / 4, size / 2, size}) { + SCOPED_TRACE(num_values); + auto values = this->create_random_values(num_values, size); + num_values = values.size(); + const auto num_blocks = (size + block_size - 1) / block_size; + std::vector bits(num_blocks, ~storage_type{}); + std::vector ranks(num_blocks, -1); + + gko::kernels::reference::bitvector::compute_bits_and_ranks( + this->ref, values.data(), num_values, size, bits.data(), + ranks.data()); + + // check bits and ranks are correct + gko::device_bitvector bv(bits.data(), ranks.data(), + size); + ASSERT_EQ(bv.size(), size); + ASSERT_EQ(bv.num_blocks(), num_blocks); + auto it = values.begin(); + index_type rank{}; + for (auto i : gko::irange{size}) { + const auto block = i / block_size; + const auto local = i % block_size; + ASSERT_EQ(bv.rank(i), rank); + if (it != values.end() && *it == i) { + ASSERT_TRUE(bool(bits[block] & (storage_type{1} << local))); + ASSERT_TRUE(bv.get(i)); + ++rank; + ++it; + } else { + ASSERT_FALSE( + bool(bits[block] & (storage_type{1} << local))); + ASSERT_FALSE(bv.get(i)); + } + } + } + } +} diff --git a/test/components/CMakeLists.txt b/test/components/CMakeLists.txt index 12e738d8eaa..bcc300d5d99 100644 --- a/test/components/CMakeLists.txt +++ b/test/components/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_common_test(absolute_array_kernels) +ginkgo_create_common_device_test(bitvector_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(fill_array_kernels) ginkgo_create_common_test(format_conversion_kernels) ginkgo_create_common_test(precision_conversion_kernels) diff --git a/test/components/bitvector_kernels.cpp b/test/components/bitvector_kernels.cpp new file mode 100644 index 00000000000..b513307b1fa --- /dev/null +++ b/test/components/bitvector_kernels.cpp @@ -0,0 +1,155 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/bitvector_kernels.hpp" + +#include +#include +#include +#include +#include + +#include + +#include + +#include "common/unified/base/kernel_launch.hpp" +#include "core/base/index_range.hpp" +#include "core/test/utils.hpp" +#include "test/utils/common_fixture.hpp" + + +// workaround for cudafe 11.0 bug +using gko::irange; + + +template +class Bitvector : public CommonTestFixture { +protected: + using index_type = T; + using device_type = gko::bitvector; + using storage_type = typename device_type::storage_type; + constexpr static auto block_size = device_type::block_size; + + Bitvector() + : rng{67193}, sizes{0, 1, 2, 16, 31, 32, 33, + 40, 63, 64, 65, 127, 128, 129, + 1000, 1024, 2000, 10000, 100000} + {} + + gko::array create_random_values(index_type num_values, + index_type size) + { + std::vector values(num_values); + std::uniform_int_distribution dist( + 0, std::max(size - 1, index_type{})); + for (auto& value : values) { + value = dist(this->rng); + } + std::sort(values.begin(), values.end()); + values.erase(std::unique(values.begin(), values.end()), values.end()); + return gko::array{this->ref, values.begin(), values.end()}; + } + + std::default_random_engine rng; + std::vector sizes; +}; + +TYPED_TEST_SUITE(Bitvector, gko::test::IndexTypes, TypenameNameGenerator); + + +// nvcc doesn't like device lambdas inside class member functions + +template +void run_device(std::shared_ptr exec, + const gko::device_bitvector bv, + const gko::device_bitvector dbv, + gko::array& output_bools, + gko::array& output_ranks, + gko::array& doutput_bools, + gko::array& doutput_ranks) +{ + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( + exec, + [] GKO_KERNEL(auto i, auto bv, auto output_bool, auto output_rank) { + output_bool[i] = bv.get(i); + output_rank[i] = bv.rank(i); + }, + dbv.size(), dbv, doutput_bools, doutput_ranks); + for (auto i : gko::irange{bv.size()}) { + output_bools.get_data()[i] = bv.get(i); + output_ranks.get_data()[i] = bv.rank(i); + } +} + + +TYPED_TEST(Bitvector, ComputeBitsAndRanksIsEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + using storage_type = typename TestFixture::storage_type; + constexpr auto block_size = TestFixture::block_size; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + for (auto num_values : + {index_type{}, size / 10, size / 4, size / 2, size}) { + SCOPED_TRACE(num_values); + auto values = this->create_random_values(num_values, size); + num_values = values.get_size(); + gko::array dvalues{this->exec, values}; + const auto num_blocks = + static_cast(gko::ceildiv(size, block_size)); + gko::array ranks{this->ref, num_blocks}; + gko::array dranks{this->exec, num_blocks}; + gko::array bits{this->ref, num_blocks}; + gko::array dbits{this->exec, num_blocks}; + dranks.fill(-1); + dbits.fill(~storage_type{}); + + gko::kernels::reference::bitvector::compute_bits_and_ranks( + this->ref, values.get_const_data(), num_values, size, + bits.get_data(), ranks.get_data()); + gko::kernels::GKO_DEVICE_NAMESPACE::bitvector:: + compute_bits_and_ranks(this->exec, dvalues.get_const_data(), + num_values, size, dbits.get_data(), + dranks.get_data()); + + GKO_ASSERT_ARRAY_EQ(bits, dbits); + GKO_ASSERT_ARRAY_EQ(ranks, dranks); + } + } +} + + +TYPED_TEST(Bitvector, AccessIsEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + using storage_type = typename TestFixture::storage_type; + constexpr auto block_size = TestFixture::block_size; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + for (auto num_values : + {index_type{}, size / 10, size / 4, size / 2, size}) { + SCOPED_TRACE(num_values); + auto values = this->create_random_values(num_values, size); + num_values = values.get_size(); + gko::array dvalues{this->exec, values}; + + auto bv = + gko::bitvector::from_sorted_indices(values, size); + auto dbv = + gko::bitvector::from_sorted_indices(dvalues, size); + + const auto usize = static_cast(size); + gko::array output_bools{this->ref, usize}; + gko::array output_ranks{this->ref, usize}; + gko::array doutput_bools{this->exec, usize}; + gko::array doutput_ranks{this->exec, usize}; + doutput_bools.fill(true); + doutput_ranks.fill(-1); + run_device(this->exec, bv.device_view(), dbv.device_view(), + output_bools, output_ranks, doutput_bools, + doutput_ranks); + } + } +} From d9d91dd4a4aabc7cd0f0e41efa4ad22497015070 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Apr 2025 11:39:11 +0200 Subject: [PATCH 02/15] add bitvector generation from predicate --- .../cuda_hip/components/bitvector_kernels.hpp | 84 ++++++++++++ .../components/bitvector_kernels.generic.hpp | 65 +++++++++ .../unified/components/bitvector_kernels.hpp | 21 +++ core/components/bitvector.cpp | 56 +++++++- core/components/bitvector.hpp | 13 ++ reference/components/bitvector_kernels.hpp | 51 +++++++ test/components/bitvector_kernels.cpp | 128 ++++++++++++------ 7 files changed, 374 insertions(+), 44 deletions(-) create mode 100644 common/cuda_hip/components/bitvector_kernels.hpp create mode 100644 common/unified/components/bitvector_kernels.generic.hpp create mode 100644 common/unified/components/bitvector_kernels.hpp create mode 100644 reference/components/bitvector_kernels.hpp diff --git a/common/cuda_hip/components/bitvector_kernels.hpp b/common/cuda_hip/components/bitvector_kernels.hpp new file mode 100644 index 00000000000..e523e9dc6f3 --- /dev/null +++ b/common/cuda_hip/components/bitvector_kernels.hpp @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_KERNELS_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_KERNELS_HPP_ + +#include "core/components/bitvector.hpp" + +#include + +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace bitvector { + + +constexpr auto default_block_size = 512; + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void bitvector_from_predicate( + IndexType size, + typename device_bitvector::storage_type* __restrict__ bits, + IndexType* __restrict__ popcounts, DevicePredicate predicate) +{ + constexpr auto block_size = device_bitvector::block_size; + const auto subwarp_id = thread::get_subwarp_id_flat(); + const auto subwarp_base = subwarp_id * block_size; + if (subwarp_base >= size) { + return; + } + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + const auto i = static_cast(subwarp_base + subwarp.thread_rank()); + const auto bit = i < size ? predicate(i) : false; + const auto mask = subwarp.ballot(bit); + if (subwarp.thread_rank() == 0) { + bits[subwarp_id] = mask; + popcounts[subwarp_id] = gko::detail::popcount(mask); + } +} + + +} // namespace kernel + + +template +gko::bitvector bitvector_from_predicate( + std::shared_ptr exec, IndexType size, + DevicePredicate device_predicate) +{ + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bits{exec, num_blocks}; + array ranks{exec, num_blocks}; + if (num_blocks > 0) { + const auto num_threadblocks = + ceildiv(num_blocks, default_block_size / block_size); + kernel::bitvector_from_predicate<<get_stream()>>>( + size, bits.get_data(), ranks.get_data(), device_predicate); + components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); + } + + return gko::bitvector{std::move(bits), std::move(ranks), size}; +} + + +} // namespace bitvector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_KERNELS_HPP_ diff --git a/common/unified/components/bitvector_kernels.generic.hpp b/common/unified/components/bitvector_kernels.generic.hpp new file mode 100644 index 00000000000..f9a94d90e8e --- /dev/null +++ b/common/unified/components/bitvector_kernels.generic.hpp @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ +#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ + +#include "common/unified/base/kernel_launch.hpp" +#include "core/components/bitvector.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace bitvector { + + +template +gko::bitvector bitvector_from_predicate( + std::shared_ptr exec, IndexType size, + DevicePredicate device_predicate) +{ + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bits{exec, num_blocks}; + array ranks{exec, num_blocks}; + run_kernel( + exec, + [] GKO_KERNEL(auto block_i, auto size, auto device_predicate, auto bits, + auto ranks) { + const auto base_i = block_i * block_size; + storage_type mask{}; + if (base_i + block_size <= size) { + for (int local_i = 0; local_i < block_size; local_i++) { + const storage_type bit = + device_predicate(base_i + local_i) ? 1 : 0; + mask |= bit << local_i; + } + } else { + int local_i = 0; + for (int local_i = 0; base_i + local_i < size; local_i++) { + const storage_type bit = + device_predicate(base_i + local_i) ? 1 : 0; + mask |= bit << local_i; + } + } + bits[block_i] = mask; + ranks[block_i] = gko::detail::popcount(mask); + }, + num_blocks, size, device_predicate, bits, ranks); + components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); + + return gko::bitvector{std::move(bits), std::move(ranks), size}; +} + + +} // namespace bitvector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ diff --git a/common/unified/components/bitvector_kernels.hpp b/common/unified/components/bitvector_kernels.hpp new file mode 100644 index 00000000000..115b6eae7ba --- /dev/null +++ b/common/unified/components/bitvector_kernels.hpp @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_HPP_ +#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_HPP_ + + +#include "common/unified/base/kernel_launch.hpp" + + +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) +#include "common/cuda_hip/components/bitvector_kernels.hpp" +#elif defined(GKO_COMPILING_DPCPP) || defined(GKO_COMPILING_OMP) +#include "common/unified/components/bitvector_kernels.generic.hpp" +#else +#error "This file should only be used inside Ginkgo device compilation" +#endif + + +#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_HPP_ diff --git a/core/components/bitvector.cpp b/core/components/bitvector.cpp index 562ed0099ac..cc3b338bb79 100644 --- a/core/components/bitvector.cpp +++ b/core/components/bitvector.cpp @@ -19,11 +19,59 @@ GKO_REGISTER_OPERATION(compute_bits_and_ranks, } // namespace + template device_bitvector bitvector::device_view() const { - return device_bitvector{bits_.get_const_data(), - ranks_.get_const_data(), size_}; + return device_bitvector{this->get_bits(), this->get_ranks(), + this->get_size()}; +} + + +template +std::shared_ptr bitvector::get_executor() const +{ + return bits_.get_executor(); +} + + +template +const typename bitvector::storage_type* +bitvector::get_bits() const +{ + return bits_.get_const_data(); +} + + +template +const IndexType* bitvector::get_ranks() const +{ + return ranks_.get_const_data(); +} + + +template +IndexType bitvector::get_size() const +{ + return size_; +} + + +template +IndexType bitvector::get_num_blocks() const +{ + return static_cast(ceildiv(this->get_size(), block_size)); +} + + +template +bitvector::bitvector(array bits, + array ranks, index_type size) + : size_{size}, bits_{std::move(bits)}, ranks_{std::move(ranks)} +{ + GKO_ASSERT(bits_.get_executor() == ranks_.get_executor()); + GKO_ASSERT(this->get_num_blocks() == bits_.get_size()); + GKO_ASSERT(this->get_num_blocks() == ranks_.get_size()); } @@ -31,8 +79,8 @@ template bitvector::bitvector(std::shared_ptr exec, index_type size) : size_{size}, - bits_{exec, static_cast(ceildiv(size, block_size))}, - ranks_{exec, static_cast(ceildiv(size, block_size))} + bits_{exec, static_cast(this->get_num_blocks())}, + ranks_{exec, static_cast(this->get_num_blocks())} { bits_.fill(storage_type{}); ranks_.fill(0); diff --git a/core/components/bitvector.hpp b/core/components/bitvector.hpp index fa50eb38ff5..9df4d0ef34a 100644 --- a/core/components/bitvector.hpp +++ b/core/components/bitvector.hpp @@ -76,6 +76,19 @@ class bitvector { static bitvector from_sorted_indices(const array& indices, index_type size); + std::shared_ptr get_executor() const; + + const storage_type* get_bits() const; + + const index_type* get_ranks() const; + + index_type get_size() const; + + index_type get_num_blocks() const; + + bitvector(array bits, array ranks, + index_type size); + private: bitvector(std::shared_ptr exec, index_type size); diff --git a/reference/components/bitvector_kernels.hpp b/reference/components/bitvector_kernels.hpp new file mode 100644 index 00000000000..a17686311f1 --- /dev/null +++ b/reference/components/bitvector_kernels.hpp @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_REFERENCE_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ +#define GKO_REFERENCE_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ + +#include "core/components/bitvector.hpp" + +#include "core/base/index_range.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace bitvector { + + +template +gko::bitvector bitvector_from_predicate( + std::shared_ptr exec, IndexType size, + DevicePredicate device_predicate) +{ + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bits{exec, num_blocks}; + array ranks{exec, num_blocks}; + std::fill_n(bits.get_data(), num_blocks, 0); + std::fill_n(ranks.get_data(), num_blocks, 0); + for (auto i : irange{size}) { + if (device_predicate(i)) { + bits.get_data()[i / block_size] |= storage_type{1} + << (i % block_size); + ranks.get_data()[i / block_size]++; + } + } + components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); + + return gko::bitvector{std::move(bits), std::move(ranks), size}; +} + + +} // namespace bitvector +} // namespace reference +} // namespace kernels +} // namespace gko + + +#endif // GKO_REFERENCE_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ diff --git a/test/components/bitvector_kernels.cpp b/test/components/bitvector_kernels.cpp index b513307b1fa..1a672815cb1 100644 --- a/test/components/bitvector_kernels.cpp +++ b/test/components/bitvector_kernels.cpp @@ -13,10 +13,13 @@ #include #include +#include #include "common/unified/base/kernel_launch.hpp" +#include "common/unified/components/bitvector_kernels.hpp" #include "core/base/index_range.hpp" #include "core/test/utils.hpp" +#include "reference/components/bitvector_kernels.hpp" #include "test/utils/common_fixture.hpp" @@ -28,9 +31,10 @@ template class Bitvector : public CommonTestFixture { protected: using index_type = T; - using device_type = gko::bitvector; - using storage_type = typename device_type::storage_type; - constexpr static auto block_size = device_type::block_size; + using bitvector = gko::bitvector; + using device_bitvector = gko::device_bitvector; + using storage_type = typename bitvector::storage_type; + constexpr static auto block_size = bitvector::block_size; Bitvector() : rng{67193}, sizes{0, 1, 2, 16, 31, 32, 33, @@ -52,6 +56,27 @@ class Bitvector : public CommonTestFixture { return gko::array{this->ref, values.begin(), values.end()}; } + void assert_bitvector_equal(const bitvector& bv, const bitvector& dbv) + { + ASSERT_EQ(bv.get_size(), dbv.get_size()); + const auto num_blocks = + static_cast(bv.get_num_blocks()); + const auto bits = + gko::detail::array_const_cast(gko::make_const_array_view( + bv.get_executor(), num_blocks, bv.get_bits())); + const auto dbits = + gko::detail::array_const_cast(gko::make_const_array_view( + dbv.get_executor(), num_blocks, dbv.get_bits())); + const auto ranks = + gko::detail::array_const_cast(gko::make_const_array_view( + bv.get_executor(), num_blocks, bv.get_ranks())); + const auto dranks = + gko::detail::array_const_cast(gko::make_const_array_view( + dbv.get_executor(), num_blocks, dbv.get_ranks())); + GKO_ASSERT_ARRAY_EQ(bits, dbits); + GKO_ASSERT_ARRAY_EQ(ranks, dranks); + } + std::default_random_engine rng; std::vector sizes; }; @@ -59,6 +84,66 @@ class Bitvector : public CommonTestFixture { TYPED_TEST_SUITE(Bitvector, gko::test::IndexTypes, TypenameNameGenerator); +TYPED_TEST(Bitvector, BuildFromIndicesIsEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + using bitvector = typename TestFixture::bitvector; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + for (auto num_values : + {index_type{}, size / 10, size / 4, size / 2, size}) { + SCOPED_TRACE(num_values); + auto values = this->create_random_values(num_values, size); + gko::array dvalues{this->exec, values}; + + auto bv = bitvector::from_sorted_indices(values, size); + auto dbv = bitvector::from_sorted_indices(dvalues, size); + + this->assert_bitvector_equal(bv, dbv); + } + } +} + + +template +std::pair, gko::bitvector> run_predicate( + std::shared_ptr ref, + std::shared_ptr exec, IndexType size, int stride) +{ + return std::make_pair( + gko::kernels::reference::bitvector::bitvector_from_predicate( + ref, size, [stride](int i) { return i % stride == 0; }), + gko::kernels::GKO_DEVICE_NAMESPACE::bitvector::bitvector_from_predicate( + exec, size, + [stride] GKO_KERNEL(int i) { return i % stride == 0; })); +} + + +TYPED_TEST(Bitvector, BuildFromPredicateIsEquivalentToFromIndices) +{ + using index_type = typename TestFixture::index_type; + using bitvector = typename TestFixture::bitvector; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + for (auto stride : {1, 2, 3, 4, 5, 65}) { + SCOPED_TRACE(stride); + std::vector indices; + for (index_type i = 0; i < size; i += stride) { + indices.push_back(i); + } + gko::array values{this->ref, indices.begin(), + indices.end()}; + + auto [bv, dbv] = run_predicate(this->ref, this->exec, size, stride); + + auto ref_bv = bitvector::from_sorted_indices(values, size); + this->assert_bitvector_equal(bv, dbv); + this->assert_bitvector_equal(ref_bv, dbv); + } + } +} + + // nvcc doesn't like device lambdas inside class member functions template @@ -84,43 +169,6 @@ void run_device(std::shared_ptr exec, } -TYPED_TEST(Bitvector, ComputeBitsAndRanksIsEquivalentToRef) -{ - using index_type = typename TestFixture::index_type; - using storage_type = typename TestFixture::storage_type; - constexpr auto block_size = TestFixture::block_size; - for (auto size : this->sizes) { - SCOPED_TRACE(size); - for (auto num_values : - {index_type{}, size / 10, size / 4, size / 2, size}) { - SCOPED_TRACE(num_values); - auto values = this->create_random_values(num_values, size); - num_values = values.get_size(); - gko::array dvalues{this->exec, values}; - const auto num_blocks = - static_cast(gko::ceildiv(size, block_size)); - gko::array ranks{this->ref, num_blocks}; - gko::array dranks{this->exec, num_blocks}; - gko::array bits{this->ref, num_blocks}; - gko::array dbits{this->exec, num_blocks}; - dranks.fill(-1); - dbits.fill(~storage_type{}); - - gko::kernels::reference::bitvector::compute_bits_and_ranks( - this->ref, values.get_const_data(), num_values, size, - bits.get_data(), ranks.get_data()); - gko::kernels::GKO_DEVICE_NAMESPACE::bitvector:: - compute_bits_and_ranks(this->exec, dvalues.get_const_data(), - num_values, size, dbits.get_data(), - dranks.get_data()); - - GKO_ASSERT_ARRAY_EQ(bits, dbits); - GKO_ASSERT_ARRAY_EQ(ranks, dranks); - } - } -} - - TYPED_TEST(Bitvector, AccessIsEquivalentToRef) { using index_type = typename TestFixture::index_type; From 60c441c9ce4fde704e2f165d67a2c6048fd719e5 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 20 Apr 2025 15:52:53 +0200 Subject: [PATCH 03/15] use sortedness in bitvector kernels --- .../cuda_hip/components/bitvector_kernels.hpp | 88 +++++++++++++++++++ test/components/bitvector_kernels.cpp | 29 ++++++ 2 files changed, 117 insertions(+) diff --git a/common/cuda_hip/components/bitvector_kernels.hpp b/common/cuda_hip/components/bitvector_kernels.hpp index e523e9dc6f3..39d15e139f5 100644 --- a/common/cuda_hip/components/bitvector_kernels.hpp +++ b/common/cuda_hip/components/bitvector_kernels.hpp @@ -7,8 +7,15 @@ #include "core/components/bitvector.hpp" +#include +#include +#include +#include +#include + #include +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -75,6 +82,87 @@ gko::bitvector bitvector_from_predicate( } +template +struct bitvector_bit_functor { + using storage_type = typename device_bitvector::storage_type; + constexpr static auto block_size = device_bitvector::block_size; + __device__ storage_type operator()(IndexType i) + { + return storage_type{1} << (i % block_size); + } + + __device__ storage_type operator()(storage_type a, storage_type b) + { + // there must not be any duplicate indices + assert(a ^ b == 0); + return a | b; + } +}; + + +template +struct bitvector_block_functor { + constexpr static auto block_size = device_bitvector::block_size; + __device__ IndexType operator()(IndexType i) + { + assert(i >= 0); + assert(i < size); + return i / block_size; + } + + IndexType size; +}; + + +template +struct bitvector_popcnt_functor { + using storage_type = typename device_bitvector::storage_type; + __device__ IndexType operator()(storage_type mask) + { + return gko::detail::popcount(mask); + } +}; + + +template +gko::bitvector::value_type> +bitvector_from_sorted_indices( + std::shared_ptr exec, IndexIterator it, + typename std::iterator_traits::difference_type count, + typename std::iterator_traits::value_type size) +{ + using index_type = typename std::iterator_traits::value_type; + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + const auto policy = thrust_policy(exec); + array bits_compact{exec, num_blocks}; + array bits_position{exec, num_blocks}; + array bits{exec, num_blocks}; + array ranks{exec, num_blocks}; + const auto block_it = thrust::make_transform_iterator( + it, bitvector_block_functor{size}); + const auto bit_it = thrust::make_transform_iterator( + it, bitvector_bit_functor{}); + auto out_pos_it = bits_position.get_data(); + auto out_bit_it = bits_compact.get_data(); + auto [out_pos_end, out_bit_end] = thrust::reduce_by_key( + policy, block_it, block_it + count, bit_it, out_pos_it, out_bit_it, + thrust::equal_to{}, bitvector_bit_functor{}); + assert(thrust::is_sorted(policy, out_pos_it, out_pos_end)); + const auto out_size = out_pos_end - out_pos_it; + thrust::fill_n(policy, bits.get_data(), num_blocks, 0); + thrust::scatter(policy, out_bit_it, out_bit_it + out_size, out_pos_it, + bits.get_data()); + const auto rank_it = thrust::make_transform_iterator( + bits.get_const_data(), bitvector_popcnt_functor{}); + thrust::exclusive_scan(policy, rank_it, rank_it + num_blocks, + ranks.get_data(), index_type{}); + + return gko::bitvector{std::move(bits), std::move(ranks), size}; +} + + } // namespace bitvector } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/test/components/bitvector_kernels.cpp b/test/components/bitvector_kernels.cpp index 1a672815cb1..f7920daced3 100644 --- a/test/components/bitvector_kernels.cpp +++ b/test/components/bitvector_kernels.cpp @@ -84,6 +84,35 @@ class Bitvector : public CommonTestFixture { TYPED_TEST_SUITE(Bitvector, gko::test::IndexTypes, TypenameNameGenerator); +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) + + +TYPED_TEST(Bitvector, BuildFromIndicesDeviceIsEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + using bitvector = typename TestFixture::bitvector; + for (auto size : this->sizes) { + SCOPED_TRACE(size); + for (auto num_values : + {index_type{}, size / 10, size / 4, size / 2, size}) { + SCOPED_TRACE(num_values); + auto values = this->create_random_values(num_values, size); + gko::array dvalues{this->exec, values}; + + auto bv = bitvector::from_sorted_indices(values, size); + auto dbv = gko::kernels::GKO_DEVICE_NAMESPACE::bitvector:: + bitvector_from_sorted_indices(this->exec, dvalues.get_data(), + dvalues.get_size(), size); + + this->assert_bitvector_equal(bv, dbv); + } + } +} + + +#endif + + TYPED_TEST(Bitvector, BuildFromIndicesIsEquivalentToRef) { using index_type = typename TestFixture::index_type; From a0f0c3e310d96f29dc895b7980443991c2a8559b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 20 Apr 2025 18:45:00 +0200 Subject: [PATCH 04/15] add bitvector support for all backends --- common/cuda_hip/CMakeLists.txt | 1 - .../{bitvector_kernels.hpp => bitvector.hpp} | 20 ++-- .../cuda_hip/components/bitvector_kernels.cpp | 55 --------- ...nels.generic.hpp => bitvector.generic.hpp} | 9 +- common/unified/components/bitvector.hpp | 26 +++++ .../unified/components/bitvector_kernels.hpp | 21 ---- core/CMakeLists.txt | 1 - core/components/bitvector.cpp | 108 ------------------ core/components/bitvector.hpp | 85 +++++++++++--- core/components/bitvector_kernels.hpp | 46 -------- core/device_hooks/common_kernels.inc.cpp | 10 -- dpcpp/CMakeLists.txt | 1 - dpcpp/components/bitvector.dp.hpp | 72 ++++++++++++ dpcpp/components/bitvector_kernels.dp.cpp | 30 ----- omp/CMakeLists.txt | 1 - omp/components/bitvector.hpp | 85 ++++++++++++++ omp/components/bitvector_kernels.cpp | 54 --------- reference/CMakeLists.txt | 1 - reference/components/bitvector.hpp | 83 ++++++++++++++ reference/components/bitvector_kernels.hpp | 51 --------- reference/test/components/CMakeLists.txt | 2 +- .../{bitvector_kernels.cpp => bitvector.cpp} | 34 +++--- test/components/CMakeLists.txt | 2 +- .../{bitvector_kernels.cpp => bitvector.cpp} | 64 ++++------- 24 files changed, 390 insertions(+), 472 deletions(-) rename common/cuda_hip/components/{bitvector_kernels.hpp => bitvector.hpp} (91%) delete mode 100644 common/cuda_hip/components/bitvector_kernels.cpp rename common/unified/components/{bitvector_kernels.generic.hpp => bitvector.generic.hpp} (87%) create mode 100644 common/unified/components/bitvector.hpp delete mode 100644 common/unified/components/bitvector_kernels.hpp delete mode 100644 core/components/bitvector.cpp delete mode 100644 core/components/bitvector_kernels.hpp create mode 100644 dpcpp/components/bitvector.dp.hpp delete mode 100644 dpcpp/components/bitvector_kernels.dp.cpp create mode 100644 omp/components/bitvector.hpp delete mode 100644 omp/components/bitvector_kernels.cpp create mode 100644 reference/components/bitvector.hpp delete mode 100644 reference/components/bitvector_kernels.hpp rename reference/test/components/{bitvector_kernels.cpp => bitvector.cpp} (75%) rename test/components/{bitvector_kernels.cpp => bitvector.cpp} (80%) diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 0b69873b7a9..10f3b857d82 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -3,7 +3,6 @@ set(CUDA_HIP_SOURCES base/batch_multi_vector_kernels.cpp base/device_matrix_data_kernels.cpp base/index_set_kernels.cpp - components/bitvector_kernels.cpp components/prefix_sum_kernels.cpp distributed/assembly_kernels.cpp distributed/index_map_kernels.cpp diff --git a/common/cuda_hip/components/bitvector_kernels.hpp b/common/cuda_hip/components/bitvector.hpp similarity index 91% rename from common/cuda_hip/components/bitvector_kernels.hpp rename to common/cuda_hip/components/bitvector.hpp index 39d15e139f5..f1a04084408 100644 --- a/common/cuda_hip/components/bitvector_kernels.hpp +++ b/common/cuda_hip/components/bitvector.hpp @@ -2,10 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_KERNELS_HPP_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_KERNELS_HPP_ - -#include "core/components/bitvector.hpp" +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ #include #include @@ -18,6 +16,7 @@ #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/bitvector.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -34,12 +33,13 @@ namespace kernel { template -__global__ __launch_bounds__(default_block_size) void bitvector_from_predicate( +__global__ __launch_bounds__(default_block_size) void from_predicate( IndexType size, typename device_bitvector::storage_type* __restrict__ bits, IndexType* __restrict__ popcounts, DevicePredicate predicate) { constexpr auto block_size = device_bitvector::block_size; + static_assert(block_size <= config::warp_size); const auto subwarp_id = thread::get_subwarp_id_flat(); const auto subwarp_base = subwarp_id * block_size; if (subwarp_base >= size) { @@ -61,7 +61,7 @@ __global__ __launch_bounds__(default_block_size) void bitvector_from_predicate( template -gko::bitvector bitvector_from_predicate( +gko::bitvector from_predicate( std::shared_ptr exec, IndexType size, DevicePredicate device_predicate) { @@ -72,8 +72,8 @@ gko::bitvector bitvector_from_predicate( if (num_blocks > 0) { const auto num_threadblocks = ceildiv(num_blocks, default_block_size / block_size); - kernel::bitvector_from_predicate<<get_stream()>>>( + kernel::from_predicate<<get_stream()>>>( size, bits.get_data(), ranks.get_data(), device_predicate); components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); } @@ -126,7 +126,7 @@ struct bitvector_popcnt_functor { template gko::bitvector::value_type> -bitvector_from_sorted_indices( +from_sorted_indices( std::shared_ptr exec, IndexIterator it, typename std::iterator_traits::difference_type count, typename std::iterator_traits::value_type size) @@ -169,4 +169,4 @@ bitvector_from_sorted_indices( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_KERNELS_HPP_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_ diff --git a/common/cuda_hip/components/bitvector_kernels.cpp b/common/cuda_hip/components/bitvector_kernels.cpp deleted file mode 100644 index 36178daacfa..00000000000 --- a/common/cuda_hip/components/bitvector_kernels.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/components/bitvector_kernels.hpp" - -#include -#include -#include -#include - -#include - -#include "common/cuda_hip/base/thrust.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -namespace bitvector { - - -template -void compute_bits_and_ranks( - std::shared_ptr exec, const IndexType* indices, - IndexType num_indices, IndexType size, - typename device_bitvector::storage_type* bits, IndexType* ranks) -{ - const auto policy = thrust_policy(exec); - using bv = device_bitvector; - using storage_type = typename bv::storage_type; - const auto num_blocks = ceildiv(size, bv::block_size); - thrust::fill_n(policy, bits, num_blocks, 0u); - thrust::for_each_n( - policy, indices, num_indices, [bits] __device__(IndexType idx) { - constexpr auto block_size = device_bitvector::block_size; - const auto block = idx / block_size; - const auto local = idx % block_size; - atomicOr(bits + block, storage_type{1} << local); - }); - const auto it = thrust::make_transform_iterator( - bits, [] __device__(storage_type word) -> IndexType { - return gko::detail::popcount(word); - }); - thrust::exclusive_scan(policy, it, it + num_blocks, ranks, IndexType{}); -} - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL); - - -} // namespace bitvector -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/common/unified/components/bitvector_kernels.generic.hpp b/common/unified/components/bitvector.generic.hpp similarity index 87% rename from common/unified/components/bitvector_kernels.generic.hpp rename to common/unified/components/bitvector.generic.hpp index f9a94d90e8e..c2831c8445d 100644 --- a/common/unified/components/bitvector_kernels.generic.hpp +++ b/common/unified/components/bitvector.generic.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ -#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ +#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_GENERIC_HPP_ +#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_GENERIC_HPP_ #include "common/unified/base/kernel_launch.hpp" #include "core/components/bitvector.hpp" @@ -17,7 +17,7 @@ namespace bitvector { template -gko::bitvector bitvector_from_predicate( +gko::bitvector from_predicate( std::shared_ptr exec, IndexType size, DevicePredicate device_predicate) { @@ -39,7 +39,6 @@ gko::bitvector bitvector_from_predicate( mask |= bit << local_i; } } else { - int local_i = 0; for (int local_i = 0; base_i + local_i < size; local_i++) { const storage_type bit = device_predicate(base_i + local_i) ? 1 : 0; @@ -62,4 +61,4 @@ gko::bitvector bitvector_from_predicate( } // namespace gko -#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ +#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_GENERIC_HPP_ diff --git a/common/unified/components/bitvector.hpp b/common/unified/components/bitvector.hpp new file mode 100644 index 00000000000..2467e768238 --- /dev/null +++ b/common/unified/components/bitvector.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ +#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ + + +#include "common/unified/base/kernel_launch.hpp" + + +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) +#include "common/cuda_hip/components/bitvector.hpp" +#elif defined(GKO_COMPILING_DPCPP) || defined(GKO_COMPILING_OMP) +#include "common/unified/components/bitvector.generic.hpp" +#ifdef GKO_COMPILING_OMP +#include "omp/components/bitvector.hpp" +#else +#include "dpcpp/components/bitvector.dp.hpp" +#endif +#else +#error "This file should only be used inside Ginkgo device compilation" +#endif + + +#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_ diff --git a/common/unified/components/bitvector_kernels.hpp b/common/unified/components/bitvector_kernels.hpp deleted file mode 100644 index 115b6eae7ba..00000000000 --- a/common/unified/components/bitvector_kernels.hpp +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_HPP_ -#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_HPP_ - - -#include "common/unified/base/kernel_launch.hpp" - - -#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) -#include "common/cuda_hip/components/bitvector_kernels.hpp" -#elif defined(GKO_COMPILING_DPCPP) || defined(GKO_COMPILING_OMP) -#include "common/unified/components/bitvector_kernels.generic.hpp" -#else -#error "This file should only be used inside Ginkgo device compilation" -#endif - - -#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_KERNELS_HPP_ diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index c05f7d39c09..815545e5f8b 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -43,7 +43,6 @@ target_sources( base/segmented_array.cpp base/timer.cpp base/version.cpp - components/bitvector.cpp components/range_minimum_query.cpp config/config.cpp config/config_helper.cpp diff --git a/core/components/bitvector.cpp b/core/components/bitvector.cpp deleted file mode 100644 index cc3b338bb79..00000000000 --- a/core/components/bitvector.cpp +++ /dev/null @@ -1,108 +0,0 @@ -// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/components/bitvector.hpp" - -#include - -#include "core/components/bitvector_kernels.hpp" - - -namespace gko { -namespace { - - -GKO_REGISTER_OPERATION(compute_bits_and_ranks, - bitvector::compute_bits_and_ranks); - - -} // namespace - - -template -device_bitvector bitvector::device_view() const -{ - return device_bitvector{this->get_bits(), this->get_ranks(), - this->get_size()}; -} - - -template -std::shared_ptr bitvector::get_executor() const -{ - return bits_.get_executor(); -} - - -template -const typename bitvector::storage_type* -bitvector::get_bits() const -{ - return bits_.get_const_data(); -} - - -template -const IndexType* bitvector::get_ranks() const -{ - return ranks_.get_const_data(); -} - - -template -IndexType bitvector::get_size() const -{ - return size_; -} - - -template -IndexType bitvector::get_num_blocks() const -{ - return static_cast(ceildiv(this->get_size(), block_size)); -} - - -template -bitvector::bitvector(array bits, - array ranks, index_type size) - : size_{size}, bits_{std::move(bits)}, ranks_{std::move(ranks)} -{ - GKO_ASSERT(bits_.get_executor() == ranks_.get_executor()); - GKO_ASSERT(this->get_num_blocks() == bits_.get_size()); - GKO_ASSERT(this->get_num_blocks() == ranks_.get_size()); -} - - -template -bitvector::bitvector(std::shared_ptr exec, - index_type size) - : size_{size}, - bits_{exec, static_cast(this->get_num_blocks())}, - ranks_{exec, static_cast(this->get_num_blocks())} -{ - bits_.fill(storage_type{}); - ranks_.fill(0); -} - - -template -bitvector bitvector::from_sorted_indices( - const array& indices, index_type size) -{ - const auto exec = indices.get_executor(); - bitvector result{exec, size}; - exec->run(make_compute_bits_and_ranks( - indices.get_const_data(), static_cast(indices.get_size()), - size, result.bits_.get_data(), result.ranks_.get_data())); - return result; -} - - -#define GKO_DEFINE_BITVECTOR(IndexType) class bitvector - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DEFINE_BITVECTOR); - - -} // namespace gko diff --git a/core/components/bitvector.hpp b/core/components/bitvector.hpp index 9df4d0ef34a..7887dc434a8 100644 --- a/core/components/bitvector.hpp +++ b/core/components/bitvector.hpp @@ -20,18 +20,34 @@ class device_bitvector { using storage_type = uint32; constexpr static int block_size = CHAR_BIT * sizeof(storage_type); + /** + * Constructs a device_bitvector from its underlying data. + * + * @param bits the bitmask array + * @param rank the rank array, it must be the prefix sum over + * `popcount(bits[i])`. + * @param size the number of bits we consider part of this bitvector. + */ constexpr device_bitvector(const storage_type* bits, const index_type* ranks, index_type size) - : bits_{bits}, ranks_{ranks}, size_{size} + : size_{size}, bits_{bits}, ranks_{ranks} {} + /** Returns the number of bits stored in this bitvector. */ constexpr index_type size() const { return size_; } + /** Returns the number of words (of type storage_type) in this bitvector. */ constexpr index_type num_blocks() const { - return (size() + block_size - 1) / block_size; + return (this->size() + block_size - 1) / block_size; } + /** + * Returns whether the bit at the given index is set. + * + * @param i the index in range [0, size()) + * @return true if the bit is set, false otherwise. + */ constexpr bool get(index_type i) const { assert(i >= 0); @@ -41,6 +57,13 @@ class device_bitvector { return bool((bits_[block] >> local) & 1); } + /** + * Returns the rank of the given index. + * + * @param i the index in range [0, size()) + * @return the rank of the given index, i.e. the number of 1 bits set + * before the corresponding bit (exclusive). + */ constexpr index_type rank(index_type i) const { assert(i >= 0); @@ -51,10 +74,28 @@ class device_bitvector { return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); } + /** + * Returns the inclusive rank of the given index. + * + * @param i the index in range [0, size()) + * @return the rank of the given index, i.e. the number of 1 bits set + * up to and including the corresponding bit (inclusive). + */ + constexpr index_type rank_inclusive(index_type i) const + { + assert(i >= 0); + assert(i < size()); + const auto block = i / block_size; + const auto local = i % block_size; + const auto mask = storage_type{1} << local; + const auto prefix_mask = mask - 1 | mask; + return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); + } + private: - const index_type* ranks_; - const storage_type* bits_; index_type size_; + const storage_type* bits_; + const index_type* ranks_; }; @@ -68,30 +109,46 @@ template class bitvector { public: using index_type = IndexType; - using storage_type = uint32; + using view_type = device_bitvector; + using storage_type = typename view_type::storage_type; constexpr static int block_size = CHAR_BIT * sizeof(storage_type); - device_bitvector device_view() const; + static index_type get_num_blocks(index_type size) + { + return (size + block_size - 1) / block_size; + } + + view_type device_view() const + { + return view_type{this->get_bits(), this->get_ranks(), this->get_size()}; + } static bitvector from_sorted_indices(const array& indices, index_type size); - std::shared_ptr get_executor() const; + std::shared_ptr get_executor() const + { + return bits_.get_executor(); + } - const storage_type* get_bits() const; + const storage_type* get_bits() const { return bits_.get_const_data(); } - const index_type* get_ranks() const; + const index_type* get_ranks() const { return ranks_.get_const_data(); } - index_type get_size() const; + index_type get_size() const { return size_; } - index_type get_num_blocks() const; + index_type get_num_blocks() const { return get_num_blocks(get_size()); } bitvector(array bits, array ranks, - index_type size); + index_type size) + : size_{size}, bits_{std::move(bits)}, ranks_{std::move(ranks)} + { + GKO_ASSERT(bits_.get_executor() == ranks_.get_executor()); + GKO_ASSERT(this->get_num_blocks() == bits_.get_size()); + GKO_ASSERT(this->get_num_blocks() == ranks_.get_size()); + } private: - bitvector(std::shared_ptr exec, index_type size); - index_type size_; array bits_; array ranks_; diff --git a/core/components/bitvector_kernels.hpp b/core/components/bitvector_kernels.hpp deleted file mode 100644 index 5cbdf9cd6ad..00000000000 --- a/core/components/bitvector_kernels.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CORE_COMPONENTS_BITVECTOR_KERNELS_HPP_ -#define GKO_CORE_COMPONENTS_BITVECTOR_KERNELS_HPP_ - - -#include "core/components/bitvector.hpp" - -#include - -#include -#include - -#include "core/base/kernel_declaration.hpp" - - -namespace gko { -namespace kernels { - - -#define GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL(IndexType) \ - void compute_bits_and_ranks( \ - std::shared_ptr exec, const IndexType* indices, \ - IndexType num_indices, IndexType size, \ - typename device_bitvector::storage_type* bits, \ - IndexType* ranks) - - -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL(IndexType) - - -GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(bitvector, - GKO_DECLARE_ALL_AS_TEMPLATES); - - -#undef GKO_DECLARE_ALL_AS_TEMPLATES - - -} // namespace kernels -} // namespace gko - -#endif // GKO_CORE_COMPONENTS_BITVECTOR_KERNELS_HPP_ diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index d6d1c1d1b4b..1abe27e9558 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -13,7 +13,6 @@ #include "core/base/index_set_kernels.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/absolute_array_kernels.hpp" -#include "core/components/bitvector_kernels.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/components/precision_conversion_kernels.hpp" @@ -291,15 +290,6 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES); } // namespace components -namespace bitvector { - - -GKO_STUB_INDEX_TYPE(GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL); - - -} // namespace bitvector - - namespace range_minimum_query { diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index acb1b4c804d..b8b4be789d1 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -20,7 +20,6 @@ target_sources( base/scoped_device_id.dp.cpp base/timer.dp.cpp base/version.dp.cpp - components/bitvector_kernels.dp.cpp components/prefix_sum_kernels.dp.cpp distributed/assembly_kernels.dp.cpp distributed/index_map_kernels.dp.cpp diff --git a/dpcpp/components/bitvector.dp.hpp b/dpcpp/components/bitvector.dp.hpp new file mode 100644 index 00000000000..670806da495 --- /dev/null +++ b/dpcpp/components/bitvector.dp.hpp @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_DPCPP_COMPONENTS_BITVECTOR_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_BITVECTOR_DP_HPP_ + + +#include + +#include + +#include "core/components/bitvector.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace bitvector { + + +template +gko::bitvector::value_type> +from_sorted_indices( + std::shared_ptr exec, IndexIterator it, + typename std::iterator_traits::difference_type count, + typename std::iterator_traits::value_type size) +{ + using index_type = typename std::iterator_traits::value_type; + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bit_array{exec, num_blocks}; + array rank_array{exec, num_blocks}; + components::fill_array(exec, bit_array.get_data(), num_blocks, + storage_type{}); + const auto bits = bit_array.get_data(); + const auto ranks = rank_array.get_data(); + const auto queue = exec->get_queue(); + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(count, [=](sycl::id<1> i) { + auto value = it[i]; + const auto block = value / block_size; + const auto local = value % block_size; + sycl::atomic_ref + atomic(bits[block]); + atomic.fetch_or(storage_type{1} << local); + }); + }); + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(num_blocks, [=](sycl::id<1> i) { + ranks[i] = gko::detail::popcount(bits[i]); + }); + }); + components::prefix_sum_nonnegative(exec, ranks, num_blocks); + + return gko::bitvector{std::move(bit_array), + std::move(rank_array), size}; +} + + +} // namespace bitvector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_BITVECTOR_DP_HPP_ diff --git a/dpcpp/components/bitvector_kernels.dp.cpp b/dpcpp/components/bitvector_kernels.dp.cpp deleted file mode 100644 index b71f475ad43..00000000000 --- a/dpcpp/components/bitvector_kernels.dp.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/components/bitvector_kernels.hpp" - -#include - - -namespace gko { -namespace kernels { -namespace dpcpp { -namespace bitvector { - - -template -void compute_bits_and_ranks( - std::shared_ptr exec, const IndexType* indices, - IndexType num_indices, IndexType size, - typename device_bitvector::storage_type* bits, - IndexType* ranks) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL); - - -} // namespace bitvector -} // namespace dpcpp -} // namespace kernels -} // namespace gko diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index e74f943a36d..6e3d0ec2b49 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -14,7 +14,6 @@ target_sources( base/index_set_kernels.cpp base/scoped_device_id.cpp base/version.cpp - components/bitvector_kernels.cpp components/prefix_sum_kernels.cpp distributed/assembly_kernels.cpp distributed/index_map_kernels.cpp diff --git a/omp/components/bitvector.hpp b/omp/components/bitvector.hpp new file mode 100644 index 00000000000..146aee992bd --- /dev/null +++ b/omp/components/bitvector.hpp @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +#include "core/base/index_range.hpp" +#include "core/components/bitvector.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace bitvector { + + +template +gko::bitvector::value_type> +from_sorted_indices( + std::shared_ptr exec, IndexIterator it, + typename std::iterator_traits::difference_type count, + typename std::iterator_traits::value_type size) +{ + using index_type = typename std::iterator_traits::value_type; + using bv = device_bitvector; + using storage_type = typename bv::storage_type; + constexpr auto block_size = bv::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bit_array{exec, num_blocks}; + array rank_array{exec, num_blocks}; + const auto bits = bit_array.get_data(); + const auto ranks = rank_array.get_data(); + components::fill_array(exec, bits, num_blocks, storage_type{}); + const auto num_threads = omp_get_max_threads(); + const auto work_per_thread = ceildiv(count, num_threads); + assert(std::is_sorted(it, it + count)); +#pragma omp parallel num_threads(num_threads) + { + const auto tid = omp_get_thread_num(); + const auto begin = std::min(tid * work_per_thread, count); + const auto end = std::min(begin + work_per_thread, count); + if (begin < end) { + const auto first_block = it[begin] / block_size; + const auto last_block = it[end - 1] / block_size; + storage_type word{}; + auto block = first_block; + for (auto i : irange{begin, end}) { + const auto value = it[i]; + const auto new_block = value / block_size; + const auto local = value % block_size; + if (new_block != block) { + assert(new_block > block); + if (block == first_block) { +#pragma omp atomic + bits[block] |= word; + } else { + bits[block] = word; + } + word = 0; + block = new_block; + } + word |= storage_type{1} << local; + } +#pragma omp atomic + bits[last_block] |= word; + } + } +#pragma omp parallel for + for (size_type i = 0; i < num_blocks; i++) { + ranks[i] = gko::detail::popcount(bits[i]); + } + components::prefix_sum_nonnegative(exec, ranks, num_blocks); + return gko::bitvector{std::move(bit_array), + std::move(rank_array), size}; +} + + +} // namespace bitvector +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/components/bitvector_kernels.cpp b/omp/components/bitvector_kernels.cpp deleted file mode 100644 index 7c2db95ab71..00000000000 --- a/omp/components/bitvector_kernels.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/components/bitvector_kernels.hpp" - -#include - -#include "core/base/index_range.hpp" -#include "core/components/prefix_sum_kernels.hpp" - - -namespace gko { -namespace kernels { -namespace omp { -namespace bitvector { - - -template -void compute_bits_and_ranks( - std::shared_ptr exec, const IndexType* indices, - IndexType num_indices, IndexType size, - typename device_bitvector::storage_type* bits, IndexType* ranks) -{ - using bv = device_bitvector; - using storage_type = typename bv::storage_type; - const auto num_blocks = ceildiv(size, bv::block_size); -#pragma omp parallel for - for (IndexType i = 0; i < num_blocks; i++) { - bits[i] = 0; - } -#pragma omp parallel for - for (IndexType i = 0; i < num_indices; i++) { - const auto index = indices[i]; - const auto block_idx = index / bv::block_size; - const auto mask = storage_type{1} << index % bv::block_size; -#pragma omp atomic - bits[block_idx] |= mask; - } -#pragma omp parallel for - for (IndexType i = 0; i < num_blocks; i++) { - ranks[i] = gko::detail::popcount(bits[i]); - } - components::prefix_sum_nonnegative(exec, ranks, num_blocks); -} - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL); - - -} // namespace bitvector -} // namespace omp -} // namespace kernels -} // namespace gko diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index ab31c1942d9..87858d18812 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -8,7 +8,6 @@ target_sources( base/scoped_device_id.cpp base/version.cpp components/absolute_array_kernels.cpp - components/bitvector_kernels.cpp components/fill_array_kernels.cpp components/format_conversion_kernels.cpp components/range_minimum_query_kernels.cpp diff --git a/reference/components/bitvector.hpp b/reference/components/bitvector.hpp new file mode 100644 index 00000000000..83675a2edc3 --- /dev/null +++ b/reference/components/bitvector.hpp @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_REFERENCE_COMPONENTS_BITVECTOR_HPP_ +#define GKO_REFERENCE_COMPONENTS_BITVECTOR_HPP_ + +#include "core/base/index_range.hpp" +#include "core/components/bitvector.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace bitvector { + + +template +gko::bitvector from_predicate( + std::shared_ptr exec, IndexType size, + DevicePredicate device_predicate) +{ + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bits{exec, num_blocks}; + array ranks{exec, num_blocks}; + std::fill_n(bits.get_data(), num_blocks, 0); + std::fill_n(ranks.get_data(), num_blocks, 0); + for (auto i : irange{size}) { + if (device_predicate(i)) { + bits.get_data()[i / block_size] |= storage_type{1} + << (i % block_size); + ranks.get_data()[i / block_size]++; + } + } + components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); + + return gko::bitvector{std::move(bits), std::move(ranks), size}; +} + + +template +gko::bitvector::value_type> +from_sorted_indices( + std::shared_ptr exec, IndexIterator it, + typename std::iterator_traits::difference_type count, + typename std::iterator_traits::value_type size) +{ + using index_type = typename std::iterator_traits::value_type; + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = ceildiv(size, block_size); + array bits{exec, static_cast(num_blocks)}; + array ranks{exec, static_cast(num_blocks)}; + std::fill_n(bits.get_data(), num_blocks, 0); + assert(std::is_sorted(it, it + count)); + for (auto i : irange{count}) { + const auto value = it[i]; + const auto block = value / block_size; + const auto local = value % block_size; + const auto mask = storage_type{1} << local; + assert((bits.get_data()[block] & mask) == 0); + bits.get_data()[block] |= mask; + } + index_type rank{}; + for (auto i : irange{num_blocks}) { + ranks.get_data()[i] = rank; + rank += gko::detail::popcount(bits.get_const_data()[i]); + } + + return gko::bitvector{std::move(bits), std::move(ranks), size}; +} + + +} // namespace bitvector +} // namespace reference +} // namespace kernels +} // namespace gko + + +#endif // GKO_REFERENCE_COMPONENTS_BITVECTOR_HPP_ diff --git a/reference/components/bitvector_kernels.hpp b/reference/components/bitvector_kernels.hpp deleted file mode 100644 index a17686311f1..00000000000 --- a/reference/components/bitvector_kernels.hpp +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_REFERENCE_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ -#define GKO_REFERENCE_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ - -#include "core/components/bitvector.hpp" - -#include "core/base/index_range.hpp" -#include "core/components/prefix_sum_kernels.hpp" - - -namespace gko { -namespace kernels { -namespace reference { -namespace bitvector { - - -template -gko::bitvector bitvector_from_predicate( - std::shared_ptr exec, IndexType size, - DevicePredicate device_predicate) -{ - using storage_type = typename device_bitvector::storage_type; - constexpr auto block_size = device_bitvector::block_size; - const auto num_blocks = static_cast(ceildiv(size, block_size)); - array bits{exec, num_blocks}; - array ranks{exec, num_blocks}; - std::fill_n(bits.get_data(), num_blocks, 0); - std::fill_n(ranks.get_data(), num_blocks, 0); - for (auto i : irange{size}) { - if (device_predicate(i)) { - bits.get_data()[i / block_size] |= storage_type{1} - << (i % block_size); - ranks.get_data()[i / block_size]++; - } - } - components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); - - return gko::bitvector{std::move(bits), std::move(ranks), size}; -} - - -} // namespace bitvector -} // namespace reference -} // namespace kernels -} // namespace gko - - -#endif // GKO_REFERENCE_COMPONENTS_BITVECTOR_KERNELS_GENERIC_HPP_ diff --git a/reference/test/components/CMakeLists.txt b/reference/test/components/CMakeLists.txt index a4e36a3439a..47998d184df 100644 --- a/reference/test/components/CMakeLists.txt +++ b/reference/test/components/CMakeLists.txt @@ -1,5 +1,5 @@ ginkgo_create_test(absolute_array_kernels) -ginkgo_create_test(bitvector_kernels) +ginkgo_create_test(bitvector) ginkgo_create_test(fill_array_kernels) ginkgo_create_test(format_conversion_kernels) ginkgo_create_test(precision_conversion_kernels) diff --git a/reference/test/components/bitvector_kernels.cpp b/reference/test/components/bitvector.cpp similarity index 75% rename from reference/test/components/bitvector_kernels.cpp rename to reference/test/components/bitvector.cpp index d8140188bee..4176fe2ba4c 100644 --- a/reference/test/components/bitvector_kernels.cpp +++ b/reference/test/components/bitvector.cpp @@ -11,8 +11,9 @@ #include #include "core/base/index_range.hpp" -#include "core/components/bitvector_kernels.hpp" +#include "core/components/bitvector.hpp" #include "core/test/utils.hpp" +#include "reference/components/bitvector.hpp" template @@ -68,34 +69,33 @@ TYPED_TEST(Bitvector, ComputeBitsAndRanks) SCOPED_TRACE(num_values); auto values = this->create_random_values(num_values, size); num_values = values.size(); - const auto num_blocks = (size + block_size - 1) / block_size; - std::vector bits(num_blocks, ~storage_type{}); - std::vector ranks(num_blocks, -1); + auto num_blocks = gko::ceildiv(size, block_size); - gko::kernels::reference::bitvector::compute_bits_and_ranks( - this->ref, values.data(), num_values, size, bits.data(), - ranks.data()); + auto bv = gko::kernels::reference::bitvector::from_sorted_indices( + this->ref, values.data(), num_values, size); + auto dbv = bv.device_view(); // check bits and ranks are correct - gko::device_bitvector bv(bits.data(), ranks.data(), - size); - ASSERT_EQ(bv.size(), size); - ASSERT_EQ(bv.num_blocks(), num_blocks); + ASSERT_EQ(bv.get_size(), size); + ASSERT_EQ(dbv.size(), size); + ASSERT_EQ(bv.get_num_blocks(), num_blocks); + ASSERT_EQ(dbv.num_blocks(), num_blocks); auto it = values.begin(); index_type rank{}; for (auto i : gko::irange{size}) { const auto block = i / block_size; const auto local = i % block_size; - ASSERT_EQ(bv.rank(i), rank); + ASSERT_EQ(dbv.rank(i), rank); if (it != values.end() && *it == i) { - ASSERT_TRUE(bool(bits[block] & (storage_type{1} << local))); - ASSERT_TRUE(bv.get(i)); + ASSERT_TRUE(bool(bv.get_bits()[block] & + (storage_type{1} << local))); + ASSERT_TRUE(dbv.get(i)); ++rank; ++it; } else { - ASSERT_FALSE( - bool(bits[block] & (storage_type{1} << local))); - ASSERT_FALSE(bv.get(i)); + ASSERT_FALSE(bool(bv.get_bits()[block] & + (storage_type{1} << local))); + ASSERT_FALSE(dbv.get(i)); } } } diff --git a/test/components/CMakeLists.txt b/test/components/CMakeLists.txt index bcc300d5d99..2793f3af575 100644 --- a/test/components/CMakeLists.txt +++ b/test/components/CMakeLists.txt @@ -1,5 +1,5 @@ ginkgo_create_common_test(absolute_array_kernels) -ginkgo_create_common_device_test(bitvector_kernels DISABLE_EXECUTORS dpcpp) +ginkgo_create_common_device_test(bitvector) ginkgo_create_common_test(fill_array_kernels) ginkgo_create_common_test(format_conversion_kernels) ginkgo_create_common_test(precision_conversion_kernels) diff --git a/test/components/bitvector_kernels.cpp b/test/components/bitvector.cpp similarity index 80% rename from test/components/bitvector_kernels.cpp rename to test/components/bitvector.cpp index f7920daced3..f061f563aa8 100644 --- a/test/components/bitvector_kernels.cpp +++ b/test/components/bitvector.cpp @@ -2,12 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/components/bitvector_kernels.hpp" +/*@GKO_PREPROCESSOR_FILENAME_HELPER@*/ + +#include "core/components/bitvector.hpp" -#include #include #include -#include #include #include @@ -16,10 +16,10 @@ #include #include "common/unified/base/kernel_launch.hpp" -#include "common/unified/components/bitvector_kernels.hpp" +#include "common/unified/components/bitvector.hpp" #include "core/base/index_range.hpp" #include "core/test/utils.hpp" -#include "reference/components/bitvector_kernels.hpp" +#include "reference/components/bitvector.hpp" #include "test/utils/common_fixture.hpp" @@ -84,35 +84,6 @@ class Bitvector : public CommonTestFixture { TYPED_TEST_SUITE(Bitvector, gko::test::IndexTypes, TypenameNameGenerator); -#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) - - -TYPED_TEST(Bitvector, BuildFromIndicesDeviceIsEquivalentToRef) -{ - using index_type = typename TestFixture::index_type; - using bitvector = typename TestFixture::bitvector; - for (auto size : this->sizes) { - SCOPED_TRACE(size); - for (auto num_values : - {index_type{}, size / 10, size / 4, size / 2, size}) { - SCOPED_TRACE(num_values); - auto values = this->create_random_values(num_values, size); - gko::array dvalues{this->exec, values}; - - auto bv = bitvector::from_sorted_indices(values, size); - auto dbv = gko::kernels::GKO_DEVICE_NAMESPACE::bitvector:: - bitvector_from_sorted_indices(this->exec, dvalues.get_data(), - dvalues.get_size(), size); - - this->assert_bitvector_equal(bv, dbv); - } - } -} - - -#endif - - TYPED_TEST(Bitvector, BuildFromIndicesIsEquivalentToRef) { using index_type = typename TestFixture::index_type; @@ -125,8 +96,11 @@ TYPED_TEST(Bitvector, BuildFromIndicesIsEquivalentToRef) auto values = this->create_random_values(num_values, size); gko::array dvalues{this->exec, values}; - auto bv = bitvector::from_sorted_indices(values, size); - auto dbv = bitvector::from_sorted_indices(dvalues, size); + auto bv = gko::kernels::reference::bitvector::from_sorted_indices( + this->ref, values.get_data(), values.get_size(), size); + auto dbv = gko::kernels::GKO_DEVICE_NAMESPACE::bitvector:: + from_sorted_indices(this->exec, dvalues.get_data(), + dvalues.get_size(), size); this->assert_bitvector_equal(bv, dbv); } @@ -140,9 +114,9 @@ std::pair, gko::bitvector> run_predicate( std::shared_ptr exec, IndexType size, int stride) { return std::make_pair( - gko::kernels::reference::bitvector::bitvector_from_predicate( + gko::kernels::reference::bitvector::from_predicate( ref, size, [stride](int i) { return i % stride == 0; }), - gko::kernels::GKO_DEVICE_NAMESPACE::bitvector::bitvector_from_predicate( + gko::kernels::GKO_DEVICE_NAMESPACE::bitvector::from_predicate( exec, size, [stride] GKO_KERNEL(int i) { return i % stride == 0; })); } @@ -165,7 +139,9 @@ TYPED_TEST(Bitvector, BuildFromPredicateIsEquivalentToFromIndices) auto [bv, dbv] = run_predicate(this->ref, this->exec, size, stride); - auto ref_bv = bitvector::from_sorted_indices(values, size); + auto ref_bv = + gko::kernels::reference::bitvector::from_sorted_indices( + this->ref, values.get_data(), values.get_size(), size); this->assert_bitvector_equal(bv, dbv); this->assert_bitvector_equal(ref_bv, dbv); } @@ -174,7 +150,6 @@ TYPED_TEST(Bitvector, BuildFromPredicateIsEquivalentToFromIndices) // nvcc doesn't like device lambdas inside class member functions - template void run_device(std::shared_ptr exec, const gko::device_bitvector bv, @@ -212,10 +187,11 @@ TYPED_TEST(Bitvector, AccessIsEquivalentToRef) num_values = values.get_size(); gko::array dvalues{this->exec, values}; - auto bv = - gko::bitvector::from_sorted_indices(values, size); - auto dbv = - gko::bitvector::from_sorted_indices(dvalues, size); + auto bv = gko::kernels::reference::bitvector::from_sorted_indices( + this->ref, values.get_const_data(), values.get_size(), size); + auto dbv = gko::kernels::GKO_DEVICE_NAMESPACE::bitvector:: + from_sorted_indices(this->exec, dvalues.get_const_data(), + dvalues.get_size(), size); const auto usize = static_cast(size); gko::array output_bools{this->ref, usize}; From 26cde3a32889473ca964f894901b886010072bf8 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 20 Apr 2025 23:01:21 +0200 Subject: [PATCH 05/15] fix HIP execution Related to a Thrust bug https://github.com/ROCm/rocThrust/issues/352# --- common/cuda_hip/components/bitvector.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/cuda_hip/components/bitvector.hpp b/common/cuda_hip/components/bitvector.hpp index f1a04084408..585f0cbaf4f 100644 --- a/common/cuda_hip/components/bitvector.hpp +++ b/common/cuda_hip/components/bitvector.hpp @@ -93,8 +93,11 @@ struct bitvector_bit_functor { __device__ storage_type operator()(storage_type a, storage_type b) { + // https://github.com/ROCm/rocThrust/issues/352 +#ifndef GKO_COMPILING_HIP // there must not be any duplicate indices assert(a ^ b == 0); +#endif return a | b; } }; From f934c25e9c1e5159e245da0d14f0700ec88c5d1d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 20 Apr 2025 23:03:02 +0200 Subject: [PATCH 06/15] add native support for OpenMP and SYCL --- .../unified/components/bitvector.generic.hpp | 64 ------------------- common/unified/components/bitvector.hpp | 7 +- dpcpp/components/bitvector.dp.hpp | 41 ++++++++++++ omp/components/bitvector.hpp | 39 +++++++++++ 4 files changed, 82 insertions(+), 69 deletions(-) delete mode 100644 common/unified/components/bitvector.generic.hpp diff --git a/common/unified/components/bitvector.generic.hpp b/common/unified/components/bitvector.generic.hpp deleted file mode 100644 index c2831c8445d..00000000000 --- a/common/unified/components/bitvector.generic.hpp +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_GENERIC_HPP_ -#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_GENERIC_HPP_ - -#include "common/unified/base/kernel_launch.hpp" -#include "core/components/bitvector.hpp" -#include "core/components/prefix_sum_kernels.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -namespace bitvector { - - -template -gko::bitvector from_predicate( - std::shared_ptr exec, IndexType size, - DevicePredicate device_predicate) -{ - using storage_type = typename device_bitvector::storage_type; - constexpr auto block_size = device_bitvector::block_size; - const auto num_blocks = static_cast(ceildiv(size, block_size)); - array bits{exec, num_blocks}; - array ranks{exec, num_blocks}; - run_kernel( - exec, - [] GKO_KERNEL(auto block_i, auto size, auto device_predicate, auto bits, - auto ranks) { - const auto base_i = block_i * block_size; - storage_type mask{}; - if (base_i + block_size <= size) { - for (int local_i = 0; local_i < block_size; local_i++) { - const storage_type bit = - device_predicate(base_i + local_i) ? 1 : 0; - mask |= bit << local_i; - } - } else { - for (int local_i = 0; base_i + local_i < size; local_i++) { - const storage_type bit = - device_predicate(base_i + local_i) ? 1 : 0; - mask |= bit << local_i; - } - } - bits[block_i] = mask; - ranks[block_i] = gko::detail::popcount(mask); - }, - num_blocks, size, device_predicate, bits, ranks); - components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks); - - return gko::bitvector{std::move(bits), std::move(ranks), size}; -} - - -} // namespace bitvector -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko - - -#endif // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_GENERIC_HPP_ diff --git a/common/unified/components/bitvector.hpp b/common/unified/components/bitvector.hpp index 2467e768238..ce289c44dce 100644 --- a/common/unified/components/bitvector.hpp +++ b/common/unified/components/bitvector.hpp @@ -11,13 +11,10 @@ #if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) #include "common/cuda_hip/components/bitvector.hpp" -#elif defined(GKO_COMPILING_DPCPP) || defined(GKO_COMPILING_OMP) -#include "common/unified/components/bitvector.generic.hpp" -#ifdef GKO_COMPILING_OMP +#elif defined(GKO_COMPILING_OMP) #include "omp/components/bitvector.hpp" -#else +#elif defined(GKO_COMPILING_DPCPP) #include "dpcpp/components/bitvector.dp.hpp" -#endif #else #error "This file should only be used inside Ginkgo device compilation" #endif diff --git a/dpcpp/components/bitvector.dp.hpp b/dpcpp/components/bitvector.dp.hpp index 670806da495..1acc1ec2aeb 100644 --- a/dpcpp/components/bitvector.dp.hpp +++ b/dpcpp/components/bitvector.dp.hpp @@ -21,6 +21,47 @@ namespace GKO_DEVICE_NAMESPACE { namespace bitvector { +template +gko::bitvector from_predicate( + std::shared_ptr exec, IndexType size, + DevicePredicate device_predicate) +{ + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bit_array{exec, num_blocks}; + array rank_array{exec, num_blocks}; + const auto bits = bit_array.get_data(); + const auto ranks = rank_array.get_data(); + const auto queue = exec->get_queue(); + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(num_blocks, [=](sycl::id<1> block_i) { + const auto base_i = static_cast(block_i) * block_size; + storage_type mask{}; + if (base_i + block_size <= size) { + for (int local_i = 0; local_i < block_size; local_i++) { + const storage_type bit = + device_predicate(base_i + local_i) ? 1 : 0; + mask |= bit << local_i; + } + } else { + for (int local_i = 0; base_i + local_i < size; local_i++) { + const storage_type bit = + device_predicate(base_i + local_i) ? 1 : 0; + mask |= bit << local_i; + } + } + bits[block_i] = mask; + ranks[block_i] = gko::detail::popcount(mask); + }); + }); + components::prefix_sum_nonnegative(exec, ranks, num_blocks); + + return gko::bitvector{std::move(bit_array), + std::move(rank_array), size}; +} + + template gko::bitvector::value_type> from_sorted_indices( diff --git a/omp/components/bitvector.hpp b/omp/components/bitvector.hpp index 146aee992bd..93e8fd6c1d4 100644 --- a/omp/components/bitvector.hpp +++ b/omp/components/bitvector.hpp @@ -18,6 +18,45 @@ namespace omp { namespace bitvector { +template +gko::bitvector from_predicate( + std::shared_ptr exec, IndexType size, + DevicePredicate device_predicate) +{ + using storage_type = typename device_bitvector::storage_type; + constexpr auto block_size = device_bitvector::block_size; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bit_array{exec, num_blocks}; + array rank_array{exec, num_blocks}; + const auto bits = bit_array.get_data(); + const auto ranks = rank_array.get_data(); +#pragma omp parallel for + for (IndexType block_i = 0; block_i < num_blocks; block_i++) { + const auto base_i = block_i * block_size; + storage_type mask{}; + if (base_i + block_size <= size) { + for (int local_i = 0; local_i < block_size; local_i++) { + const storage_type bit = + device_predicate(base_i + local_i) ? 1 : 0; + mask |= bit << local_i; + } + } else { + for (int local_i = 0; base_i + local_i < size; local_i++) { + const storage_type bit = + device_predicate(base_i + local_i) ? 1 : 0; + mask |= bit << local_i; + } + } + bits[block_i] = mask; + ranks[block_i] = gko::detail::popcount(mask); + } + components::prefix_sum_nonnegative(exec, ranks, num_blocks); + + return gko::bitvector{std::move(bit_array), + std::move(rank_array), size}; +} + + template gko::bitvector::value_type> from_sorted_indices( From a9daa2e91b71ad422de12b68cfa669d1a7c21a67 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 22 Apr 2025 13:12:25 +0200 Subject: [PATCH 07/15] review update Co-authored-by: Marcel Koch --- common/cuda_hip/components/bitvector.hpp | 2 +- core/components/bitvector.hpp | 25 +++++++++++++++--------- dpcpp/components/bitvector.dp.hpp | 19 +++++++++--------- omp/components/bitvector.hpp | 12 ++++++------ reference/components/bitvector.hpp | 5 ++--- reference/test/components/bitvector.cpp | 12 +++++------- test/components/bitvector.cpp | 17 +++++++++------- 7 files changed, 50 insertions(+), 42 deletions(-) diff --git a/common/cuda_hip/components/bitvector.hpp b/common/cuda_hip/components/bitvector.hpp index 585f0cbaf4f..aa26d4cee28 100644 --- a/common/cuda_hip/components/bitvector.hpp +++ b/common/cuda_hip/components/bitvector.hpp @@ -88,7 +88,7 @@ struct bitvector_bit_functor { constexpr static auto block_size = device_bitvector::block_size; __device__ storage_type operator()(IndexType i) { - return storage_type{1} << (i % block_size); + return device_bitvector::get_block_and_mask(i).second; } __device__ storage_type operator()(storage_type a, storage_type b) diff --git a/core/components/bitvector.hpp b/core/components/bitvector.hpp index 7887dc434a8..efc6ffc7570 100644 --- a/core/components/bitvector.hpp +++ b/core/components/bitvector.hpp @@ -20,6 +20,19 @@ class device_bitvector { using storage_type = uint32; constexpr static int block_size = CHAR_BIT * sizeof(storage_type); + /** + * Returns the block index and bitmask belonging to a specific bit index. + * + * @param i the bit index + * @returns a pair consisting of the block index and bitmask for this bit. + */ + constexpr static std::pair get_block_and_mask( + index_type i) + { + return std::make_pair(i / block_size, + storage_type{1} << (i % block_size)); + } + /** * Constructs a device_bitvector from its underlying data. * @@ -68,9 +81,8 @@ class device_bitvector { { assert(i >= 0); assert(i < size()); - const auto block = i / block_size; - const auto local = i % block_size; - const auto prefix_mask = (storage_type{1} << local) - 1; + const auto [block, mask] = get_block_and_mask(i); + const auto prefix_mask = mask - 1; return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); } @@ -85,9 +97,7 @@ class device_bitvector { { assert(i >= 0); assert(i < size()); - const auto block = i / block_size; - const auto local = i % block_size; - const auto mask = storage_type{1} << local; + const auto [block, mask] = get_block_and_mask(i); const auto prefix_mask = mask - 1 | mask; return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); } @@ -123,9 +133,6 @@ class bitvector { return view_type{this->get_bits(), this->get_ranks(), this->get_size()}; } - static bitvector from_sorted_indices(const array& indices, - index_type size); - std::shared_ptr get_executor() const { return bits_.get_executor(); diff --git a/dpcpp/components/bitvector.dp.hpp b/dpcpp/components/bitvector.dp.hpp index 1acc1ec2aeb..962d99129aa 100644 --- a/dpcpp/components/bitvector.dp.hpp +++ b/dpcpp/components/bitvector.dp.hpp @@ -38,17 +38,18 @@ gko::bitvector from_predicate( cgh.parallel_for(num_blocks, [=](sycl::id<1> block_i) { const auto base_i = static_cast(block_i) * block_size; storage_type mask{}; + const auto local_op = [&](int local_i) { + const storage_type bit = + device_predicate(base_i + local_i) ? 1 : 0; + mask |= bit << local_i; + }; if (base_i + block_size <= size) { for (int local_i = 0; local_i < block_size; local_i++) { - const storage_type bit = - device_predicate(base_i + local_i) ? 1 : 0; - mask |= bit << local_i; + local_op(local_i); } } else { for (int local_i = 0; base_i + local_i < size; local_i++) { - const storage_type bit = - device_predicate(base_i + local_i) ? 1 : 0; - mask |= bit << local_i; + local_op(local_i); } } bits[block_i] = mask; @@ -83,13 +84,13 @@ from_sorted_indices( queue->submit([&](sycl::handler& cgh) { cgh.parallel_for(count, [=](sycl::id<1> i) { auto value = it[i]; - const auto block = value / block_size; - const auto local = value % block_size; + const auto [block, mask] = + device_bitvector::get_block_and_mask(value); sycl::atomic_ref atomic(bits[block]); - atomic.fetch_or(storage_type{1} << local); + atomic.fetch_or(mask); }); }); queue->submit([&](sycl::handler& cgh) { diff --git a/omp/components/bitvector.hpp b/omp/components/bitvector.hpp index 93e8fd6c1d4..7a3d8bc3a9b 100644 --- a/omp/components/bitvector.hpp +++ b/omp/components/bitvector.hpp @@ -34,17 +34,17 @@ gko::bitvector from_predicate( for (IndexType block_i = 0; block_i < num_blocks; block_i++) { const auto base_i = block_i * block_size; storage_type mask{}; + const auto local_op = [&](int local_i) { + const storage_type bit = device_predicate(base_i + local_i) ? 1 : 0; + mask |= bit << local_i; + }; if (base_i + block_size <= size) { for (int local_i = 0; local_i < block_size; local_i++) { - const storage_type bit = - device_predicate(base_i + local_i) ? 1 : 0; - mask |= bit << local_i; + local_op(local_i); } } else { for (int local_i = 0; base_i + local_i < size; local_i++) { - const storage_type bit = - device_predicate(base_i + local_i) ? 1 : 0; - mask |= bit << local_i; + local_op(local_i); } } bits[block_i] = mask; diff --git a/reference/components/bitvector.hpp b/reference/components/bitvector.hpp index 83675a2edc3..30858224a87 100644 --- a/reference/components/bitvector.hpp +++ b/reference/components/bitvector.hpp @@ -58,9 +58,8 @@ from_sorted_indices( assert(std::is_sorted(it, it + count)); for (auto i : irange{count}) { const auto value = it[i]; - const auto block = value / block_size; - const auto local = value % block_size; - const auto mask = storage_type{1} << local; + const auto [block, mask] = + device_bitvector::get_block_and_mask(value); assert((bits.get_data()[block] & mask) == 0); bits.get_data()[block] |= mask; } diff --git a/reference/test/components/bitvector.cpp b/reference/test/components/bitvector.cpp index 4176fe2ba4c..8217e32d55c 100644 --- a/reference/test/components/bitvector.cpp +++ b/reference/test/components/bitvector.cpp @@ -38,14 +38,12 @@ class Bitvector : public ::testing::Test { std::vector create_random_values(index_type num_values, index_type size) { - std::vector values(num_values); - std::uniform_int_distribution dist( - 0, std::max(size - 1, index_type{})); - for (auto& value : values) { - value = dist(this->rng); - } + assert(num_values <= size); + std::vector values(size); + std::iota(values.begin(), values.end(), index_type{}); + std::shuffle(values.begin(), values.end(), rng); + values.resize(num_values); std::sort(values.begin(), values.end()); - values.erase(std::unique(values.begin(), values.end()), values.end()); return values; } diff --git a/test/components/bitvector.cpp b/test/components/bitvector.cpp index f061f563aa8..f3aec9faac0 100644 --- a/test/components/bitvector.cpp +++ b/test/components/bitvector.cpp @@ -6,6 +6,7 @@ #include "core/components/bitvector.hpp" +#include #include #include #include @@ -45,14 +46,12 @@ class Bitvector : public CommonTestFixture { gko::array create_random_values(index_type num_values, index_type size) { - std::vector values(num_values); - std::uniform_int_distribution dist( - 0, std::max(size - 1, index_type{})); - for (auto& value : values) { - value = dist(this->rng); - } + assert(num_values <= size); + std::vector values(size); + std::iota(values.begin(), values.end(), index_type{}); + std::shuffle(values.begin(), values.end(), rng); + values.resize(num_values); std::sort(values.begin(), values.end()); - values.erase(std::unique(values.begin(), values.end()), values.end()); return gko::array{this->ref, values.begin(), values.end()}; } @@ -108,6 +107,7 @@ TYPED_TEST(Bitvector, BuildFromIndicesIsEquivalentToRef) } +// nvcc doesn't like device lambdas inside class member functions template std::pair, gko::bitvector> run_predicate( std::shared_ptr ref, @@ -203,6 +203,9 @@ TYPED_TEST(Bitvector, AccessIsEquivalentToRef) run_device(this->exec, bv.device_view(), dbv.device_view(), output_bools, output_ranks, doutput_bools, doutput_ranks); + + GKO_ASSERT_ARRAY_EQ(doutput_bools, output_bools); + GKO_ASSERT_ARRAY_EQ(doutput_ranks, output_ranks); } } } From 629400cce96de31a98cd735e8e664b1c62359c5c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 27 Apr 2025 12:40:24 +0200 Subject: [PATCH 08/15] fix OpenMP compilation on macOS --- omp/components/bitvector.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/omp/components/bitvector.hpp b/omp/components/bitvector.hpp index 7a3d8bc3a9b..5f2bf9fd9eb 100644 --- a/omp/components/bitvector.hpp +++ b/omp/components/bitvector.hpp @@ -80,8 +80,8 @@ from_sorted_indices( #pragma omp parallel num_threads(num_threads) { const auto tid = omp_get_thread_num(); - const auto begin = std::min(tid * work_per_thread, count); - const auto end = std::min(begin + work_per_thread, count); + const auto begin = std::min(tid * work_per_thread, count); + const auto end = std::min(begin + work_per_thread, count); if (begin < end) { const auto first_block = it[begin] / block_size; const auto last_block = it[end - 1] / block_size; From add25b8eccd1910e852cab7d06d5f2f98af7757b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 27 Apr 2025 12:41:21 +0200 Subject: [PATCH 09/15] fix compilation for DPC++ --- dpcpp/components/bitvector.dp.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpcpp/components/bitvector.dp.hpp b/dpcpp/components/bitvector.dp.hpp index 962d99129aa..278112fa02b 100644 --- a/dpcpp/components/bitvector.dp.hpp +++ b/dpcpp/components/bitvector.dp.hpp @@ -85,7 +85,7 @@ from_sorted_indices( cgh.parallel_for(count, [=](sycl::id<1> i) { auto value = it[i]; const auto [block, mask] = - device_bitvector::get_block_and_mask(value); + device_bitvector::get_block_and_mask(value); sycl::atomic_ref From 59abaff875f3b7296ce732b874c60de256f44d59 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 27 Apr 2025 12:42:59 +0200 Subject: [PATCH 10/15] attempt fixing HIP compilation --- common/cuda_hip/components/bitvector.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/common/cuda_hip/components/bitvector.hpp b/common/cuda_hip/components/bitvector.hpp index aa26d4cee28..592748d5894 100644 --- a/common/cuda_hip/components/bitvector.hpp +++ b/common/cuda_hip/components/bitvector.hpp @@ -85,12 +85,16 @@ gko::bitvector from_predicate( template struct bitvector_bit_functor { using storage_type = typename device_bitvector::storage_type; - constexpr static auto block_size = device_bitvector::block_size; __device__ storage_type operator()(IndexType i) { return device_bitvector::get_block_and_mask(i).second; } +}; + +template +struct bitvector_or_functor { + using storage_type = typename device_bitvector::storage_type; __device__ storage_type operator()(storage_type a, storage_type b) { // https://github.com/ROCm/rocThrust/issues/352 @@ -151,7 +155,7 @@ from_sorted_indices( auto out_bit_it = bits_compact.get_data(); auto [out_pos_end, out_bit_end] = thrust::reduce_by_key( policy, block_it, block_it + count, bit_it, out_pos_it, out_bit_it, - thrust::equal_to{}, bitvector_bit_functor{}); + thrust::equal_to{}, bitvector_or_functor{}); assert(thrust::is_sorted(policy, out_pos_it, out_pos_end)); const auto out_size = out_pos_end - out_pos_it; thrust::fill_n(policy, bits.get_data(), num_blocks, 0); From 4c974d48858e2ca475c2e495b64ea217aacbd27f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 27 Apr 2025 14:13:50 +0200 Subject: [PATCH 11/15] fix ROCm 4.5 issue --- common/cuda_hip/components/bitvector.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/cuda_hip/components/bitvector.hpp b/common/cuda_hip/components/bitvector.hpp index 592748d5894..b1586c2d912 100644 --- a/common/cuda_hip/components/bitvector.hpp +++ b/common/cuda_hip/components/bitvector.hpp @@ -109,6 +109,8 @@ struct bitvector_or_functor { template struct bitvector_block_functor { + // workaround for ROCm 4.5 bug + using result_type = IndexType; constexpr static auto block_size = device_bitvector::block_size; __device__ IndexType operator()(IndexType i) { From 94eee10ee67c9e2625ffa0495b26e62dac1653f9 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 30 Apr 2025 15:36:13 +0200 Subject: [PATCH 12/15] review updates - reformat documentation - remove explicit uint32 usages - rename member functions with get_ - unroll loops explicitly Co-authored-by: Marcel Koch Co-authored-by: Yu-Hsiang M. Tsai --- common/cuda_hip/components/bitvector.hpp | 3 ++- core/components/bitvector.hpp | 22 +++++++++------- dpcpp/components/bitvector.dp.hpp | 1 + omp/components/bitvector.hpp | 15 +++++------ reference/test/components/bitvector.cpp | 13 +++++----- test/components/bitvector.cpp | 32 +++++++++++------------- 6 files changed, 45 insertions(+), 41 deletions(-) diff --git a/common/cuda_hip/components/bitvector.hpp b/common/cuda_hip/components/bitvector.hpp index b1586c2d912..f753604982e 100644 --- a/common/cuda_hip/components/bitvector.hpp +++ b/common/cuda_hip/components/bitvector.hpp @@ -65,9 +65,10 @@ gko::bitvector from_predicate( std::shared_ptr exec, IndexType size, DevicePredicate device_predicate) { + using storage_type = typename device_bitvector::storage_type; constexpr auto block_size = device_bitvector::block_size; const auto num_blocks = static_cast(ceildiv(size, block_size)); - array bits{exec, num_blocks}; + array bits{exec, num_blocks}; array ranks{exec, num_blocks}; if (num_blocks > 0) { const auto num_threadblocks = diff --git a/core/components/bitvector.hpp b/core/components/bitvector.hpp index efc6ffc7570..13b5e47e295 100644 --- a/core/components/bitvector.hpp +++ b/core/components/bitvector.hpp @@ -24,6 +24,7 @@ class device_bitvector { * Returns the block index and bitmask belonging to a specific bit index. * * @param i the bit index + * * @returns a pair consisting of the block index and bitmask for this bit. */ constexpr static std::pair get_block_and_mask( @@ -47,24 +48,25 @@ class device_bitvector { {} /** Returns the number of bits stored in this bitvector. */ - constexpr index_type size() const { return size_; } + constexpr index_type get_size() const { return size_; } /** Returns the number of words (of type storage_type) in this bitvector. */ - constexpr index_type num_blocks() const + constexpr index_type get_num_blocks() const { - return (this->size() + block_size - 1) / block_size; + return (this->get_size() + block_size - 1) / block_size; } /** * Returns whether the bit at the given index is set. * * @param i the index in range [0, size()) + * * @return true if the bit is set, false otherwise. */ - constexpr bool get(index_type i) const + constexpr bool operator[](index_type i) const { assert(i >= 0); - assert(i < size()); + assert(i < get_size()); const auto block = i / block_size; const auto local = i % block_size; return bool((bits_[block] >> local) & 1); @@ -74,13 +76,14 @@ class device_bitvector { * Returns the rank of the given index. * * @param i the index in range [0, size()) + * * @return the rank of the given index, i.e. the number of 1 bits set * before the corresponding bit (exclusive). */ - constexpr index_type rank(index_type i) const + constexpr index_type get_rank(index_type i) const { assert(i >= 0); - assert(i < size()); + assert(i < get_size()); const auto [block, mask] = get_block_and_mask(i); const auto prefix_mask = mask - 1; return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); @@ -90,13 +93,14 @@ class device_bitvector { * Returns the inclusive rank of the given index. * * @param i the index in range [0, size()) + * * @return the rank of the given index, i.e. the number of 1 bits set * up to and including the corresponding bit (inclusive). */ - constexpr index_type rank_inclusive(index_type i) const + constexpr index_type get_rank_inclusive(index_type i) const { assert(i >= 0); - assert(i < size()); + assert(i < get_size()); const auto [block, mask] = get_block_and_mask(i); const auto prefix_mask = mask - 1 | mask; return ranks_[block] + detail::popcount(prefix_mask & bits_[block]); diff --git a/dpcpp/components/bitvector.dp.hpp b/dpcpp/components/bitvector.dp.hpp index 278112fa02b..2c329941eaa 100644 --- a/dpcpp/components/bitvector.dp.hpp +++ b/dpcpp/components/bitvector.dp.hpp @@ -44,6 +44,7 @@ gko::bitvector from_predicate( mask |= bit << local_i; }; if (base_i + block_size <= size) { +#pragma unroll for (int local_i = 0; local_i < block_size; local_i++) { local_op(local_i); } diff --git a/omp/components/bitvector.hpp b/omp/components/bitvector.hpp index 5f2bf9fd9eb..9b87a990b3c 100644 --- a/omp/components/bitvector.hpp +++ b/omp/components/bitvector.hpp @@ -26,7 +26,7 @@ gko::bitvector from_predicate( using storage_type = typename device_bitvector::storage_type; constexpr auto block_size = device_bitvector::block_size; const auto num_blocks = static_cast(ceildiv(size, block_size)); - array bit_array{exec, num_blocks}; + array bit_array{exec, num_blocks}; array rank_array{exec, num_blocks}; const auto bits = bit_array.get_data(); const auto ranks = rank_array.get_data(); @@ -39,6 +39,7 @@ gko::bitvector from_predicate( mask |= bit << local_i; }; if (base_i + block_size <= size) { +#pragma unroll for (int local_i = 0; local_i < block_size; local_i++) { local_op(local_i); } @@ -85,7 +86,7 @@ from_sorted_indices( if (begin < end) { const auto first_block = it[begin] / block_size; const auto last_block = it[end - 1] / block_size; - storage_type word{}; + storage_type mask{0}; auto block = first_block; for (auto i : irange{begin, end}) { const auto value = it[i]; @@ -95,17 +96,17 @@ from_sorted_indices( assert(new_block > block); if (block == first_block) { #pragma omp atomic - bits[block] |= word; + bits[block] |= mask; } else { - bits[block] = word; + bits[block] = mask; } - word = 0; + mask = 0; block = new_block; } - word |= storage_type{1} << local; + mask |= storage_type{1} << local; } #pragma omp atomic - bits[last_block] |= word; + bits[last_block] |= mask; } } #pragma omp parallel for diff --git a/reference/test/components/bitvector.cpp b/reference/test/components/bitvector.cpp index 8217e32d55c..8c5b428bb39 100644 --- a/reference/test/components/bitvector.cpp +++ b/reference/test/components/bitvector.cpp @@ -63,7 +63,7 @@ TYPED_TEST(Bitvector, ComputeBitsAndRanks) for (auto size : this->sizes) { SCOPED_TRACE(size); for (auto num_values : - {index_type{}, size / 10, size / 4, size / 2, size}) { + {index_type{0}, size / 10, size / 4, size / 2, size}) { SCOPED_TRACE(num_values); auto values = this->create_random_values(num_values, size); num_values = values.size(); @@ -75,26 +75,27 @@ TYPED_TEST(Bitvector, ComputeBitsAndRanks) // check bits and ranks are correct ASSERT_EQ(bv.get_size(), size); - ASSERT_EQ(dbv.size(), size); + ASSERT_EQ(dbv.get_size(), size); ASSERT_EQ(bv.get_num_blocks(), num_blocks); - ASSERT_EQ(dbv.num_blocks(), num_blocks); + ASSERT_EQ(dbv.get_num_blocks(), num_blocks); auto it = values.begin(); index_type rank{}; for (auto i : gko::irange{size}) { const auto block = i / block_size; const auto local = i % block_size; - ASSERT_EQ(dbv.rank(i), rank); + ASSERT_EQ(dbv.get_rank(i), rank); if (it != values.end() && *it == i) { ASSERT_TRUE(bool(bv.get_bits()[block] & (storage_type{1} << local))); - ASSERT_TRUE(dbv.get(i)); + ASSERT_TRUE(dbv[i]); ++rank; ++it; } else { ASSERT_FALSE(bool(bv.get_bits()[block] & (storage_type{1} << local))); - ASSERT_FALSE(dbv.get(i)); + ASSERT_FALSE(dbv[i]); } + ASSERT_EQ(dbv.get_rank_inclusive(i), rank); } } } diff --git a/test/components/bitvector.cpp b/test/components/bitvector.cpp index f3aec9faac0..dfed105e6ec 100644 --- a/test/components/bitvector.cpp +++ b/test/components/bitvector.cpp @@ -60,18 +60,14 @@ class Bitvector : public CommonTestFixture { ASSERT_EQ(bv.get_size(), dbv.get_size()); const auto num_blocks = static_cast(bv.get_num_blocks()); - const auto bits = - gko::detail::array_const_cast(gko::make_const_array_view( - bv.get_executor(), num_blocks, bv.get_bits())); - const auto dbits = - gko::detail::array_const_cast(gko::make_const_array_view( - dbv.get_executor(), num_blocks, dbv.get_bits())); - const auto ranks = - gko::detail::array_const_cast(gko::make_const_array_view( - bv.get_executor(), num_blocks, bv.get_ranks())); - const auto dranks = - gko::detail::array_const_cast(gko::make_const_array_view( - dbv.get_executor(), num_blocks, dbv.get_ranks())); + const auto bits = gko::make_const_array_view(bv.get_executor(), + num_blocks, bv.get_bits()); + const auto dbits = gko::make_const_array_view( + dbv.get_executor(), num_blocks, dbv.get_bits()); + const auto ranks = gko::make_const_array_view( + bv.get_executor(), num_blocks, bv.get_ranks()); + const auto dranks = gko::make_const_array_view( + dbv.get_executor(), num_blocks, dbv.get_ranks()); GKO_ASSERT_ARRAY_EQ(bits, dbits); GKO_ASSERT_ARRAY_EQ(ranks, dranks); } @@ -162,13 +158,13 @@ void run_device(std::shared_ptr exec, gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto bv, auto output_bool, auto output_rank) { - output_bool[i] = bv.get(i); - output_rank[i] = bv.rank(i); + output_bool[i] = bv[i]; + output_rank[i] = bv.get_rank(i); }, - dbv.size(), dbv, doutput_bools, doutput_ranks); - for (auto i : gko::irange{bv.size()}) { - output_bools.get_data()[i] = bv.get(i); - output_ranks.get_data()[i] = bv.rank(i); + dbv.get_size(), dbv, doutput_bools, doutput_ranks); + for (auto i : gko::irange{bv.get_size()}) { + output_bools.get_data()[i] = bv[i]; + output_ranks.get_data()[i] = bv.get_rank(i); } } From 93f4c3846682977e7ebc14fa225af474083b0a6f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 30 Apr 2025 17:44:58 +0200 Subject: [PATCH 13/15] fix ROCm 4.5 compilation --- common/cuda_hip/components/bitvector.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/cuda_hip/components/bitvector.hpp b/common/cuda_hip/components/bitvector.hpp index f753604982e..1b29eaad62a 100644 --- a/common/cuda_hip/components/bitvector.hpp +++ b/common/cuda_hip/components/bitvector.hpp @@ -86,7 +86,7 @@ gko::bitvector from_predicate( template struct bitvector_bit_functor { using storage_type = typename device_bitvector::storage_type; - __device__ storage_type operator()(IndexType i) + constexpr storage_type operator()(IndexType i) const { return device_bitvector::get_block_and_mask(i).second; } @@ -96,7 +96,7 @@ struct bitvector_bit_functor { template struct bitvector_or_functor { using storage_type = typename device_bitvector::storage_type; - __device__ storage_type operator()(storage_type a, storage_type b) + constexpr storage_type operator()(storage_type a, storage_type b) const { // https://github.com/ROCm/rocThrust/issues/352 #ifndef GKO_COMPILING_HIP @@ -113,7 +113,7 @@ struct bitvector_block_functor { // workaround for ROCm 4.5 bug using result_type = IndexType; constexpr static auto block_size = device_bitvector::block_size; - __device__ IndexType operator()(IndexType i) + constexpr IndexType operator()(IndexType i) const { assert(i >= 0); assert(i < size); @@ -127,7 +127,7 @@ struct bitvector_block_functor { template struct bitvector_popcnt_functor { using storage_type = typename device_bitvector::storage_type; - __device__ IndexType operator()(storage_type mask) + constexpr IndexType operator()(storage_type mask) const { return gko::detail::popcount(mask); } From 090969eb8e79e4c322561ec34225efc8e01ed4f7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 30 Apr 2025 17:49:15 +0200 Subject: [PATCH 14/15] remove file --- reference/components/bitvector_kernels.cpp | 49 ---------------------- 1 file changed, 49 deletions(-) delete mode 100644 reference/components/bitvector_kernels.cpp diff --git a/reference/components/bitvector_kernels.cpp b/reference/components/bitvector_kernels.cpp deleted file mode 100644 index 8bbb7ff7679..00000000000 --- a/reference/components/bitvector_kernels.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/components/bitvector_kernels.hpp" - -#include - -#include "core/base/index_range.hpp" - - -namespace gko { -namespace kernels { -namespace reference { -namespace bitvector { - - -template -void compute_bits_and_ranks( - std::shared_ptr exec, const IndexType* indices, - IndexType num_indices, IndexType size, - typename device_bitvector::storage_type* bits, IndexType* ranks) -{ - using bv = device_bitvector; - using storage_type = typename bv::storage_type; - const auto num_blocks = ceildiv(size, bv::block_size); - std::fill_n(bits, num_blocks, 0u); - for (auto i : irange{num_indices}) { - const auto index = indices[i]; - assert(index >= 0); - assert(index < size); - bits[index / bv::block_size] |= storage_type{1} - << index % bv::block_size; - } - IndexType rank{}; - for (auto i : irange{num_blocks}) { - ranks[i] = rank; - rank += gko::detail::popcount(bits[i]); - } -} - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_BITVECTOR_COMPUTE_BITS_AND_RANKS_KERNEL); - - -} // namespace bitvector -} // namespace reference -} // namespace kernels -} // namespace gko From 3c3a66b701853cf8bdf610e5cf85782a9b10bb34 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 30 Apr 2025 18:04:36 +0200 Subject: [PATCH 15/15] review updates --- reference/components/bitvector.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/reference/components/bitvector.hpp b/reference/components/bitvector.hpp index 30858224a87..d35c2b2766b 100644 --- a/reference/components/bitvector.hpp +++ b/reference/components/bitvector.hpp @@ -51,9 +51,9 @@ from_sorted_indices( using index_type = typename std::iterator_traits::value_type; using storage_type = typename device_bitvector::storage_type; constexpr auto block_size = device_bitvector::block_size; - const auto num_blocks = ceildiv(size, block_size); - array bits{exec, static_cast(num_blocks)}; - array ranks{exec, static_cast(num_blocks)}; + const auto num_blocks = static_cast(ceildiv(size, block_size)); + array bits{exec, num_blocks}; + array ranks{exec, num_blocks}; std::fill_n(bits.get_data(), num_blocks, 0); assert(std::is_sorted(it, it + count)); for (auto i : irange{count}) { @@ -63,8 +63,8 @@ from_sorted_indices( assert((bits.get_data()[block] & mask) == 0); bits.get_data()[block] |= mask; } - index_type rank{}; - for (auto i : irange{num_blocks}) { + index_type rank{0}; + for (auto i : irange{static_cast(num_blocks)}) { ranks.get_data()[i] = rank; rank += gko::detail::popcount(bits.get_const_data()[i]); }