diff --git a/common/cuda_hip/components/bitvector.hpp b/common/cuda_hip/components/bitvector.hpp
new file mode 100644
index 00000000000..1b29eaad62a
--- /dev/null
+++ b/common/cuda_hip/components/bitvector.hpp
@@ -0,0 +1,182 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_
+
+#include <thrust/functional.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scatter.h>
+#include <thrust/sort.h>
+
+#include <ginkgo/core/base/intrinsics.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/bitvector.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace bitvector {
+
+
+constexpr auto default_block_size = 512;
+
+
+namespace kernel {
+
+
+template <typename IndexType, typename DevicePredicate>
+__global__ __launch_bounds__(default_block_size) void from_predicate(
+    IndexType size,
+    typename device_bitvector<IndexType>::storage_type* __restrict__ bits,
+    IndexType* __restrict__ popcounts, DevicePredicate predicate)
+{
+    constexpr auto block_size = device_bitvector<IndexType>::block_size;
+    static_assert(block_size <= config::warp_size);
+    const auto subwarp_id = thread::get_subwarp_id_flat<block_size>();
+    const auto subwarp_base = subwarp_id * block_size;
+    if (subwarp_base >= size) {
+        return;
+    }
+    const auto subwarp =
+        group::tiled_partition<block_size>(group::this_thread_block());
+    const auto i = static_cast<IndexType>(subwarp_base + subwarp.thread_rank());
+    const auto bit = i < size ? predicate(i) : false;
+    const auto mask = subwarp.ballot(bit);
+    if (subwarp.thread_rank() == 0) {
+        bits[subwarp_id] = mask;
+        popcounts[subwarp_id] = gko::detail::popcount(mask);
+    }
+}
+
+
+}  // namespace kernel
+
+
+template <typename IndexType, typename DevicePredicate>
+gko::bitvector<IndexType> from_predicate(
+    std::shared_ptr<const DefaultExecutor> exec, IndexType size,
+    DevicePredicate device_predicate)
+{
+    using storage_type = typename device_bitvector<IndexType>::storage_type;
+    constexpr auto block_size = device_bitvector<IndexType>::block_size;
+    const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
+    array<storage_type> bits{exec, num_blocks};
+    array<IndexType> ranks{exec, num_blocks};
+    if (num_blocks > 0) {
+        const auto num_threadblocks =
+            ceildiv(num_blocks, default_block_size / block_size);
+        kernel::from_predicate<<<num_threadblocks, default_block_size, 0,
+                                 exec->get_stream()>>>(
+            size, bits.get_data(), ranks.get_data(), device_predicate);
+        components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks);
+    }
+
+    return gko::bitvector<IndexType>{std::move(bits), std::move(ranks), size};
+}
+
+
+template <typename IndexType>
+struct bitvector_bit_functor {
+    using storage_type = typename device_bitvector<IndexType>::storage_type;
+    constexpr storage_type operator()(IndexType i) const
+    {
+        return device_bitvector<IndexType>::get_block_and_mask(i).second;
+    }
+};
+
+
+template <typename IndexType>
+struct bitvector_or_functor {
+    using storage_type = typename device_bitvector<IndexType>::storage_type;
+    constexpr storage_type operator()(storage_type a, storage_type b) const
+    {
+        // https://github.com/ROCm/rocThrust/issues/352
+#ifndef GKO_COMPILING_HIP
+        // there must not be any duplicate indices
+        assert(a ^ b == 0);
+#endif
+        return a | b;
+    }
+};
+
+
+template <typename IndexType>
+struct bitvector_block_functor {
+    // workaround for ROCm 4.5 bug
+    using result_type = IndexType;
+    constexpr static auto block_size = device_bitvector<IndexType>::block_size;
+    constexpr IndexType operator()(IndexType i) const
+    {
+        assert(i >= 0);
+        assert(i < size);
+        return i / block_size;
+    }
+
+    IndexType size;
+};
+
+
+template <typename IndexType>
+struct bitvector_popcnt_functor {
+    using storage_type = typename device_bitvector<IndexType>::storage_type;
+    constexpr IndexType operator()(storage_type mask) const
+    {
+        return gko::detail::popcount(mask);
+    }
+};
+
+
+template <typename IndexIterator>
+gko::bitvector<typename std::iterator_traits<IndexIterator>::value_type>
+from_sorted_indices(
+    std::shared_ptr<const DefaultExecutor> exec, IndexIterator it,
+    typename std::iterator_traits<IndexIterator>::difference_type count,
+    typename std::iterator_traits<IndexIterator>::value_type size)
+{
+    using index_type = typename std::iterator_traits<IndexIterator>::value_type;
+    using storage_type = typename device_bitvector<index_type>::storage_type;
+    constexpr auto block_size = device_bitvector<index_type>::block_size;
+    const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
+    const auto policy = thrust_policy(exec);
+    array<storage_type> bits_compact{exec, num_blocks};
+    array<index_type> bits_position{exec, num_blocks};
+    array<storage_type> bits{exec, num_blocks};
+    array<index_type> ranks{exec, num_blocks};
+    const auto block_it = thrust::make_transform_iterator(
+        it, bitvector_block_functor<index_type>{size});
+    const auto bit_it = thrust::make_transform_iterator(
+        it, bitvector_bit_functor<index_type>{});
+    auto out_pos_it = bits_position.get_data();
+    auto out_bit_it = bits_compact.get_data();
+    auto [out_pos_end, out_bit_end] = thrust::reduce_by_key(
+        policy, block_it, block_it + count, bit_it, out_pos_it, out_bit_it,
+        thrust::equal_to<index_type>{}, bitvector_or_functor<storage_type>{});
+    assert(thrust::is_sorted(policy, out_pos_it, out_pos_end));
+    const auto out_size = out_pos_end - out_pos_it;
+    thrust::fill_n(policy, bits.get_data(), num_blocks, 0);
+    thrust::scatter(policy, out_bit_it, out_bit_it + out_size, out_pos_it,
+                    bits.get_data());
+    const auto rank_it = thrust::make_transform_iterator(
+        bits.get_const_data(), bitvector_popcnt_functor<index_type>{});
+    thrust::exclusive_scan(policy, rank_it, rank_it + num_blocks,
+                           ranks.get_data(), index_type{});
+
+    return gko::bitvector<index_type>{std::move(bits), std::move(ranks), size};
+}
+
+
+}  // namespace bitvector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_BITVECTOR_HPP_
diff --git a/common/unified/components/bitvector.hpp b/common/unified/components/bitvector.hpp
new file mode 100644
index 00000000000..ce289c44dce
--- /dev/null
+++ b/common/unified/components/bitvector.hpp
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_
+#define GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_
+
+
+#include "common/unified/base/kernel_launch.hpp"
+
+
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
+#include "common/cuda_hip/components/bitvector.hpp"
+#elif defined(GKO_COMPILING_OMP)
+#include "omp/components/bitvector.hpp"
+#elif defined(GKO_COMPILING_DPCPP)
+#include "dpcpp/components/bitvector.dp.hpp"
+#else
+#error "This file should only be used inside Ginkgo device compilation"
+#endif
+
+
+#endif  // GKO_COMMON_UNIFIED_COMPONENTS_BITVECTOR_HPP_
diff --git a/core/base/array.cpp b/core/base/array.cpp
index 51fa4b34bb1..da1accc1ca1 100644
--- a/core/base/array.cpp
+++ b/core/base/array.cpp
@@ -91,6 +91,7 @@ ValueType reduce_add(const array<ValueType>& input_arr,
 #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value)
 
 GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL);
+template GKO_DECLARE_ARRAY_FILL(bool);
 template GKO_DECLARE_ARRAY_FILL(uint16);
 template GKO_DECLARE_ARRAY_FILL(uint32);
 #ifndef GKO_SIZE_T_IS_UINT64_T
diff --git a/core/components/bit_packed_storage.hpp b/core/components/bit_packed_storage.hpp
index 78777c1d2d2..2654b7ecb2a 100644
--- a/core/components/bit_packed_storage.hpp
+++ b/core/components/bit_packed_storage.hpp
@@ -239,11 +239,13 @@ class bit_packed_span {
  * @tparam num_bits  The number of bits necessary to store a single value in the
  *                   array. Values need to be in the range [0, 2^num_bits).
  * @tparam size  The number of values to store in the array.
+ * @tparam StorageType  the underlying storage type to use for each individual
+ word
  */
-template <int num_bits, int size>
+template <int num_bits, int size, typename StorageType = uint32>
 class bit_packed_array {
 public:
-    using storage_type = uint32;
+    using storage_type = StorageType;
     constexpr static int bits_per_word = sizeof(storage_type) * CHAR_BIT;
     constexpr static int bits_per_value = round_up_pow2_constexpr(num_bits);
     constexpr static int values_per_word = bits_per_word / bits_per_value;
diff --git a/core/components/bitvector.hpp b/core/components/bitvector.hpp
new file mode 100644
index 00000000000..13b5e47e295
--- /dev/null
+++ b/core/components/bitvector.hpp
@@ -0,0 +1,171 @@
+// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CORE_COMPONENTS_BITVECTOR_HPP_
+#define GKO_CORE_COMPONENTS_BITVECTOR_HPP_
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/intrinsics.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+namespace gko {
+
+
+template <typename IndexType>
+class device_bitvector {
+public:
+    using index_type = IndexType;
+    using storage_type = uint32;
+    constexpr static int block_size = CHAR_BIT * sizeof(storage_type);
+
+    /**
+     * Returns the block index and bitmask belonging to a specific bit index.
+     *
+     * @param i  the bit index
+     *
+     * @returns a pair consisting of the block index and bitmask for this bit.
+     */
+    constexpr static std::pair<index_type, storage_type> get_block_and_mask(
+        index_type i)
+    {
+        return std::make_pair(i / block_size,
+                              storage_type{1} << (i % block_size));
+    }
+
+    /**
+     * Constructs a device_bitvector from its underlying data.
+     *
+     * @param bits  the bitmask array
+     * @param rank  the rank array, it must be the prefix sum over
+     *              `popcount(bits[i])`.
+     * @param size  the number of bits we consider part of this bitvector.
+     */
+    constexpr device_bitvector(const storage_type* bits,
+                               const index_type* ranks, index_type size)
+        : size_{size}, bits_{bits}, ranks_{ranks}
+    {}
+
+    /** Returns the number of bits stored in this bitvector. */
+    constexpr index_type get_size() const { return size_; }
+
+    /** Returns the number of words (of type storage_type) in this bitvector. */
+    constexpr index_type get_num_blocks() const
+    {
+        return (this->get_size() + block_size - 1) / block_size;
+    }
+
+    /**
+     * Returns whether the bit at the given index is set.
+     *
+     * @param i  the index in range [0, size())
+     *
+     * @return true if the bit is set, false otherwise.
+     */
+    constexpr bool operator[](index_type i) const
+    {
+        assert(i >= 0);
+        assert(i < get_size());
+        const auto block = i / block_size;
+        const auto local = i % block_size;
+        return bool((bits_[block] >> local) & 1);
+    }
+
+    /**
+     * Returns the rank of the given index.
+     *
+     * @param i  the index in range [0, size())
+     *
+     * @return the rank of the given index, i.e. the number of 1 bits set
+     *         before the corresponding bit (exclusive).
+     */
+    constexpr index_type get_rank(index_type i) const
+    {
+        assert(i >= 0);
+        assert(i < get_size());
+        const auto [block, mask] = get_block_and_mask(i);
+        const auto prefix_mask = mask - 1;
+        return ranks_[block] + detail::popcount(prefix_mask & bits_[block]);
+    }
+
+    /**
+     * Returns the inclusive rank of the given index.
+     *
+     * @param i  the index in range [0, size())
+     *
+     * @return the rank of the given index, i.e. the number of 1 bits set
+     *         up to and including the corresponding bit (inclusive).
+     */
+    constexpr index_type get_rank_inclusive(index_type i) const
+    {
+        assert(i >= 0);
+        assert(i < get_size());
+        const auto [block, mask] = get_block_and_mask(i);
+        const auto prefix_mask = mask - 1 | mask;
+        return ranks_[block] + detail::popcount(prefix_mask & bits_[block]);
+    }
+
+private:
+    index_type size_;
+    const storage_type* bits_;
+    const index_type* ranks_;
+};
+
+
+/**
+ * Bitvector with rank support. It supports bit queries (whether a bit is set)
+ * and rank queries (how many bits are set before a specific index).
+ *
+ * @tparam IndexType  the type of indices used in the input and rank array.
+ */
+template <typename IndexType>
+class bitvector {
+public:
+    using index_type = IndexType;
+    using view_type = device_bitvector<index_type>;
+    using storage_type = typename view_type::storage_type;
+    constexpr static int block_size = CHAR_BIT * sizeof(storage_type);
+
+    static index_type get_num_blocks(index_type size)
+    {
+        return (size + block_size - 1) / block_size;
+    }
+
+    view_type device_view() const
+    {
+        return view_type{this->get_bits(), this->get_ranks(), this->get_size()};
+    }
+
+    std::shared_ptr<const Executor> get_executor() const
+    {
+        return bits_.get_executor();
+    }
+
+    const storage_type* get_bits() const { return bits_.get_const_data(); }
+
+    const index_type* get_ranks() const { return ranks_.get_const_data(); }
+
+    index_type get_size() const { return size_; }
+
+    index_type get_num_blocks() const { return get_num_blocks(get_size()); }
+
+    bitvector(array<storage_type> bits, array<index_type> ranks,
+              index_type size)
+        : size_{size}, bits_{std::move(bits)}, ranks_{std::move(ranks)}
+    {
+        GKO_ASSERT(bits_.get_executor() == ranks_.get_executor());
+        GKO_ASSERT(this->get_num_blocks() == bits_.get_size());
+        GKO_ASSERT(this->get_num_blocks() == ranks_.get_size());
+    }
+
+private:
+    index_type size_;
+    array<storage_type> bits_;
+    array<index_type> ranks_;
+};
+
+
+}  // namespace gko
+
+#endif  // GKO_CORE_COMPONENTS_BITVECTOR_HPP_
diff --git a/core/components/range_minimum_query.cpp b/core/components/range_minimum_query.cpp
index 07cb69fcdff..970a2ef739e 100644
--- a/core/components/range_minimum_query.cpp
+++ b/core/components/range_minimum_query.cpp
@@ -68,10 +68,10 @@ range_minimum_query<IndexType>::get() const
 }
 
 
-#define GKO_DEFINE_DEVICE_RANGE_MINIMUM_QUERY(IndexType) \
+#define GKO_DEFINE_RANGE_MINIMUM_QUERY(IndexType) \
     class range_minimum_query<IndexType>
 
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DEFINE_DEVICE_RANGE_MINIMUM_QUERY);
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DEFINE_RANGE_MINIMUM_QUERY);
 
 
 }  // namespace gko
diff --git a/dpcpp/components/bitvector.dp.hpp b/dpcpp/components/bitvector.dp.hpp
new file mode 100644
index 00000000000..2c329941eaa
--- /dev/null
+++ b/dpcpp/components/bitvector.dp.hpp
@@ -0,0 +1,115 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_DPCPP_COMPONENTS_BITVECTOR_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_BITVECTOR_DP_HPP_
+
+
+#include <sycl/sycl.hpp>
+
+#include <ginkgo/core/base/intrinsics.hpp>
+
+#include "core/components/bitvector.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace bitvector {
+
+
+template <typename IndexType, typename DevicePredicate>
+gko::bitvector<IndexType> from_predicate(
+    std::shared_ptr<const DefaultExecutor> exec, IndexType size,
+    DevicePredicate device_predicate)
+{
+    using storage_type = typename device_bitvector<IndexType>::storage_type;
+    constexpr auto block_size = device_bitvector<IndexType>::block_size;
+    const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
+    array<uint32> bit_array{exec, num_blocks};
+    array<IndexType> rank_array{exec, num_blocks};
+    const auto bits = bit_array.get_data();
+    const auto ranks = rank_array.get_data();
+    const auto queue = exec->get_queue();
+    queue->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(num_blocks, [=](sycl::id<1> block_i) {
+            const auto base_i = static_cast<IndexType>(block_i) * block_size;
+            storage_type mask{};
+            const auto local_op = [&](int local_i) {
+                const storage_type bit =
+                    device_predicate(base_i + local_i) ? 1 : 0;
+                mask |= bit << local_i;
+            };
+            if (base_i + block_size <= size) {
+#pragma unroll
+                for (int local_i = 0; local_i < block_size; local_i++) {
+                    local_op(local_i);
+                }
+            } else {
+                for (int local_i = 0; base_i + local_i < size; local_i++) {
+                    local_op(local_i);
+                }
+            }
+            bits[block_i] = mask;
+            ranks[block_i] = gko::detail::popcount(mask);
+        });
+    });
+    components::prefix_sum_nonnegative(exec, ranks, num_blocks);
+
+    return gko::bitvector<IndexType>{std::move(bit_array),
+                                     std::move(rank_array), size};
+}
+
+
+template <typename IndexIterator>
+gko::bitvector<typename std::iterator_traits<IndexIterator>::value_type>
+from_sorted_indices(
+    std::shared_ptr<const DefaultExecutor> exec, IndexIterator it,
+    typename std::iterator_traits<IndexIterator>::difference_type count,
+    typename std::iterator_traits<IndexIterator>::value_type size)
+{
+    using index_type = typename std::iterator_traits<IndexIterator>::value_type;
+    using storage_type = typename device_bitvector<index_type>::storage_type;
+    constexpr auto block_size = device_bitvector<index_type>::block_size;
+    const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
+    array<storage_type> bit_array{exec, num_blocks};
+    array<index_type> rank_array{exec, num_blocks};
+    components::fill_array(exec, bit_array.get_data(), num_blocks,
+                           storage_type{});
+    const auto bits = bit_array.get_data();
+    const auto ranks = rank_array.get_data();
+    const auto queue = exec->get_queue();
+    queue->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(count, [=](sycl::id<1> i) {
+            auto value = it[i];
+            const auto [block, mask] =
+                device_bitvector<index_type>::get_block_and_mask(value);
+            sycl::atomic_ref<storage_type, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                atomic(bits[block]);
+            atomic.fetch_or(mask);
+        });
+    });
+    queue->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(num_blocks, [=](sycl::id<1> i) {
+            ranks[i] = gko::detail::popcount(bits[i]);
+        });
+    });
+    components::prefix_sum_nonnegative(exec, ranks, num_blocks);
+
+    return gko::bitvector<index_type>{std::move(bit_array),
+                                      std::move(rank_array), size};
+}
+
+
+}  // namespace bitvector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_BITVECTOR_DP_HPP_
diff --git a/omp/components/bitvector.hpp b/omp/components/bitvector.hpp
new file mode 100644
index 00000000000..9b87a990b3c
--- /dev/null
+++ b/omp/components/bitvector.hpp
@@ -0,0 +1,125 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <omp.h>
+
+#include <ginkgo/core/base/intrinsics.hpp>
+
+#include "core/base/index_range.hpp"
+#include "core/components/bitvector.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace bitvector {
+
+
+template <typename IndexType, typename DevicePredicate>
+gko::bitvector<IndexType> from_predicate(
+    std::shared_ptr<const DefaultExecutor> exec, IndexType size,
+    DevicePredicate device_predicate)
+{
+    using storage_type = typename device_bitvector<IndexType>::storage_type;
+    constexpr auto block_size = device_bitvector<IndexType>::block_size;
+    const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
+    array<storage_type> bit_array{exec, num_blocks};
+    array<IndexType> rank_array{exec, num_blocks};
+    const auto bits = bit_array.get_data();
+    const auto ranks = rank_array.get_data();
+#pragma omp parallel for
+    for (IndexType block_i = 0; block_i < num_blocks; block_i++) {
+        const auto base_i = block_i * block_size;
+        storage_type mask{};
+        const auto local_op = [&](int local_i) {
+            const storage_type bit = device_predicate(base_i + local_i) ? 1 : 0;
+            mask |= bit << local_i;
+        };
+        if (base_i + block_size <= size) {
+#pragma unroll
+            for (int local_i = 0; local_i < block_size; local_i++) {
+                local_op(local_i);
+            }
+        } else {
+            for (int local_i = 0; base_i + local_i < size; local_i++) {
+                local_op(local_i);
+            }
+        }
+        bits[block_i] = mask;
+        ranks[block_i] = gko::detail::popcount(mask);
+    }
+    components::prefix_sum_nonnegative(exec, ranks, num_blocks);
+
+    return gko::bitvector<IndexType>{std::move(bit_array),
+                                     std::move(rank_array), size};
+}
+
+
+template <typename IndexIterator>
+gko::bitvector<typename std::iterator_traits<IndexIterator>::value_type>
+from_sorted_indices(
+    std::shared_ptr<const DefaultExecutor> exec, IndexIterator it,
+    typename std::iterator_traits<IndexIterator>::difference_type count,
+    typename std::iterator_traits<IndexIterator>::value_type size)
+{
+    using index_type = typename std::iterator_traits<IndexIterator>::value_type;
+    using bv = device_bitvector<index_type>;
+    using storage_type = typename bv::storage_type;
+    constexpr auto block_size = bv::block_size;
+    const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
+    array<storage_type> bit_array{exec, num_blocks};
+    array<index_type> rank_array{exec, num_blocks};
+    const auto bits = bit_array.get_data();
+    const auto ranks = rank_array.get_data();
+    components::fill_array(exec, bits, num_blocks, storage_type{});
+    const auto num_threads = omp_get_max_threads();
+    const auto work_per_thread = ceildiv(count, num_threads);
+    assert(std::is_sorted(it, it + count));
+#pragma omp parallel num_threads(num_threads)
+    {
+        const auto tid = omp_get_thread_num();
+        const auto begin = std::min<index_type>(tid * work_per_thread, count);
+        const auto end = std::min<index_type>(begin + work_per_thread, count);
+        if (begin < end) {
+            const auto first_block = it[begin] / block_size;
+            const auto last_block = it[end - 1] / block_size;
+            storage_type mask{0};
+            auto block = first_block;
+            for (auto i : irange{begin, end}) {
+                const auto value = it[i];
+                const auto new_block = value / block_size;
+                const auto local = value % block_size;
+                if (new_block != block) {
+                    assert(new_block > block);
+                    if (block == first_block) {
+#pragma omp atomic
+                        bits[block] |= mask;
+                    } else {
+                        bits[block] = mask;
+                    }
+                    mask = 0;
+                    block = new_block;
+                }
+                mask |= storage_type{1} << local;
+            }
+#pragma omp atomic
+            bits[last_block] |= mask;
+        }
+    }
+#pragma omp parallel for
+    for (size_type i = 0; i < num_blocks; i++) {
+        ranks[i] = gko::detail::popcount(bits[i]);
+    }
+    components::prefix_sum_nonnegative(exec, ranks, num_blocks);
+    return gko::bitvector<index_type>{std::move(bit_array),
+                                      std::move(rank_array), size};
+}
+
+
+}  // namespace bitvector
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/components/bitvector.hpp b/reference/components/bitvector.hpp
new file mode 100644
index 00000000000..d35c2b2766b
--- /dev/null
+++ b/reference/components/bitvector.hpp
@@ -0,0 +1,82 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_REFERENCE_COMPONENTS_BITVECTOR_HPP_
+#define GKO_REFERENCE_COMPONENTS_BITVECTOR_HPP_
+
+#include "core/base/index_range.hpp"
+#include "core/components/bitvector.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace bitvector {
+
+
+template <typename IndexType, typename DevicePredicate>
+gko::bitvector<IndexType> from_predicate(
+    std::shared_ptr<const DefaultExecutor> exec, IndexType size,
+    DevicePredicate device_predicate)
+{
+    using storage_type = typename device_bitvector<IndexType>::storage_type;
+    constexpr auto block_size = device_bitvector<IndexType>::block_size;
+    const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
+    array<storage_type> bits{exec, num_blocks};
+    array<IndexType> ranks{exec, num_blocks};
+    std::fill_n(bits.get_data(), num_blocks, 0);
+    std::fill_n(ranks.get_data(), num_blocks, 0);
+    for (auto i : irange{size}) {
+        if (device_predicate(i)) {
+            bits.get_data()[i / block_size] |= storage_type{1}
+                                               << (i % block_size);
+            ranks.get_data()[i / block_size]++;
+        }
+    }
+    components::prefix_sum_nonnegative(exec, ranks.get_data(), num_blocks);
+
+    return gko::bitvector<IndexType>{std::move(bits), std::move(ranks), size};
+}
+
+
+template <typename IndexIterator>
+gko::bitvector<typename std::iterator_traits<IndexIterator>::value_type>
+from_sorted_indices(
+    std::shared_ptr<const DefaultExecutor> exec, IndexIterator it,
+    typename std::iterator_traits<IndexIterator>::difference_type count,
+    typename std::iterator_traits<IndexIterator>::value_type size)
+{
+    using index_type = typename std::iterator_traits<IndexIterator>::value_type;
+    using storage_type = typename device_bitvector<index_type>::storage_type;
+    constexpr auto block_size = device_bitvector<index_type>::block_size;
+    const auto num_blocks = static_cast<size_type>(ceildiv(size, block_size));
+    array<storage_type> bits{exec, num_blocks};
+    array<index_type> ranks{exec, num_blocks};
+    std::fill_n(bits.get_data(), num_blocks, 0);
+    assert(std::is_sorted(it, it + count));
+    for (auto i : irange{count}) {
+        const auto value = it[i];
+        const auto [block, mask] =
+            device_bitvector<index_type>::get_block_and_mask(value);
+        assert((bits.get_data()[block] & mask) == 0);
+        bits.get_data()[block] |= mask;
+    }
+    index_type rank{0};
+    for (auto i : irange{static_cast<index_type>(num_blocks)}) {
+        ranks.get_data()[i] = rank;
+        rank += gko::detail::popcount(bits.get_const_data()[i]);
+    }
+
+    return gko::bitvector<index_type>{std::move(bits), std::move(ranks), size};
+}
+
+
+}  // namespace bitvector
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_REFERENCE_COMPONENTS_BITVECTOR_HPP_
diff --git a/reference/test/components/CMakeLists.txt b/reference/test/components/CMakeLists.txt
index b17880ab32d..47998d184df 100644
--- a/reference/test/components/CMakeLists.txt
+++ b/reference/test/components/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_test(absolute_array_kernels)
+ginkgo_create_test(bitvector)
 ginkgo_create_test(fill_array_kernels)
 ginkgo_create_test(format_conversion_kernels)
 ginkgo_create_test(precision_conversion_kernels)
diff --git a/reference/test/components/bitvector.cpp b/reference/test/components/bitvector.cpp
new file mode 100644
index 00000000000..8c5b428bb39
--- /dev/null
+++ b/reference/test/components/bitvector.cpp
@@ -0,0 +1,102 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/components/bitvector.hpp"
+
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "core/base/index_range.hpp"
+#include "core/components/bitvector.hpp"
+#include "core/test/utils.hpp"
+#include "reference/components/bitvector.hpp"
+
+
+template <typename IndexType>
+class Bitvector : public ::testing::Test {
+protected:
+    using index_type = IndexType;
+    using device_type = gko::bitvector<index_type>;
+    using storage_type = typename device_type::storage_type;
+    constexpr static auto block_size = device_type::block_size;
+    Bitvector()
+        : ref{gko::ReferenceExecutor::create()}, rng{67593}, sizes{0,    1,
+                                                                   2,    16,
+                                                                   31,   32,
+                                                                   33,   40,
+                                                                   63,   64,
+                                                                   65,   127,
+                                                                   128,  129,
+                                                                   1000, 1024,
+                                                                   2000}
+    {}
+
+    std::vector<index_type> create_random_values(index_type num_values,
+                                                 index_type size)
+    {
+        assert(num_values <= size);
+        std::vector<index_type> values(size);
+        std::iota(values.begin(), values.end(), index_type{});
+        std::shuffle(values.begin(), values.end(), rng);
+        values.resize(num_values);
+        std::sort(values.begin(), values.end());
+        return values;
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::default_random_engine rng;
+    std::vector<index_type> sizes;
+};
+
+TYPED_TEST_SUITE(Bitvector, gko::test::IndexTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(Bitvector, ComputeBitsAndRanks)
+{
+    using index_type = typename TestFixture::index_type;
+    using storage_type = typename TestFixture::storage_type;
+    constexpr auto block_size = TestFixture::block_size;
+    for (auto size : this->sizes) {
+        SCOPED_TRACE(size);
+        for (auto num_values :
+             {index_type{0}, size / 10, size / 4, size / 2, size}) {
+            SCOPED_TRACE(num_values);
+            auto values = this->create_random_values(num_values, size);
+            num_values = values.size();
+            auto num_blocks = gko::ceildiv(size, block_size);
+
+            auto bv = gko::kernels::reference::bitvector::from_sorted_indices(
+                this->ref, values.data(), num_values, size);
+            auto dbv = bv.device_view();
+
+            // check bits and ranks are correct
+            ASSERT_EQ(bv.get_size(), size);
+            ASSERT_EQ(dbv.get_size(), size);
+            ASSERT_EQ(bv.get_num_blocks(), num_blocks);
+            ASSERT_EQ(dbv.get_num_blocks(), num_blocks);
+            auto it = values.begin();
+            index_type rank{};
+            for (auto i : gko::irange{size}) {
+                const auto block = i / block_size;
+                const auto local = i % block_size;
+                ASSERT_EQ(dbv.get_rank(i), rank);
+                if (it != values.end() && *it == i) {
+                    ASSERT_TRUE(bool(bv.get_bits()[block] &
+                                     (storage_type{1} << local)));
+                    ASSERT_TRUE(dbv[i]);
+                    ++rank;
+                    ++it;
+                } else {
+                    ASSERT_FALSE(bool(bv.get_bits()[block] &
+                                      (storage_type{1} << local)));
+                    ASSERT_FALSE(dbv[i]);
+                }
+                ASSERT_EQ(dbv.get_rank_inclusive(i), rank);
+            }
+        }
+    }
+}
diff --git a/test/components/CMakeLists.txt b/test/components/CMakeLists.txt
index 12e738d8eaa..2793f3af575 100644
--- a/test/components/CMakeLists.txt
+++ b/test/components/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_common_test(absolute_array_kernels)
+ginkgo_create_common_device_test(bitvector)
 ginkgo_create_common_test(fill_array_kernels)
 ginkgo_create_common_test(format_conversion_kernels)
 ginkgo_create_common_test(precision_conversion_kernels)
diff --git a/test/components/bitvector.cpp b/test/components/bitvector.cpp
new file mode 100644
index 00000000000..dfed105e6ec
--- /dev/null
+++ b/test/components/bitvector.cpp
@@ -0,0 +1,207 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+/*@GKO_PREPROCESSOR_FILENAME_HELPER@*/
+
+#include "core/components/bitvector.hpp"
+
+#include <cassert>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/components/bitvector.hpp"
+#include "core/base/index_range.hpp"
+#include "core/test/utils.hpp"
+#include "reference/components/bitvector.hpp"
+#include "test/utils/common_fixture.hpp"
+
+
+// workaround for cudafe 11.0 bug
+using gko::irange;
+
+
+template <typename T>
+class Bitvector : public CommonTestFixture {
+protected:
+    using index_type = T;
+    using bitvector = gko::bitvector<index_type>;
+    using device_bitvector = gko::device_bitvector<index_type>;
+    using storage_type = typename bitvector::storage_type;
+    constexpr static auto block_size = bitvector::block_size;
+
+    Bitvector()
+        : rng{67193}, sizes{0,    1,    2,    16,    31,    32,  33,
+                            40,   63,   64,   65,    127,   128, 129,
+                            1000, 1024, 2000, 10000, 100000}
+    {}
+
+    gko::array<index_type> create_random_values(index_type num_values,
+                                                index_type size)
+    {
+        assert(num_values <= size);
+        std::vector<index_type> values(size);
+        std::iota(values.begin(), values.end(), index_type{});
+        std::shuffle(values.begin(), values.end(), rng);
+        values.resize(num_values);
+        std::sort(values.begin(), values.end());
+        return gko::array<index_type>{this->ref, values.begin(), values.end()};
+    }
+
+    void assert_bitvector_equal(const bitvector& bv, const bitvector& dbv)
+    {
+        ASSERT_EQ(bv.get_size(), dbv.get_size());
+        const auto num_blocks =
+            static_cast<gko::size_type>(bv.get_num_blocks());
+        const auto bits = gko::make_const_array_view(bv.get_executor(),
+                                                     num_blocks, bv.get_bits());
+        const auto dbits = gko::make_const_array_view(
+            dbv.get_executor(), num_blocks, dbv.get_bits());
+        const auto ranks = gko::make_const_array_view(
+            bv.get_executor(), num_blocks, bv.get_ranks());
+        const auto dranks = gko::make_const_array_view(
+            dbv.get_executor(), num_blocks, dbv.get_ranks());
+        GKO_ASSERT_ARRAY_EQ(bits, dbits);
+        GKO_ASSERT_ARRAY_EQ(ranks, dranks);
+    }
+
+    std::default_random_engine rng;
+    std::vector<index_type> sizes;
+};
+
+TYPED_TEST_SUITE(Bitvector, gko::test::IndexTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(Bitvector, BuildFromIndicesIsEquivalentToRef)
+{
+    using index_type = typename TestFixture::index_type;
+    using bitvector = typename TestFixture::bitvector;
+    for (auto size : this->sizes) {
+        SCOPED_TRACE(size);
+        for (auto num_values :
+             {index_type{}, size / 10, size / 4, size / 2, size}) {
+            SCOPED_TRACE(num_values);
+            auto values = this->create_random_values(num_values, size);
+            gko::array<index_type> dvalues{this->exec, values};
+
+            auto bv = gko::kernels::reference::bitvector::from_sorted_indices(
+                this->ref, values.get_data(), values.get_size(), size);
+            auto dbv = gko::kernels::GKO_DEVICE_NAMESPACE::bitvector::
+                from_sorted_indices(this->exec, dvalues.get_data(),
+                                    dvalues.get_size(), size);
+
+            this->assert_bitvector_equal(bv, dbv);
+        }
+    }
+}
+
+
+// nvcc doesn't like device lambdas inside class member functions
+template <typename IndexType>
+std::pair<gko::bitvector<IndexType>, gko::bitvector<IndexType>> run_predicate(
+    std::shared_ptr<const gko::ReferenceExecutor> ref,
+    std::shared_ptr<const gko::EXEC_TYPE> exec, IndexType size, int stride)
+{
+    return std::make_pair(
+        gko::kernels::reference::bitvector::from_predicate(
+            ref, size, [stride](int i) { return i % stride == 0; }),
+        gko::kernels::GKO_DEVICE_NAMESPACE::bitvector::from_predicate(
+            exec, size,
+            [stride] GKO_KERNEL(int i) { return i % stride == 0; }));
+}
+
+
+TYPED_TEST(Bitvector, BuildFromPredicateIsEquivalentToFromIndices)
+{
+    using index_type = typename TestFixture::index_type;
+    using bitvector = typename TestFixture::bitvector;
+    for (auto size : this->sizes) {
+        SCOPED_TRACE(size);
+        for (auto stride : {1, 2, 3, 4, 5, 65}) {
+            SCOPED_TRACE(stride);
+            std::vector<index_type> indices;
+            for (index_type i = 0; i < size; i += stride) {
+                indices.push_back(i);
+            }
+            gko::array<index_type> values{this->ref, indices.begin(),
+                                          indices.end()};
+
+            auto [bv, dbv] = run_predicate(this->ref, this->exec, size, stride);
+
+            auto ref_bv =
+                gko::kernels::reference::bitvector::from_sorted_indices(
+                    this->ref, values.get_data(), values.get_size(), size);
+            this->assert_bitvector_equal(bv, dbv);
+            this->assert_bitvector_equal(ref_bv, dbv);
+        }
+    }
+}
+
+
+// nvcc doesn't like device lambdas inside class member functions
+template <typename IndexType>
+void run_device(std::shared_ptr<const gko::EXEC_TYPE> exec,
+                const gko::device_bitvector<IndexType> bv,
+                const gko::device_bitvector<IndexType> dbv,
+                gko::array<bool>& output_bools,
+                gko::array<IndexType>& output_ranks,
+                gko::array<bool>& doutput_bools,
+                gko::array<IndexType>& doutput_ranks)
+{
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto bv, auto output_bool, auto output_rank) {
+            output_bool[i] = bv[i];
+            output_rank[i] = bv.get_rank(i);
+        },
+        dbv.get_size(), dbv, doutput_bools, doutput_ranks);
+    for (auto i : gko::irange{bv.get_size()}) {
+        output_bools.get_data()[i] = bv[i];
+        output_ranks.get_data()[i] = bv.get_rank(i);
+    }
+}
+
+
+TYPED_TEST(Bitvector, AccessIsEquivalentToRef)
+{
+    using index_type = typename TestFixture::index_type;
+    using storage_type = typename TestFixture::storage_type;
+    constexpr auto block_size = TestFixture::block_size;
+    for (auto size : this->sizes) {
+        SCOPED_TRACE(size);
+        for (auto num_values :
+             {index_type{}, size / 10, size / 4, size / 2, size}) {
+            SCOPED_TRACE(num_values);
+            auto values = this->create_random_values(num_values, size);
+            num_values = values.get_size();
+            gko::array<index_type> dvalues{this->exec, values};
+
+            auto bv = gko::kernels::reference::bitvector::from_sorted_indices(
+                this->ref, values.get_const_data(), values.get_size(), size);
+            auto dbv = gko::kernels::GKO_DEVICE_NAMESPACE::bitvector::
+                from_sorted_indices(this->exec, dvalues.get_const_data(),
+                                    dvalues.get_size(), size);
+
+            const auto usize = static_cast<gko::size_type>(size);
+            gko::array<bool> output_bools{this->ref, usize};
+            gko::array<index_type> output_ranks{this->ref, usize};
+            gko::array<bool> doutput_bools{this->exec, usize};
+            gko::array<index_type> doutput_ranks{this->exec, usize};
+            doutput_bools.fill(true);
+            doutput_ranks.fill(-1);
+            run_device(this->exec, bv.device_view(), dbv.device_view(),
+                       output_bools, output_ranks, doutput_bools,
+                       doutput_ranks);
+
+            GKO_ASSERT_ARRAY_EQ(doutput_bools, output_bools);
+            GKO_ASSERT_ARRAY_EQ(doutput_ranks, output_ranks);
+        }
+    }
+}