diff --git a/common/cuda_hip/components/load_balancing.hpp b/common/cuda_hip/components/load_balancing.hpp
new file mode 100644
index 00000000000..ff053ac42cf
--- /dev/null
+++ b/common/cuda_hip/components/load_balancing.hpp
@@ -0,0 +1,122 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_
+
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+/**
+ * @internal
+ * Distributes work evenly inside a subwarp, making sure that no thread is idle
+ * as long as possible. The work is described by chunks (id, work_count), where
+ * work_count describes the number of work items in this chunk.
+ *
+ * @param chunk_count  how many total chunks of work are there?
+ * @param work_count   functor work_count(i) should return how many work items
+ *                     are in the i'th chunk.
+ *                     It will only be executed once for each i.
+ * @param op           the operation to execute for a single work item, called
+ *                     via op(chunk_id, work_item_id)
+ * @param subwarp      the subwarp executing this function
+ *
+ * @tparam  the index type used to address individual chunks and work items.
+ *          it needs to be able to represent the total number of work items
+ *          without overflow.
+ */
+template <int subwarp_size, typename IndexType, typename WorkCountFunctor,
+          typename OpFunctor>
+__forceinline__ __device__ void load_balance_subwarp(
+    IndexType chunk_count, WorkCountFunctor work_count, OpFunctor op)
+{
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    static_assert(std::is_same_v<decltype(work_count(IndexType{})), IndexType>,
+                  "work_count needs to return IndexType");
+    const auto lane = subwarp.thread_rank();
+    IndexType chunk_base{};
+    IndexType work_base{};
+    IndexType local_work = lane < chunk_count ? work_count(lane) : IndexType{1};
+    // inclusive prefix sum over work tells us where each chunk begins
+    IndexType work_prefix_sum{};
+    subwarp_prefix_sum<true>(local_work, work_prefix_sum, subwarp);
+    while (chunk_base < chunk_count) {
+        assert(local_work >= 0);
+        // binary search over this prefix sum tells us which chunk each thread
+        // works in
+        const auto local_work_pos = work_base + lane;
+        const auto local_chunk =
+            synchronous_fixed_binary_search<subwarp_size>([&](int i) {
+                return local_work_pos < subwarp.shfl(work_prefix_sum, i);
+            });
+        // guard against out-of-bounds shuffle
+        auto local_chunk_work_base = subwarp.shfl(
+            work_prefix_sum - local_work, max(local_chunk, subwarp_size - 1));
+        const auto chunk = chunk_base + local_chunk;
+        // do the work inside this chunk
+        if (chunk < chunk_count && local_chunk < subwarp_size) {
+            op(chunk, local_work_pos - local_chunk_work_base, local_work_pos);
+        }
+        const auto last_local_chunk =
+            subwarp.shfl(local_chunk, subwarp_size - 1);
+        // find the last value of the prefix sum and remember it for later
+        const auto work_prefix_sum_end =
+            subwarp.shfl(work_prefix_sum, subwarp_size - 1);
+        IndexType work_prefix_sum_add{};
+        if (last_local_chunk == subwarp_size) {
+            // if we didn't have enough work to do: all local chunks are
+            // completed
+            chunk_base += subwarp_size;
+            work_prefix_sum = work_prefix_sum_end;
+            const auto in_chunk = chunk_base + lane;
+            local_work =
+                in_chunk < chunk_count ? work_count(in_chunk) : IndexType{};
+            work_prefix_sum_add = local_work;
+        } else {
+            const auto last_local_chunk_end =
+                subwarp.shfl(work_prefix_sum, last_local_chunk);
+            // how many chunks have we completed? The last one is completed if
+            // its end matches work_base after the update
+            const auto chunk_advance =
+                last_local_chunk + (last_local_chunk_end == work_base ? 1 : 0);
+            chunk_base += chunk_advance;
+            // shift down local_work and work_prefix_sum,
+            // adding new values when necessary
+            local_work = subwarp.shfl_down(local_work, chunk_advance);
+            // this shuffle leaves the trailing elements unchaged, we need to
+            // overwrite them later
+            work_prefix_sum = subwarp.shfl_down(work_prefix_sum, chunk_advance);
+            if (lane >= subwarp_size - chunk_advance) {
+                const auto in_chunk = chunk_base + lane;
+                // load new work counters at the end
+                local_work =
+                    in_chunk < chunk_count ? work_count(in_chunk) : IndexType{};
+                work_prefix_sum_add = local_work;
+                // fill the trailing work_prefix_sum with the last element
+                work_prefix_sum = work_prefix_sum_end;
+            }
+        }
+        work_base += popcnt(subwarp.ballot(local_chunk < subwarp_size));
+        // compute a prefix sum over new chunks and add to the prefix sum
+        subwarp_prefix_sum<true>(work_prefix_sum_add, work_prefix_sum_add,
+                                 subwarp);
+        work_prefix_sum += work_prefix_sum_add;
+    }
+}
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_
diff --git a/core/base/array.cpp b/core/base/array.cpp
index 51fa4b34bb1..da1accc1ca1 100644
--- a/core/base/array.cpp
+++ b/core/base/array.cpp
@@ -91,6 +91,7 @@ ValueType reduce_add(const array<ValueType>& input_arr,
 #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value)
 
 GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL);
+template GKO_DECLARE_ARRAY_FILL(bool);
 template GKO_DECLARE_ARRAY_FILL(uint16);
 template GKO_DECLARE_ARRAY_FILL(uint32);
 #ifndef GKO_SIZE_T_IS_UINT64_T
diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt
index 89e374431c4..6fe81ea0635 100644
--- a/cuda/test/components/CMakeLists.txt
+++ b/cuda/test/components/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_cuda_test(cooperative_groups)
+ginkgo_create_cuda_test(load_balancing)
 ginkgo_create_cuda_test(merging)
 ginkgo_create_cuda_test(searching)
 ginkgo_create_cuda_test(sorting)
diff --git a/cuda/test/components/load_balancing.cu b/cuda/test/components/load_balancing.cu
new file mode 100644
index 00000000000..c871f286fec
--- /dev/null
+++ b/cuda/test/components/load_balancing.cu
@@ -0,0 +1,303 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/components/load_balancing.hpp"
+
+#include <memory>
+#include <numeric>
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+#include "core/base/index_range.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+class LoadBalancing : public CudaTestFixture {
+protected:
+};
+
+
+__global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err,
+                                      int size)
+{
+    gko::kernels::cuda::load_balance_subwarp<
+        gko::kernels::cuda::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return 1;
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work != 0) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            if (global_pos != chunk) {
+                printf("incorrect global_pos %d %d\n", global_pos, chunk);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output + chunk, -1, chunk);
+            if (prev != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWith1WorkPerChunk)
+{
+    for (const auto size :
+         {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> data{this->exec, usize};
+        data.fill(-1);
+        gko::array<bool> err{this->exec, 1};
+        err.fill(false);
+        gko::array<int> size_once{this->exec, usize};
+        size_once.fill(0);
+        gko::array<int> ref_data{this->exec, usize};
+        gko::kernels::cuda::components::fill_seq_array(
+            exec, ref_data.get_data(), usize);
+
+        test_1_work_per_chunk<<<1, gko::kernels::cuda::config::warp_size>>>(
+            data.get_data(), size_once.get_data(), err.get_data(), size);
+
+        GKO_ASSERT_ARRAY_EQ(data, ref_data);
+        ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data()));
+    }
+}
+
+
+__global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err,
+                                      int size)
+{
+    gko::kernels::cuda::load_balance_subwarp<
+        gko::kernels::cuda::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return 3;
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work < 0 || work >= 3) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output + global_pos, -1, 3 * chunk + work);
+            if (prev != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWith3WorkPerChunk)
+{
+    for (const auto size : {0, 1, 2, 3, 10, 11, 19, 20, 21, 30, 31, 32, 33, 34,
+                            63, 64, 65, 100}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> data{this->exec, 3 * usize};
+        data.fill(-1);
+        gko::array<bool> err{this->exec, 1};
+        err.fill(false);
+        gko::array<int> size_once{this->exec, usize};
+        size_once.fill(0);
+        gko::array<int> ref_data{this->exec, 3 * usize};
+        gko::kernels::cuda::components::fill_seq_array(
+            exec, ref_data.get_data(), 3 * usize);
+
+        test_3_work_per_chunk<<<1, gko::kernels::cuda::config::warp_size>>>(
+            data.get_data(), size_once.get_data(), err.get_data(), size);
+
+        GKO_ASSERT_ARRAY_EQ(data, ref_data);
+        ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data()));
+    }
+}
+
+
+__global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size,
+                                            int* output_chunk, int* output_work,
+                                            int* size_once, bool* err)
+{
+    gko::kernels::cuda::load_balance_subwarp<
+        gko::kernels::cuda::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return chunk_sizes[chunk];
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work < 0 ||
+                work >= chunk_sizes[chunk]) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output_chunk + global_pos, -1, chunk);
+            int prev2 = atomicCAS(output_work + global_pos, -1, work);
+            if (prev != -1 || prev2 != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkNonempty)
+{
+    std::default_random_engine rng{123468};
+    std::uniform_int_distribution<int> dist{1, 100};
+    for (const auto size :
+         {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 1000}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> chunk_sizes{this->ref, usize};
+        std::generate_n(chunk_sizes.get_data(), size,
+                        [&] { return dist(rng); });
+        const auto total_work = static_cast<gko::size_type>(std::reduce(
+            chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size));
+        gko::array<int> data_chunk{this->ref, total_work};
+        gko::array<int> data_work{this->ref, total_work};
+        gko::array<int> ddata_chunk{this->exec, total_work};
+        gko::array<int> ddata_work{this->exec, total_work};
+        gko::array<bool> derr{this->exec, 1};
+        gko::array<int> dsize_once{this->exec, usize};
+        gko::array<int> dchunk_sizes{this->exec, chunk_sizes};
+        ddata_chunk.fill(-1);
+        ddata_work.fill(-1);
+        derr.fill(false);
+        dsize_once.fill(0);
+        int i{};
+        for (const auto chunk : gko::irange{size}) {
+            const auto chunk_size = chunk_sizes.get_const_data()[chunk];
+            for (const auto work : gko::irange{chunk_size}) {
+                data_chunk.get_data()[i] = chunk;
+                data_work.get_data()[i] = work;
+                i++;
+            }
+        }
+
+        test_dynamic_work_per_chunk<<<1,
+                                      gko::kernels::cuda::config::warp_size>>>(
+            dchunk_sizes.get_data(), size, ddata_chunk.get_data(),
+            ddata_work.get_data(), dsize_once.get_data(), derr.get_data());
+
+        GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk);
+        GKO_ASSERT_ARRAY_EQ(ddata_work, data_work);
+        ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data()));
+    }
+}
+
+
+TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkEmptyAllowed)
+{
+    // empty chunks aligned with warp size
+    std::vector<int> empty_aligned(32, 0);
+    // empty chunks not aligned with warp size
+    std::vector<int> empty_unaligned(33, 0);
+    // some empty chunks, not enough work initially
+    std::vector<int> incomplete{0, 0, 31, 0, 0};
+    // some empty chunks, not enough work in second iteration
+    std::vector<int> incomplete2{0, 0, 33, 0, 0};
+    // large gaps, never enough work
+    std::vector<int> gaps_incomplete(128);
+    gaps_incomplete[4] = 31;
+    gaps_incomplete[4 + 32] = 31;
+    gaps_incomplete[4 + 64] = 31;
+    gaps_incomplete[4 + 96] = 31;
+    // large gaps, enough work
+    std::vector<int> gaps_complete(128);
+    gaps_incomplete[4] = 32;
+    gaps_incomplete[4 + 32] = 32;
+    gaps_incomplete[4 + 64] = 32;
+    gaps_incomplete[4 + 96] = 32;
+    // large gaps, chunks overlap between iterations
+    std::vector<int> gaps_overlap(96);
+    gaps_overlap[31] = 63;
+    gaps_overlap[31 + 32] = 32;
+    gaps_overlap[31 + 64] = 33;
+    // large gaps, work at the end of the chunk
+    std::vector<int> gaps_complete_end(96);
+    gaps_overlap[31] = 64;
+    gaps_overlap[31 + 32] = 32;
+    gaps_overlap[31 + 64] = 32;
+
+    for (auto input :
+         {empty_aligned, empty_unaligned, incomplete, incomplete2}) {
+        gko::array<int> chunk_sizes{this->ref, input.begin(), input.end()};
+        auto size = static_cast<int>(input.size());
+        const auto total_work = static_cast<gko::size_type>(std::reduce(
+            chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size));
+        gko::array<int> data_chunk{this->ref, total_work};
+        gko::array<int> data_work{this->ref, total_work};
+        gko::array<int> ddata_chunk{this->exec, total_work};
+        gko::array<int> ddata_work{this->exec, total_work};
+        gko::array<bool> derr{this->exec, 1};
+        gko::array<int> dsize_once{this->exec, static_cast<unsigned>(size)};
+        gko::array<int> dchunk_sizes{this->exec, chunk_sizes};
+        ddata_chunk.fill(-1);
+        ddata_work.fill(-1);
+        derr.fill(false);
+        dsize_once.fill(0);
+        int i{};
+        for (const auto chunk : gko::irange{size}) {
+            const auto chunk_size = chunk_sizes.get_const_data()[chunk];
+            for (const auto work : gko::irange{chunk_size}) {
+                data_chunk.get_data()[i] = chunk;
+                data_work.get_data()[i] = work;
+                i++;
+            }
+        }
+
+        test_dynamic_work_per_chunk<<<1,
+                                      gko::kernels::cuda::config::warp_size>>>(
+            dchunk_sizes.get_data(), size, ddata_chunk.get_data(),
+            ddata_work.get_data(), dsize_once.get_data(), derr.get_data());
+
+        GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk);
+        GKO_ASSERT_ARRAY_EQ(ddata_work, data_work);
+        ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data()));
+    }
+}
+
+
+}  // namespace
diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt
index 9bbd49608a7..780b0076c7f 100644
--- a/hip/test/components/CMakeLists.txt
+++ b/hip/test/components/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_hip_test(cooperative_groups)
+ginkgo_create_hip_test(load_balancing)
 ginkgo_create_hip_test(merging)
 ginkgo_create_hip_test(searching)
 ginkgo_create_hip_test(sorting)
diff --git a/hip/test/components/load_balancing.hip.cpp b/hip/test/components/load_balancing.hip.cpp
new file mode 100644
index 00000000000..3ea38b09b50
--- /dev/null
+++ b/hip/test/components/load_balancing.hip.cpp
@@ -0,0 +1,295 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/components/load_balancing.hpp"
+
+#include <memory>
+#include <numeric>
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+#include "core/base/index_range.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class LoadBalancing : public HipTestFixture {
+protected:
+};
+
+
+__global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err,
+                                      int size)
+{
+    gko::kernels::hip::load_balance_subwarp<
+        gko::kernels::hip::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return 1;
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work != 0) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            if (global_pos != chunk) {
+                printf("incorrect global_pos %d %d\n", global_pos, chunk);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output + chunk, -1, chunk);
+            if (prev != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWith1WorkPerChunk)
+{
+    for (const auto size : {0, 1, 2, 3, 7, 8, 63, 64, 65, 127, 128, 129, 200}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> data{this->exec, usize};
+        data.fill(-1);
+        gko::array<bool> err{this->exec, 1};
+        err.fill(false);
+        gko::array<int> size_once{this->exec, usize};
+        size_once.fill(0);
+        gko::array<int> ref_data{this->exec, usize};
+        gko::kernels::hip::components::fill_seq_array(exec, ref_data.get_data(),
+                                                      usize);
+
+        test_1_work_per_chunk<<<1, gko::kernels::hip::config::warp_size>>>(
+            data.get_data(), size_once.get_data(), err.get_data(), size);
+
+        GKO_ASSERT_ARRAY_EQ(data, ref_data);
+        ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data()));
+    }
+}
+
+
+__global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err,
+                                      int size)
+{
+    gko::kernels::hip::load_balance_subwarp<
+        gko::kernels::hip::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return 3;
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work < 0 || work >= 3) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output + global_pos, -1, 3 * chunk + work);
+            if (prev != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWith3WorkPerChunk)
+{
+    for (const auto size : {0, 1, 2, 3, 21, 22, 42, 43, 63, 64, 65, 100}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> data{this->exec, 3 * usize};
+        data.fill(-1);
+        gko::array<bool> err{this->exec, 1};
+        err.fill(false);
+        gko::array<int> size_once{this->exec, usize};
+        size_once.fill(0);
+        gko::array<int> ref_data{this->exec, 3 * usize};
+        gko::kernels::hip::components::fill_seq_array(exec, ref_data.get_data(),
+                                                      3 * usize);
+
+        test_3_work_per_chunk<<<1, gko::kernels::hip::config::warp_size>>>(
+            data.get_data(), size_once.get_data(), err.get_data(), size);
+
+        GKO_ASSERT_ARRAY_EQ(data, ref_data);
+        ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data()));
+    }
+}
+
+
+__global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size,
+                                            int* output_chunk, int* output_work,
+                                            int* size_once, bool* err)
+{
+    gko::kernels::hip::load_balance_subwarp<
+        gko::kernels::hip::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return chunk_sizes[chunk];
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work < 0 ||
+                work >= chunk_sizes[chunk]) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output_chunk + global_pos, -1, chunk);
+            int prev2 = atomicCAS(output_work + global_pos, -1, work);
+            if (prev != -1 || prev2 != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk)
+{
+    std::default_random_engine rng{123468};
+    std::uniform_int_distribution<int> dist{1, 100};
+    for (const auto size :
+         {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> chunk_sizes{this->ref, usize};
+        std::generate_n(chunk_sizes.get_data(), size,
+                        [&] { return dist(rng); });
+        const auto total_work = static_cast<gko::size_type>(std::reduce(
+            chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size));
+        gko::array<int> data_chunk{this->ref, total_work};
+        gko::array<int> data_work{this->ref, total_work};
+        gko::array<int> ddata_chunk{this->exec, total_work};
+        gko::array<int> ddata_work{this->exec, total_work};
+        gko::array<bool> derr{this->exec, 1};
+        gko::array<int> dsize_once{this->exec, usize};
+        gko::array<int> dchunk_sizes{this->exec, chunk_sizes};
+        ddata_chunk.fill(-1);
+        ddata_work.fill(-1);
+        derr.fill(false);
+        dsize_once.fill(0);
+        int i{};
+        for (const auto chunk : gko::irange{size}) {
+            const auto chunk_size = chunk_sizes.get_const_data()[chunk];
+            for (const auto work : gko::irange{chunk_size}) {
+                data_chunk.get_data()[i] = chunk;
+                data_work.get_data()[i] = work;
+                i++;
+            }
+        }
+
+        test_dynamic_work_per_chunk<<<1,
+                                      gko::kernels::hip::config::warp_size>>>(
+            dchunk_sizes.get_data(), size, ddata_chunk.get_data(),
+            ddata_work.get_data(), dsize_once.get_data(), derr.get_data());
+
+        GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk);
+        GKO_ASSERT_ARRAY_EQ(ddata_work, data_work);
+        ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data()));
+    }
+}
+
+
+TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkEmptyAllowed)
+{
+    // empty chunks aligned with warp size
+    std::vector<int> empty_aligned(64, 0);
+    // empty chunks not aligned with warp size
+    std::vector<int> empty_unaligned(65, 0);
+    // some empty chunks, not enough work initially
+    std::vector<int> incomplete{0, 0, 63, 0, 0};
+    // some empty chunks, not enough work in second iteration
+    std::vector<int> incomplete2{0, 0, 65, 0, 0};
+    // large gaps, never enough work
+    std::vector<int> gaps_incomplete(128);
+    gaps_incomplete[4] = 31;
+    gaps_incomplete[4 + 64] = 31;
+    // large gaps, enough work
+    std::vector<int> gaps_complete(128);
+    gaps_incomplete[4] = 64;
+    gaps_incomplete[4 + 64] = 64;
+    // large gaps, chunks overlap between iterations
+    std::vector<int> gaps_overlap(96);
+    gaps_overlap[31] = 127;
+    gaps_overlap[31 + 64] = 64;
+    // large gaps, work at the end of the chunk
+    std::vector<int> gaps_complete_end(96);
+    gaps_overlap[31] = 128;
+    gaps_overlap[31 + 64] = 64;
+
+    for (auto input :
+         {empty_aligned, empty_unaligned, incomplete, incomplete2}) {
+        gko::array<int> chunk_sizes{this->ref, input.begin(), input.end()};
+        auto size = static_cast<int>(input.size());
+        const auto total_work = static_cast<gko::size_type>(std::reduce(
+            chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size));
+        gko::array<int> data_chunk{this->ref, total_work};
+        gko::array<int> data_work{this->ref, total_work};
+        gko::array<int> ddata_chunk{this->exec, total_work};
+        gko::array<int> ddata_work{this->exec, total_work};
+        gko::array<bool> derr{this->exec, 1};
+        gko::array<int> dsize_once{this->exec, static_cast<unsigned>(size)};
+        gko::array<int> dchunk_sizes{this->exec, chunk_sizes};
+        ddata_chunk.fill(-1);
+        ddata_work.fill(-1);
+        derr.fill(false);
+        dsize_once.fill(0);
+        int i{};
+        for (const auto chunk : gko::irange{size}) {
+            const auto chunk_size = chunk_sizes.get_const_data()[chunk];
+            for (const auto work : gko::irange{chunk_size}) {
+                data_chunk.get_data()[i] = chunk;
+                data_work.get_data()[i] = work;
+                i++;
+            }
+        }
+
+        test_dynamic_work_per_chunk<<<1,
+                                      gko::kernels::hip::config::warp_size>>>(
+            dchunk_sizes.get_data(), size, ddata_chunk.get_data(),
+            ddata_work.get_data(), dsize_once.get_data(), derr.get_data());
+
+        GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk);
+        GKO_ASSERT_ARRAY_EQ(ddata_work, data_work);
+        ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data()));
+    }
+}
+
+
+}  // namespace