From 490015af5430c7058ad17183abdcfc095465dd24 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 29 Jan 2025 13:51:10 +0100
Subject: [PATCH 1/3] wip load balancing abstraction

---
 common/cuda_hip/components/load_balancing.hpp | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 common/cuda_hip/components/load_balancing.hpp
diff --git a/common/cuda_hip/components/load_balancing.hpp b/common/cuda_hip/components/load_balancing.hpp
new file mode 100644
index 00000000000..ba62e3e09b3
--- /dev/null
+++ b/common/cuda_hip/components/load_balancing.hpp
@@ -0,0 +1,84 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_
+
+
+#include <type_traits>
+
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+/**
+ * @internal
+ * Distributes work evenly inside a subwarp, making sure that no thread is idle
+ * as long as possible. The work is described by chunks (id, work_count), where
+ * work_count describes the number of work items in this chunk.
+ *
+ * @param chunk_count  how many total chunks of work are there?
+ * @param work_count   functor work_count(i) should return how many work items
+ *                     are in the i'th chunk, at least 1.
+ *                     It will only be executed once for each i.
+ * @param op           the operation to execute for a single work item, called
+ *                     via op(chunk_id, work_item_id)
+ * @param subwarp      the subwarp executing this function
+ *
+ * @tparam  the index type used to address individual chunks and work items.
+ *          it needs to be able to represent the total number of work items
+ *          without overflow.
+ */
+template <int subwarp_size, typename IndexType, typename WorkCountFunctor,
+          typename OpFunctor, typename Group>
+__forceinline__ __device__ void load_balance_swarp(IndexType chunk_count,
+                                                   WorkCountFunctor work_count,
+                                                   Op op)
+{
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    static_assert(std::is_same_v<decltype(work_count(IndexType{})), IndexType>,
+                  "work_count needs to return IndexType");
+    IndexType chunk_base{};
+    IndexType work_base{};
+    const auto lane = subwarp.threadd_rank();
+    while (chunk_base < chunk_count) {
+        const auto in_chunk = chunk_base + lane;
+        const auto local_work =
+            in_chunk < chunk_count ? work_count(in_chunk) : IndexType{};
+        assert(local_work > 0);
+        IndexType work_prefix_sum{};
+        // inclusive prefix sum over work tells us where each chunk begins
+        subwarp_prefix_sum<true>(local_work, work_prefix_sum, subwarp);
+        // binary search over this prefix sum tells us which chunk each thread
+        // works in
+        const auto local_work_pos = work_base + lane;
+        const auto local_chunk =
+            synchronous_fixed_binary_search<subwarp_size>([&](int i) {
+                return local_work_pos < subwarp.shfl(work_prefix_sum, i);
+            });
+        auto local_chunk_work_base =
+            subwarp.shfl(work_prefix_sum - local_work, local_chunk);
+        const auto chunk = chunk_base + local_chunk;
+        op(chunk, local_work_pos - local_chunk_work_base);
+        const auto last_chunk = subwarp.shfl(in_chunk, subwarp_size - 1);
+        const auto last_chunk_size =
+            subwarp.shfl(work_prefix_sum, subwarp_size - 1);
+        work_base += subwarp_size;
+    }
+}
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_

From 1f28b410ae3d3bcc82370fe50c6c3aded17be974 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 29 Jan 2025 23:28:23 +0100
Subject: [PATCH 2/3] add warp load balancing primitive

---
 common/cuda_hip/components/load_balancing.hpp |  65 +++--
 core/base/array.cpp                           |   1 +
 cuda/test/components/CMakeLists.txt           |   1 +
 cuda/test/components/load_balancing.cu        | 231 ++++++++++++++++++
 hip/test/components/CMakeLists.txt            |   1 +
 hip/test/components/load_balancing.hip.cpp    | 229 +++++++++++++++++
 6 files changed, 510 insertions(+), 18 deletions(-)
 create mode 100644 cuda/test/components/load_balancing.cu
 create mode 100644 hip/test/components/load_balancing.hip.cpp

diff --git a/common/cuda_hip/components/load_balancing.hpp b/common/cuda_hip/components/load_balancing.hpp
index ba62e3e09b3..e48d94c0f63 100644
--- a/common/cuda_hip/components/load_balancing.hpp
+++ b/common/cuda_hip/components/load_balancing.hpp
@@ -5,9 +5,6 @@
 #ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_
 #define GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_
 
-
-#include <type_traits>
-
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/prefix_sum.hpp"
 #include "common/cuda_hip/components/searching.hpp"
@@ -37,26 +34,23 @@ namespace GKO_DEVICE_NAMESPACE {
  *          without overflow.
  */
 template <int subwarp_size, typename IndexType, typename WorkCountFunctor,
-          typename OpFunctor, typename Group>
-__forceinline__ __device__ void load_balance_swarp(IndexType chunk_count,
-                                                   WorkCountFunctor work_count,
-                                                   Op op)
+          typename OpFunctor>
+__forceinline__ __device__ void load_balance_subwarp_nonempty(
+    IndexType chunk_count, WorkCountFunctor work_count, OpFunctor op)
 {
     const auto subwarp =
         group::tiled_partition<subwarp_size>(group::this_thread_block());
     static_assert(std::is_same_v<decltype(work_count(IndexType{})), IndexType>,
                   "work_count needs to return IndexType");
+    const auto lane = subwarp.thread_rank();
     IndexType chunk_base{};
     IndexType work_base{};
-    const auto lane = subwarp.threadd_rank();
+    IndexType local_work = lane < chunk_count ? work_count(lane) : IndexType{1};
+    // inclusive prefix sum over work tells us where each chunk begins
+    IndexType work_prefix_sum{};
+    subwarp_prefix_sum<true>(local_work, work_prefix_sum, subwarp);
     while (chunk_base < chunk_count) {
-        const auto in_chunk = chunk_base + lane;
-        const auto local_work =
-            in_chunk < chunk_count ? work_count(in_chunk) : IndexType{};
         assert(local_work > 0);
-        IndexType work_prefix_sum{};
-        // inclusive prefix sum over work tells us where each chunk begins
-        subwarp_prefix_sum<true>(local_work, work_prefix_sum, subwarp);
         // binary search over this prefix sum tells us which chunk each thread
         // works in
         const auto local_work_pos = work_base + lane;
@@ -64,14 +58,49 @@ __forceinline__ __device__ void load_balance_swarp(IndexType chunk_count,
             synchronous_fixed_binary_search<subwarp_size>([&](int i) {
                 return local_work_pos < subwarp.shfl(work_prefix_sum, i);
             });
+        assert(local_chunk < subwarp_size);
         auto local_chunk_work_base =
             subwarp.shfl(work_prefix_sum - local_work, local_chunk);
         const auto chunk = chunk_base + local_chunk;
-        op(chunk, local_work_pos - local_chunk_work_base);
-        const auto last_chunk = subwarp.shfl(in_chunk, subwarp_size - 1);
-        const auto last_chunk_size =
-            subwarp.shfl(work_prefix_sum, subwarp_size - 1);
+        // do the work inside this chunk
+        if (chunk < chunk_count) {
+            op(chunk, local_work_pos - local_chunk_work_base, local_work_pos);
+        }
+        const auto last_local_chunk =
+            subwarp.shfl(local_chunk, subwarp_size - 1);
+        const auto last_local_chunk_end =
+            subwarp.shfl(work_prefix_sum, last_local_chunk);
+        assert(last_local_chunk < subwarp_size);
+        assert(last_local_chunk_end > local_work_pos);
         work_base += subwarp_size;
+        // how many chunks have we completed? The last one is completed if its
+        // end matches work_base after the update
+        const auto chunk_advance =
+            last_local_chunk + (last_local_chunk_end == work_base ? 1 : 0);
+        chunk_base += chunk_advance;
+        // shift down local_work and work_prefix_sum,
+        // adding new values when necessary
+        local_work = subwarp.shfl_down(local_work, chunk_advance);
+        // find the last value of the prefix sum and remember it for later
+        const auto work_prefix_sum_end =
+            subwarp.shfl(work_prefix_sum, subwarp_size - 1);
+        // this shuffle leaves the trailing elements unchaged, we need to
+        // overwrite them later
+        work_prefix_sum = subwarp.shfl_down(work_prefix_sum, chunk_advance);
+        IndexType work_prefix_sum_add{};
+        if (lane >= subwarp_size - chunk_advance) {
+            const auto in_chunk = chunk_base + lane;
+            // load new work counters at the end
+            local_work =
+                in_chunk < chunk_count ? work_count(in_chunk) : IndexType{1};
+            work_prefix_sum_add = local_work;
+            // fill the trailing work_prefix_sum with the last element
+            work_prefix_sum = work_prefix_sum_end;
+        }
+        // compute a prefix sum over new chunks and add to the prefix sum
+        subwarp_prefix_sum<true>(work_prefix_sum_add, work_prefix_sum_add,
+                                 subwarp);
+        work_prefix_sum += work_prefix_sum_add;
     }
 }
 
diff --git a/core/base/array.cpp b/core/base/array.cpp
index 51fa4b34bb1..da1accc1ca1 100644
--- a/core/base/array.cpp
+++ b/core/base/array.cpp
@@ -91,6 +91,7 @@ ValueType reduce_add(const array<ValueType>& input_arr,
 #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value)
 
 GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL);
+template GKO_DECLARE_ARRAY_FILL(bool);
 template GKO_DECLARE_ARRAY_FILL(uint16);
 template GKO_DECLARE_ARRAY_FILL(uint32);
 #ifndef GKO_SIZE_T_IS_UINT64_T
diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt
index 89e374431c4..6fe81ea0635 100644
--- a/cuda/test/components/CMakeLists.txt
+++ b/cuda/test/components/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_cuda_test(cooperative_groups)
+ginkgo_create_cuda_test(load_balancing)
 ginkgo_create_cuda_test(merging)
 ginkgo_create_cuda_test(searching)
 ginkgo_create_cuda_test(sorting)
diff --git a/cuda/test/components/load_balancing.cu b/cuda/test/components/load_balancing.cu
new file mode 100644
index 00000000000..7ea5930278f
--- /dev/null
+++ b/cuda/test/components/load_balancing.cu
@@ -0,0 +1,231 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/components/load_balancing.hpp"
+
+#include <memory>
+#include <numeric>
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+#include "core/base/index_range.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+class LoadBalancing : public CudaTestFixture {
+protected:
+};
+
+
+__global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err,
+                                      int size)
+{
+    gko::kernels::cuda::load_balance_subwarp_nonempty<
+        gko::kernels::cuda::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return 1;
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work != 0) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            if (global_pos != chunk) {
+                printf("incorrect global_pos %d %d\n", global_pos, chunk);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output + chunk, -1, chunk);
+            if (prev != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWith1WorkPerChunk)
+{
+    for (const auto size :
+         {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> data{this->exec, usize};
+        data.fill(-1);
+        gko::array<bool> err{this->exec, 1};
+        err.fill(false);
+        gko::array<int> size_once{this->exec, usize};
+        size_once.fill(0);
+        gko::array<int> ref_data{this->exec, usize};
+        gko::kernels::cuda::components::fill_seq_array(
+            exec, ref_data.get_data(), usize);
+
+        test_1_work_per_chunk<<<1, gko::kernels::cuda::config::warp_size>>>(
+            data.get_data(), size_once.get_data(), err.get_data(), size);
+
+        GKO_ASSERT_ARRAY_EQ(data, ref_data);
+        ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data()));
+    }
+}
+
+
+__global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err,
+                                      int size)
+{
+    gko::kernels::cuda::load_balance_subwarp_nonempty<
+        gko::kernels::cuda::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return 3;
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work < 0 || work >= 3) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output + global_pos, -1, 3 * chunk + work);
+            if (prev != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWith3WorkPerChunk)
+{
+    for (const auto size : {0, 1, 2, 3, 10, 11, 19, 20, 21, 30, 31, 32, 33, 34,
+                            63, 64, 65, 100}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> data{this->exec, 3 * usize};
+        data.fill(-1);
+        gko::array<bool> err{this->exec, 1};
+        err.fill(false);
+        gko::array<int> size_once{this->exec, usize};
+        size_once.fill(0);
+        gko::array<int> ref_data{this->exec, 3 * usize};
+        gko::kernels::cuda::components::fill_seq_array(
+            exec, ref_data.get_data(), 3 * usize);
+
+        test_3_work_per_chunk<<<1, gko::kernels::cuda::config::warp_size>>>(
+            data.get_data(), size_once.get_data(), err.get_data(), size);
+
+        GKO_ASSERT_ARRAY_EQ(data, ref_data);
+        ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data()));
+    }
+}
+
+
+__global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size,
+                                            int* output_chunk, int* output_work,
+                                            int* size_once, bool* err)
+{
+    gko::kernels::cuda::load_balance_subwarp_nonempty<
+        gko::kernels::cuda::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return chunk_sizes[chunk];
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work < 0 ||
+                work >= chunk_sizes[chunk]) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output_chunk + global_pos, -1, chunk);
+            int prev2 = atomicCAS(output_work + global_pos, -1, work);
+            if (prev != -1 || prev2 != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk)
+{
+    std::default_random_engine rng{123468};
+    std::uniform_int_distribution<int> dist{1, 100};
+    for (const auto size :
+         {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> chunk_sizes{this->ref, usize};
+        std::generate_n(chunk_sizes.get_data(), size,
+                        [&] { return dist(rng); });
+        const auto total_work = static_cast<gko::size_type>(std::reduce(
+            chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size));
+        gko::array<int> data_chunk{this->ref, total_work};
+        gko::array<int> data_work{this->ref, total_work};
+        gko::array<int> ddata_chunk{this->exec, total_work};
+        gko::array<int> ddata_work{this->exec, total_work};
+        gko::array<bool> derr{this->exec, 1};
+        gko::array<int> dsize_once{this->exec, usize};
+        gko::array<int> dchunk_sizes{this->exec, chunk_sizes};
+        ddata_chunk.fill(-1);
+        ddata_work.fill(-1);
+        derr.fill(false);
+        dsize_once.fill(0);
+        int i{};
+        for (const auto chunk : gko::irange{size}) {
+            const auto chunk_size = chunk_sizes.get_const_data()[chunk];
+            for (const auto work : gko::irange{chunk_size}) {
+                data_chunk.get_data()[i] = chunk;
+                data_work.get_data()[i] = work;
+                i++;
+            }
+        }
+
+        test_dynamic_work_per_chunk<<<1,
+                                      gko::kernels::cuda::config::warp_size>>>(
+            dchunk_sizes.get_data(), size, ddata_chunk.get_data(),
+            ddata_work.get_data(), dsize_once.get_data(), derr.get_data());
+
+        GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk);
+        GKO_ASSERT_ARRAY_EQ(ddata_work, data_work);
+        ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data()));
+    }
+}
+
+
+}  // namespace
diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt
index 9bbd49608a7..780b0076c7f 100644
--- a/hip/test/components/CMakeLists.txt
+++ b/hip/test/components/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_hip_test(cooperative_groups)
+ginkgo_create_hip_test(load_balancing)
 ginkgo_create_hip_test(merging)
 ginkgo_create_hip_test(searching)
 ginkgo_create_hip_test(sorting)
diff --git a/hip/test/components/load_balancing.hip.cpp b/hip/test/components/load_balancing.hip.cpp
new file mode 100644
index 00000000000..daedff51af3
--- /dev/null
+++ b/hip/test/components/load_balancing.hip.cpp
@@ -0,0 +1,229 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/components/load_balancing.hpp"
+
+#include <memory>
+#include <numeric>
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+#include "core/base/index_range.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class LoadBalancing : public HipTestFixture {
+protected:
+};
+
+
+__global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err,
+                                      int size)
+{
+    gko::kernels::hip::load_balance_subwarp_nonempty<
+        gko::kernels::hip::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return 1;
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work != 0) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            if (global_pos != chunk) {
+                printf("incorrect global_pos %d %d\n", global_pos, chunk);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output + chunk, -1, chunk);
+            if (prev != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWith1WorkPerChunk)
+{
+    for (const auto size : {0, 1, 2, 3, 7, 8, 63, 64, 65, 127, 128, 129, 200}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> data{this->exec, usize};
+        data.fill(-1);
+        gko::array<bool> err{this->exec, 1};
+        err.fill(false);
+        gko::array<int> size_once{this->exec, usize};
+        size_once.fill(0);
+        gko::array<int> ref_data{this->exec, usize};
+        gko::kernels::hip::components::fill_seq_array(exec, ref_data.get_data(),
+                                                      usize);
+
+        test_1_work_per_chunk<<<1, gko::kernels::hip::config::warp_size>>>(
+            data.get_data(), size_once.get_data(), err.get_data(), size);
+
+        GKO_ASSERT_ARRAY_EQ(data, ref_data);
+        ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data()));
+    }
+}
+
+
+__global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err,
+                                      int size)
+{
+    gko::kernels::hip::load_balance_subwarp_nonempty<
+        gko::kernels::hip::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return 3;
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work < 0 || work >= 3) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output + global_pos, -1, 3 * chunk + work);
+            if (prev != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWith3WorkPerChunk)
+{
+    for (const auto size : {0, 1, 2, 3, 21, 22, 42, 43, 63, 64, 65, 100}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> data{this->exec, 3 * usize};
+        data.fill(-1);
+        gko::array<bool> err{this->exec, 1};
+        err.fill(false);
+        gko::array<int> size_once{this->exec, usize};
+        size_once.fill(0);
+        gko::array<int> ref_data{this->exec, 3 * usize};
+        gko::kernels::hip::components::fill_seq_array(exec, ref_data.get_data(),
+                                                      3 * usize);
+
+        test_3_work_per_chunk<<<1, gko::kernels::hip::config::warp_size>>>(
+            data.get_data(), size_once.get_data(), err.get_data(), size);
+
+        GKO_ASSERT_ARRAY_EQ(data, ref_data);
+        ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data()));
+    }
+}
+
+
+__global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size,
+                                            int* output_chunk, int* output_work,
+                                            int* size_once, bool* err)
+{
+    gko::kernels::hip::load_balance_subwarp_nonempty<
+        gko::kernels::hip::config::warp_size>(
+        size,
+        [&](int chunk) {
+            if (chunk < 0 || chunk >= size) {
+                printf("oob %d\n", chunk);
+                *err = true;
+            }
+            int result = atomicCAS(size_once + chunk, 0, 1);
+            if (result != 0) {
+                printf("duplicate size %d\n", chunk);
+                *err = true;
+            }
+            return chunk_sizes[chunk];
+        },
+        [&](int chunk, int work, int global_pos) {
+            if (chunk < 0 || chunk >= size || work < 0 ||
+                work >= chunk_sizes[chunk]) {
+                printf("oob %d %d\n", chunk, work);
+                *err = true;
+            }
+            // make sure every work item happens only once
+            int prev = atomicCAS(output_chunk + global_pos, -1, chunk);
+            int prev2 = atomicCAS(output_work + global_pos, -1, work);
+            if (prev != -1 || prev2 != -1) {
+                printf("duplicate %d %d\n", chunk, work);
+                *err = true;
+            }
+        });
+}
+
+
+TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk)
+{
+    std::default_random_engine rng{123468};
+    std::uniform_int_distribution<int> dist{1, 100};
+    for (const auto size :
+         {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) {
+        SCOPED_TRACE(size);
+        const auto usize = static_cast<gko::size_type>(size);
+        gko::array<int> chunk_sizes{this->ref, usize};
+        std::generate_n(chunk_sizes.get_data(), size,
+                        [&] { return dist(rng); });
+        const auto total_work = static_cast<gko::size_type>(std::reduce(
+            chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size));
+        gko::array<int> data_chunk{this->ref, total_work};
+        gko::array<int> data_work{this->ref, total_work};
+        gko::array<int> ddata_chunk{this->exec, total_work};
+        gko::array<int> ddata_work{this->exec, total_work};
+        gko::array<bool> derr{this->exec, 1};
+        gko::array<int> dsize_once{this->exec, usize};
+        gko::array<int> dchunk_sizes{this->exec, chunk_sizes};
+        ddata_chunk.fill(-1);
+        ddata_work.fill(-1);
+        derr.fill(false);
+        dsize_once.fill(0);
+        int i{};
+        for (const auto chunk : gko::irange{size}) {
+            const auto chunk_size = chunk_sizes.get_const_data()[chunk];
+            for (const auto work : gko::irange{chunk_size}) {
+                data_chunk.get_data()[i] = chunk;
+                data_work.get_data()[i] = work;
+                i++;
+            }
+        }
+
+        test_dynamic_work_per_chunk<<<1,
+                                      gko::kernels::hip::config::warp_size>>>(
+            dchunk_sizes.get_data(), size, ddata_chunk.get_data(),
+            ddata_work.get_data(), dsize_once.get_data(), derr.get_data());
+
+        GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk);
+        GKO_ASSERT_ARRAY_EQ(ddata_work, data_work);
+        ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data()));
+    }
+}
+
+
+}  // namespace

From 7085f1322f2344a53508ba506483971d01d9ab33 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 18 Apr 2025 16:24:37 +0200
Subject: [PATCH 3/3] allow empty inputs in warp load balancing

---
 common/cuda_hip/components/load_balancing.hpp | 65 ++++++++-------
 cuda/test/components/load_balancing.cu        | 82 +++++++++++++++++--
 hip/test/components/load_balancing.hip.cpp    | 72 +++++++++++++++-
 3 files changed, 183 insertions(+), 36 deletions(-)

diff --git a/common/cuda_hip/components/load_balancing.hpp b/common/cuda_hip/components/load_balancing.hpp
index e48d94c0f63..ff053ac42cf 100644
--- a/common/cuda_hip/components/load_balancing.hpp
+++ b/common/cuda_hip/components/load_balancing.hpp
@@ -23,7 +23,7 @@ namespace GKO_DEVICE_NAMESPACE {
  *
  * @param chunk_count  how many total chunks of work are there?
  * @param work_count   functor work_count(i) should return how many work items
- *                     are in the i'th chunk, at least 1.
+ *                     are in the i'th chunk.
  *                     It will only be executed once for each i.
  * @param op           the operation to execute for a single work item, called
  *                     via op(chunk_id, work_item_id)
@@ -35,7 +35,7 @@ namespace GKO_DEVICE_NAMESPACE {
  */
 template <int subwarp_size, typename IndexType, typename WorkCountFunctor,
           typename OpFunctor>
-__forceinline__ __device__ void load_balance_subwarp_nonempty(
+__forceinline__ __device__ void load_balance_subwarp(
     IndexType chunk_count, WorkCountFunctor work_count, OpFunctor op)
 {
     const auto subwarp =
@@ -50,7 +50,7 @@ __forceinline__ __device__ void load_balance_subwarp_nonempty(
     IndexType work_prefix_sum{};
     subwarp_prefix_sum<true>(local_work, work_prefix_sum, subwarp);
     while (chunk_base < chunk_count) {
-        assert(local_work > 0);
+        assert(local_work >= 0);
         // binary search over this prefix sum tells us which chunk each thread
         // works in
         const auto local_work_pos = work_base + lane;
@@ -58,45 +58,54 @@ __forceinline__ __device__ void load_balance_subwarp_nonempty(
             synchronous_fixed_binary_search<subwarp_size>([&](int i) {
                 return local_work_pos < subwarp.shfl(work_prefix_sum, i);
             });
-        assert(local_chunk < subwarp_size);
-        auto local_chunk_work_base =
-            subwarp.shfl(work_prefix_sum - local_work, local_chunk);
+        // guard against out-of-bounds shuffle
+        auto local_chunk_work_base = subwarp.shfl(
+            work_prefix_sum - local_work, max(local_chunk, subwarp_size - 1));
         const auto chunk = chunk_base + local_chunk;
         // do the work inside this chunk
-        if (chunk < chunk_count) {
+        if (chunk < chunk_count && local_chunk < subwarp_size) {
             op(chunk, local_work_pos - local_chunk_work_base, local_work_pos);
         }
         const auto last_local_chunk =
             subwarp.shfl(local_chunk, subwarp_size - 1);
-        const auto last_local_chunk_end =
-            subwarp.shfl(work_prefix_sum, last_local_chunk);
-        assert(last_local_chunk < subwarp_size);
-        assert(last_local_chunk_end > local_work_pos);
-        work_base += subwarp_size;
-        // how many chunks have we completed? The last one is completed if its
-        // end matches work_base after the update
-        const auto chunk_advance =
-            last_local_chunk + (last_local_chunk_end == work_base ? 1 : 0);
-        chunk_base += chunk_advance;
-        // shift down local_work and work_prefix_sum,
-        // adding new values when necessary
-        local_work = subwarp.shfl_down(local_work, chunk_advance);
         // find the last value of the prefix sum and remember it for later
         const auto work_prefix_sum_end =
             subwarp.shfl(work_prefix_sum, subwarp_size - 1);
-        // this shuffle leaves the trailing elements unchaged, we need to
-        // overwrite them later
-        work_prefix_sum = subwarp.shfl_down(work_prefix_sum, chunk_advance);
         IndexType work_prefix_sum_add{};
-        if (lane >= subwarp_size - chunk_advance) {
+        if (last_local_chunk == subwarp_size) {
+            // if we didn't have enough work to do: all local chunks are
+            // completed
+            chunk_base += subwarp_size;
+            work_prefix_sum = work_prefix_sum_end;
             const auto in_chunk = chunk_base + lane;
-            // load new work counters at the end
             local_work =
-                in_chunk < chunk_count ? work_count(in_chunk) : IndexType{1};
+                in_chunk < chunk_count ? work_count(in_chunk) : IndexType{};
             work_prefix_sum_add = local_work;
-            // fill the trailing work_prefix_sum with the last element
-            work_prefix_sum = work_prefix_sum_end;
+        } else {
+            const auto last_local_chunk_end =
+                subwarp.shfl(work_prefix_sum, last_local_chunk);
+            // how many chunks have we completed? The last one is completed if
+            // its end matches work_base after the update
+            const auto chunk_advance =
+                last_local_chunk + (last_local_chunk_end == work_base ? 1 : 0);
+            chunk_base += chunk_advance;
+            // shift down local_work and work_prefix_sum,
+            // adding new values when necessary
+            local_work = subwarp.shfl_down(local_work, chunk_advance);
+            // this shuffle leaves the trailing elements unchaged, we need to
+            // overwrite them later
+            work_prefix_sum = subwarp.shfl_down(work_prefix_sum, chunk_advance);
+            if (lane >= subwarp_size - chunk_advance) {
+                const auto in_chunk = chunk_base + lane;
+                // load new work counters at the end
+                local_work =
+                    in_chunk < chunk_count ? work_count(in_chunk) : IndexType{};
+                work_prefix_sum_add = local_work;
+                // fill the trailing work_prefix_sum with the last element
+                work_prefix_sum = work_prefix_sum_end;
+            }
         }
+        work_base += popcnt(subwarp.ballot(local_chunk < subwarp_size));
         // compute a prefix sum over new chunks and add to the prefix sum
         subwarp_prefix_sum<true>(work_prefix_sum_add, work_prefix_sum_add,
                                  subwarp);
diff --git a/cuda/test/components/load_balancing.cu b/cuda/test/components/load_balancing.cu
index 7ea5930278f..c871f286fec 100644
--- a/cuda/test/components/load_balancing.cu
+++ b/cuda/test/components/load_balancing.cu
@@ -29,7 +29,7 @@ protected:
 __global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err,
                                       int size)
 {
-    gko::kernels::cuda::load_balance_subwarp_nonempty<
+    gko::kernels::cuda::load_balance_subwarp<
         gko::kernels::cuda::config::warp_size>(
         size,
         [&](int chunk) {
@@ -91,7 +91,7 @@ TEST_F(LoadBalancing, WorksWith1WorkPerChunk)
 __global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err,
                                       int size)
 {
-    gko::kernels::cuda::load_balance_subwarp_nonempty<
+    gko::kernels::cuda::load_balance_subwarp<
         gko::kernels::cuda::config::warp_size>(
         size,
         [&](int chunk) {
@@ -150,7 +150,7 @@ __global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size,
                                             int* output_chunk, int* output_work,
                                             int* size_once, bool* err)
 {
-    gko::kernels::cuda::load_balance_subwarp_nonempty<
+    gko::kernels::cuda::load_balance_subwarp<
         gko::kernels::cuda::config::warp_size>(
         size,
         [&](int chunk) {
@@ -182,12 +182,12 @@ __global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size,
 }
 
 
-TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk)
+TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkNonempty)
 {
     std::default_random_engine rng{123468};
     std::uniform_int_distribution<int> dist{1, 100};
     for (const auto size :
-         {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) {
+         {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 1000}) {
         SCOPED_TRACE(size);
         const auto usize = static_cast<gko::size_type>(size);
         gko::array<int> chunk_sizes{this->ref, usize};
@@ -228,4 +228,76 @@ TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk)
 }
 
 
+TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkEmptyAllowed)
+{
+    // empty chunks aligned with warp size
+    std::vector<int> empty_aligned(32, 0);
+    // empty chunks not aligned with warp size
+    std::vector<int> empty_unaligned(33, 0);
+    // some empty chunks, not enough work initially
+    std::vector<int> incomplete{0, 0, 31, 0, 0};
+    // some empty chunks, not enough work in second iteration
+    std::vector<int> incomplete2{0, 0, 33, 0, 0};
+    // large gaps, never enough work
+    std::vector<int> gaps_incomplete(128);
+    gaps_incomplete[4] = 31;
+    gaps_incomplete[4 + 32] = 31;
+    gaps_incomplete[4 + 64] = 31;
+    gaps_incomplete[4 + 96] = 31;
+    // large gaps, enough work
+    std::vector<int> gaps_complete(128);
+    gaps_incomplete[4] = 32;
+    gaps_incomplete[4 + 32] = 32;
+    gaps_incomplete[4 + 64] = 32;
+    gaps_incomplete[4 + 96] = 32;
+    // large gaps, chunks overlap between iterations
+    std::vector<int> gaps_overlap(96);
+    gaps_overlap[31] = 63;
+    gaps_overlap[31 + 32] = 32;
+    gaps_overlap[31 + 64] = 33;
+    // large gaps, work at the end of the chunk
+    std::vector<int> gaps_complete_end(96);
+    gaps_overlap[31] = 64;
+    gaps_overlap[31 + 32] = 32;
+    gaps_overlap[31 + 64] = 32;
+
+    for (auto input :
+         {empty_aligned, empty_unaligned, incomplete, incomplete2}) {
+        gko::array<int> chunk_sizes{this->ref, input.begin(), input.end()};
+        auto size = static_cast<int>(input.size());
+        const auto total_work = static_cast<gko::size_type>(std::reduce(
+            chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size));
+        gko::array<int> data_chunk{this->ref, total_work};
+        gko::array<int> data_work{this->ref, total_work};
+        gko::array<int> ddata_chunk{this->exec, total_work};
+        gko::array<int> ddata_work{this->exec, total_work};
+        gko::array<bool> derr{this->exec, 1};
+        gko::array<int> dsize_once{this->exec, static_cast<unsigned>(size)};
+        gko::array<int> dchunk_sizes{this->exec, chunk_sizes};
+        ddata_chunk.fill(-1);
+        ddata_work.fill(-1);
+        derr.fill(false);
+        dsize_once.fill(0);
+        int i{};
+        for (const auto chunk : gko::irange{size}) {
+            const auto chunk_size = chunk_sizes.get_const_data()[chunk];
+            for (const auto work : gko::irange{chunk_size}) {
+                data_chunk.get_data()[i] = chunk;
+                data_work.get_data()[i] = work;
+                i++;
+            }
+        }
+
+        test_dynamic_work_per_chunk<<<1,
+                                      gko::kernels::cuda::config::warp_size>>>(
+            dchunk_sizes.get_data(), size, ddata_chunk.get_data(),
+            ddata_work.get_data(), dsize_once.get_data(), derr.get_data());
+
+        GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk);
+        GKO_ASSERT_ARRAY_EQ(ddata_work, data_work);
+        ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data()));
+    }
+}
+
+
 }  // namespace
diff --git a/hip/test/components/load_balancing.hip.cpp b/hip/test/components/load_balancing.hip.cpp
index daedff51af3..3ea38b09b50 100644
--- a/hip/test/components/load_balancing.hip.cpp
+++ b/hip/test/components/load_balancing.hip.cpp
@@ -29,7 +29,7 @@ class LoadBalancing : public HipTestFixture {
 __global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err,
                                       int size)
 {
-    gko::kernels::hip::load_balance_subwarp_nonempty<
+    gko::kernels::hip::load_balance_subwarp<
         gko::kernels::hip::config::warp_size>(
         size,
         [&](int chunk) {
@@ -90,7 +90,7 @@ TEST_F(LoadBalancing, WorksWith1WorkPerChunk)
 __global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err,
                                       int size)
 {
-    gko::kernels::hip::load_balance_subwarp_nonempty<
+    gko::kernels::hip::load_balance_subwarp<
         gko::kernels::hip::config::warp_size>(
         size,
         [&](int chunk) {
@@ -148,7 +148,7 @@ __global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size,
                                             int* output_chunk, int* output_work,
                                             int* size_once, bool* err)
 {
-    gko::kernels::hip::load_balance_subwarp_nonempty<
+    gko::kernels::hip::load_balance_subwarp<
         gko::kernels::hip::config::warp_size>(
         size,
         [&](int chunk) {
@@ -226,4 +226,70 @@ TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk)
 }
 
 
+TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkEmptyAllowed)
+{
+    // empty chunks aligned with warp size
+    std::vector<int> empty_aligned(64, 0);
+    // empty chunks not aligned with warp size
+    std::vector<int> empty_unaligned(65, 0);
+    // some empty chunks, not enough work initially
+    std::vector<int> incomplete{0, 0, 63, 0, 0};
+    // some empty chunks, not enough work in second iteration
+    std::vector<int> incomplete2{0, 0, 65, 0, 0};
+    // large gaps, never enough work
+    std::vector<int> gaps_incomplete(128);
+    gaps_incomplete[4] = 31;
+    gaps_incomplete[4 + 64] = 31;
+    // large gaps, enough work
+    std::vector<int> gaps_complete(128);
+    gaps_incomplete[4] = 64;
+    gaps_incomplete[4 + 64] = 64;
+    // large gaps, chunks overlap between iterations
+    std::vector<int> gaps_overlap(96);
+    gaps_overlap[31] = 127;
+    gaps_overlap[31 + 64] = 64;
+    // large gaps, work at the end of the chunk
+    std::vector<int> gaps_complete_end(96);
+    gaps_overlap[31] = 128;
+    gaps_overlap[31 + 64] = 64;
+
+    for (auto input :
+         {empty_aligned, empty_unaligned, incomplete, incomplete2}) {
+        gko::array<int> chunk_sizes{this->ref, input.begin(), input.end()};
+        auto size = static_cast<int>(input.size());
+        const auto total_work = static_cast<gko::size_type>(std::reduce(
+            chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size));
+        gko::array<int> data_chunk{this->ref, total_work};
+        gko::array<int> data_work{this->ref, total_work};
+        gko::array<int> ddata_chunk{this->exec, total_work};
+        gko::array<int> ddata_work{this->exec, total_work};
+        gko::array<bool> derr{this->exec, 1};
+        gko::array<int> dsize_once{this->exec, static_cast<unsigned>(size)};
+        gko::array<int> dchunk_sizes{this->exec, chunk_sizes};
+        ddata_chunk.fill(-1);
+        ddata_work.fill(-1);
+        derr.fill(false);
+        dsize_once.fill(0);
+        int i{};
+        for (const auto chunk : gko::irange{size}) {
+            const auto chunk_size = chunk_sizes.get_const_data()[chunk];
+            for (const auto work : gko::irange{chunk_size}) {
+                data_chunk.get_data()[i] = chunk;
+                data_work.get_data()[i] = work;
+                i++;
+            }
+        }
+
+        test_dynamic_work_per_chunk<<<1,
+                                      gko::kernels::hip::config::warp_size>>>(
+            dchunk_sizes.get_data(), size, ddata_chunk.get_data(),
+            ddata_work.get_data(), dsize_once.get_data(), derr.get_data());
+
+        GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk);
+        GKO_ASSERT_ARRAY_EQ(ddata_work, data_work);
+        ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data()));
+    }
+}
+
+
 }  // namespace