diff --git a/common/cuda_hip/components/load_balancing.hpp b/common/cuda_hip/components/load_balancing.hpp new file mode 100644 index 00000000000..ff053ac42cf --- /dev/null +++ b/common/cuda_hip/components/load_balancing.hpp @@ -0,0 +1,122 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_ + +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +/** + * @internal + * Distributes work evenly inside a subwarp, making sure that no thread is idle + * as long as possible. The work is described by chunks (id, work_count), where + * work_count describes the number of work items in this chunk. + * + * @param chunk_count how many total chunks of work are there? + * @param work_count functor work_count(i) should return how many work items + * are in the i'th chunk. + * It will only be executed once for each i. + * @param op the operation to execute for a single work item, called + * via op(chunk_id, work_item_id) + * @param subwarp the subwarp executing this function + * + * @tparam the index type used to address individual chunks and work items. + * it needs to be able to represent the total number of work items + * without overflow. + */ +template +__forceinline__ __device__ void load_balance_subwarp( + IndexType chunk_count, WorkCountFunctor work_count, OpFunctor op) +{ + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + static_assert(std::is_same_v, + "work_count needs to return IndexType"); + const auto lane = subwarp.thread_rank(); + IndexType chunk_base{}; + IndexType work_base{}; + IndexType local_work = lane < chunk_count ? work_count(lane) : IndexType{1}; + // inclusive prefix sum over work tells us where each chunk begins + IndexType work_prefix_sum{}; + subwarp_prefix_sum(local_work, work_prefix_sum, subwarp); + while (chunk_base < chunk_count) { + assert(local_work >= 0); + // binary search over this prefix sum tells us which chunk each thread + // works in + const auto local_work_pos = work_base + lane; + const auto local_chunk = + synchronous_fixed_binary_search([&](int i) { + return local_work_pos < subwarp.shfl(work_prefix_sum, i); + }); + // guard against out-of-bounds shuffle + auto local_chunk_work_base = subwarp.shfl( + work_prefix_sum - local_work, max(local_chunk, subwarp_size - 1)); + const auto chunk = chunk_base + local_chunk; + // do the work inside this chunk + if (chunk < chunk_count && local_chunk < subwarp_size) { + op(chunk, local_work_pos - local_chunk_work_base, local_work_pos); + } + const auto last_local_chunk = + subwarp.shfl(local_chunk, subwarp_size - 1); + // find the last value of the prefix sum and remember it for later + const auto work_prefix_sum_end = + subwarp.shfl(work_prefix_sum, subwarp_size - 1); + IndexType work_prefix_sum_add{}; + if (last_local_chunk == subwarp_size) { + // if we didn't have enough work to do: all local chunks are + // completed + chunk_base += subwarp_size; + work_prefix_sum = work_prefix_sum_end; + const auto in_chunk = chunk_base + lane; + local_work = + in_chunk < chunk_count ? work_count(in_chunk) : IndexType{}; + work_prefix_sum_add = local_work; + } else { + const auto last_local_chunk_end = + subwarp.shfl(work_prefix_sum, last_local_chunk); + // how many chunks have we completed? The last one is completed if + // its end matches work_base after the update + const auto chunk_advance = + last_local_chunk + (last_local_chunk_end == work_base ? 1 : 0); + chunk_base += chunk_advance; + // shift down local_work and work_prefix_sum, + // adding new values when necessary + local_work = subwarp.shfl_down(local_work, chunk_advance); + // this shuffle leaves the trailing elements unchaged, we need to + // overwrite them later + work_prefix_sum = subwarp.shfl_down(work_prefix_sum, chunk_advance); + if (lane >= subwarp_size - chunk_advance) { + const auto in_chunk = chunk_base + lane; + // load new work counters at the end + local_work = + in_chunk < chunk_count ? work_count(in_chunk) : IndexType{}; + work_prefix_sum_add = local_work; + // fill the trailing work_prefix_sum with the last element + work_prefix_sum = work_prefix_sum_end; + } + } + work_base += popcnt(subwarp.ballot(local_chunk < subwarp_size)); + // compute a prefix sum over new chunks and add to the prefix sum + subwarp_prefix_sum(work_prefix_sum_add, work_prefix_sum_add, + subwarp); + work_prefix_sum += work_prefix_sum_add; + } +} + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_ diff --git a/core/base/array.cpp b/core/base/array.cpp index 51fa4b34bb1..da1accc1ca1 100644 --- a/core/base/array.cpp +++ b/core/base/array.cpp @@ -91,6 +91,7 @@ ValueType reduce_add(const array& input_arr, #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value) GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL); +template GKO_DECLARE_ARRAY_FILL(bool); template GKO_DECLARE_ARRAY_FILL(uint16); template GKO_DECLARE_ARRAY_FILL(uint32); #ifndef GKO_SIZE_T_IS_UINT64_T diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt index 89e374431c4..6fe81ea0635 100644 --- a/cuda/test/components/CMakeLists.txt +++ b/cuda/test/components/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_cuda_test(cooperative_groups) +ginkgo_create_cuda_test(load_balancing) ginkgo_create_cuda_test(merging) ginkgo_create_cuda_test(searching) ginkgo_create_cuda_test(sorting) diff --git a/cuda/test/components/load_balancing.cu b/cuda/test/components/load_balancing.cu new file mode 100644 index 00000000000..c871f286fec --- /dev/null +++ b/cuda/test/components/load_balancing.cu @@ -0,0 +1,303 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/components/load_balancing.hpp" + +#include +#include +#include + +#include + +#include +#include + +#include "core/base/index_range.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "cuda/test/utils.hpp" + + +namespace { + + +class LoadBalancing : public CudaTestFixture { +protected: +}; + + +__global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err, + int size) +{ + gko::kernels::cuda::load_balance_subwarp< + gko::kernels::cuda::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return 1; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work != 0) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + if (global_pos != chunk) { + printf("incorrect global_pos %d %d\n", global_pos, chunk); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output + chunk, -1, chunk); + if (prev != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWith1WorkPerChunk) +{ + for (const auto size : + {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array data{this->exec, usize}; + data.fill(-1); + gko::array err{this->exec, 1}; + err.fill(false); + gko::array size_once{this->exec, usize}; + size_once.fill(0); + gko::array ref_data{this->exec, usize}; + gko::kernels::cuda::components::fill_seq_array( + exec, ref_data.get_data(), usize); + + test_1_work_per_chunk<<<1, gko::kernels::cuda::config::warp_size>>>( + data.get_data(), size_once.get_data(), err.get_data(), size); + + GKO_ASSERT_ARRAY_EQ(data, ref_data); + ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data())); + } +} + + +__global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err, + int size) +{ + gko::kernels::cuda::load_balance_subwarp< + gko::kernels::cuda::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return 3; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work < 0 || work >= 3) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output + global_pos, -1, 3 * chunk + work); + if (prev != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWith3WorkPerChunk) +{ + for (const auto size : {0, 1, 2, 3, 10, 11, 19, 20, 21, 30, 31, 32, 33, 34, + 63, 64, 65, 100}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array data{this->exec, 3 * usize}; + data.fill(-1); + gko::array err{this->exec, 1}; + err.fill(false); + gko::array size_once{this->exec, usize}; + size_once.fill(0); + gko::array ref_data{this->exec, 3 * usize}; + gko::kernels::cuda::components::fill_seq_array( + exec, ref_data.get_data(), 3 * usize); + + test_3_work_per_chunk<<<1, gko::kernels::cuda::config::warp_size>>>( + data.get_data(), size_once.get_data(), err.get_data(), size); + + GKO_ASSERT_ARRAY_EQ(data, ref_data); + ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data())); + } +} + + +__global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size, + int* output_chunk, int* output_work, + int* size_once, bool* err) +{ + gko::kernels::cuda::load_balance_subwarp< + gko::kernels::cuda::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return chunk_sizes[chunk]; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work < 0 || + work >= chunk_sizes[chunk]) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output_chunk + global_pos, -1, chunk); + int prev2 = atomicCAS(output_work + global_pos, -1, work); + if (prev != -1 || prev2 != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkNonempty) +{ + std::default_random_engine rng{123468}; + std::uniform_int_distribution dist{1, 100}; + for (const auto size : + {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 1000}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array chunk_sizes{this->ref, usize}; + std::generate_n(chunk_sizes.get_data(), size, + [&] { return dist(rng); }); + const auto total_work = static_cast(std::reduce( + chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size)); + gko::array data_chunk{this->ref, total_work}; + gko::array data_work{this->ref, total_work}; + gko::array ddata_chunk{this->exec, total_work}; + gko::array ddata_work{this->exec, total_work}; + gko::array derr{this->exec, 1}; + gko::array dsize_once{this->exec, usize}; + gko::array dchunk_sizes{this->exec, chunk_sizes}; + ddata_chunk.fill(-1); + ddata_work.fill(-1); + derr.fill(false); + dsize_once.fill(0); + int i{}; + for (const auto chunk : gko::irange{size}) { + const auto chunk_size = chunk_sizes.get_const_data()[chunk]; + for (const auto work : gko::irange{chunk_size}) { + data_chunk.get_data()[i] = chunk; + data_work.get_data()[i] = work; + i++; + } + } + + test_dynamic_work_per_chunk<<<1, + gko::kernels::cuda::config::warp_size>>>( + dchunk_sizes.get_data(), size, ddata_chunk.get_data(), + ddata_work.get_data(), dsize_once.get_data(), derr.get_data()); + + GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk); + GKO_ASSERT_ARRAY_EQ(ddata_work, data_work); + ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data())); + } +} + + +TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkEmptyAllowed) +{ + // empty chunks aligned with warp size + std::vector empty_aligned(32, 0); + // empty chunks not aligned with warp size + std::vector empty_unaligned(33, 0); + // some empty chunks, not enough work initially + std::vector incomplete{0, 0, 31, 0, 0}; + // some empty chunks, not enough work in second iteration + std::vector incomplete2{0, 0, 33, 0, 0}; + // large gaps, never enough work + std::vector gaps_incomplete(128); + gaps_incomplete[4] = 31; + gaps_incomplete[4 + 32] = 31; + gaps_incomplete[4 + 64] = 31; + gaps_incomplete[4 + 96] = 31; + // large gaps, enough work + std::vector gaps_complete(128); + gaps_incomplete[4] = 32; + gaps_incomplete[4 + 32] = 32; + gaps_incomplete[4 + 64] = 32; + gaps_incomplete[4 + 96] = 32; + // large gaps, chunks overlap between iterations + std::vector gaps_overlap(96); + gaps_overlap[31] = 63; + gaps_overlap[31 + 32] = 32; + gaps_overlap[31 + 64] = 33; + // large gaps, work at the end of the chunk + std::vector gaps_complete_end(96); + gaps_overlap[31] = 64; + gaps_overlap[31 + 32] = 32; + gaps_overlap[31 + 64] = 32; + + for (auto input : + {empty_aligned, empty_unaligned, incomplete, incomplete2}) { + gko::array chunk_sizes{this->ref, input.begin(), input.end()}; + auto size = static_cast(input.size()); + const auto total_work = static_cast(std::reduce( + chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size)); + gko::array data_chunk{this->ref, total_work}; + gko::array data_work{this->ref, total_work}; + gko::array ddata_chunk{this->exec, total_work}; + gko::array ddata_work{this->exec, total_work}; + gko::array derr{this->exec, 1}; + gko::array dsize_once{this->exec, static_cast(size)}; + gko::array dchunk_sizes{this->exec, chunk_sizes}; + ddata_chunk.fill(-1); + ddata_work.fill(-1); + derr.fill(false); + dsize_once.fill(0); + int i{}; + for (const auto chunk : gko::irange{size}) { + const auto chunk_size = chunk_sizes.get_const_data()[chunk]; + for (const auto work : gko::irange{chunk_size}) { + data_chunk.get_data()[i] = chunk; + data_work.get_data()[i] = work; + i++; + } + } + + test_dynamic_work_per_chunk<<<1, + gko::kernels::cuda::config::warp_size>>>( + dchunk_sizes.get_data(), size, ddata_chunk.get_data(), + ddata_work.get_data(), dsize_once.get_data(), derr.get_data()); + + GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk); + GKO_ASSERT_ARRAY_EQ(ddata_work, data_work); + ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data())); + } +} + + +} // namespace diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt index 9bbd49608a7..780b0076c7f 100644 --- a/hip/test/components/CMakeLists.txt +++ b/hip/test/components/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_hip_test(cooperative_groups) +ginkgo_create_hip_test(load_balancing) ginkgo_create_hip_test(merging) ginkgo_create_hip_test(searching) ginkgo_create_hip_test(sorting) diff --git a/hip/test/components/load_balancing.hip.cpp b/hip/test/components/load_balancing.hip.cpp new file mode 100644 index 00000000000..3ea38b09b50 --- /dev/null +++ b/hip/test/components/load_balancing.hip.cpp @@ -0,0 +1,295 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/components/load_balancing.hpp" + +#include +#include +#include + +#include + +#include +#include + +#include "core/base/index_range.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class LoadBalancing : public HipTestFixture { +protected: +}; + + +__global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err, + int size) +{ + gko::kernels::hip::load_balance_subwarp< + gko::kernels::hip::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return 1; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work != 0) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + if (global_pos != chunk) { + printf("incorrect global_pos %d %d\n", global_pos, chunk); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output + chunk, -1, chunk); + if (prev != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWith1WorkPerChunk) +{ + for (const auto size : {0, 1, 2, 3, 7, 8, 63, 64, 65, 127, 128, 129, 200}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array data{this->exec, usize}; + data.fill(-1); + gko::array err{this->exec, 1}; + err.fill(false); + gko::array size_once{this->exec, usize}; + size_once.fill(0); + gko::array ref_data{this->exec, usize}; + gko::kernels::hip::components::fill_seq_array(exec, ref_data.get_data(), + usize); + + test_1_work_per_chunk<<<1, gko::kernels::hip::config::warp_size>>>( + data.get_data(), size_once.get_data(), err.get_data(), size); + + GKO_ASSERT_ARRAY_EQ(data, ref_data); + ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data())); + } +} + + +__global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err, + int size) +{ + gko::kernels::hip::load_balance_subwarp< + gko::kernels::hip::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return 3; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work < 0 || work >= 3) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output + global_pos, -1, 3 * chunk + work); + if (prev != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWith3WorkPerChunk) +{ + for (const auto size : {0, 1, 2, 3, 21, 22, 42, 43, 63, 64, 65, 100}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array data{this->exec, 3 * usize}; + data.fill(-1); + gko::array err{this->exec, 1}; + err.fill(false); + gko::array size_once{this->exec, usize}; + size_once.fill(0); + gko::array ref_data{this->exec, 3 * usize}; + gko::kernels::hip::components::fill_seq_array(exec, ref_data.get_data(), + 3 * usize); + + test_3_work_per_chunk<<<1, gko::kernels::hip::config::warp_size>>>( + data.get_data(), size_once.get_data(), err.get_data(), size); + + GKO_ASSERT_ARRAY_EQ(data, ref_data); + ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data())); + } +} + + +__global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size, + int* output_chunk, int* output_work, + int* size_once, bool* err) +{ + gko::kernels::hip::load_balance_subwarp< + gko::kernels::hip::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return chunk_sizes[chunk]; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work < 0 || + work >= chunk_sizes[chunk]) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output_chunk + global_pos, -1, chunk); + int prev2 = atomicCAS(output_work + global_pos, -1, work); + if (prev != -1 || prev2 != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk) +{ + std::default_random_engine rng{123468}; + std::uniform_int_distribution dist{1, 100}; + for (const auto size : + {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array chunk_sizes{this->ref, usize}; + std::generate_n(chunk_sizes.get_data(), size, + [&] { return dist(rng); }); + const auto total_work = static_cast(std::reduce( + chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size)); + gko::array data_chunk{this->ref, total_work}; + gko::array data_work{this->ref, total_work}; + gko::array ddata_chunk{this->exec, total_work}; + gko::array ddata_work{this->exec, total_work}; + gko::array derr{this->exec, 1}; + gko::array dsize_once{this->exec, usize}; + gko::array dchunk_sizes{this->exec, chunk_sizes}; + ddata_chunk.fill(-1); + ddata_work.fill(-1); + derr.fill(false); + dsize_once.fill(0); + int i{}; + for (const auto chunk : gko::irange{size}) { + const auto chunk_size = chunk_sizes.get_const_data()[chunk]; + for (const auto work : gko::irange{chunk_size}) { + data_chunk.get_data()[i] = chunk; + data_work.get_data()[i] = work; + i++; + } + } + + test_dynamic_work_per_chunk<<<1, + gko::kernels::hip::config::warp_size>>>( + dchunk_sizes.get_data(), size, ddata_chunk.get_data(), + ddata_work.get_data(), dsize_once.get_data(), derr.get_data()); + + GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk); + GKO_ASSERT_ARRAY_EQ(ddata_work, data_work); + ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data())); + } +} + + +TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkEmptyAllowed) +{ + // empty chunks aligned with warp size + std::vector empty_aligned(64, 0); + // empty chunks not aligned with warp size + std::vector empty_unaligned(65, 0); + // some empty chunks, not enough work initially + std::vector incomplete{0, 0, 63, 0, 0}; + // some empty chunks, not enough work in second iteration + std::vector incomplete2{0, 0, 65, 0, 0}; + // large gaps, never enough work + std::vector gaps_incomplete(128); + gaps_incomplete[4] = 31; + gaps_incomplete[4 + 64] = 31; + // large gaps, enough work + std::vector gaps_complete(128); + gaps_incomplete[4] = 64; + gaps_incomplete[4 + 64] = 64; + // large gaps, chunks overlap between iterations + std::vector gaps_overlap(96); + gaps_overlap[31] = 127; + gaps_overlap[31 + 64] = 64; + // large gaps, work at the end of the chunk + std::vector gaps_complete_end(96); + gaps_overlap[31] = 128; + gaps_overlap[31 + 64] = 64; + + for (auto input : + {empty_aligned, empty_unaligned, incomplete, incomplete2}) { + gko::array chunk_sizes{this->ref, input.begin(), input.end()}; + auto size = static_cast(input.size()); + const auto total_work = static_cast(std::reduce( + chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size)); + gko::array data_chunk{this->ref, total_work}; + gko::array data_work{this->ref, total_work}; + gko::array ddata_chunk{this->exec, total_work}; + gko::array ddata_work{this->exec, total_work}; + gko::array derr{this->exec, 1}; + gko::array dsize_once{this->exec, static_cast(size)}; + gko::array dchunk_sizes{this->exec, chunk_sizes}; + ddata_chunk.fill(-1); + ddata_work.fill(-1); + derr.fill(false); + dsize_once.fill(0); + int i{}; + for (const auto chunk : gko::irange{size}) { + const auto chunk_size = chunk_sizes.get_const_data()[chunk]; + for (const auto work : gko::irange{chunk_size}) { + data_chunk.get_data()[i] = chunk; + data_work.get_data()[i] = work; + i++; + } + } + + test_dynamic_work_per_chunk<<<1, + gko::kernels::hip::config::warp_size>>>( + dchunk_sizes.get_data(), size, ddata_chunk.get_data(), + ddata_work.get_data(), dsize_once.get_data(), derr.get_data()); + + GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk); + GKO_ASSERT_ARRAY_EQ(ddata_work, data_work); + ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data())); + } +} + + +} // namespace