From 490015af5430c7058ad17183abdcfc095465dd24 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 29 Jan 2025 13:51:10 +0100 Subject: [PATCH 1/3] wip load balancing abstraction --- common/cuda_hip/components/load_balancing.hpp | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 common/cuda_hip/components/load_balancing.hpp diff --git a/common/cuda_hip/components/load_balancing.hpp b/common/cuda_hip/components/load_balancing.hpp new file mode 100644 index 00000000000..ba62e3e09b3 --- /dev/null +++ b/common/cuda_hip/components/load_balancing.hpp @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_ + + +#include + +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +/** + * @internal + * Distributes work evenly inside a subwarp, making sure that no thread is idle + * as long as possible. The work is described by chunks (id, work_count), where + * work_count describes the number of work items in this chunk. + * + * @param chunk_count how many total chunks of work are there? + * @param work_count functor work_count(i) should return how many work items + * are in the i'th chunk, at least 1. + * It will only be executed once for each i. + * @param op the operation to execute for a single work item, called + * via op(chunk_id, work_item_id) + * @param subwarp the subwarp executing this function + * + * @tparam the index type used to address individual chunks and work items. + * it needs to be able to represent the total number of work items + * without overflow. + */ +template +__forceinline__ __device__ void load_balance_swarp(IndexType chunk_count, + WorkCountFunctor work_count, + Op op) +{ + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + static_assert(std::is_same_v, + "work_count needs to return IndexType"); + IndexType chunk_base{}; + IndexType work_base{}; + const auto lane = subwarp.threadd_rank(); + while (chunk_base < chunk_count) { + const auto in_chunk = chunk_base + lane; + const auto local_work = + in_chunk < chunk_count ? work_count(in_chunk) : IndexType{}; + assert(local_work > 0); + IndexType work_prefix_sum{}; + // inclusive prefix sum over work tells us where each chunk begins + subwarp_prefix_sum(local_work, work_prefix_sum, subwarp); + // binary search over this prefix sum tells us which chunk each thread + // works in + const auto local_work_pos = work_base + lane; + const auto local_chunk = + synchronous_fixed_binary_search([&](int i) { + return local_work_pos < subwarp.shfl(work_prefix_sum, i); + }); + auto local_chunk_work_base = + subwarp.shfl(work_prefix_sum - local_work, local_chunk); + const auto chunk = chunk_base + local_chunk; + op(chunk, local_work_pos - local_chunk_work_base); + const auto last_chunk = subwarp.shfl(in_chunk, subwarp_size - 1); + const auto last_chunk_size = + subwarp.shfl(work_prefix_sum, subwarp_size - 1); + work_base += subwarp_size; + } +} + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_ From 1f28b410ae3d3bcc82370fe50c6c3aded17be974 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 29 Jan 2025 23:28:23 +0100 Subject: [PATCH 2/3] add warp load balancing primitive --- common/cuda_hip/components/load_balancing.hpp | 65 +++-- core/base/array.cpp | 1 + cuda/test/components/CMakeLists.txt | 1 + cuda/test/components/load_balancing.cu | 231 ++++++++++++++++++ hip/test/components/CMakeLists.txt | 1 + hip/test/components/load_balancing.hip.cpp | 229 +++++++++++++++++ 6 files changed, 510 insertions(+), 18 deletions(-) create mode 100644 cuda/test/components/load_balancing.cu create mode 100644 hip/test/components/load_balancing.hip.cpp diff --git a/common/cuda_hip/components/load_balancing.hpp b/common/cuda_hip/components/load_balancing.hpp index ba62e3e09b3..e48d94c0f63 100644 --- a/common/cuda_hip/components/load_balancing.hpp +++ b/common/cuda_hip/components/load_balancing.hpp @@ -5,9 +5,6 @@ #ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_ #define GKO_COMMON_CUDA_HIP_COMPONENTS_LOAD_BALANCING_HPP_ - -#include - #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/prefix_sum.hpp" #include "common/cuda_hip/components/searching.hpp" @@ -37,26 +34,23 @@ namespace GKO_DEVICE_NAMESPACE { * without overflow. */ template -__forceinline__ __device__ void load_balance_swarp(IndexType chunk_count, - WorkCountFunctor work_count, - Op op) + typename OpFunctor> +__forceinline__ __device__ void load_balance_subwarp_nonempty( + IndexType chunk_count, WorkCountFunctor work_count, OpFunctor op) { const auto subwarp = group::tiled_partition(group::this_thread_block()); static_assert(std::is_same_v, "work_count needs to return IndexType"); + const auto lane = subwarp.thread_rank(); IndexType chunk_base{}; IndexType work_base{}; - const auto lane = subwarp.threadd_rank(); + IndexType local_work = lane < chunk_count ? work_count(lane) : IndexType{1}; + // inclusive prefix sum over work tells us where each chunk begins + IndexType work_prefix_sum{}; + subwarp_prefix_sum(local_work, work_prefix_sum, subwarp); while (chunk_base < chunk_count) { - const auto in_chunk = chunk_base + lane; - const auto local_work = - in_chunk < chunk_count ? work_count(in_chunk) : IndexType{}; assert(local_work > 0); - IndexType work_prefix_sum{}; - // inclusive prefix sum over work tells us where each chunk begins - subwarp_prefix_sum(local_work, work_prefix_sum, subwarp); // binary search over this prefix sum tells us which chunk each thread // works in const auto local_work_pos = work_base + lane; @@ -64,14 +58,49 @@ __forceinline__ __device__ void load_balance_swarp(IndexType chunk_count, synchronous_fixed_binary_search([&](int i) { return local_work_pos < subwarp.shfl(work_prefix_sum, i); }); + assert(local_chunk < subwarp_size); auto local_chunk_work_base = subwarp.shfl(work_prefix_sum - local_work, local_chunk); const auto chunk = chunk_base + local_chunk; - op(chunk, local_work_pos - local_chunk_work_base); - const auto last_chunk = subwarp.shfl(in_chunk, subwarp_size - 1); - const auto last_chunk_size = - subwarp.shfl(work_prefix_sum, subwarp_size - 1); + // do the work inside this chunk + if (chunk < chunk_count) { + op(chunk, local_work_pos - local_chunk_work_base, local_work_pos); + } + const auto last_local_chunk = + subwarp.shfl(local_chunk, subwarp_size - 1); + const auto last_local_chunk_end = + subwarp.shfl(work_prefix_sum, last_local_chunk); + assert(last_local_chunk < subwarp_size); + assert(last_local_chunk_end > local_work_pos); work_base += subwarp_size; + // how many chunks have we completed? The last one is completed if its + // end matches work_base after the update + const auto chunk_advance = + last_local_chunk + (last_local_chunk_end == work_base ? 1 : 0); + chunk_base += chunk_advance; + // shift down local_work and work_prefix_sum, + // adding new values when necessary + local_work = subwarp.shfl_down(local_work, chunk_advance); + // find the last value of the prefix sum and remember it for later + const auto work_prefix_sum_end = + subwarp.shfl(work_prefix_sum, subwarp_size - 1); + // this shuffle leaves the trailing elements unchaged, we need to + // overwrite them later + work_prefix_sum = subwarp.shfl_down(work_prefix_sum, chunk_advance); + IndexType work_prefix_sum_add{}; + if (lane >= subwarp_size - chunk_advance) { + const auto in_chunk = chunk_base + lane; + // load new work counters at the end + local_work = + in_chunk < chunk_count ? work_count(in_chunk) : IndexType{1}; + work_prefix_sum_add = local_work; + // fill the trailing work_prefix_sum with the last element + work_prefix_sum = work_prefix_sum_end; + } + // compute a prefix sum over new chunks and add to the prefix sum + subwarp_prefix_sum(work_prefix_sum_add, work_prefix_sum_add, + subwarp); + work_prefix_sum += work_prefix_sum_add; } } diff --git a/core/base/array.cpp b/core/base/array.cpp index 51fa4b34bb1..da1accc1ca1 100644 --- a/core/base/array.cpp +++ b/core/base/array.cpp @@ -91,6 +91,7 @@ ValueType reduce_add(const array& input_arr, #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value) GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL); +template GKO_DECLARE_ARRAY_FILL(bool); template GKO_DECLARE_ARRAY_FILL(uint16); template GKO_DECLARE_ARRAY_FILL(uint32); #ifndef GKO_SIZE_T_IS_UINT64_T diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt index 89e374431c4..6fe81ea0635 100644 --- a/cuda/test/components/CMakeLists.txt +++ b/cuda/test/components/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_cuda_test(cooperative_groups) +ginkgo_create_cuda_test(load_balancing) ginkgo_create_cuda_test(merging) ginkgo_create_cuda_test(searching) ginkgo_create_cuda_test(sorting) diff --git a/cuda/test/components/load_balancing.cu b/cuda/test/components/load_balancing.cu new file mode 100644 index 00000000000..7ea5930278f --- /dev/null +++ b/cuda/test/components/load_balancing.cu @@ -0,0 +1,231 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/components/load_balancing.hpp" + +#include +#include +#include + +#include + +#include +#include + +#include "core/base/index_range.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "cuda/test/utils.hpp" + + +namespace { + + +class LoadBalancing : public CudaTestFixture { +protected: +}; + + +__global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err, + int size) +{ + gko::kernels::cuda::load_balance_subwarp_nonempty< + gko::kernels::cuda::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return 1; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work != 0) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + if (global_pos != chunk) { + printf("incorrect global_pos %d %d\n", global_pos, chunk); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output + chunk, -1, chunk); + if (prev != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWith1WorkPerChunk) +{ + for (const auto size : + {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array data{this->exec, usize}; + data.fill(-1); + gko::array err{this->exec, 1}; + err.fill(false); + gko::array size_once{this->exec, usize}; + size_once.fill(0); + gko::array ref_data{this->exec, usize}; + gko::kernels::cuda::components::fill_seq_array( + exec, ref_data.get_data(), usize); + + test_1_work_per_chunk<<<1, gko::kernels::cuda::config::warp_size>>>( + data.get_data(), size_once.get_data(), err.get_data(), size); + + GKO_ASSERT_ARRAY_EQ(data, ref_data); + ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data())); + } +} + + +__global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err, + int size) +{ + gko::kernels::cuda::load_balance_subwarp_nonempty< + gko::kernels::cuda::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return 3; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work < 0 || work >= 3) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output + global_pos, -1, 3 * chunk + work); + if (prev != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWith3WorkPerChunk) +{ + for (const auto size : {0, 1, 2, 3, 10, 11, 19, 20, 21, 30, 31, 32, 33, 34, + 63, 64, 65, 100}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array data{this->exec, 3 * usize}; + data.fill(-1); + gko::array err{this->exec, 1}; + err.fill(false); + gko::array size_once{this->exec, usize}; + size_once.fill(0); + gko::array ref_data{this->exec, 3 * usize}; + gko::kernels::cuda::components::fill_seq_array( + exec, ref_data.get_data(), 3 * usize); + + test_3_work_per_chunk<<<1, gko::kernels::cuda::config::warp_size>>>( + data.get_data(), size_once.get_data(), err.get_data(), size); + + GKO_ASSERT_ARRAY_EQ(data, ref_data); + ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data())); + } +} + + +__global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size, + int* output_chunk, int* output_work, + int* size_once, bool* err) +{ + gko::kernels::cuda::load_balance_subwarp_nonempty< + gko::kernels::cuda::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return chunk_sizes[chunk]; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work < 0 || + work >= chunk_sizes[chunk]) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output_chunk + global_pos, -1, chunk); + int prev2 = atomicCAS(output_work + global_pos, -1, work); + if (prev != -1 || prev2 != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk) +{ + std::default_random_engine rng{123468}; + std::uniform_int_distribution dist{1, 100}; + for (const auto size : + {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array chunk_sizes{this->ref, usize}; + std::generate_n(chunk_sizes.get_data(), size, + [&] { return dist(rng); }); + const auto total_work = static_cast(std::reduce( + chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size)); + gko::array data_chunk{this->ref, total_work}; + gko::array data_work{this->ref, total_work}; + gko::array ddata_chunk{this->exec, total_work}; + gko::array ddata_work{this->exec, total_work}; + gko::array derr{this->exec, 1}; + gko::array dsize_once{this->exec, usize}; + gko::array dchunk_sizes{this->exec, chunk_sizes}; + ddata_chunk.fill(-1); + ddata_work.fill(-1); + derr.fill(false); + dsize_once.fill(0); + int i{}; + for (const auto chunk : gko::irange{size}) { + const auto chunk_size = chunk_sizes.get_const_data()[chunk]; + for (const auto work : gko::irange{chunk_size}) { + data_chunk.get_data()[i] = chunk; + data_work.get_data()[i] = work; + i++; + } + } + + test_dynamic_work_per_chunk<<<1, + gko::kernels::cuda::config::warp_size>>>( + dchunk_sizes.get_data(), size, ddata_chunk.get_data(), + ddata_work.get_data(), dsize_once.get_data(), derr.get_data()); + + GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk); + GKO_ASSERT_ARRAY_EQ(ddata_work, data_work); + ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data())); + } +} + + +} // namespace diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt index 9bbd49608a7..780b0076c7f 100644 --- a/hip/test/components/CMakeLists.txt +++ b/hip/test/components/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_hip_test(cooperative_groups) +ginkgo_create_hip_test(load_balancing) ginkgo_create_hip_test(merging) ginkgo_create_hip_test(searching) ginkgo_create_hip_test(sorting) diff --git a/hip/test/components/load_balancing.hip.cpp b/hip/test/components/load_balancing.hip.cpp new file mode 100644 index 00000000000..daedff51af3 --- /dev/null +++ b/hip/test/components/load_balancing.hip.cpp @@ -0,0 +1,229 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/components/load_balancing.hpp" + +#include +#include +#include + +#include + +#include +#include + +#include "core/base/index_range.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class LoadBalancing : public HipTestFixture { +protected: +}; + + +__global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err, + int size) +{ + gko::kernels::hip::load_balance_subwarp_nonempty< + gko::kernels::hip::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return 1; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work != 0) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + if (global_pos != chunk) { + printf("incorrect global_pos %d %d\n", global_pos, chunk); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output + chunk, -1, chunk); + if (prev != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWith1WorkPerChunk) +{ + for (const auto size : {0, 1, 2, 3, 7, 8, 63, 64, 65, 127, 128, 129, 200}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array data{this->exec, usize}; + data.fill(-1); + gko::array err{this->exec, 1}; + err.fill(false); + gko::array size_once{this->exec, usize}; + size_once.fill(0); + gko::array ref_data{this->exec, usize}; + gko::kernels::hip::components::fill_seq_array(exec, ref_data.get_data(), + usize); + + test_1_work_per_chunk<<<1, gko::kernels::hip::config::warp_size>>>( + data.get_data(), size_once.get_data(), err.get_data(), size); + + GKO_ASSERT_ARRAY_EQ(data, ref_data); + ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data())); + } +} + + +__global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err, + int size) +{ + gko::kernels::hip::load_balance_subwarp_nonempty< + gko::kernels::hip::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return 3; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work < 0 || work >= 3) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output + global_pos, -1, 3 * chunk + work); + if (prev != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWith3WorkPerChunk) +{ + for (const auto size : {0, 1, 2, 3, 21, 22, 42, 43, 63, 64, 65, 100}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array data{this->exec, 3 * usize}; + data.fill(-1); + gko::array err{this->exec, 1}; + err.fill(false); + gko::array size_once{this->exec, usize}; + size_once.fill(0); + gko::array ref_data{this->exec, 3 * usize}; + gko::kernels::hip::components::fill_seq_array(exec, ref_data.get_data(), + 3 * usize); + + test_3_work_per_chunk<<<1, gko::kernels::hip::config::warp_size>>>( + data.get_data(), size_once.get_data(), err.get_data(), size); + + GKO_ASSERT_ARRAY_EQ(data, ref_data); + ASSERT_FALSE(this->exec->copy_val_to_host(err.get_const_data())); + } +} + + +__global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size, + int* output_chunk, int* output_work, + int* size_once, bool* err) +{ + gko::kernels::hip::load_balance_subwarp_nonempty< + gko::kernels::hip::config::warp_size>( + size, + [&](int chunk) { + if (chunk < 0 || chunk >= size) { + printf("oob %d\n", chunk); + *err = true; + } + int result = atomicCAS(size_once + chunk, 0, 1); + if (result != 0) { + printf("duplicate size %d\n", chunk); + *err = true; + } + return chunk_sizes[chunk]; + }, + [&](int chunk, int work, int global_pos) { + if (chunk < 0 || chunk >= size || work < 0 || + work >= chunk_sizes[chunk]) { + printf("oob %d %d\n", chunk, work); + *err = true; + } + // make sure every work item happens only once + int prev = atomicCAS(output_chunk + global_pos, -1, chunk); + int prev2 = atomicCAS(output_work + global_pos, -1, work); + if (prev != -1 || prev2 != -1) { + printf("duplicate %d %d\n", chunk, work); + *err = true; + } + }); +} + + +TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk) +{ + std::default_random_engine rng{123468}; + std::uniform_int_distribution dist{1, 100}; + for (const auto size : + {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) { + SCOPED_TRACE(size); + const auto usize = static_cast(size); + gko::array chunk_sizes{this->ref, usize}; + std::generate_n(chunk_sizes.get_data(), size, + [&] { return dist(rng); }); + const auto total_work = static_cast(std::reduce( + chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size)); + gko::array data_chunk{this->ref, total_work}; + gko::array data_work{this->ref, total_work}; + gko::array ddata_chunk{this->exec, total_work}; + gko::array ddata_work{this->exec, total_work}; + gko::array derr{this->exec, 1}; + gko::array dsize_once{this->exec, usize}; + gko::array dchunk_sizes{this->exec, chunk_sizes}; + ddata_chunk.fill(-1); + ddata_work.fill(-1); + derr.fill(false); + dsize_once.fill(0); + int i{}; + for (const auto chunk : gko::irange{size}) { + const auto chunk_size = chunk_sizes.get_const_data()[chunk]; + for (const auto work : gko::irange{chunk_size}) { + data_chunk.get_data()[i] = chunk; + data_work.get_data()[i] = work; + i++; + } + } + + test_dynamic_work_per_chunk<<<1, + gko::kernels::hip::config::warp_size>>>( + dchunk_sizes.get_data(), size, ddata_chunk.get_data(), + ddata_work.get_data(), dsize_once.get_data(), derr.get_data()); + + GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk); + GKO_ASSERT_ARRAY_EQ(ddata_work, data_work); + ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data())); + } +} + + +} // namespace From 7085f1322f2344a53508ba506483971d01d9ab33 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 18 Apr 2025 16:24:37 +0200 Subject: [PATCH 3/3] allow empty inputs in warp load balancing --- common/cuda_hip/components/load_balancing.hpp | 65 ++++++++------- cuda/test/components/load_balancing.cu | 82 +++++++++++++++++-- hip/test/components/load_balancing.hip.cpp | 72 +++++++++++++++- 3 files changed, 183 insertions(+), 36 deletions(-) diff --git a/common/cuda_hip/components/load_balancing.hpp b/common/cuda_hip/components/load_balancing.hpp index e48d94c0f63..ff053ac42cf 100644 --- a/common/cuda_hip/components/load_balancing.hpp +++ b/common/cuda_hip/components/load_balancing.hpp @@ -23,7 +23,7 @@ namespace GKO_DEVICE_NAMESPACE { * * @param chunk_count how many total chunks of work are there? * @param work_count functor work_count(i) should return how many work items - * are in the i'th chunk, at least 1. + * are in the i'th chunk. * It will only be executed once for each i. * @param op the operation to execute for a single work item, called * via op(chunk_id, work_item_id) @@ -35,7 +35,7 @@ namespace GKO_DEVICE_NAMESPACE { */ template -__forceinline__ __device__ void load_balance_subwarp_nonempty( +__forceinline__ __device__ void load_balance_subwarp( IndexType chunk_count, WorkCountFunctor work_count, OpFunctor op) { const auto subwarp = @@ -50,7 +50,7 @@ __forceinline__ __device__ void load_balance_subwarp_nonempty( IndexType work_prefix_sum{}; subwarp_prefix_sum(local_work, work_prefix_sum, subwarp); while (chunk_base < chunk_count) { - assert(local_work > 0); + assert(local_work >= 0); // binary search over this prefix sum tells us which chunk each thread // works in const auto local_work_pos = work_base + lane; @@ -58,45 +58,54 @@ __forceinline__ __device__ void load_balance_subwarp_nonempty( synchronous_fixed_binary_search([&](int i) { return local_work_pos < subwarp.shfl(work_prefix_sum, i); }); - assert(local_chunk < subwarp_size); - auto local_chunk_work_base = - subwarp.shfl(work_prefix_sum - local_work, local_chunk); + // guard against out-of-bounds shuffle + auto local_chunk_work_base = subwarp.shfl( + work_prefix_sum - local_work, max(local_chunk, subwarp_size - 1)); const auto chunk = chunk_base + local_chunk; // do the work inside this chunk - if (chunk < chunk_count) { + if (chunk < chunk_count && local_chunk < subwarp_size) { op(chunk, local_work_pos - local_chunk_work_base, local_work_pos); } const auto last_local_chunk = subwarp.shfl(local_chunk, subwarp_size - 1); - const auto last_local_chunk_end = - subwarp.shfl(work_prefix_sum, last_local_chunk); - assert(last_local_chunk < subwarp_size); - assert(last_local_chunk_end > local_work_pos); - work_base += subwarp_size; - // how many chunks have we completed? The last one is completed if its - // end matches work_base after the update - const auto chunk_advance = - last_local_chunk + (last_local_chunk_end == work_base ? 1 : 0); - chunk_base += chunk_advance; - // shift down local_work and work_prefix_sum, - // adding new values when necessary - local_work = subwarp.shfl_down(local_work, chunk_advance); // find the last value of the prefix sum and remember it for later const auto work_prefix_sum_end = subwarp.shfl(work_prefix_sum, subwarp_size - 1); - // this shuffle leaves the trailing elements unchaged, we need to - // overwrite them later - work_prefix_sum = subwarp.shfl_down(work_prefix_sum, chunk_advance); IndexType work_prefix_sum_add{}; - if (lane >= subwarp_size - chunk_advance) { + if (last_local_chunk == subwarp_size) { + // if we didn't have enough work to do: all local chunks are + // completed + chunk_base += subwarp_size; + work_prefix_sum = work_prefix_sum_end; const auto in_chunk = chunk_base + lane; - // load new work counters at the end local_work = - in_chunk < chunk_count ? work_count(in_chunk) : IndexType{1}; + in_chunk < chunk_count ? work_count(in_chunk) : IndexType{}; work_prefix_sum_add = local_work; - // fill the trailing work_prefix_sum with the last element - work_prefix_sum = work_prefix_sum_end; + } else { + const auto last_local_chunk_end = + subwarp.shfl(work_prefix_sum, last_local_chunk); + // how many chunks have we completed? The last one is completed if + // its end matches work_base after the update + const auto chunk_advance = + last_local_chunk + (last_local_chunk_end == work_base ? 1 : 0); + chunk_base += chunk_advance; + // shift down local_work and work_prefix_sum, + // adding new values when necessary + local_work = subwarp.shfl_down(local_work, chunk_advance); + // this shuffle leaves the trailing elements unchaged, we need to + // overwrite them later + work_prefix_sum = subwarp.shfl_down(work_prefix_sum, chunk_advance); + if (lane >= subwarp_size - chunk_advance) { + const auto in_chunk = chunk_base + lane; + // load new work counters at the end + local_work = + in_chunk < chunk_count ? work_count(in_chunk) : IndexType{}; + work_prefix_sum_add = local_work; + // fill the trailing work_prefix_sum with the last element + work_prefix_sum = work_prefix_sum_end; + } } + work_base += popcnt(subwarp.ballot(local_chunk < subwarp_size)); // compute a prefix sum over new chunks and add to the prefix sum subwarp_prefix_sum(work_prefix_sum_add, work_prefix_sum_add, subwarp); diff --git a/cuda/test/components/load_balancing.cu b/cuda/test/components/load_balancing.cu index 7ea5930278f..c871f286fec 100644 --- a/cuda/test/components/load_balancing.cu +++ b/cuda/test/components/load_balancing.cu @@ -29,7 +29,7 @@ protected: __global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err, int size) { - gko::kernels::cuda::load_balance_subwarp_nonempty< + gko::kernels::cuda::load_balance_subwarp< gko::kernels::cuda::config::warp_size>( size, [&](int chunk) { @@ -91,7 +91,7 @@ TEST_F(LoadBalancing, WorksWith1WorkPerChunk) __global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err, int size) { - gko::kernels::cuda::load_balance_subwarp_nonempty< + gko::kernels::cuda::load_balance_subwarp< gko::kernels::cuda::config::warp_size>( size, [&](int chunk) { @@ -150,7 +150,7 @@ __global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size, int* output_chunk, int* output_work, int* size_once, bool* err) { - gko::kernels::cuda::load_balance_subwarp_nonempty< + gko::kernels::cuda::load_balance_subwarp< gko::kernels::cuda::config::warp_size>( size, [&](int chunk) { @@ -182,12 +182,12 @@ __global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size, } -TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk) +TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkNonempty) { std::default_random_engine rng{123468}; std::uniform_int_distribution dist{1, 100}; for (const auto size : - {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 100}) { + {0, 1, 2, 3, 7, 8, 31, 32, 33, 34, 63, 64, 65, 1000}) { SCOPED_TRACE(size); const auto usize = static_cast(size); gko::array chunk_sizes{this->ref, usize}; @@ -228,4 +228,76 @@ TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk) } +TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkEmptyAllowed) +{ + // empty chunks aligned with warp size + std::vector empty_aligned(32, 0); + // empty chunks not aligned with warp size + std::vector empty_unaligned(33, 0); + // some empty chunks, not enough work initially + std::vector incomplete{0, 0, 31, 0, 0}; + // some empty chunks, not enough work in second iteration + std::vector incomplete2{0, 0, 33, 0, 0}; + // large gaps, never enough work + std::vector gaps_incomplete(128); + gaps_incomplete[4] = 31; + gaps_incomplete[4 + 32] = 31; + gaps_incomplete[4 + 64] = 31; + gaps_incomplete[4 + 96] = 31; + // large gaps, enough work + std::vector gaps_complete(128); + gaps_incomplete[4] = 32; + gaps_incomplete[4 + 32] = 32; + gaps_incomplete[4 + 64] = 32; + gaps_incomplete[4 + 96] = 32; + // large gaps, chunks overlap between iterations + std::vector gaps_overlap(96); + gaps_overlap[31] = 63; + gaps_overlap[31 + 32] = 32; + gaps_overlap[31 + 64] = 33; + // large gaps, work at the end of the chunk + std::vector gaps_complete_end(96); + gaps_overlap[31] = 64; + gaps_overlap[31 + 32] = 32; + gaps_overlap[31 + 64] = 32; + + for (auto input : + {empty_aligned, empty_unaligned, incomplete, incomplete2}) { + gko::array chunk_sizes{this->ref, input.begin(), input.end()}; + auto size = static_cast(input.size()); + const auto total_work = static_cast(std::reduce( + chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size)); + gko::array data_chunk{this->ref, total_work}; + gko::array data_work{this->ref, total_work}; + gko::array ddata_chunk{this->exec, total_work}; + gko::array ddata_work{this->exec, total_work}; + gko::array derr{this->exec, 1}; + gko::array dsize_once{this->exec, static_cast(size)}; + gko::array dchunk_sizes{this->exec, chunk_sizes}; + ddata_chunk.fill(-1); + ddata_work.fill(-1); + derr.fill(false); + dsize_once.fill(0); + int i{}; + for (const auto chunk : gko::irange{size}) { + const auto chunk_size = chunk_sizes.get_const_data()[chunk]; + for (const auto work : gko::irange{chunk_size}) { + data_chunk.get_data()[i] = chunk; + data_work.get_data()[i] = work; + i++; + } + } + + test_dynamic_work_per_chunk<<<1, + gko::kernels::cuda::config::warp_size>>>( + dchunk_sizes.get_data(), size, ddata_chunk.get_data(), + ddata_work.get_data(), dsize_once.get_data(), derr.get_data()); + + GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk); + GKO_ASSERT_ARRAY_EQ(ddata_work, data_work); + ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data())); + } +} + + } // namespace diff --git a/hip/test/components/load_balancing.hip.cpp b/hip/test/components/load_balancing.hip.cpp index daedff51af3..3ea38b09b50 100644 --- a/hip/test/components/load_balancing.hip.cpp +++ b/hip/test/components/load_balancing.hip.cpp @@ -29,7 +29,7 @@ class LoadBalancing : public HipTestFixture { __global__ void test_1_work_per_chunk(int* output, int* size_once, bool* err, int size) { - gko::kernels::hip::load_balance_subwarp_nonempty< + gko::kernels::hip::load_balance_subwarp< gko::kernels::hip::config::warp_size>( size, [&](int chunk) { @@ -90,7 +90,7 @@ TEST_F(LoadBalancing, WorksWith1WorkPerChunk) __global__ void test_3_work_per_chunk(int* output, int* size_once, bool* err, int size) { - gko::kernels::hip::load_balance_subwarp_nonempty< + gko::kernels::hip::load_balance_subwarp< gko::kernels::hip::config::warp_size>( size, [&](int chunk) { @@ -148,7 +148,7 @@ __global__ void test_dynamic_work_per_chunk(const int* chunk_sizes, int size, int* output_chunk, int* output_work, int* size_once, bool* err) { - gko::kernels::hip::load_balance_subwarp_nonempty< + gko::kernels::hip::load_balance_subwarp< gko::kernels::hip::config::warp_size>( size, [&](int chunk) { @@ -226,4 +226,70 @@ TEST_F(LoadBalancing, WorksWithRandomWorkPerChunk) } +TEST_F(LoadBalancing, WorksWithRandomWorkPerChunkEmptyAllowed) +{ + // empty chunks aligned with warp size + std::vector empty_aligned(64, 0); + // empty chunks not aligned with warp size + std::vector empty_unaligned(65, 0); + // some empty chunks, not enough work initially + std::vector incomplete{0, 0, 63, 0, 0}; + // some empty chunks, not enough work in second iteration + std::vector incomplete2{0, 0, 65, 0, 0}; + // large gaps, never enough work + std::vector gaps_incomplete(128); + gaps_incomplete[4] = 31; + gaps_incomplete[4 + 64] = 31; + // large gaps, enough work + std::vector gaps_complete(128); + gaps_incomplete[4] = 64; + gaps_incomplete[4 + 64] = 64; + // large gaps, chunks overlap between iterations + std::vector gaps_overlap(96); + gaps_overlap[31] = 127; + gaps_overlap[31 + 64] = 64; + // large gaps, work at the end of the chunk + std::vector gaps_complete_end(96); + gaps_overlap[31] = 128; + gaps_overlap[31 + 64] = 64; + + for (auto input : + {empty_aligned, empty_unaligned, incomplete, incomplete2}) { + gko::array chunk_sizes{this->ref, input.begin(), input.end()}; + auto size = static_cast(input.size()); + const auto total_work = static_cast(std::reduce( + chunk_sizes.get_const_data(), chunk_sizes.get_const_data() + size)); + gko::array data_chunk{this->ref, total_work}; + gko::array data_work{this->ref, total_work}; + gko::array ddata_chunk{this->exec, total_work}; + gko::array ddata_work{this->exec, total_work}; + gko::array derr{this->exec, 1}; + gko::array dsize_once{this->exec, static_cast(size)}; + gko::array dchunk_sizes{this->exec, chunk_sizes}; + ddata_chunk.fill(-1); + ddata_work.fill(-1); + derr.fill(false); + dsize_once.fill(0); + int i{}; + for (const auto chunk : gko::irange{size}) { + const auto chunk_size = chunk_sizes.get_const_data()[chunk]; + for (const auto work : gko::irange{chunk_size}) { + data_chunk.get_data()[i] = chunk; + data_work.get_data()[i] = work; + i++; + } + } + + test_dynamic_work_per_chunk<<<1, + gko::kernels::hip::config::warp_size>>>( + dchunk_sizes.get_data(), size, ddata_chunk.get_data(), + ddata_work.get_data(), dsize_once.get_data(), derr.get_data()); + + GKO_ASSERT_ARRAY_EQ(ddata_chunk, data_chunk); + GKO_ASSERT_ARRAY_EQ(ddata_work, data_work); + ASSERT_FALSE(this->exec->copy_val_to_host(derr.get_const_data())); + } +} + + } // namespace