From b6338354b5d3dbe3d09a2d3cf3e50204e8451242 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Apr 2025 18:06:43 +0200 Subject: [PATCH] add bucketsort kernels --- common/cuda_hip/components/sorting.hpp | 133 ++++++++++++++++++++++++- cuda/test/components/sorting.cu | 57 ++++++++++- hip/test/components/sorting.hip.cpp | 57 ++++++++++- omp/components/sorting.hpp | 79 +++++++++++++++ omp/test/CMakeLists.txt | 1 + omp/test/components/CMakeLists.txt | 1 + omp/test/components/sorting.cpp | 63 ++++++++++++ 7 files changed, 388 insertions(+), 3 deletions(-) create mode 100644 omp/components/sorting.hpp create mode 100644 omp/test/components/CMakeLists.txt create mode 100644 omp/test/components/sorting.cpp diff --git a/common/cuda_hip/components/sorting.hpp b/common/cuda_hip/components/sorting.hpp index 76694541c2d..1eaa3281d03 100644 --- a/common/cuda_hip/components/sorting.hpp +++ b/common/cuda_hip/components/sorting.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -8,6 +8,8 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "core/base/index_range.hpp" +#include "core/components/prefix_sum_kernels.hpp" namespace gko { @@ -310,6 +312,135 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements, } +namespace kernel { + + +template +struct bucket_sort_config { + constexpr static int num_buckets = NumBuckets; + constexpr static int items_per_thread = 16; + constexpr static int threadblock_size = 512; + constexpr static int items_per_threadblock = + items_per_thread * threadblock_size; +}; + + +template +__global__ __launch_bounds__(Config::threadblock_size) void bucket_sort_count( + InputIterator begin, IndexType size, BucketIndexOp bucket_op, + IndexType* counters) +{ + constexpr auto num_buckets = Config::num_buckets; + constexpr auto threadblock_size = Config::threadblock_size; + const auto block_id = static_cast(blockIdx.x); + __shared__ int sh_counters[num_buckets]; + for (int i = threadIdx.x; i < num_buckets; i += threadblock_size) { + sh_counters[i] = 0; + } + __syncthreads(); + const auto base_i = Config::items_per_threadblock * block_id; + const auto end = min(base_i + Config::items_per_threadblock, size); + for (IndexType i = base_i + threadIdx.x; i < end; i += threadblock_size) { + const auto bucket = bucket_op(*(begin + i)); + assert(bucket >= 0 && bucket < num_buckets); + atomicAdd(sh_counters + bucket, 1); + } + __syncthreads(); + for (int i = threadIdx.x; i < num_buckets; i += threadblock_size) { + counters[i + num_buckets * block_id] = sh_counters[i]; + } +} + + +template +__global__ __launch_bounds__(Config::num_buckets) void bucket_sort_prefixsum( + IndexType* offsets, IndexType num_blocks) +{ + constexpr auto num_buckets = Config::num_buckets; + const auto i = threadIdx.x; + if (i >= num_buckets) { + return; + } + IndexType sum{}; + for (IndexType block : irange{num_blocks}) { + const auto idx = i + block * num_buckets; + auto new_value = offsets[idx]; + offsets[idx] = sum; + sum += new_value; + } + offsets[i + num_blocks * num_buckets] = sum; +} + + +template +__global__ +__launch_bounds__(Config::threadblock_size) void bucket_sort_distribute( + InputIterator begin, OutputIterator out_begin, IndexType size, + IndexType num_blocks, BucketIndexOp bucket_op, const IndexType* offsets) +{ + constexpr auto num_buckets = Config::num_buckets; + constexpr auto threadblock_size = Config::threadblock_size; + const auto block_id = static_cast(blockIdx.x); + __shared__ IndexType sh_counters[num_buckets]; + for (int i = threadIdx.x; i < num_buckets; i += threadblock_size) { + sh_counters[i] = offsets[i + num_buckets * block_id] + + offsets[i + num_buckets * num_blocks]; + } + __syncthreads(); + const auto base_i = Config::items_per_threadblock * block_id; + const auto end = min(base_i + Config::items_per_threadblock, size); + for (IndexType i = base_i + threadIdx.x; i < end; i += threadblock_size) { + const auto value = *(begin + i); + const auto bucket = bucket_op(value); + assert(bucket >= 0 && bucket < num_buckets); + auto out_pos = atomicAdd(sh_counters + bucket, 1); + *(out_begin + out_pos) = value; + } +} + + +} // namespace kernel + + +template +std::array bucket_sort( + std::shared_ptr exec, InputIterator begin, + InputIterator end, OutputIterator out_begin, BucketIndexOp bucket_op, + array& tmp) +{ + using config = kernel::bucket_sort_config; + const auto size = static_cast(end - begin); + std::array offsets{}; + offsets.back() = size; + + if (size > 0) { + const auto num_blocks = static_cast( + ceildiv(size, config::items_per_threadblock)); + const auto tmp_size = (num_blocks + 1) * config::num_buckets; + if (tmp.get_size() < tmp_size) { + tmp.resize_and_reset(tmp_size); + } + kernel::bucket_sort_count + <<get_stream()>>>( + begin, size, bucket_op, tmp.get_data()); + kernel::bucket_sort_prefixsum + <<<1, config::num_buckets, 0, exec->get_stream()>>>(tmp.get_data(), + num_blocks); + const auto global_offsets = tmp.get_data() + num_buckets * num_blocks; + components::prefix_sum_nonnegative(exec, global_offsets, num_buckets); + kernel::bucket_sort_distribute + <<get_stream()>>>( + begin, out_begin, size, num_blocks, bucket_op, tmp.get_data()); + exec->get_master()->copy_from(exec, num_buckets, global_offsets, + offsets.data()); + } + return offsets; +} + + } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/cuda/test/components/sorting.cu b/cuda/test/components/sorting.cu index 0cc54e5904e..ba5a925b764 100644 --- a/cuda/test/components/sorting.cu +++ b/cuda/test/components/sorting.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -12,6 +12,7 @@ #include #include +#include "core/base/index_range.hpp" #include "cuda/test/utils.hpp" @@ -103,4 +104,58 @@ TEST_F(Sorting, CudaBitonicSortShared) } +constexpr auto num_buckets = 10; + +std::array run_bucketsort( + std::shared_ptr exec, const int* input, int size, + int* output, gko::array& tmp) +{ + return gko::kernels::cuda::bucket_sort( + exec, input, input + size, output, + [] __device__(int i) { return i / 100; }, tmp); +} + +TEST_F(Sorting, BucketSort) +{ + for (int size : {0, 1, 10, 100, 1000, 10000, 100000}) { + SCOPED_TRACE(size); + const auto proj = [](auto i) { return i / 100; }; + const auto comp = [&proj](auto a, auto b) { return proj(a) < proj(b); }; + gko::array data{ref, static_cast(size)}; + std::uniform_int_distribution dist{0, num_buckets * 100 - 1}; + for (auto i : gko::irange{size}) { + data.get_data()[i] = dist(rng); + } + data.set_executor(exec); + gko::array out_data{exec, static_cast(size)}; + gko::array tmp{exec}; + + auto offsets = run_bucketsort(exec, data.get_const_data(), size, + out_data.get_data(), tmp); + + data.set_executor(ref); + out_data.set_executor(ref); + const auto out_data_ptr = out_data.get_data(); + const auto data_ptr = data.get_data(); + // the output must be sorted by bucket + ASSERT_TRUE(std::is_sorted(out_data_ptr, out_data_ptr + size, comp)); + // the output offsets must describe the bucket ranges + for (int bucket = 0; bucket < num_buckets; bucket++) { + const auto bucket_begin = offsets[bucket]; + const auto bucket_end = offsets[bucket + 1]; + ASSERT_LE(bucket_begin, bucket_end); + for (const auto i : gko::irange{bucket_begin, bucket_end}) { + ASSERT_EQ(proj(out_data_ptr[i]), bucket); + } + } + // inside each bucket, the input and output data must be the same + std::sort(data_ptr, data_ptr + size); + std::sort(out_data_ptr, out_data_ptr + size); + std::stable_sort(data_ptr, data_ptr + size, comp); + std::stable_sort(out_data_ptr, out_data_ptr + size, comp); + GKO_ASSERT_ARRAY_EQ(data, out_data); + } +} + + } // namespace diff --git a/hip/test/components/sorting.hip.cpp b/hip/test/components/sorting.hip.cpp index 653a0f536eb..173cf40385f 100644 --- a/hip/test/components/sorting.hip.cpp +++ b/hip/test/components/sorting.hip.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -12,6 +12,7 @@ #include #include +#include "core/base/index_range.hpp" #include "hip/test/utils.hip.hpp" @@ -104,4 +105,58 @@ TEST_F(Sorting, HipBitonicSortShared) } +constexpr auto num_buckets = 10; + +std::array run_bucketsort( + std::shared_ptr exec, const int* input, int size, + int* output, gko::array& tmp) +{ + return gko::kernels::hip::bucket_sort( + exec, input, input + size, output, + [] __device__(int i) { return i / 100; }, tmp); +} + +TEST_F(Sorting, BucketSort) +{ + for (int size : {0, 1, 10, 100, 1000, 10000, 100000}) { + SCOPED_TRACE(size); + const auto proj = [](auto i) { return i / 100; }; + const auto comp = [&proj](auto a, auto b) { return proj(a) < proj(b); }; + gko::array data{ref, static_cast(size)}; + std::uniform_int_distribution dist{0, num_buckets * 100 - 1}; + for (auto i : gko::irange{size}) { + data.get_data()[i] = dist(rng); + } + data.set_executor(exec); + gko::array out_data{exec, static_cast(size)}; + gko::array tmp{exec}; + + auto offsets = run_bucketsort(exec, data.get_const_data(), size, + out_data.get_data(), tmp); + + data.set_executor(ref); + out_data.set_executor(ref); + const auto out_data_ptr = out_data.get_data(); + const auto data_ptr = data.get_data(); + // the output must be sorted by bucket + ASSERT_TRUE(std::is_sorted(out_data_ptr, out_data_ptr + size, comp)); + // the output offsets must describe the bucket ranges + for (int bucket = 0; bucket < num_buckets; bucket++) { + const auto bucket_begin = offsets[bucket]; + const auto bucket_end = offsets[bucket + 1]; + ASSERT_LE(bucket_begin, bucket_end); + for (const auto i : gko::irange{bucket_begin, bucket_end}) { + ASSERT_EQ(proj(out_data_ptr[i]), bucket); + } + } + // inside each bucket, the input and output data must be the same + std::sort(data_ptr, data_ptr + size); + std::sort(out_data_ptr, out_data_ptr + size); + std::stable_sort(data_ptr, data_ptr + size, comp); + std::stable_sort(out_data_ptr, out_data_ptr + size, comp); + GKO_ASSERT_ARRAY_EQ(data, out_data); + } +} + + } // namespace diff --git a/omp/components/sorting.hpp b/omp/components/sorting.hpp new file mode 100644 index 00000000000..56277110fe2 --- /dev/null +++ b/omp/components/sorting.hpp @@ -0,0 +1,79 @@ +// SPDX-FileCopyrightText: 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +#include +#include + +namespace gko { +namespace kernels { +namespace omp { + + +template +std::array bucket_sort(InputIterator begin, + InputIterator end, + OutputIterator out_begin, + BucketIndexOp bucket_op, + gko::array& tmp) +{ + using index_type = + typename std::iterator_traits::difference_type; + const auto size = end - begin; + const auto num_threads = omp_get_max_threads(); + const auto tmp_size = num_threads * num_buckets; + if (tmp.get_size() < tmp_size) { + tmp.resize_and_reset(tmp_size); + } + const auto sums = tmp.get_data(); + std::fill_n(sums, tmp_size, 0); + std::array global_offsets{}; +#pragma omp parallel + { + const auto tid = omp_get_thread_num(); + auto counts = sums + tid * num_buckets; +#pragma omp for + for (index_type i = 0; i < size; i++) { + const auto value = *(begin + i); + const auto bucket = bucket_op(value); + assert(bucket >= 0); + assert(bucket < num_buckets); + counts[bucket]++; + } +#pragma omp barrier +#pragma omp single + { + std::array offsets{}; + for (int tid = 0; tid < num_threads; tid++) { + for (int i = 0; i < num_buckets; i++) { + const auto value = sums[tid * num_buckets + i]; + sums[tid * num_buckets + i] = offsets[i]; + offsets[i] += value; + } + } + std::copy_n(offsets.begin(), num_buckets, global_offsets.begin()); + std::exclusive_scan(global_offsets.begin(), global_offsets.end(), + global_offsets.begin(), index_type{}); + } +#pragma omp for + for (index_type i = 0; i < size; i++) { + const auto value = *(begin + i); + const auto bucket = bucket_op(value); + assert(bucket >= 0); + assert(bucket < num_buckets); + const auto output_pos = counts[bucket]++ + global_offsets[bucket]; + *(out_begin + output_pos) = value; + } + } + return global_offsets; +} + + +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/test/CMakeLists.txt b/omp/test/CMakeLists.txt index b16882cfaf6..2102026009f 100644 --- a/omp/test/CMakeLists.txt +++ b/omp/test/CMakeLists.txt @@ -1,4 +1,5 @@ include(${PROJECT_SOURCE_DIR}/cmake/create_test.cmake) add_subdirectory(base) +add_subdirectory(components) add_subdirectory(matrix) diff --git a/omp/test/components/CMakeLists.txt b/omp/test/components/CMakeLists.txt new file mode 100644 index 00000000000..191a7cdc392 --- /dev/null +++ b/omp/test/components/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_create_omp_test(sorting) diff --git a/omp/test/components/sorting.cpp b/omp/test/components/sorting.cpp new file mode 100644 index 00000000000..9931320b0b6 --- /dev/null +++ b/omp/test/components/sorting.cpp @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "omp/components/sorting.hpp" + +#include +#include + +#include + +#include +#include + +#include "core/base/index_range.hpp" + + +class Sorting : public ::testing::Test { +protected: + Sorting() : exec(gko::OmpExecutor::create()), rng{729854} {} + + std::shared_ptr exec; + std::default_random_engine rng; +}; + + +TEST_F(Sorting, BucketSort) +{ + constexpr auto num_buckets = 10; + for (int size : {0, 1, 10, 100, 1000, 10000, 100000}) { + SCOPED_TRACE(size); + const auto proj = [](auto i) { return i / 100; }; + const auto comp = [&proj](auto a, auto b) { return proj(a) < proj(b); }; + std::vector data(size); + std::uniform_int_distribution dist{0, num_buckets * 100 - 1}; + for (auto& value : data) { + value = dist(rng); + } + std::vector out_data(size); + gko::array tmp{exec}; + + auto offsets = gko::kernels::omp::bucket_sort( + data.cbegin(), data.cend(), out_data.begin(), proj, tmp); + + // the output must be sorted by bucket + ASSERT_TRUE(std::is_sorted(out_data.begin(), out_data.end(), comp)); + // the output offsets must describe the bucket ranges + for (int bucket = 0; bucket < num_buckets; bucket++) { + const auto bucket_begin = offsets[bucket]; + const auto bucket_end = offsets[bucket + 1]; + ASSERT_LE(bucket_begin, bucket_end); + for (const auto i : gko::irange{bucket_begin, bucket_end}) { + ASSERT_EQ(proj(out_data[i]), bucket); + } + } + // inside each bucket, the input and output data must be the same + std::sort(data.begin(), data.end()); + std::sort(out_data.begin(), out_data.end()); + std::stable_sort(data.begin(), data.end(), comp); + std::stable_sort(out_data.begin(), out_data.end(), comp); + ASSERT_EQ(data, out_data); + } +}