From b6338354b5d3dbe3d09a2d3cf3e50204e8451242 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 2 Apr 2025 18:06:43 +0200
Subject: [PATCH] add bucketsort kernels

---
 common/cuda_hip/components/sorting.hpp | 133 ++++++++++++++++++++++++-
 cuda/test/components/sorting.cu        |  57 ++++++++++-
 hip/test/components/sorting.hip.cpp    |  57 ++++++++++-
 omp/components/sorting.hpp             |  79 +++++++++++++++
 omp/test/CMakeLists.txt                |   1 +
 omp/test/components/CMakeLists.txt     |   1 +
 omp/test/components/sorting.cpp        |  63 ++++++++++++
 7 files changed, 388 insertions(+), 3 deletions(-)
 create mode 100644 omp/components/sorting.hpp
 create mode 100644 omp/test/components/CMakeLists.txt
 create mode 100644 omp/test/components/sorting.cpp
diff --git a/common/cuda_hip/components/sorting.hpp b/common/cuda_hip/components/sorting.hpp
index 76694541c2d..1eaa3281d03 100644
--- a/common/cuda_hip/components/sorting.hpp
+++ b/common/cuda_hip/components/sorting.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -8,6 +8,8 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "core/base/index_range.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
 
 
 namespace gko {
@@ -310,6 +312,135 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements,
 }
 
 
+namespace kernel {
+
+
+template <int NumBuckets>
+struct bucket_sort_config {
+    constexpr static int num_buckets = NumBuckets;
+    constexpr static int items_per_thread = 16;
+    constexpr static int threadblock_size = 512;
+    constexpr static int items_per_threadblock =
+        items_per_thread * threadblock_size;
+};
+
+
+template <typename Config, typename IndexType, typename InputIterator,
+          typename BucketIndexOp>
+__global__ __launch_bounds__(Config::threadblock_size) void bucket_sort_count(
+    InputIterator begin, IndexType size, BucketIndexOp bucket_op,
+    IndexType* counters)
+{
+    constexpr auto num_buckets = Config::num_buckets;
+    constexpr auto threadblock_size = Config::threadblock_size;
+    const auto block_id = static_cast<IndexType>(blockIdx.x);
+    __shared__ int sh_counters[num_buckets];
+    for (int i = threadIdx.x; i < num_buckets; i += threadblock_size) {
+        sh_counters[i] = 0;
+    }
+    __syncthreads();
+    const auto base_i = Config::items_per_threadblock * block_id;
+    const auto end = min(base_i + Config::items_per_threadblock, size);
+    for (IndexType i = base_i + threadIdx.x; i < end; i += threadblock_size) {
+        const auto bucket = bucket_op(*(begin + i));
+        assert(bucket >= 0 && bucket < num_buckets);
+        atomicAdd(sh_counters + bucket, 1);
+    }
+    __syncthreads();
+    for (int i = threadIdx.x; i < num_buckets; i += threadblock_size) {
+        counters[i + num_buckets * block_id] = sh_counters[i];
+    }
+}
+
+
+template <typename Config, typename IndexType>
+__global__ __launch_bounds__(Config::num_buckets) void bucket_sort_prefixsum(
+    IndexType* offsets, IndexType num_blocks)
+{
+    constexpr auto num_buckets = Config::num_buckets;
+    const auto i = threadIdx.x;
+    if (i >= num_buckets) {
+        return;
+    }
+    IndexType sum{};
+    for (IndexType block : irange{num_blocks}) {
+        const auto idx = i + block * num_buckets;
+        auto new_value = offsets[idx];
+        offsets[idx] = sum;
+        sum += new_value;
+    }
+    offsets[i + num_blocks * num_buckets] = sum;
+}
+
+
+template <typename Config, typename IndexType, typename InputIterator,
+          typename OutputIterator, typename BucketIndexOp>
+__global__
+__launch_bounds__(Config::threadblock_size) void bucket_sort_distribute(
+    InputIterator begin, OutputIterator out_begin, IndexType size,
+    IndexType num_blocks, BucketIndexOp bucket_op, const IndexType* offsets)
+{
+    constexpr auto num_buckets = Config::num_buckets;
+    constexpr auto threadblock_size = Config::threadblock_size;
+    const auto block_id = static_cast<IndexType>(blockIdx.x);
+    __shared__ IndexType sh_counters[num_buckets];
+    for (int i = threadIdx.x; i < num_buckets; i += threadblock_size) {
+        sh_counters[i] = offsets[i + num_buckets * block_id] +
+                         offsets[i + num_buckets * num_blocks];
+    }
+    __syncthreads();
+    const auto base_i = Config::items_per_threadblock * block_id;
+    const auto end = min(base_i + Config::items_per_threadblock, size);
+    for (IndexType i = base_i + threadIdx.x; i < end; i += threadblock_size) {
+        const auto value = *(begin + i);
+        const auto bucket = bucket_op(value);
+        assert(bucket >= 0 && bucket < num_buckets);
+        auto out_pos = atomicAdd(sh_counters + bucket, 1);
+        *(out_begin + out_pos) = value;
+    }
+}
+
+
+}  // namespace kernel
+
+
+template <int num_buckets, typename IndexType, typename InputIterator,
+          typename OutputIterator, typename BucketIndexOp>
+std::array<IndexType, num_buckets + 1> bucket_sort(
+    std::shared_ptr<const DefaultExecutor> exec, InputIterator begin,
+    InputIterator end, OutputIterator out_begin, BucketIndexOp bucket_op,
+    array<IndexType>& tmp)
+{
+    using config = kernel::bucket_sort_config<num_buckets>;
+    const auto size = static_cast<IndexType>(end - begin);
+    std::array<IndexType, num_buckets + 1> offsets{};
+    offsets.back() = size;
+
+    if (size > 0) {
+        const auto num_blocks = static_cast<IndexType>(
+            ceildiv(size, config::items_per_threadblock));
+        const auto tmp_size = (num_blocks + 1) * config::num_buckets;
+        if (tmp.get_size() < tmp_size) {
+            tmp.resize_and_reset(tmp_size);
+        }
+        kernel::bucket_sort_count<config>
+            <<<num_blocks, config::threadblock_size, 0, exec->get_stream()>>>(
+                begin, size, bucket_op, tmp.get_data());
+        kernel::bucket_sort_prefixsum<config, IndexType>
+            <<<1, config::num_buckets, 0, exec->get_stream()>>>(tmp.get_data(),
+                                                                num_blocks);
+        const auto global_offsets = tmp.get_data() + num_buckets * num_blocks;
+        components::prefix_sum_nonnegative(exec, global_offsets, num_buckets);
+        kernel::bucket_sort_distribute<config, IndexType>
+            <<<num_blocks, config::threadblock_size, 0, exec->get_stream()>>>(
+                begin, out_begin, size, num_blocks, bucket_op, tmp.get_data());
+        exec->get_master()->copy_from(exec, num_buckets, global_offsets,
+                                      offsets.data());
+    }
+    return offsets;
+}
+
+
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/test/components/sorting.cu b/cuda/test/components/sorting.cu
index 0cc54e5904e..ba5a925b764 100644
--- a/cuda/test/components/sorting.cu
+++ b/cuda/test/components/sorting.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -12,6 +12,7 @@
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
+#include "core/base/index_range.hpp"
 #include "cuda/test/utils.hpp"
 
 
@@ -103,4 +104,58 @@ TEST_F(Sorting, CudaBitonicSortShared)
 }
 
 
+constexpr auto num_buckets = 10;
+
+std::array<int, num_buckets + 1> run_bucketsort(
+    std::shared_ptr<const gko::CudaExecutor> exec, const int* input, int size,
+    int* output, gko::array<int>& tmp)
+{
+    return gko::kernels::cuda::bucket_sort<num_buckets>(
+        exec, input, input + size, output,
+        [] __device__(int i) { return i / 100; }, tmp);
+}
+
+TEST_F(Sorting, BucketSort)
+{
+    for (int size : {0, 1, 10, 100, 1000, 10000, 100000}) {
+        SCOPED_TRACE(size);
+        const auto proj = [](auto i) { return i / 100; };
+        const auto comp = [&proj](auto a, auto b) { return proj(a) < proj(b); };
+        gko::array<int> data{ref, static_cast<gko::size_type>(size)};
+        std::uniform_int_distribution<int> dist{0, num_buckets * 100 - 1};
+        for (auto i : gko::irange{size}) {
+            data.get_data()[i] = dist(rng);
+        }
+        data.set_executor(exec);
+        gko::array<int> out_data{exec, static_cast<gko::size_type>(size)};
+        gko::array<int> tmp{exec};
+
+        auto offsets = run_bucketsort(exec, data.get_const_data(), size,
+                                      out_data.get_data(), tmp);
+
+        data.set_executor(ref);
+        out_data.set_executor(ref);
+        const auto out_data_ptr = out_data.get_data();
+        const auto data_ptr = data.get_data();
+        // the output must be sorted by bucket
+        ASSERT_TRUE(std::is_sorted(out_data_ptr, out_data_ptr + size, comp));
+        // the output offsets must describe the bucket ranges
+        for (int bucket = 0; bucket < num_buckets; bucket++) {
+            const auto bucket_begin = offsets[bucket];
+            const auto bucket_end = offsets[bucket + 1];
+            ASSERT_LE(bucket_begin, bucket_end);
+            for (const auto i : gko::irange{bucket_begin, bucket_end}) {
+                ASSERT_EQ(proj(out_data_ptr[i]), bucket);
+            }
+        }
+        // inside each bucket, the input and output data must be the same
+        std::sort(data_ptr, data_ptr + size);
+        std::sort(out_data_ptr, out_data_ptr + size);
+        std::stable_sort(data_ptr, data_ptr + size, comp);
+        std::stable_sort(out_data_ptr, out_data_ptr + size, comp);
+        GKO_ASSERT_ARRAY_EQ(data, out_data);
+    }
+}
+
+
 }  // namespace
diff --git a/hip/test/components/sorting.hip.cpp b/hip/test/components/sorting.hip.cpp
index 653a0f536eb..173cf40385f 100644
--- a/hip/test/components/sorting.hip.cpp
+++ b/hip/test/components/sorting.hip.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -12,6 +12,7 @@
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
+#include "core/base/index_range.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
@@ -104,4 +105,58 @@ TEST_F(Sorting, HipBitonicSortShared)
 }
 
 
+constexpr auto num_buckets = 10;
+
+std::array<int, num_buckets + 1> run_bucketsort(
+    std::shared_ptr<const gko::HipExecutor> exec, const int* input, int size,
+    int* output, gko::array<int>& tmp)
+{
+    return gko::kernels::hip::bucket_sort<num_buckets>(
+        exec, input, input + size, output,
+        [] __device__(int i) { return i / 100; }, tmp);
+}
+
+TEST_F(Sorting, BucketSort)
+{
+    for (int size : {0, 1, 10, 100, 1000, 10000, 100000}) {
+        SCOPED_TRACE(size);
+        const auto proj = [](auto i) { return i / 100; };
+        const auto comp = [&proj](auto a, auto b) { return proj(a) < proj(b); };
+        gko::array<int> data{ref, static_cast<gko::size_type>(size)};
+        std::uniform_int_distribution<int> dist{0, num_buckets * 100 - 1};
+        for (auto i : gko::irange{size}) {
+            data.get_data()[i] = dist(rng);
+        }
+        data.set_executor(exec);
+        gko::array<int> out_data{exec, static_cast<gko::size_type>(size)};
+        gko::array<int> tmp{exec};
+
+        auto offsets = run_bucketsort(exec, data.get_const_data(), size,
+                                      out_data.get_data(), tmp);
+
+        data.set_executor(ref);
+        out_data.set_executor(ref);
+        const auto out_data_ptr = out_data.get_data();
+        const auto data_ptr = data.get_data();
+        // the output must be sorted by bucket
+        ASSERT_TRUE(std::is_sorted(out_data_ptr, out_data_ptr + size, comp));
+        // the output offsets must describe the bucket ranges
+        for (int bucket = 0; bucket < num_buckets; bucket++) {
+            const auto bucket_begin = offsets[bucket];
+            const auto bucket_end = offsets[bucket + 1];
+            ASSERT_LE(bucket_begin, bucket_end);
+            for (const auto i : gko::irange{bucket_begin, bucket_end}) {
+                ASSERT_EQ(proj(out_data_ptr[i]), bucket);
+            }
+        }
+        // inside each bucket, the input and output data must be the same
+        std::sort(data_ptr, data_ptr + size);
+        std::sort(out_data_ptr, out_data_ptr + size);
+        std::stable_sort(data_ptr, data_ptr + size, comp);
+        std::stable_sort(out_data_ptr, out_data_ptr + size, comp);
+        GKO_ASSERT_ARRAY_EQ(data, out_data);
+    }
+}
+
+
 }  // namespace
diff --git a/omp/components/sorting.hpp b/omp/components/sorting.hpp
new file mode 100644
index 00000000000..56277110fe2
--- /dev/null
+++ b/omp/components/sorting.hpp
@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <numeric>
+
+#include <omp.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+namespace gko {
+namespace kernels {
+namespace omp {
+
+
+template <int num_buckets, typename InputIterator, typename OutputIterator,
+          typename BucketIndexOp>
+std::array<int64, num_buckets + 1> bucket_sort(InputIterator begin,
+                                               InputIterator end,
+                                               OutputIterator out_begin,
+                                               BucketIndexOp bucket_op,
+                                               gko::array<int64>& tmp)
+{
+    using index_type =
+        typename std::iterator_traits<InputIterator>::difference_type;
+    const auto size = end - begin;
+    const auto num_threads = omp_get_max_threads();
+    const auto tmp_size = num_threads * num_buckets;
+    if (tmp.get_size() < tmp_size) {
+        tmp.resize_and_reset(tmp_size);
+    }
+    const auto sums = tmp.get_data();
+    std::fill_n(sums, tmp_size, 0);
+    std::array<int64, num_buckets + 1> global_offsets{};
+#pragma omp parallel
+    {
+        const auto tid = omp_get_thread_num();
+        auto counts = sums + tid * num_buckets;
+#pragma omp for
+        for (index_type i = 0; i < size; i++) {
+            const auto value = *(begin + i);
+            const auto bucket = bucket_op(value);
+            assert(bucket >= 0);
+            assert(bucket < num_buckets);
+            counts[bucket]++;
+        }
+#pragma omp barrier
+#pragma omp single
+        {
+            std::array<int64, num_buckets> offsets{};
+            for (int tid = 0; tid < num_threads; tid++) {
+                for (int i = 0; i < num_buckets; i++) {
+                    const auto value = sums[tid * num_buckets + i];
+                    sums[tid * num_buckets + i] = offsets[i];
+                    offsets[i] += value;
+                }
+            }
+            std::copy_n(offsets.begin(), num_buckets, global_offsets.begin());
+            std::exclusive_scan(global_offsets.begin(), global_offsets.end(),
+                                global_offsets.begin(), index_type{});
+        }
+#pragma omp for
+        for (index_type i = 0; i < size; i++) {
+            const auto value = *(begin + i);
+            const auto bucket = bucket_op(value);
+            assert(bucket >= 0);
+            assert(bucket < num_buckets);
+            const auto output_pos = counts[bucket]++ + global_offsets[bucket];
+            *(out_begin + output_pos) = value;
+        }
+    }
+    return global_offsets;
+}
+
+
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/test/CMakeLists.txt b/omp/test/CMakeLists.txt
index b16882cfaf6..2102026009f 100644
--- a/omp/test/CMakeLists.txt
+++ b/omp/test/CMakeLists.txt
@@ -1,4 +1,5 @@
 include(${PROJECT_SOURCE_DIR}/cmake/create_test.cmake)
 
 add_subdirectory(base)
+add_subdirectory(components)
 add_subdirectory(matrix)
diff --git a/omp/test/components/CMakeLists.txt b/omp/test/components/CMakeLists.txt
new file mode 100644
index 00000000000..191a7cdc392
--- /dev/null
+++ b/omp/test/components/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_omp_test(sorting)
diff --git a/omp/test/components/sorting.cpp b/omp/test/components/sorting.cpp
new file mode 100644
index 00000000000..9931320b0b6
--- /dev/null
+++ b/omp/test/components/sorting.cpp
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "omp/components/sorting.hpp"
+
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+#include "core/base/index_range.hpp"
+
+
+class Sorting : public ::testing::Test {
+protected:
+    Sorting() : exec(gko::OmpExecutor::create()), rng{729854} {}
+
+    std::shared_ptr<const gko::OmpExecutor> exec;
+    std::default_random_engine rng;
+};
+
+
+TEST_F(Sorting, BucketSort)
+{
+    constexpr auto num_buckets = 10;
+    for (int size : {0, 1, 10, 100, 1000, 10000, 100000}) {
+        SCOPED_TRACE(size);
+        const auto proj = [](auto i) { return i / 100; };
+        const auto comp = [&proj](auto a, auto b) { return proj(a) < proj(b); };
+        std::vector<int> data(size);
+        std::uniform_int_distribution<int> dist{0, num_buckets * 100 - 1};
+        for (auto& value : data) {
+            value = dist(rng);
+        }
+        std::vector<int> out_data(size);
+        gko::array<gko::int64> tmp{exec};
+
+        auto offsets = gko::kernels::omp::bucket_sort<num_buckets>(
+            data.cbegin(), data.cend(), out_data.begin(), proj, tmp);
+
+        // the output must be sorted by bucket
+        ASSERT_TRUE(std::is_sorted(out_data.begin(), out_data.end(), comp));
+        // the output offsets must describe the bucket ranges
+        for (int bucket = 0; bucket < num_buckets; bucket++) {
+            const auto bucket_begin = offsets[bucket];
+            const auto bucket_end = offsets[bucket + 1];
+            ASSERT_LE(bucket_begin, bucket_end);
+            for (const auto i : gko::irange{bucket_begin, bucket_end}) {
+                ASSERT_EQ(proj(out_data[i]), bucket);
+            }
+        }
+        // inside each bucket, the input and output data must be the same
+        std::sort(data.begin(), data.end());
+        std::sort(out_data.begin(), out_data.end());
+        std::stable_sort(data.begin(), data.end(), comp);
+        std::stable_sort(out_data.begin(), out_data.end(), comp);
+        ASSERT_EQ(data, out_data);
+    }
+}