Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 132 additions & 1 deletion common/cuda_hip/components/sorting.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand All @@ -8,6 +8,8 @@

#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/components/cooperative_groups.hpp"
#include "core/base/index_range.hpp"
#include "core/components/prefix_sum_kernels.hpp"


namespace gko {
Expand Down Expand Up @@ -310,6 +312,135 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements,
}


namespace kernel {


template <int NumBuckets>
struct bucket_sort_config {
constexpr static int num_buckets = NumBuckets;
constexpr static int items_per_thread = 16;
constexpr static int threadblock_size = 512;
constexpr static int items_per_threadblock =
items_per_thread * threadblock_size;
};


template <typename Config, typename IndexType, typename InputIterator,
typename BucketIndexOp>
__global__ __launch_bounds__(Config::threadblock_size) void bucket_sort_count(
InputIterator begin, IndexType size, BucketIndexOp bucket_op,
IndexType* counters)
{
constexpr auto num_buckets = Config::num_buckets;
constexpr auto threadblock_size = Config::threadblock_size;
const auto block_id = static_cast<IndexType>(blockIdx.x);
__shared__ int sh_counters[num_buckets];
for (int i = threadIdx.x; i < num_buckets; i += threadblock_size) {
sh_counters[i] = 0;
}
__syncthreads();
const auto base_i = Config::items_per_threadblock * block_id;
const auto end = min(base_i + Config::items_per_threadblock, size);
for (IndexType i = base_i + threadIdx.x; i < end; i += threadblock_size) {
const auto bucket = bucket_op(*(begin + i));
assert(bucket >= 0 && bucket < num_buckets);
atomicAdd(sh_counters + bucket, 1);
}
__syncthreads();
for (int i = threadIdx.x; i < num_buckets; i += threadblock_size) {
counters[i + num_buckets * block_id] = sh_counters[i];
}
}


template <typename Config, typename IndexType>
__global__ __launch_bounds__(Config::num_buckets) void bucket_sort_prefixsum(
IndexType* offsets, IndexType num_blocks)
{
constexpr auto num_buckets = Config::num_buckets;
const auto i = threadIdx.x;
if (i >= num_buckets) {
return;
}
IndexType sum{};
for (IndexType block : irange{num_blocks}) {
const auto idx = i + block * num_buckets;
auto new_value = offsets[idx];
offsets[idx] = sum;
sum += new_value;
}
offsets[i + num_blocks * num_buckets] = sum;
}


template <typename Config, typename IndexType, typename InputIterator,
typename OutputIterator, typename BucketIndexOp>
__global__
__launch_bounds__(Config::threadblock_size) void bucket_sort_distribute(
InputIterator begin, OutputIterator out_begin, IndexType size,
IndexType num_blocks, BucketIndexOp bucket_op, const IndexType* offsets)
{
constexpr auto num_buckets = Config::num_buckets;
constexpr auto threadblock_size = Config::threadblock_size;
const auto block_id = static_cast<IndexType>(blockIdx.x);
__shared__ IndexType sh_counters[num_buckets];
for (int i = threadIdx.x; i < num_buckets; i += threadblock_size) {
sh_counters[i] = offsets[i + num_buckets * block_id] +
offsets[i + num_buckets * num_blocks];
}
__syncthreads();
const auto base_i = Config::items_per_threadblock * block_id;
const auto end = min(base_i + Config::items_per_threadblock, size);
for (IndexType i = base_i + threadIdx.x; i < end; i += threadblock_size) {
const auto value = *(begin + i);
const auto bucket = bucket_op(value);
assert(bucket >= 0 && bucket < num_buckets);
auto out_pos = atomicAdd(sh_counters + bucket, 1);
*(out_begin + out_pos) = value;
}
}


} // namespace kernel


template <int num_buckets, typename IndexType, typename InputIterator,
typename OutputIterator, typename BucketIndexOp>
std::array<IndexType, num_buckets + 1> bucket_sort(
std::shared_ptr<const DefaultExecutor> exec, InputIterator begin,
InputIterator end, OutputIterator out_begin, BucketIndexOp bucket_op,
array<IndexType>& tmp)
{
using config = kernel::bucket_sort_config<num_buckets>;
const auto size = static_cast<IndexType>(end - begin);
std::array<IndexType, num_buckets + 1> offsets{};
offsets.back() = size;

if (size > 0) {
const auto num_blocks = static_cast<IndexType>(
ceildiv(size, config::items_per_threadblock));
const auto tmp_size = (num_blocks + 1) * config::num_buckets;
if (tmp.get_size() < tmp_size) {
tmp.resize_and_reset(tmp_size);
}
kernel::bucket_sort_count<config>
<<<num_blocks, config::threadblock_size, 0, exec->get_stream()>>>(
begin, size, bucket_op, tmp.get_data());
kernel::bucket_sort_prefixsum<config, IndexType>
<<<1, config::num_buckets, 0, exec->get_stream()>>>(tmp.get_data(),
num_blocks);
const auto global_offsets = tmp.get_data() + num_buckets * num_blocks;
components::prefix_sum_nonnegative(exec, global_offsets, num_buckets);
kernel::bucket_sort_distribute<config, IndexType>
<<<num_blocks, config::threadblock_size, 0, exec->get_stream()>>>(
begin, out_begin, size, num_blocks, bucket_op, tmp.get_data());
exec->get_master()->copy_from(exec, num_buckets, global_offsets,
offsets.data());
}
return offsets;
}


} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Expand Down
57 changes: 56 additions & 1 deletion cuda/test/components/sorting.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand All @@ -12,6 +12,7 @@
#include <ginkgo/core/base/array.hpp>
#include <ginkgo/core/base/executor.hpp>

#include "core/base/index_range.hpp"
#include "cuda/test/utils.hpp"


Expand Down Expand Up @@ -103,4 +104,58 @@ TEST_F(Sorting, CudaBitonicSortShared)
}


constexpr auto num_buckets = 10;

std::array<int, num_buckets + 1> run_bucketsort(
std::shared_ptr<const gko::CudaExecutor> exec, const int* input, int size,
int* output, gko::array<int>& tmp)
{
return gko::kernels::cuda::bucket_sort<num_buckets>(
exec, input, input + size, output,
[] __device__(int i) { return i / 100; }, tmp);
}

TEST_F(Sorting, BucketSort)
{
for (int size : {0, 1, 10, 100, 1000, 10000, 100000}) {
SCOPED_TRACE(size);
const auto proj = [](auto i) { return i / 100; };
const auto comp = [&proj](auto a, auto b) { return proj(a) < proj(b); };
gko::array<int> data{ref, static_cast<gko::size_type>(size)};
std::uniform_int_distribution<int> dist{0, num_buckets * 100 - 1};
for (auto i : gko::irange{size}) {
data.get_data()[i] = dist(rng);
}
data.set_executor(exec);
gko::array<int> out_data{exec, static_cast<gko::size_type>(size)};
gko::array<int> tmp{exec};

auto offsets = run_bucketsort(exec, data.get_const_data(), size,
out_data.get_data(), tmp);

data.set_executor(ref);
out_data.set_executor(ref);
const auto out_data_ptr = out_data.get_data();
const auto data_ptr = data.get_data();
// the output must be sorted by bucket
ASSERT_TRUE(std::is_sorted(out_data_ptr, out_data_ptr + size, comp));
// the output offsets must describe the bucket ranges
for (int bucket = 0; bucket < num_buckets; bucket++) {
const auto bucket_begin = offsets[bucket];
const auto bucket_end = offsets[bucket + 1];
ASSERT_LE(bucket_begin, bucket_end);
for (const auto i : gko::irange{bucket_begin, bucket_end}) {
ASSERT_EQ(proj(out_data_ptr[i]), bucket);
}
}
// inside each bucket, the input and output data must be the same
std::sort(data_ptr, data_ptr + size);
std::sort(out_data_ptr, out_data_ptr + size);
std::stable_sort(data_ptr, data_ptr + size, comp);
std::stable_sort(out_data_ptr, out_data_ptr + size, comp);
GKO_ASSERT_ARRAY_EQ(data, out_data);
}
}


} // namespace
57 changes: 56 additions & 1 deletion hip/test/components/sorting.hip.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand All @@ -12,6 +12,7 @@
#include <ginkgo/core/base/array.hpp>
#include <ginkgo/core/base/executor.hpp>

#include "core/base/index_range.hpp"
#include "hip/test/utils.hip.hpp"


Expand Down Expand Up @@ -104,4 +105,58 @@ TEST_F(Sorting, HipBitonicSortShared)
}


constexpr auto num_buckets = 10;

std::array<int, num_buckets + 1> run_bucketsort(
std::shared_ptr<const gko::HipExecutor> exec, const int* input, int size,
int* output, gko::array<int>& tmp)
{
return gko::kernels::hip::bucket_sort<num_buckets>(
exec, input, input + size, output,
[] __device__(int i) { return i / 100; }, tmp);
}

TEST_F(Sorting, BucketSort)
{
for (int size : {0, 1, 10, 100, 1000, 10000, 100000}) {
SCOPED_TRACE(size);
const auto proj = [](auto i) { return i / 100; };
const auto comp = [&proj](auto a, auto b) { return proj(a) < proj(b); };
gko::array<int> data{ref, static_cast<gko::size_type>(size)};
std::uniform_int_distribution<int> dist{0, num_buckets * 100 - 1};
for (auto i : gko::irange{size}) {
data.get_data()[i] = dist(rng);
}
data.set_executor(exec);
gko::array<int> out_data{exec, static_cast<gko::size_type>(size)};
gko::array<int> tmp{exec};

auto offsets = run_bucketsort(exec, data.get_const_data(), size,
out_data.get_data(), tmp);

data.set_executor(ref);
out_data.set_executor(ref);
const auto out_data_ptr = out_data.get_data();
const auto data_ptr = data.get_data();
// the output must be sorted by bucket
ASSERT_TRUE(std::is_sorted(out_data_ptr, out_data_ptr + size, comp));
// the output offsets must describe the bucket ranges
for (int bucket = 0; bucket < num_buckets; bucket++) {
const auto bucket_begin = offsets[bucket];
const auto bucket_end = offsets[bucket + 1];
ASSERT_LE(bucket_begin, bucket_end);
for (const auto i : gko::irange{bucket_begin, bucket_end}) {
ASSERT_EQ(proj(out_data_ptr[i]), bucket);
}
}
// inside each bucket, the input and output data must be the same
std::sort(data_ptr, data_ptr + size);
std::sort(out_data_ptr, out_data_ptr + size);
std::stable_sort(data_ptr, data_ptr + size, comp);
std::stable_sort(out_data_ptr, out_data_ptr + size, comp);
GKO_ASSERT_ARRAY_EQ(data, out_data);
}
}


} // namespace
79 changes: 79 additions & 0 deletions omp/components/sorting.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// SPDX-FileCopyrightText: 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include <numeric>

#include <omp.h>

#include <ginkgo/core/base/array.hpp>
#include <ginkgo/core/base/types.hpp>

namespace gko {
namespace kernels {
namespace omp {


template <int num_buckets, typename InputIterator, typename OutputIterator,
typename BucketIndexOp>
std::array<int64, num_buckets + 1> bucket_sort(InputIterator begin,
InputIterator end,
OutputIterator out_begin,
BucketIndexOp bucket_op,
gko::array<int64>& tmp)
{
using index_type =
typename std::iterator_traits<InputIterator>::difference_type;
const auto size = end - begin;
const auto num_threads = omp_get_max_threads();
const auto tmp_size = num_threads * num_buckets;
if (tmp.get_size() < tmp_size) {
tmp.resize_and_reset(tmp_size);
}
const auto sums = tmp.get_data();
std::fill_n(sums, tmp_size, 0);
std::array<int64, num_buckets + 1> global_offsets{};
#pragma omp parallel
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#pragma omp parallel
#pragma omp parallel num_threads(num_threads)

{
const auto tid = omp_get_thread_num();
auto counts = sums + tid * num_buckets;
#pragma omp for
for (index_type i = 0; i < size; i++) {
const auto value = *(begin + i);
const auto bucket = bucket_op(value);
assert(bucket >= 0);
assert(bucket < num_buckets);
counts[bucket]++;
}
#pragma omp barrier
#pragma omp single
{
std::array<int64, num_buckets> offsets{};
for (int tid = 0; tid < num_threads; tid++) {
for (int i = 0; i < num_buckets; i++) {
const auto value = sums[tid * num_buckets + i];
sums[tid * num_buckets + i] = offsets[i];
offsets[i] += value;
}
}
std::copy_n(offsets.begin(), num_buckets, global_offsets.begin());
std::exclusive_scan(global_offsets.begin(), global_offsets.end(),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be unsupported in one of the libstdc++ versions we are using (ROCm 4.5 job), but I think I would rather require a fully C++17 compliant compiler than work around this.

global_offsets.begin(), index_type{});
}
#pragma omp for
for (index_type i = 0; i < size; i++) {
const auto value = *(begin + i);
const auto bucket = bucket_op(value);
assert(bucket >= 0);
assert(bucket < num_buckets);
const auto output_pos = counts[bucket]++ + global_offsets[bucket];
*(out_begin + output_pos) = value;
}
}
return global_offsets;
}


} // namespace omp
} // namespace kernels
} // namespace gko
1 change: 1 addition & 0 deletions omp/test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
include(${PROJECT_SOURCE_DIR}/cmake/create_test.cmake)

add_subdirectory(base)
add_subdirectory(components)
add_subdirectory(matrix)
1 change: 1 addition & 0 deletions omp/test/components/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ginkgo_create_omp_test(sorting)
Loading
Loading