Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 1 addition & 14 deletions cpp/include/cudf/detail/cuco_helpers.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,17 +29,4 @@ static cudf::size_type constexpr CUDF_SIZE_TYPE_SENTINEL = -1;
/// Default load factor for cuco data structures
static double constexpr CUCO_DESIRED_LOAD_FACTOR = 0.5;

/**
* @brief Stream-ordered allocator adaptor used for cuco data structures
*
* The stream-ordered `rmm::mr::polymorphic_allocator` cannot be used in `cuco` directly since the
* later expects a standard C++ `Allocator` interface. This allocator helper provides a simple way
* to handle cuco memory allocation/deallocation with the given `stream` and the rmm default memory
* resource.
*
* @tparam T The allocator's value type.
*/
template <typename T>
using cuco_allocator = rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<T>>;

} // namespace cudf::detail
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/join/distinct_hash_join.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ class distinct_hash_join {
cuda::thread_scope_device,
always_not_equal,
probing_scheme_type,
cudf::detail::cuco_allocator<char>,
rmm::mr::polymorphic_allocator<char>,
cuco_storage_type>;

bool _has_nested_columns; ///< True if nested columns are present in build and probe tables
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/join/filtered_join.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ class filtered_join {
cuco::bucket_storage<key,
1, /// fixing bucket size to be 1 i.e each thread handles one slot
cuco::extent<cudf::size_type>,
cudf::detail::cuco_allocator<char>>;
rmm::mr::polymorphic_allocator<char>>;

// Hasher for primitive row types
using primitive_row_hasher =
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/join/hash_join.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ struct hash_join {
cuda::thread_scope_device,
always_not_equal,
cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hasher1, hasher2>,
cudf::detail::cuco_allocator<char>,
rmm::mr::polymorphic_allocator<char>,
cuco::storage<2>>;

hash_join() = delete;
Expand Down
20 changes: 10 additions & 10 deletions cpp/src/groupby/hash/compute_groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -109,16 +109,16 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
return hashes;
}();

auto set = cuco::static_set{
cuco::extent<int64_t>{static_cast<int64_t>(num_keys)},
cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% load factor
cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
d_row_equal,
probing_scheme_t{row_hasher_with_cache_t{d_row_hash, cached_hashes.data()}},
cuco::thread_scope_device,
cuco::storage<GROUPBY_BUCKET_SIZE>{},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
stream.value()};
auto set =
cuco::static_set{cuco::extent<int64_t>{static_cast<int64_t>(num_keys)},
cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% load factor
cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
d_row_equal,
probing_scheme_t{row_hasher_with_cache_t{d_row_hash, cached_hashes.data()}},
cuco::thread_scope_device,
cuco::storage<GROUPBY_BUCKET_SIZE>{},
rmm::mr::polymorphic_allocator<char>{},
stream.value()};

auto const gather_keys = [&](auto const& gather_map) {
return cudf::detail::gather(keys,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/groupby/hash/helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,15 @@ using global_set_t = cuco::static_set<cudf::size_type,
cuda::thread_scope_device,
row_comparator_t,
probing_scheme_t,
cudf::detail::cuco_allocator<char>,
rmm::mr::polymorphic_allocator<char>,
cuco::storage<GROUPBY_BUCKET_SIZE>>;

using nullable_global_set_t = cuco::static_set<cudf::size_type,
cuco::extent<int64_t>,
cuda::thread_scope_device,
nullable_row_comparator_t,
probing_scheme_t,
cudf::detail::cuco_allocator<char>,
rmm::mr::polymorphic_allocator<char>,
cuco::storage<GROUPBY_BUCKET_SIZE>>;

template <typename Op>
Expand Down
54 changes: 26 additions & 28 deletions cpp/src/io/json/json_tree.cu
Original file line number Diff line number Diff line change
Expand Up @@ -553,15 +553,14 @@ std::pair<size_t, rmm::device_uvector<size_type>> remapped_field_nodes_after_uni

using hasher_type = decltype(d_hasher);
constexpr size_type empty_node_index_sentinel = -1;
auto key_set = cuco::static_set{
cuco::extent{compute_hash_table_size(num_keys)},
cuco::empty_key{empty_node_index_sentinel},
d_equal,
cuco::linear_probing<1, hasher_type>{d_hasher},
{},
{},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
stream.value()};
auto key_set = cuco::static_set{cuco::extent{compute_hash_table_size(num_keys)},
cuco::empty_key{empty_node_index_sentinel},
d_equal,
cuco::linear_probing<1, hasher_type>{d_hasher},
{},
{},
rmm::mr::polymorphic_allocator<char>{},
stream.value()};
auto const counting_iter = thrust::make_counting_iterator<size_type>(0);
rmm::device_uvector<size_type> found_keys(num_keys, stream);
key_set.insert_and_find_async(counting_iter,
Expand Down Expand Up @@ -629,15 +628,15 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol

using hasher_type = decltype(d_hasher);
constexpr size_type empty_node_index_sentinel = -1;
auto key_set = cuco::static_set{
cuco::extent{compute_hash_table_size(num_fields, 40)}, // 40% occupancy
cuco::empty_key{empty_node_index_sentinel},
d_equal,
cuco::linear_probing<1, hasher_type>{d_hasher},
{},
{},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
stream.value()};
auto key_set =
cuco::static_set{cuco::extent{compute_hash_table_size(num_fields, 40)}, // 40% occupancy
cuco::empty_key{empty_node_index_sentinel},
d_equal,
cuco::linear_probing<1, hasher_type>{d_hasher},
{},
{},
rmm::mr::polymorphic_allocator<char>{},
stream.value()};
key_set.insert_if_async(counting_iter,
counting_iter + num_nodes,
thrust::counting_iterator<size_type>(0), // stencil
Expand All @@ -659,7 +658,7 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
cuco::linear_probing<1, hasher_type3>{hasher_type3{}},
{},
{},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
rmm::mr::polymorphic_allocator<char>{},
stream.value()};
};
if (!is_enabled_experimental) { return std::pair{false, make_map(0)}; }
Expand Down Expand Up @@ -864,15 +863,14 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
constexpr size_type empty_node_index_sentinel = -1;
using hasher_type = decltype(d_hashed_cache);

auto key_set = cuco::static_set{
cuco::extent{compute_hash_table_size(num_nodes)},
cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
d_equal,
cuco::linear_probing<1, hasher_type>{d_hashed_cache},
{},
{},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
stream.value()};
auto key_set = cuco::static_set{cuco::extent{compute_hash_table_size(num_nodes)},
cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
d_equal,
cuco::linear_probing<1, hasher_type>{d_hashed_cache},
{},
{},
rmm::mr::polymorphic_allocator<char>{},
stream.value()};

// insert and convert node ids to unique set ids
auto nodes_itr = thrust::make_counting_iterator<size_type>(0);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/orc/orc_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a ha
using storage_type = cuco::bucket_storage<slot_type,
bucket_size,
cuco::extent<std::size_t>,
cudf::detail::cuco_allocator<char>>;
rmm::mr::polymorphic_allocator<char>>;
using storage_ref_type = typename storage_type::ref_type;

auto constexpr KEY_SENTINEL = size_type{-1};
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/io/orc/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2166,8 +2166,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,

// Create a single bulk storage to use for all sub-dictionaries
auto map_storage = std::make_unique<storage_type>(
total_map_storage_size,
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream});
total_map_storage_size, rmm::mr::polymorphic_allocator<char>{}, stream.value());

// Initialize stripe dictionaries
for (auto col_idx : orc_table.string_column_indices) {
Expand Down
16 changes: 6 additions & 10 deletions cpp/src/io/parquet/experimental/deletion_vectors.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
* limitations under the License.
*/

#include <cudf/detail/cuco_helpers.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/io/experimental/deletion_vectors.hpp>
#include <cudf/stream_compaction.hpp>
Expand All @@ -25,6 +24,7 @@
#include <rmm/device_buffer.hpp>
#include <rmm/exec_policy.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/polymorphic_allocator.hpp>

#include <cuco/roaring_bitmap.cuh>
#include <cuda/functional>
Expand All @@ -40,7 +40,7 @@ namespace cudf::io::parquet::experimental {

// Type alias for the cuco 64-bit roaring bitmap
using roaring_bitmap_type =
cuco::experimental::roaring_bitmap<cuda::std::uint64_t, cudf::detail::cuco_allocator<char>>;
cuco::experimental::roaring_bitmap<cuda::std::uint64_t, rmm::mr::polymorphic_allocator<char>>;

namespace {

Expand Down Expand Up @@ -299,9 +299,7 @@ chunked_parquet_reader::chunked_parquet_reader(

if (not serialized_roaring64.empty()) {
_deletion_vector = std::make_unique<roaring_bitmap_impl>(
serialized_roaring64.data(),
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, _stream.value()},
_stream);
serialized_roaring64.data(), rmm::mr::polymorphic_allocator<char>{}, _stream);
}
}

Expand All @@ -311,7 +309,7 @@ chunked_parquet_reader::chunked_parquet_reader(
struct chunked_parquet_reader::roaring_bitmap_impl {
roaring_bitmap_type roaring_bitmap;
roaring_bitmap_impl(cuda::std::byte const* const serialized_roaring64_data,
cudf::detail::cuco_allocator<char> const& allocator,
rmm::mr::polymorphic_allocator<char> const& allocator,
rmm::cuda_stream_view stream)
: roaring_bitmap(serialized_roaring64_data, allocator, stream)
{
Expand Down Expand Up @@ -445,10 +443,8 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
}

// Filter the table using the deletion vector
auto deletion_vector = roaring_bitmap_type(
serialized_roaring64.data(),
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream.value()},
stream);
auto deletion_vector = roaring_bitmap_type{
serialized_roaring64.data(), rmm::mr::polymorphic_allocator<char>{}, stream};
auto row_mask = build_row_mask_column(table_with_index->get_column(0).view(),
deletion_vector,
num_rows,
Expand Down
7 changes: 3 additions & 4 deletions cpp/src/io/parquet/experimental/dictionary_page_filter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ using hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<T>;
using storage_type = cuco::bucket_storage<slot_type,
BUCKET_SIZE,
cuco::extent<std::size_t>,
cudf::detail::cuco_allocator<char>>;
rmm::mr::polymorphic_allocator<char>>;
using storage_ref_type = typename storage_type::ref_type;

/**
Expand Down Expand Up @@ -1105,9 +1105,8 @@ struct dictionary_caster {
auto const total_num_literals = static_cast<cudf::size_type>(literals.size());

// Create a single bulk storage used by all cuco hash sets
auto set_storage = storage_type{
total_set_storage_size,
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
auto set_storage =
storage_type{total_set_storage_size, rmm::mr::polymorphic_allocator<char>{}, stream.value()};

// Initialize storage with the empty key sentinel
set_storage.initialize_async(EMPTY_KEY_SENTINEL, {stream.value()});
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/parquet_gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ auto constexpr SCOPE = cuda::thread_scope_block;
using storage_type = cuco::bucket_storage<slot_type,
bucket_size,
cuco::extent<std::size_t>,
cudf::detail::cuco_allocator<char>>;
rmm::mr::polymorphic_allocator<char>>;
using storage_ref_type = typename storage_type::ref_type;

/**
Expand Down
5 changes: 2 additions & 3 deletions cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1346,9 +1346,8 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; }

// Create a single bulk storage used by all sub-dictionaries
auto map_storage = storage_type{
total_map_storage_size,
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
auto map_storage =
storage_type{total_map_storage_size, rmm::mr::polymorphic_allocator<char>{}, stream.value()};
// Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer.
device_span<slot_type> const map_storage_data{map_storage.data(), total_map_storage_size};

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/distinct_hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ distinct_hash_join::distinct_hash_join(cudf::table_view const& build,
{},
cuco::thread_scope_device,
cuco_storage_type{},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
rmm::mr::polymorphic_allocator<char>{},
stream.value()}
{
CUDF_FUNC_RANGE();
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/join/filtered_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,8 @@ filtered_join::filtered_join(cudf::table_view const& build,
_build{build},
_preprocessed_build{cudf::detail::row::equality::preprocessed_table::create(_build, stream)},
_bucket_storage{cuco::extent<cudf::size_type>{compute_bucket_storage_size(build, load_factor)},
cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream.value()}}
rmm::mr::polymorphic_allocator<char>{},
stream.value()}
{
_bucket_storage.initialize(empty_sentinel_key, stream);
}
Expand Down
21 changes: 10 additions & 11 deletions cpp/src/join/hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -516,17 +516,16 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
: _has_nulls(has_nulls),
_is_empty{build.num_rows() == 0},
_nulls_equal{compare_nulls},
_hash_table{
cuco::extent{static_cast<size_t>(build.num_rows())},
load_factor,
cuco::empty_key{
cuco::pair{std::numeric_limits<hash_value_type>::max(), cudf::detail::JoinNoneValue}},
{},
{},
{},
{},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream.value()},
stream.value()},
_hash_table{cuco::extent{static_cast<size_t>(build.num_rows())},
load_factor,
cuco::empty_key{cuco::pair{std::numeric_limits<hash_value_type>::max(),
cudf::detail::JoinNoneValue}},
{},
{},
{},
{},
rmm::mr::polymorphic_allocator<char>{},
stream.value()},
_build{build},
_preprocessed_build{cudf::detail::row::equality::preprocessed_table::create(_build, stream)}
{
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/join_common_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ using mixed_multiset_type =
cuda::thread_scope_device,
mixed_join_always_not_equal,
cuco::double_hashing<1, mixed_join_hasher1, mixed_join_hasher2>,
cudf::detail::cuco_allocator<char>,
rmm::mr::polymorphic_allocator<char>,
cuco::storage<2>>;

bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/mixed_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ mixed_join_setup_data setup_mixed_join_common(table_view const& left_equality,
{},
{},
{},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
rmm::mr::polymorphic_allocator<char>{},
stream.value()};

// TODO: To add support for nested columns we will need to flatten in many
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/mixed_join_common_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ using hash_set_type =
cuda::thread_scope_device,
double_row_equality_comparator,
cuco::linear_probing<DEFAULT_MIXED_SEMI_JOIN_CG_SIZE, row_hash>,
cudf::detail::cuco_allocator<char>,
rmm::mr::polymorphic_allocator<char>,
cuco::storage<1>>;

// The hash_set_ref_type used by mixed_semi_join kernels for probing.
Expand Down
17 changes: 8 additions & 9 deletions cpp/src/join/mixed_join_semi.cu
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,14 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
auto const equality_build_conditional =
row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);

hash_set_type row_set{
{compute_hash_table_size(build.num_rows())},
cuco::empty_key{JoinNoneValue},
{equality_build_equality, equality_build_conditional},
{row_hash_build.device_hasher(build_nulls)},
{},
{},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
{stream.value()}};
hash_set_type row_set{{compute_hash_table_size(build.num_rows())},
cuco::empty_key{JoinNoneValue},
{equality_build_equality, equality_build_conditional},
{row_hash_build.device_hasher(build_nulls)},
{},
{},
rmm::mr::polymorphic_allocator<char>{},
{stream.value()}};

auto iter = thrust::make_counting_iterator(0);

Expand Down
Loading