|
| 1 | +/* |
| 2 | + * Copyright (c) 2025, NVIDIA CORPORATION. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +#include "hybrid_scan_impl.hpp" |
| 18 | + |
| 19 | +#include <cudf/io/experimental/hybrid_scan.hpp> |
| 20 | +#include <cudf/utilities/error.hpp> |
| 21 | + |
| 22 | +#include <thrust/host_vector.h> |
| 23 | + |
| 24 | +namespace cudf::io::parquet::experimental { |
| 25 | + |
| 26 | +hybrid_scan_reader::hybrid_scan_reader(cudf::host_span<uint8_t const> footer_bytes, |
| 27 | + parquet_reader_options const& options) |
| 28 | + : _impl{std::make_unique<detail::hybrid_scan_reader_impl>(footer_bytes, options)} |
| 29 | +{ |
| 30 | +} |
| 31 | + |
| 32 | +hybrid_scan_reader::~hybrid_scan_reader() = default; |
| 33 | + |
| 34 | +[[nodiscard]] text::byte_range_info hybrid_scan_reader::page_index_byte_range() const |
| 35 | +{ |
| 36 | + return _impl->page_index_byte_range(); |
| 37 | +} |
| 38 | + |
| 39 | +[[nodiscard]] FileMetaData hybrid_scan_reader::parquet_metadata() const |
| 40 | +{ |
| 41 | + return _impl->parquet_metadata(); |
| 42 | +} |
| 43 | + |
| 44 | +void hybrid_scan_reader::setup_page_index(cudf::host_span<uint8_t const> page_index_bytes) const |
| 45 | +{ |
| 46 | + return _impl->setup_page_index(page_index_bytes); |
| 47 | +} |
| 48 | + |
| 49 | +std::vector<cudf::size_type> hybrid_scan_reader::all_row_groups( |
| 50 | + parquet_reader_options const& options) const |
| 51 | +{ |
| 52 | + CUDF_EXPECTS(options.get_row_groups().size() <= 1, |
| 53 | + "Encountered invalid size of row group indices in parquet reader options"); |
| 54 | + |
| 55 | + // If row groups are specified in parquet reader options, return them as is |
| 56 | + if (options.get_row_groups().size() == 1) { return options.get_row_groups().front(); } |
| 57 | + |
| 58 | + return _impl->all_row_groups(options); |
| 59 | +} |
| 60 | + |
| 61 | +std::vector<cudf::size_type> hybrid_scan_reader::filter_row_groups_with_stats( |
| 62 | + cudf::host_span<size_type const> row_group_indices, |
| 63 | + parquet_reader_options const& options, |
| 64 | + rmm::cuda_stream_view stream) const |
| 65 | +{ |
| 66 | + // Temporary vector with row group indices from the first source |
| 67 | + auto const input_row_group_indices = |
| 68 | + std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}}; |
| 69 | + |
| 70 | + return _impl->filter_row_groups_with_stats(input_row_group_indices, options, stream).front(); |
| 71 | +} |
| 72 | + |
| 73 | +std::pair<std::vector<text::byte_range_info>, std::vector<text::byte_range_info>> |
| 74 | +hybrid_scan_reader::secondary_filters_byte_ranges( |
| 75 | + cudf::host_span<size_type const> row_group_indices, parquet_reader_options const& options) const |
| 76 | +{ |
| 77 | + // Temporary vector with row group indices from the first source |
| 78 | + auto const input_row_group_indices = |
| 79 | + std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}}; |
| 80 | + |
| 81 | + return _impl->secondary_filters_byte_ranges(input_row_group_indices, options); |
| 82 | +} |
| 83 | + |
| 84 | +std::vector<cudf::size_type> hybrid_scan_reader::filter_row_groups_with_dictionary_pages( |
| 85 | + cudf::host_span<rmm::device_buffer> dictionary_page_data, |
| 86 | + cudf::host_span<size_type const> row_group_indices, |
| 87 | + parquet_reader_options const& options, |
| 88 | + rmm::cuda_stream_view stream) const |
| 89 | +{ |
| 90 | + CUDF_EXPECTS(row_group_indices.size() == dictionary_page_data.size(), |
| 91 | + "Mismatch in size of input row group indices and dictionary page device buffers"); |
| 92 | + |
| 93 | + // Temporary vector with row group indices from the first source |
| 94 | + auto const input_row_group_indices = |
| 95 | + std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}}; |
| 96 | + |
| 97 | + return _impl |
| 98 | + ->filter_row_groups_with_dictionary_pages( |
| 99 | + dictionary_page_data, input_row_group_indices, options, stream) |
| 100 | + .front(); |
| 101 | +} |
| 102 | + |
| 103 | +std::vector<cudf::size_type> hybrid_scan_reader::filter_row_groups_with_bloom_filters( |
| 104 | + cudf::host_span<rmm::device_buffer> bloom_filter_data, |
| 105 | + cudf::host_span<size_type const> row_group_indices, |
| 106 | + parquet_reader_options const& options, |
| 107 | + rmm::cuda_stream_view stream) const |
| 108 | +{ |
| 109 | + CUDF_EXPECTS(row_group_indices.size() == bloom_filter_data.size(), |
| 110 | + "Mismatch in size of input row group indices and bloom filter device buffers"); |
| 111 | + |
| 112 | + // Temporary vector with row group indices from the first source |
| 113 | + auto const input_row_group_indices = |
| 114 | + std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}}; |
| 115 | + |
| 116 | + return _impl |
| 117 | + ->filter_row_groups_with_bloom_filters( |
| 118 | + bloom_filter_data, input_row_group_indices, options, stream) |
| 119 | + .front(); |
| 120 | +} |
| 121 | + |
| 122 | +std::pair<std::unique_ptr<cudf::column>, std::vector<thrust::host_vector<bool>>> |
| 123 | +hybrid_scan_reader::filter_data_pages_with_stats(cudf::host_span<size_type const> row_group_indices, |
| 124 | + parquet_reader_options const& options, |
| 125 | + rmm::cuda_stream_view stream, |
| 126 | + rmm::device_async_resource_ref mr) const |
| 127 | +{ |
| 128 | + // Temporary vector with row group indices from the first source |
| 129 | + auto const input_row_group_indices = |
| 130 | + std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}}; |
| 131 | + |
| 132 | + return _impl->filter_data_pages_with_stats(input_row_group_indices, options, stream, mr); |
| 133 | +} |
| 134 | + |
| 135 | +[[nodiscard]] std::vector<text::byte_range_info> |
| 136 | +hybrid_scan_reader::filter_column_chunks_byte_ranges( |
| 137 | + cudf::host_span<size_type const> row_group_indices, parquet_reader_options const& options) const |
| 138 | +{ |
| 139 | + // Temporary vector with row group indices from the first source |
| 140 | + auto const input_row_group_indices = |
| 141 | + std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}}; |
| 142 | + |
| 143 | + return _impl->filter_column_chunks_byte_ranges(input_row_group_indices, options).first; |
| 144 | +} |
| 145 | + |
| 146 | +table_with_metadata hybrid_scan_reader::materialize_filter_columns( |
| 147 | + cudf::host_span<thrust::host_vector<bool> const> data_page_mask, |
| 148 | + cudf::host_span<size_type const> row_group_indices, |
| 149 | + std::vector<rmm::device_buffer> column_chunk_buffers, |
| 150 | + cudf::mutable_column_view row_mask, |
| 151 | + parquet_reader_options const& options, |
| 152 | + rmm::cuda_stream_view stream) const |
| 153 | +{ |
| 154 | + // Temporary vector with row group indices from the first source |
| 155 | + auto const input_row_group_indices = |
| 156 | + std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}}; |
| 157 | + |
| 158 | + return _impl->materialize_filter_columns(data_page_mask, |
| 159 | + input_row_group_indices, |
| 160 | + std::move(column_chunk_buffers), |
| 161 | + row_mask, |
| 162 | + options, |
| 163 | + stream); |
| 164 | +} |
| 165 | + |
| 166 | +[[nodiscard]] std::vector<text::byte_range_info> |
| 167 | +hybrid_scan_reader::payload_column_chunks_byte_ranges( |
| 168 | + cudf::host_span<size_type const> row_group_indices, parquet_reader_options const& options) const |
| 169 | +{ |
| 170 | + auto const input_row_group_indices = |
| 171 | + std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}}; |
| 172 | + |
| 173 | + return _impl->payload_column_chunks_byte_ranges(input_row_group_indices, options).first; |
| 174 | +} |
| 175 | + |
| 176 | +table_with_metadata hybrid_scan_reader::materialize_payload_columns( |
| 177 | + cudf::host_span<size_type const> row_group_indices, |
| 178 | + std::vector<rmm::device_buffer> column_chunk_buffers, |
| 179 | + cudf::column_view row_mask, |
| 180 | + parquet_reader_options const& options, |
| 181 | + rmm::cuda_stream_view stream) const |
| 182 | +{ |
| 183 | + // Temporary vector with row group indices from the first source |
| 184 | + auto const input_row_group_indices = |
| 185 | + std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}}; |
| 186 | + |
| 187 | + return _impl->materialize_payload_columns( |
| 188 | + input_row_group_indices, std::move(column_chunk_buffers), row_mask, options, stream); |
| 189 | +} |
| 190 | + |
| 191 | +} // namespace cudf::io::parquet::experimental |
0 commit comments