From cdc36851b35e60572e68f9b4ba0753a53c2feb01 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 26 Sep 2025 22:50:27 +0000 Subject: [PATCH 01/22] Passthrough unary ops through Parquet predicate pushdown --- cpp/src/io/parquet/bloom_filter_reader.cu | 25 ++-- .../parquet/experimental/page_index_filter.cu | 11 ++ cpp/src/io/parquet/predicate_pushdown.cpp | 3 + cpp/src/io/parquet/stats_filter_helpers.cpp | 123 ++++++++++-------- cpp/src/io/parquet/stats_filter_helpers.hpp | 2 + cpp/tests/io/parquet_reader_test.cpp | 63 +++++++++ 6 files changed, 163 insertions(+), 64 deletions(-) diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index 7abceb3ad4d..68fe88dae57 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -210,10 +210,12 @@ class bloom_filter_expression_converter : public equality_literals_collector { auto const op = expr.get_operator(); if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second should be literal. - CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, - "Only binary operations are supported on column reference"); - CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, + // First operand should be column reference, second (if binary operation)should be literal. + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + cudf::ast::detail::ast_operator_arity(op) == 2, + "Only unary and binary operations are supported on column reference"); + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + dynamic_cast(&operands[1].get()) != nullptr, "Second operand of binary operation with column reference must be a literal"); v->accept(*this); @@ -622,17 +624,20 @@ std::reference_wrapper equality_literals_collector::visit auto const op = expr.get_operator(); if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second should be literal. - CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, - "Only binary operations are supported on column reference"); - auto const literal_ptr = dynamic_cast(&operands[1].get()); - CUDF_EXPECTS(literal_ptr != nullptr, + // First operand should be column reference, second (if binary operation)should be literal. + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + cudf::ast::detail::ast_operator_arity(op) == 2, + "Only unary and binary operations are supported on column reference"); + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + dynamic_cast(&operands[1].get()) != nullptr, "Second operand of binary operation with column reference must be a literal"); + v->accept(*this); // Push to the corresponding column's literals list iff equality predicate is seen if (op == ast_operator::EQUAL) { - auto const col_idx = v->get_column_index(); + auto const literal_ptr = dynamic_cast(&operands[1].get()); + auto const col_idx = v->get_column_index(); _literals[col_idx].emplace_back(const_cast(literal_ptr)); } } else { diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 216d9a23f55..9f8f31bfaff 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -627,6 +627,17 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag static_cast(output_dtypes.size())} .get_stats_columns_mask(); + // Return early if no columns will participate in stats based page filtering + if (stats_columns_mask.empty()) { + auto row_mask = cudf::make_numeric_column( + data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr); + thrust::fill(rmm::exec_policy_nosync(stream), + row_mask->mutable_view().begin(), + row_mask->mutable_view().end(), + true); + return row_mask; + } + // Convert page statistics to a table // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1] // For each column, it contains total number of rows from all row groups. diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 01c09f9306e..2695a88c346 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -126,6 +126,9 @@ std::optional>> aggregate_reader_metadata::ap stats_columns_collector{filter.get(), static_cast(output_dtypes.size())} .get_stats_columns_mask(); + // Return early if no columns will participate in stats based filtering + if (stats_columns_mask.empty()) { return std::nullopt; } + // Converts Column chunk statistics to a table // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1] // For each column, it contains #sources * #column_chunks_per_src rows diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp index 7277150314b..5cbbce234b5 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.cpp +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -63,19 +63,25 @@ std::reference_wrapper stats_columns_collector::visit( auto const op = expr.get_operator(); if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second should be literal. - CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, - "Only binary operations are supported on column reference"); - CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, + // First operand should be column reference, second (if binary operation)should be literal. + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + cudf::ast::detail::ast_operator_arity(op) == 2, + "Only unary and binary operations are supported on column reference"); + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + dynamic_cast(&operands[1].get()) != nullptr, "Second operand of binary operation with column reference must be a literal"); v->accept(*this); - // If this is a supported operation, mark the column as needed + + // Return early if this is a unary operation + if (cudf::ast::detail::ast_operator_arity(op) == 1) { return expr; } + + // Else if this is a supported binary operation, mark the column as needed if (op == ast_operator::EQUAL or op == ast_operator::NOT_EQUAL or op == ast_operator::LESS or op == ast_operator::LESS_EQUAL or op == ast_operator::GREATER or op == ast_operator::GREATER_EQUAL) { _columns_mask[v->get_column_index()] = true; } else { - CUDF_FAIL("Unsupported operation in Statistics AST"); + CUDF_FAIL("Unsupported binary operation in Statistics AST"); } } else { // Visit the operands and ignore any output as we only want to build the column mask @@ -117,56 +123,65 @@ std::reference_wrapper stats_expression_converter::visit( auto const op = expr.get_operator(); if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second should be literal. - CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, - "Only binary operations are supported on column reference"); - CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, + // First operand should be column reference, second (if binary operation)should be literal. + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + cudf::ast::detail::ast_operator_arity(op) == 2, + "Only unary and binary operations are supported on column reference"); + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + dynamic_cast(&operands[1].get()) != nullptr, "Second operand of binary operation with column reference must be a literal"); v->accept(*this); - // Push literal into the ast::tree - auto const& literal = _stats_expr.push(*dynamic_cast(&operands[1].get())); - auto const col_index = v->get_column_index(); - switch (op) { - /* transform to stats conditions. op(col, literal) - col1 == val --> vmin <= val && vmax >= val - col1 != val --> !(vmin == val && vmax == val) - col1 > val --> vmax > val - col1 < val --> vmin < val - col1 >= val --> vmax >= val - col1 <= val --> vmin <= val - */ - case ast_operator::EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{ - ast::ast_operator::LOGICAL_AND, - _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), - _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})}); - break; - } - case ast_operator::NOT_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push( - ast::operation{ast_operator::LOGICAL_OR, - _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), - _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})}); - break; - } - case ast_operator::LESS: [[fallthrough]]; - case ast_operator::LESS_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - _stats_expr.push(ast::operation{op, vmin, literal}); - break; - } - case ast_operator::GREATER: [[fallthrough]]; - case ast_operator::GREATER_EQUAL: { - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{op, vmax, literal}); - break; - } - default: CUDF_FAIL("Unsupported operation in Statistics AST"); - }; + + // For unary operators, push an always true expression + if (cudf::ast::detail::ast_operator_arity(op) == 1) { + _stats_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + } else { + // Push literal into the ast::tree + auto const& literal = + _stats_expr.push(*dynamic_cast(&operands[1].get())); + auto const col_index = v->get_column_index(); + switch (op) { + /* transform to stats conditions. op(col, literal) + col1 == val --> vmin <= val && vmax >= val + col1 != val --> !(vmin == val && vmax == val) + col1 > val --> vmax > val + col1 < val --> vmin < val + col1 >= val --> vmax >= val + col1 <= val --> vmin <= val + */ + case ast_operator::EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{ + ast::ast_operator::LOGICAL_AND, + _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), + _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})}); + break; + } + case ast_operator::NOT_EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{ + ast_operator::LOGICAL_OR, + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})}); + break; + } + case ast_operator::LESS: [[fallthrough]]; + case ast_operator::LESS_EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + _stats_expr.push(ast::operation{op, vmin, literal}); + break; + } + case ast_operator::GREATER: [[fallthrough]]; + case ast_operator::GREATER_EQUAL: { + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{op, vmax, literal}); + break; + } + default: CUDF_FAIL("Unsupported binary operation in Statistics AST"); + }; + } } else { auto new_operands = visit_operands(operands); if (cudf::ast::detail::ast_operator_arity(op) == 2) { diff --git a/cpp/src/io/parquet/stats_filter_helpers.hpp b/cpp/src/io/parquet/stats_filter_helpers.hpp index e91467fd761..efc35535320 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.hpp +++ b/cpp/src/io/parquet/stats_filter_helpers.hpp @@ -311,6 +311,8 @@ class stats_expression_converter : public stats_columns_collector { private: ast::tree _stats_expr; + cudf::numeric_scalar _always_true_scalar{true}; + ast::literal const _always_true{_always_true_scalar}; }; } // namespace cudf::io::parquet::detail diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index e44eb47503c..26e77c25b14 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -2842,6 +2842,69 @@ void filter_typed_test() test_predicate_pushdown( filter_expression, ref_filter, expected_total_row_groups, expected_stats_filtered_row_groups); } + + // Unary operation `IS_NULL` should not filter any row groups + { + auto constexpr expected_total_row_groups = 4; + auto constexpr expected_stats_filtered_row_groups = 4; + + auto const filter_expression = + cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_name_0); + auto const ref_filter = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_ref_0); + test_predicate_pushdown( + filter_expression, ref_filter, expected_total_row_groups, expected_stats_filtered_row_groups); + } + + // Unary operation `IS_NULL` should not affect anything when ANDing with another expression, and + // should short circuit when ORing with another expression + { + auto constexpr expected_total_row_groups = 4; + + // Filtering AST + auto literal_value = []() { + if constexpr (cudf::is_timestamp()) { + // table[0] < 10000 timestamp days/seconds/milliseconds/microseconds/nanoseconds + return cudf::timestamp_scalar(T(typename T::duration(10000))); // i (0-20,000) + } else if constexpr (cudf::is_duration()) { + // table[0] < 10000 day/seconds/milliseconds/microseconds/nanoseconds + return cudf::duration_scalar(T(10000)); // i (0-20,000) + } else if constexpr (std::is_same_v) { + // table[0] < "000010000" + return cudf::string_scalar("000010000"); // i (0-20,000) + } else { + // table[0] < 0 or 100u + return cudf::numeric_scalar( + (100 - 100 * std::is_signed_v)); // i/100 (-100-100/ 0-200) + } + }(); + + auto const literal = cudf::ast::literal(literal_value); + auto const expr1 = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal); + auto const expr2 = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_name_0); + + auto const ref_expr1 = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); + auto const ref_expr2 = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_ref_0); + + // Filter expression AND unary operation + auto filter_expression = + cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr1, expr2); + auto ref_filter = + cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, ref_expr1, ref_expr2); + auto constexpr expected_filtered_row_groups_with_unary_and = 2; + test_predicate_pushdown(filter_expression, + ref_filter, + expected_total_row_groups, + expected_filtered_row_groups_with_unary_and); + + // Filter expression OR unary operation + filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr1, expr2); + ref_filter = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, ref_expr1, ref_expr2); + auto constexpr expected_filtered_row_groups_with_unary_or = 4; + test_predicate_pushdown(filter_expression, + ref_filter, + expected_total_row_groups, + expected_filtered_row_groups_with_unary_or); + } } TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped) From 0d55bd495a0e96fda23a938c7f02c2cda7c737f0 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 29 Sep 2025 18:29:58 +0000 Subject: [PATCH 02/22] Update gtests --- cpp/tests/io/parquet_reader_test.cpp | 101 ++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index 26e77c25b14..fe5ea8ca3e1 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -2842,8 +2842,97 @@ void filter_typed_test() test_predicate_pushdown( filter_expression, ref_filter, expected_total_row_groups, expected_stats_filtered_row_groups); } +} + +template +void filter_unary_operation_typed_test() +{ + std::mt19937 gen(0xd00dL); + auto [src, filepath, null_count] = [&]() { + auto constexpr num_rows = num_ordered_rows; + auto constexpr row_group_size_rows = num_rows / 4; + auto _col0 = testdata::ascending(); + // Add nulls to col0 + std::bernoulli_distribution bn(0.7f); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); + auto col0 = cudf::purge_nonempty_nulls(*_col0[0]); + auto col1 = testdata::descending(); + auto col2 = testdata::unordered(); + auto const written_table = table_view{{col0, col1, col2}}; + auto const filepath = temp_env->get_temp_filepath("FilterUnaryOperationTyped.parquet"); + { + cudf::io::table_input_metadata expected_metadata(written_table); + expected_metadata.column_metadata[0].set_name("col0"); + expected_metadata.column_metadata[1].set_name("col1"); + expected_metadata.column_metadata[2].set_name("col2"); + + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table) + .metadata(std::move(expected_metadata)) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .row_group_size_rows(row_group_size_rows); + cudf::io::write_parquet(out_opts); + } + + std::vector> columns; + columns.push_back(col0.release()); + columns.push_back(col1.release()); + columns.push_back(col2.release()); + + return std::pair{cudf::table{std::move(columns)}, filepath, null_count}; + }(); + + auto const written_table = src.view(); + auto const col_name_0 = cudf::ast::column_name_reference("col0"); + auto const col_ref_0 = cudf::ast::column_reference(0); + auto const test_predicate_pushdown = [&](cudf::ast::operation const& filter_expression, + cudf::ast::operation const& ref_filter, + cudf::size_type expected_total_row_groups, + cudf::size_type expected_stats_filtered_row_groups, + std::optional expected_num_rows = + std::nullopt) { + // Expected result + auto const predicate = cudf::compute_column(written_table, ref_filter); + EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8) + << "Predicate filter should return a boolean"; + auto const expected = cudf::apply_boolean_mask(written_table, *predicate); + + // JIT does not support nullness-dependent operators such as IS_NULL + auto constexpr use_jit = false; + + // Reading with Predicate Pushdown + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .filter(filter_expression) + .use_jit_filter(use_jit); + auto const result = cudf::io::read_parquet(read_opts); + auto const result_table = result.tbl->view(); - // Unary operation `IS_NULL` should not filter any row groups + // Tests + if (expected_num_rows.has_value()) { + EXPECT_EQ(expected->num_rows(), expected_num_rows.value()); + EXPECT_EQ(result_table.num_rows(), expected_num_rows.value()); + } + EXPECT_EQ(static_cast(written_table.column(0).type().id()), + static_cast(result_table.column(0).type().id())) + << "col0 type mismatch"; + // To make sure AST filters out some elements if row groups must be filtered + if (expected_stats_filtered_row_groups < expected_total_row_groups) { + EXPECT_LT(expected->num_rows(), written_table.num_rows()); + } else { + EXPECT_LE(expected->num_rows(), written_table.num_rows()); + } + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table); + EXPECT_EQ(result.metadata.num_input_row_groups, expected_total_row_groups); + EXPECT_TRUE(result.metadata.num_row_groups_after_stats_filter.has_value()); + EXPECT_EQ(result.metadata.num_row_groups_after_stats_filter.value(), + expected_stats_filtered_row_groups); + EXPECT_FALSE(result.metadata.num_row_groups_after_bloom_filter.has_value()); + }; + + // Unary operation `IS_NULL` should not filter any row groups and yield exactly `null_count` rows { auto constexpr expected_total_row_groups = 4; auto constexpr expected_stats_filtered_row_groups = 4; @@ -2851,8 +2940,11 @@ void filter_typed_test() auto const filter_expression = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_name_0); auto const ref_filter = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_ref_0); - test_predicate_pushdown( - filter_expression, ref_filter, expected_total_row_groups, expected_stats_filtered_row_groups); + test_predicate_pushdown(filter_expression, + ref_filter, + expected_total_row_groups, + expected_stats_filtered_row_groups, + null_count); } // Unary operation `IS_NULL` should not affect anything when ANDing with another expression, and @@ -2910,11 +3002,14 @@ void filter_typed_test() TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped) { filter_typed_test(); + filter_unary_operation_typed_test(); } TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTypedJIT) { filter_typed_test(); + // JIT does not support nullness-dependent operators such as IS_NULL so we can't call + // `filter_unary_operation_typed_test` } TEST_P(ParquetDecompressionTest, RoundTripBasic) From 204d746dcfae4710bb9b17e146c3df3266244853 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 29 Sep 2025 21:18:30 +0000 Subject: [PATCH 03/22] Propagate `_always_true` to parent expression when expr can't be evaluated --- cpp/src/io/parquet/bloom_filter_reader.cu | 16 ++++--- .../experimental/dictionary_page_filter.cu | 12 +++++- cpp/src/io/parquet/stats_filter_helpers.cpp | 12 +++++- cpp/tests/io/parquet_reader_test.cpp | 42 ++++++++++++++----- 4 files changed, 62 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index 68fe88dae57..528cc026b0d 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -234,18 +234,24 @@ class bloom_filter_expression_converter : public equality_literals_collector { auto const& value = _bloom_filter_expr.push(ast::column_reference{col_literal_offset}); _bloom_filter_expr.push(ast::operation{ast_operator::IDENTITY, value}); } - // For all other expressions, push an always true expression + // For all other expressions, push and return the `_always_true` expression else { - _bloom_filter_expr.push( - ast::operation{ast_operator::NOT, - _bloom_filter_expr.push(ast::operation{ast_operator::NOT, _always_true})}); + _bloom_filter_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + // This is important as we need to propagate the `_always_true` expression to its parent + return _always_true; } } else { auto new_operands = visit_operands(operands); if (cudf::ast::detail::ast_operator_arity(op) == 2) { _bloom_filter_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { - _bloom_filter_expr.push(ast::operation{op, new_operands.front()}); + // If the new_operands is just a `_always_true` literal, propagate it here + if (auto* lit = dynamic_cast(&new_operands.front().get()); + lit == &_always_true) { + _bloom_filter_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + } else { + _bloom_filter_expr.push(ast::operation{op, new_operands.front()}); + } } } return _bloom_filter_expr.back(); diff --git a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu index a03c1ae3032..b5f8c7a8cca 100644 --- a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu +++ b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu @@ -1440,16 +1440,24 @@ class dictionary_expression_converter : public equality_literals_collector { _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, value}); } } - // For all other expressions, push the `always true` expression + // For all other expressions, push and return the `_always_true` expression else { _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + // This is important as we need to propagate the `_always_true` expression to its parent + return _always_true; } } else { auto new_operands = visit_operands(operands); if (cudf::ast::detail::ast_operator_arity(op) == 2) { _dictionary_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { - _dictionary_expr.push(ast::operation{op, new_operands.front()}); + // If the new_operands is just a `_always_true` literal, propagate it here + if (auto* lit = dynamic_cast(&new_operands.front().get()); + lit == &_always_true) { + _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + } else { + _dictionary_expr.push(ast::operation{op, new_operands.front()}); + } } } return _dictionary_expr.back(); diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp index 5cbbce234b5..72ab5f97ba7 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.cpp +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -132,9 +132,11 @@ std::reference_wrapper stats_expression_converter::visit( "Second operand of binary operation with column reference must be a literal"); v->accept(*this); - // For unary operators, push an always true expression + // For unary operators, push and return the `_always_true` expression if (cudf::ast::detail::ast_operator_arity(op) == 1) { _stats_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + // This is important as we need to propagate the `_always_true` expression to its parent + return _always_true; } else { // Push literal into the ast::tree auto const& literal = @@ -187,7 +189,13 @@ std::reference_wrapper stats_expression_converter::visit( if (cudf::ast::detail::ast_operator_arity(op) == 2) { _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { - _stats_expr.push(ast::operation{op, new_operands.front()}); + // If the new_operands is just a `_always_true` literal, propagate it here + if (auto* lit = dynamic_cast(&new_operands.front().get()); + lit == &_always_true) { + _stats_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + } else { + _stats_expr.push(ast::operation{op, new_operands.front()}); + } } } return _stats_expr.back(); diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index fe5ea8ca3e1..f5654cc6261 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -2851,17 +2851,18 @@ void filter_unary_operation_typed_test() auto [src, filepath, null_count] = [&]() { auto constexpr num_rows = num_ordered_rows; auto constexpr row_group_size_rows = num_rows / 4; - auto _col0 = testdata::ascending(); + auto _col0 = testdata::ascending().release(); // Add nulls to col0 std::bernoulli_distribution bn(0.7f); auto valids = cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); - auto col0 = cudf::purge_nonempty_nulls(*_col0[0]); - auto col1 = testdata::descending(); - auto col2 = testdata::unordered(); - auto const written_table = table_view{{col0, col1, col2}}; - auto const filepath = temp_env->get_temp_filepath("FilterUnaryOperationTyped.parquet"); + _col0->set_null_mask(std::move(null_mask), null_count); + auto col0 = cudf::purge_nonempty_nulls(_col0->view()); + auto col1 = testdata::descending(); + auto col2 = testdata::unordered(); + auto const written_table = table_view{{col0->view(), col1, col2}}; + auto const filepath = temp_env->get_temp_filepath("FilterUnaryOperationTyped.parquet"); { cudf::io::table_input_metadata expected_metadata(written_table); expected_metadata.column_metadata[0].set_name("col0"); @@ -2877,16 +2878,14 @@ void filter_unary_operation_typed_test() } std::vector> columns; - columns.push_back(col0.release()); + columns.push_back(std::move(col0)); columns.push_back(col1.release()); columns.push_back(col2.release()); - return std::pair{cudf::table{std::move(columns)}, filepath, null_count}; + return std::tuple{cudf::table{std::move(columns)}, filepath, null_count}; }(); auto const written_table = src.view(); - auto const col_name_0 = cudf::ast::column_name_reference("col0"); - auto const col_ref_0 = cudf::ast::column_reference(0); auto const test_predicate_pushdown = [&](cudf::ast::operation const& filter_expression, cudf::ast::operation const& ref_filter, cudf::size_type expected_total_row_groups, @@ -2932,6 +2931,9 @@ void filter_unary_operation_typed_test() EXPECT_FALSE(result.metadata.num_row_groups_after_bloom_filter.has_value()); }; + auto const col_name_0 = cudf::ast::column_name_reference("col0"); + auto const col_ref_0 = cudf::ast::column_reference(0); + // Unary operation `IS_NULL` should not filter any row groups and yield exactly `null_count` rows { auto constexpr expected_total_row_groups = 4; @@ -2947,6 +2949,24 @@ void filter_unary_operation_typed_test() null_count); } + // Unary operation `NOT(IS_NULL)` should not filter any row groups and yield exactly `num_rows - + // null_count` rows + { + auto constexpr expected_total_row_groups = 4; + auto constexpr expected_stats_filtered_row_groups = 4; + + auto const is_null_expr = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_name_0); + auto const filter_expression = cudf::ast::operation(cudf::ast::ast_operator::NOT, is_null_expr); + auto const is_null_ref_expr = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_ref_0); + auto const ref_filter = cudf::ast::operation(cudf::ast::ast_operator::NOT, is_null_ref_expr); + + test_predicate_pushdown(filter_expression, + ref_filter, + expected_total_row_groups, + expected_stats_filtered_row_groups, + num_ordered_rows - null_count); + } + // Unary operation `IS_NULL` should not affect anything when ANDing with another expression, and // should short circuit when ORing with another expression { @@ -3002,7 +3022,7 @@ void filter_unary_operation_typed_test() TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped) { filter_typed_test(); - filter_unary_operation_typed_test(); + filter_unary_operation_typed_test(); } TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTypedJIT) From 06389224b6cb6787603a8b78f0802f33c5e07d80 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:47:18 +0000 Subject: [PATCH 04/22] Small update --- cpp/src/io/parquet/bloom_filter_reader.cu | 3 ++- cpp/src/io/parquet/experimental/dictionary_page_filter.cu | 2 +- cpp/src/io/parquet/stats_filter_helpers.cpp | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index 528cc026b0d..566ef90e2ba 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -248,7 +248,8 @@ class bloom_filter_expression_converter : public equality_literals_collector { // If the new_operands is just a `_always_true` literal, propagate it here if (auto* lit = dynamic_cast(&new_operands.front().get()); lit == &_always_true) { - _bloom_filter_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + _bloom_filter_expr.push( + ast::operation{ast_operator::IDENTITY, _bloom_filter_expr.back()}); } else { _bloom_filter_expr.push(ast::operation{op, new_operands.front()}); } diff --git a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu index b5f8c7a8cca..533d0f747bd 100644 --- a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu +++ b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu @@ -1454,7 +1454,7 @@ class dictionary_expression_converter : public equality_literals_collector { // If the new_operands is just a `_always_true` literal, propagate it here if (auto* lit = dynamic_cast(&new_operands.front().get()); lit == &_always_true) { - _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, _dictionary_expr.back()}); } else { _dictionary_expr.push(ast::operation{op, new_operands.front()}); } diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp index 72ab5f97ba7..ad264ff7f26 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.cpp +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -192,7 +192,7 @@ std::reference_wrapper stats_expression_converter::visit( // If the new_operands is just a `_always_true` literal, propagate it here if (auto* lit = dynamic_cast(&new_operands.front().get()); lit == &_always_true) { - _stats_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + _stats_expr.push(ast::operation{ast_operator::IDENTITY, _stats_expr.back()}); } else { _stats_expr.push(ast::operation{op, new_operands.front()}); } From 0733944a7e1668fd128c76a21ccccbcca0b9f08b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 29 Sep 2025 23:01:18 +0000 Subject: [PATCH 05/22] Minor updates --- cpp/src/io/parquet/bloom_filter_reader.cu | 11 ++- .../experimental/dictionary_page_filter.cu | 11 ++- cpp/src/io/parquet/stats_filter_helpers.cpp | 94 +++++++++---------- 3 files changed, 63 insertions(+), 53 deletions(-) diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index 566ef90e2ba..f508c231439 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -219,6 +219,12 @@ class bloom_filter_expression_converter : public equality_literals_collector { "Second operand of binary operation with column reference must be a literal"); v->accept(*this); + // Propagate the `_always_true` as expression to its unary operator parent + if (cudf::ast::detail::ast_operator_arity(op) == 1) { + _bloom_filter_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + return _always_true; + } + if (op == ast_operator::EQUAL) { // Search the literal in this input column's equality literals list and add to the offset. auto const col_idx = v->get_column_index(); @@ -234,11 +240,10 @@ class bloom_filter_expression_converter : public equality_literals_collector { auto const& value = _bloom_filter_expr.push(ast::column_reference{col_literal_offset}); _bloom_filter_expr.push(ast::operation{ast_operator::IDENTITY, value}); } - // For all other expressions, push and return the `_always_true` expression + // For all other expressions, push the `_always_true` expression else { _bloom_filter_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); - // This is important as we need to propagate the `_always_true` expression to its parent - return _always_true; + return _bloom_filter_expr.back(); } } else { auto new_operands = visit_operands(operands); diff --git a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu index 533d0f747bd..b51acb29150 100644 --- a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu +++ b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu @@ -1414,6 +1414,12 @@ class dictionary_expression_converter : public equality_literals_collector { "Second operand of binary operation with column reference must be a literal"); v->accept(*this); + // Propagate the `_always_true` as expression to its unary operator parent + if (cudf::ast::detail::ast_operator_arity(op) == 1) { + _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + return _always_true; + } + if (op == ast_operator::EQUAL or op == ast::ast_operator::NOT_EQUAL) { // Search the literal in this input column's equality literals list and add to // the offset. @@ -1440,11 +1446,10 @@ class dictionary_expression_converter : public equality_literals_collector { _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, value}); } } - // For all other expressions, push and return the `_always_true` expression + // For all other expressions, push the `_always_true` expression else { _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); - // This is important as we need to propagate the `_always_true` expression to its parent - return _always_true; + return _dictionary_expr.back(); } } else { auto new_operands = visit_operands(operands); diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp index ad264ff7f26..5cc6549861a 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.cpp +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -135,55 +135,55 @@ std::reference_wrapper stats_expression_converter::visit( // For unary operators, push and return the `_always_true` expression if (cudf::ast::detail::ast_operator_arity(op) == 1) { _stats_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); - // This is important as we need to propagate the `_always_true` expression to its parent + // Propagate the `_always_true` as expression to its unary operator parent return _always_true; - } else { - // Push literal into the ast::tree - auto const& literal = - _stats_expr.push(*dynamic_cast(&operands[1].get())); - auto const col_index = v->get_column_index(); - switch (op) { - /* transform to stats conditions. op(col, literal) - col1 == val --> vmin <= val && vmax >= val - col1 != val --> !(vmin == val && vmax == val) - col1 > val --> vmax > val - col1 < val --> vmin < val - col1 >= val --> vmax >= val - col1 <= val --> vmin <= val - */ - case ast_operator::EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{ - ast::ast_operator::LOGICAL_AND, - _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), - _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})}); - break; - } - case ast_operator::NOT_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{ - ast_operator::LOGICAL_OR, - _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), - _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})}); - break; - } - case ast_operator::LESS: [[fallthrough]]; - case ast_operator::LESS_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - _stats_expr.push(ast::operation{op, vmin, literal}); - break; - } - case ast_operator::GREATER: [[fallthrough]]; - case ast_operator::GREATER_EQUAL: { - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{op, vmax, literal}); - break; - } - default: CUDF_FAIL("Unsupported binary operation in Statistics AST"); - }; } + + // Push literal into the ast::tree + auto const& literal = _stats_expr.push(*dynamic_cast(&operands[1].get())); + auto const col_index = v->get_column_index(); + switch (op) { + /* transform to stats conditions. op(col, literal) + col1 == val --> vmin <= val && vmax >= val + col1 != val --> !(vmin == val && vmax == val) + col1 > val --> vmax > val + col1 < val --> vmin < val + col1 >= val --> vmax >= val + col1 <= val --> vmin <= val + */ + case ast_operator::EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{ + ast::ast_operator::LOGICAL_AND, + _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), + _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})}); + break; + } + case ast_operator::NOT_EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push( + ast::operation{ast_operator::LOGICAL_OR, + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})}); + break; + } + case ast_operator::LESS: [[fallthrough]]; + case ast_operator::LESS_EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + _stats_expr.push(ast::operation{op, vmin, literal}); + break; + } + case ast_operator::GREATER: [[fallthrough]]; + case ast_operator::GREATER_EQUAL: { + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{op, vmax, literal}); + break; + } + default: CUDF_FAIL("Unsupported binary operation in Statistics AST"); + }; + } else { auto new_operands = visit_operands(operands); if (cudf::ast::detail::ast_operator_arity(op) == 2) { From c18d5dcda421ae93c8082dab62eb1921e52a5f74 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 29 Sep 2025 23:03:47 +0000 Subject: [PATCH 06/22] Comments update --- cpp/src/io/parquet/bloom_filter_reader.cu | 2 +- .../io/parquet/experimental/dictionary_page_filter.cu | 10 ++++++---- cpp/src/io/parquet/stats_filter_helpers.cpp | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index f508c231439..057590128fd 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -210,7 +210,7 @@ class bloom_filter_expression_converter : public equality_literals_collector { auto const op = expr.get_operator(); if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second (if binary operation)should be literal. + // First operand should be column reference, second (if binary operation) should be literal. CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or cudf::ast::detail::ast_operator_arity(op) == 2, "Only unary and binary operations are supported on column reference"); diff --git a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu index b51acb29150..7816956c7de 100644 --- a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu +++ b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu @@ -1407,10 +1407,12 @@ class dictionary_expression_converter : public equality_literals_collector { auto const op = expr.get_operator(); if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second should be literal. - CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, - "Only binary operations are supported on column reference"); - CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, + // First operand should be column reference, second (if binary operation) should be literal. + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + cudf::ast::detail::ast_operator_arity(op) == 2, + "Only unary and binary operations are supported on column reference"); + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + dynamic_cast(&operands[1].get()) != nullptr, "Second operand of binary operation with column reference must be a literal"); v->accept(*this); diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp index 5cc6549861a..aae762515ef 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.cpp +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -123,7 +123,7 @@ std::reference_wrapper stats_expression_converter::visit( auto const op = expr.get_operator(); if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second (if binary operation)should be literal. + // First operand should be column reference, second (if binary operation) should be literal. CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or cudf::ast::detail::ast_operator_arity(op) == 2, "Only unary and binary operations are supported on column reference"); From 88be6f4495694cd77ce07a6363e11941ea331bb2 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 29 Sep 2025 23:10:11 +0000 Subject: [PATCH 07/22] Similar changes in dictionary and bloom filter `collectors` --- cpp/src/io/parquet/bloom_filter_reader.cu | 6 ++++-- .../experimental/dictionary_page_filter.cu | 18 +++++++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index 057590128fd..586edf11781 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -243,7 +243,6 @@ class bloom_filter_expression_converter : public equality_literals_collector { // For all other expressions, push the `_always_true` expression else { _bloom_filter_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); - return _bloom_filter_expr.back(); } } else { auto new_operands = visit_operands(operands); @@ -636,7 +635,7 @@ std::reference_wrapper equality_literals_collector::visit auto const op = expr.get_operator(); if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second (if binary operation)should be literal. + // First operand should be column reference, second (if binary operation) should be literal. CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or cudf::ast::detail::ast_operator_arity(op) == 2, "Only unary and binary operations are supported on column reference"); @@ -646,6 +645,9 @@ std::reference_wrapper equality_literals_collector::visit v->accept(*this); + // Return early if this is a unary operation + if (cudf::ast::detail::ast_operator_arity(op) == 1) { return expr; } + // Push to the corresponding column's literals list iff equality predicate is seen if (op == ast_operator::EQUAL) { auto const literal_ptr = dynamic_cast(&operands[1].get()); diff --git a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu index 7816956c7de..debfc3c0713 100644 --- a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu +++ b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu @@ -1451,7 +1451,6 @@ class dictionary_expression_converter : public equality_literals_collector { // For all other expressions, push the `_always_true` expression else { _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); - return _dictionary_expr.back(); } } else { auto new_operands = visit_operands(operands); @@ -1594,18 +1593,23 @@ std::reference_wrapper dictionary_literals_collector::vis auto const op = expr.get_operator(); if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second should be literal. - CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, - "Only binary operations are supported on column reference"); - auto const literal_ptr = dynamic_cast(&operands[1].get()); - CUDF_EXPECTS(literal_ptr != nullptr, + // First operand should be column reference, second (if binary operation) should be literal. + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + cudf::ast::detail::ast_operator_arity(op) == 2, + "Only unary and binary operations are supported on column reference"); + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1 or + dynamic_cast(&operands[1].get()) != nullptr, "Second operand of binary operation with column reference must be a literal"); v->accept(*this); + // Return early if this is a unary operation + if (cudf::ast::detail::ast_operator_arity(op) == 1) { return expr; } + // Push to the corresponding column's literals and operators list iff EQUAL or NOT_EQUAL // operator is seen if (op == ast_operator::EQUAL or op == ast::ast_operator::NOT_EQUAL) { - auto const col_idx = v->get_column_index(); + auto const literal_ptr = dynamic_cast(&operands[1].get()); + auto const col_idx = v->get_column_index(); _literals[col_idx].emplace_back(const_cast(literal_ptr)); _operators[col_idx].emplace_back(op); } From bd2b32c43982aaaab62c78bfd041ec9c7227d5b4 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 30 Sep 2025 02:16:29 +0000 Subject: [PATCH 08/22] Evaluate `IS_NULL` at rg and page levels in parquet filtering --- .../parquet/experimental/page_index_filter.cu | 85 ++++++++++++++----- cpp/src/io/parquet/predicate_pushdown.cpp | 29 ++++++- cpp/src/io/parquet/stats_filter_helpers.cpp | 47 ++++++---- cpp/src/io/parquet/stats_filter_helpers.hpp | 3 +- cpp/tests/io/parquet_reader_test.cpp | 28 +++--- 5 files changed, 132 insertions(+), 60 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 9f8f31bfaff..5af8a4d29ee 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -443,7 +443,7 @@ struct page_stats_caster : public stats_caster_base { * @return A pair of device columns with min and max value from page statistics for each row */ template - std::pair, std::unique_ptr> operator()( + std::tuple, std::unique_ptr, std::unique_ptr> operator()( cudf::size_type schema_idx, cudf::data_type dtype, rmm::cuda_stream_view stream, @@ -466,6 +466,7 @@ struct page_stats_caster : public stats_caster_base { // Create host columns with page-level min, max values host_column min(total_pages, stream); host_column max(total_pages, stream); + host_column is_null(total_pages, stream); // Populate the host columns with page-level min, max statistics from the page index auto page_offset_idx = 0; @@ -491,18 +492,34 @@ struct page_stats_caster : public stats_caster_base { auto const page_offset_in_colchunk = col_chunk_page_offsets[page_offset_idx++]; // For all pages in this column chunk - std::for_each( - thrust::counting_iterator(0), - thrust::counting_iterator(num_pages_in_colchunk), - [&](auto page_idx) { - auto const& min_value = column_index.min_values[page_idx]; - auto const& max_value = column_index.max_values[page_idx]; - // Translate binary data to Type then to - min.set_index( - page_offset_in_colchunk + page_idx, min_value, colchunk.meta_data.type); - max.set_index( - page_offset_in_colchunk + page_idx, max_value, colchunk.meta_data.type); - }); + std::for_each(thrust::counting_iterator(0), + thrust::counting_iterator(num_pages_in_colchunk), + [&](auto page_idx) { + auto const& min_value = column_index.min_values[page_idx]; + auto const& max_value = column_index.max_values[page_idx]; + auto const column_page_idx = page_offset_in_colchunk + page_idx; + // Translate binary data to Type then to + min.set_index(column_page_idx, min_value, colchunk.meta_data.type); + max.set_index(column_page_idx, max_value, colchunk.meta_data.type); + if (column_index.null_pages[page_idx]) { + is_null.val[column_page_idx] = true; + return; + } + if (not column_index.null_counts.has_value()) { + is_null.set_index(column_page_idx, std::nullopt, {}); + return; + } + auto const page_row_count = page_row_offsets[column_page_idx + 1] - + page_row_offsets[column_page_idx]; + auto const& null_count = column_index.null_counts.value()[page_idx]; + if (null_count == page_row_count) { + is_null.val[column_page_idx] = false; + } else if (null_count > 0 and null_count < page_row_count) { + is_null.set_index(column_page_idx, std::nullopt, {}); + } else { + CUDF_FAIL("Invalid null count"); + } + }); }); }); @@ -514,37 +531,49 @@ struct page_stats_caster : public stats_caster_base { // row-level. if constexpr (not cuda::std::is_same_v) { // Move host columns to device - auto mincol = min.to_device(dtype, stream, mr); - auto maxcol = max.to_device(dtype, stream, mr); + auto mincol = min.to_device(dtype, stream, mr); + auto maxcol = max.to_device(dtype, stream, mr); + auto is_nullcol = is_null.to_device(cudf::data_type{cudf::type_id::BOOL8}, stream, mr); // Convert page-level min and max columns to row-level min and max columns by gathering // values based on page-level row offsets - auto [min_data, min_bitmask] = build_data_and_nullmask(mincol->mutable_view(), + auto [min_data, min_bitmask] = build_data_and_nullmask(mincol->mutable_view(), min.null_mask.data(), page_indices, page_row_offsets, dtype, stream, mr); - auto [max_data, max_bitmask] = build_data_and_nullmask(maxcol->mutable_view(), + auto [max_data, max_bitmask] = build_data_and_nullmask(maxcol->mutable_view(), max.null_mask.data(), page_indices, page_row_offsets, dtype, stream, mr); + auto [null_data, null_bitmask] = build_data_and_nullmask(is_nullcol->mutable_view(), + is_null.null_mask.data(), + page_indices, + page_row_offsets, + dtype, + stream, + mr); // Count nulls in min and max columns auto const min_nulls = cudf::detail::null_count( reinterpret_cast(min_bitmask.data()), 0, total_rows, stream); auto const max_nulls = cudf::detail::null_count( reinterpret_cast(max_bitmask.data()), 0, total_rows, stream); + auto const null_nulls = cudf::detail::null_count( + reinterpret_cast(null_bitmask.data()), 0, total_rows, stream); - // Return min and max device columns + // Return min, max and is_null device columns return {std::make_unique( dtype, total_rows, std::move(min_data), std::move(min_bitmask), min_nulls), std::make_unique( - dtype, total_rows, std::move(max_data), std::move(max_bitmask), max_nulls)}; + dtype, total_rows, std::move(max_data), std::move(max_bitmask), max_nulls), + std::make_unique( + dtype, total_rows, std::move(null_data), std::move(null_bitmask), null_nulls)}; } // For strings columns, gather the page-level string offsets and bitmask to row-level // directly and gather string chars using a batched memcpy. @@ -553,14 +582,19 @@ struct page_stats_caster : public stats_caster_base { min.val, min.chars, min.null_mask.data(), page_indices, page_row_offsets, stream, mr); auto [max_data, max_offsets, max_nullmask] = build_string_data_and_nullmask( max.val, max.chars, max.null_mask.data(), page_indices, page_row_offsets, stream, mr); + auto is_nullcol = is_null.to_device(cudf::data_type{cudf::type_id::BOOL8}, stream, mr); + auto [null_data, null_bitmask] = build_data_and_nullmask( + is_nullcol->mutable_view(), {}, page_indices, page_row_offsets, dtype, stream, mr); // Count nulls in min and max columns auto const min_nulls = cudf::detail::null_count( reinterpret_cast(min_nullmask.data()), 0, total_rows, stream); auto const max_nulls = cudf::detail::null_count( reinterpret_cast(max_nullmask.data()), 0, total_rows, stream); + auto const null_nulls = cudf::detail::null_count( + reinterpret_cast(null_bitmask.data()), 0, total_rows, stream); - // Return min and max device strings columns + // Return min, max and is_null device strings columns return { cudf::make_strings_column( total_rows, @@ -573,7 +607,9 @@ struct page_stats_caster : public stats_caster_base { std::make_unique(std::move(max_offsets), rmm::device_buffer{0, stream, mr}, 0), std::move(max_data), max_nulls, - std::move(max_nullmask))}; + std::move(max_nullmask)), + std::make_unique( + dtype, total_rows, std::move(null_data), std::move(null_bitmask), null_nulls)}; } } } @@ -622,7 +658,7 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag auto const num_columns = output_dtypes.size(); // Get a boolean mask indicating which columns will participate in stats based filtering - auto const stats_columns_mask = + auto const [stats_columns_mask, has_is_null_operator] = parquet::detail::stats_columns_collector{filter.get(), static_cast(output_dtypes.size())} .get_stats_columns_mask(); @@ -659,12 +695,15 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr)); columns.push_back(cudf::make_numeric_column( data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr)); + columns.push_back(cudf::make_numeric_column( + data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr)); return; } - auto [min_col, max_col] = cudf::type_dispatcher( + auto [min_col, max_col, is_null_col] = cudf::type_dispatcher( dtype, stats_col, schema_idx, dtype, stream, mr); columns.push_back(std::move(min_col)); columns.push_back(std::move(max_col)); + columns.push_back(std::move(is_null_col)); }); auto stats_table = cudf::table(std::move(columns)); diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 2695a88c346..5ccd1f29d3d 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -64,7 +64,7 @@ struct row_group_stats_caster : public stats_caster_base { // Creates device columns from column statistics (min, max) template - std::pair, std::unique_ptr> operator()( + std::tuple, std::unique_ptr, std::unique_ptr> operator()( int schema_idx, cudf::data_type dtype, rmm::cuda_stream_view stream, @@ -76,6 +76,8 @@ struct row_group_stats_caster : public stats_caster_base { } else { host_column min(total_row_groups, stream); host_column max(total_row_groups, stream); + host_column is_null(total_row_groups, stream); + size_type stats_idx = 0; for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) { for (auto const rg_idx : row_group_indices[src_idx]) { @@ -96,15 +98,31 @@ struct row_group_stats_caster : public stats_caster_base { // translate binary data to Type then to min.set_index(stats_idx, min_value, colchunk.meta_data.type); max.set_index(stats_idx, max_value, colchunk.meta_data.type); + // Check the nullability of this column chunk + if (colchunk.meta_data.statistics.null_count.has_value()) { + auto const& null_count = colchunk.meta_data.statistics.null_count.value(); + if (null_count == 0) { + is_null.val[stats_idx] = false; + } else if (null_count < colchunk.meta_data.num_values) { + is_null.set_index(stats_idx, std::nullopt, {}); + } else if (null_count == colchunk.meta_data.num_values) { + is_null.val[stats_idx] = true; + } else { + CUDF_FAIL("Invalid null count"); + } + } } else { // Marking it null, if column present in row group min.set_index(stats_idx, std::nullopt, {}); max.set_index(stats_idx, std::nullopt, {}); + is_null.set_index(stats_idx, std::nullopt, {}); } stats_idx++; } }; - return {min.to_device(dtype, stream, mr), max.to_device(dtype, stream, mr)}; + return {min.to_device(dtype, stream, mr), + max.to_device(dtype, stream, mr), + is_null.to_device(data_type{cudf::type_id::BOOL8}, stream, mr)}; } } }; @@ -122,7 +140,7 @@ std::optional>> aggregate_reader_metadata::ap auto mr = cudf::get_current_device_resource_ref(); // Get a boolean mask indicating which columns can participate in stats based filtering - auto const stats_columns_mask = + auto const [stats_columns_mask, has_is_null_operator] = stats_columns_collector{filter.get(), static_cast(output_dtypes.size())} .get_stats_columns_mask(); @@ -146,12 +164,15 @@ std::optional>> aggregate_reader_metadata::ap data_type{cudf::type_id::BOOL8}, total_row_groups, rmm::device_buffer{}, 0, stream, mr)); columns.push_back(cudf::make_numeric_column( data_type{cudf::type_id::BOOL8}, total_row_groups, rmm::device_buffer{}, 0, stream, mr)); + columns.push_back(cudf::make_numeric_column( + data_type{cudf::type_id::BOOL8}, total_row_groups, rmm::device_buffer{}, 0, stream, mr)); continue; } - auto [min_col, max_col] = + auto [min_col, max_col, is_null_col] = cudf::type_dispatcher(dtype, stats_col, schema_idx, dtype, stream, mr); columns.push_back(std::move(min_col)); columns.push_back(std::move(max_col)); + columns.push_back(std::move(is_null_col)); } auto stats_table = cudf::table(std::move(columns)); diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp index aae762515ef..78ee1c39a53 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.cpp +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -73,15 +73,18 @@ std::reference_wrapper stats_columns_collector::visit( v->accept(*this); // Return early if this is a unary operation - if (cudf::ast::detail::ast_operator_arity(op) == 1) { return expr; } + if (cudf::ast::detail::ast_operator_arity(op) == 1 and op != ast_operator::IS_NULL) { + return expr; + } // Else if this is a supported binary operation, mark the column as needed if (op == ast_operator::EQUAL or op == ast_operator::NOT_EQUAL or op == ast_operator::LESS or op == ast_operator::LESS_EQUAL or op == ast_operator::GREATER or - op == ast_operator::GREATER_EQUAL) { + op == ast_operator::GREATER_EQUAL or op == ast_operator::IS_NULL) { _columns_mask[v->get_column_index()] = true; + if (op == ast_operator::IS_NULL) { _has_is_null_operator = true; } } else { - CUDF_FAIL("Unsupported binary operation in Statistics AST"); + CUDF_FAIL("Unsupported operation in Statistics AST"); } } else { // Visit the operands and ignore any output as we only want to build the column mask @@ -91,9 +94,9 @@ std::reference_wrapper stats_columns_collector::visit( return expr; } -thrust::host_vector stats_columns_collector::get_stats_columns_mask() && +std::pair, bool> stats_columns_collector::get_stats_columns_mask() && { - return std::move(_columns_mask); + return {std::move(_columns_mask), _has_is_null_operator}; } std::vector> stats_columns_collector::visit_operands( @@ -132,16 +135,24 @@ std::reference_wrapper stats_expression_converter::visit( "Second operand of binary operation with column reference must be a literal"); v->accept(*this); - // For unary operators, push and return the `_always_true` expression + auto const col_index = v->get_column_index(); + if (cudf::ast::detail::ast_operator_arity(op) == 1) { - _stats_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); - // Propagate the `_always_true` as expression to its unary operator parent - return _always_true; + // Evaluate IS_NULL unary operator + if (op == ast_operator::IS_NULL) { + auto const& vnull = _stats_expr.push(ast::column_reference{col_index * 3 + 2}); + _stats_expr.push(ast::operation{ast_operator::IDENTITY, vnull}); + return _stats_expr.back(); + } // For all other unary operators, push and return the `_always_true` expression + else { + _stats_expr.push(ast::operation{ast_operator::IDENTITY, _always_true}); + // Propagate the `_always_true` as expression to its unary operator parent + return _always_true; + } } // Push literal into the ast::tree - auto const& literal = _stats_expr.push(*dynamic_cast(&operands[1].get())); - auto const col_index = v->get_column_index(); + auto const& literal = _stats_expr.push(*dynamic_cast(&operands[1].get())); switch (op) { /* transform to stats conditions. op(col, literal) col1 == val --> vmin <= val && vmax >= val @@ -152,8 +163,8 @@ std::reference_wrapper stats_expression_converter::visit( col1 <= val --> vmin <= val */ case ast_operator::EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 3}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 3 + 1}); _stats_expr.push(ast::operation{ ast::ast_operator::LOGICAL_AND, _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), @@ -161,8 +172,8 @@ std::reference_wrapper stats_expression_converter::visit( break; } case ast_operator::NOT_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 3}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 3 + 1}); _stats_expr.push( ast::operation{ast_operator::LOGICAL_OR, _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), @@ -171,17 +182,17 @@ std::reference_wrapper stats_expression_converter::visit( } case ast_operator::LESS: [[fallthrough]]; case ast_operator::LESS_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 3}); _stats_expr.push(ast::operation{op, vmin, literal}); break; } case ast_operator::GREATER: [[fallthrough]]; case ast_operator::GREATER_EQUAL: { - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 3 + 1}); _stats_expr.push(ast::operation{op, vmax, literal}); break; } - default: CUDF_FAIL("Unsupported binary operation in Statistics AST"); + default: CUDF_FAIL("Unsupported operation in Statistics AST"); }; } else { diff --git a/cpp/src/io/parquet/stats_filter_helpers.hpp b/cpp/src/io/parquet/stats_filter_helpers.hpp index efc35535320..4c3dae0e195 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.hpp +++ b/cpp/src/io/parquet/stats_filter_helpers.hpp @@ -266,7 +266,7 @@ class stats_columns_collector : public ast::detail::expression_transformer { * * @return Boolean vector indicating input columns that can participate in stats based filtering */ - thrust::host_vector get_stats_columns_mask() &&; + std::pair, bool> get_stats_columns_mask() &&; protected: std::vector> visit_operands( @@ -276,6 +276,7 @@ class stats_columns_collector : public ast::detail::expression_transformer { private: thrust::host_vector _columns_mask; + bool _has_is_null_operator = false; }; /** diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index f5654cc6261..3fdd0bd0833 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -2853,9 +2853,10 @@ void filter_unary_operation_typed_test() auto constexpr row_group_size_rows = num_rows / 4; auto _col0 = testdata::ascending().release(); // Add nulls to col0 - std::bernoulli_distribution bn(0.7f); - auto valids = - cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); + [[maybe_unused]] std::bernoulli_distribution bn(0.7f); + auto valids = cudf::detail::make_counting_transform_iterator(0, [&](int index) { + return (index >= 2 * row_group_size_rows and index < 3 * row_group_size_rows) ? false : true; + }); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); _col0->set_null_mask(std::move(null_mask), null_count); auto col0 = cudf::purge_nonempty_nulls(_col0->view()); @@ -2934,10 +2935,11 @@ void filter_unary_operation_typed_test() auto const col_name_0 = cudf::ast::column_name_reference("col0"); auto const col_ref_0 = cudf::ast::column_reference(0); - // Unary operation `IS_NULL` should not filter any row groups and yield exactly `null_count` rows + // Unary operation `IS_NULL` should filter all but one row group and yield exactly `null_count` + // rows { auto constexpr expected_total_row_groups = 4; - auto constexpr expected_stats_filtered_row_groups = 4; + auto constexpr expected_stats_filtered_row_groups = 1; auto const filter_expression = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_name_0); @@ -2949,11 +2951,11 @@ void filter_unary_operation_typed_test() null_count); } - // Unary operation `NOT(IS_NULL)` should not filter any row groups and yield exactly `num_rows - - // null_count` rows + // Unary operation `NOT(IS_NULL)` should filter all but one row group and yield exactly `num_rows + // - null_count` rows { auto constexpr expected_total_row_groups = 4; - auto constexpr expected_stats_filtered_row_groups = 4; + auto constexpr expected_stats_filtered_row_groups = 3; auto const is_null_expr = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_name_0); auto const filter_expression = cudf::ast::operation(cudf::ast::ast_operator::NOT, is_null_expr); @@ -2967,8 +2969,6 @@ void filter_unary_operation_typed_test() num_ordered_rows - null_count); } - // Unary operation `IS_NULL` should not affect anything when ANDing with another expression, and - // should short circuit when ORing with another expression { auto constexpr expected_total_row_groups = 4; @@ -2997,21 +2997,21 @@ void filter_unary_operation_typed_test() auto const ref_expr1 = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); auto const ref_expr2 = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_ref_0); - // Filter expression AND unary operation + // col0 < 100 AND IS_NULL(col0) auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr1, expr2); auto ref_filter = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, ref_expr1, ref_expr2); - auto constexpr expected_filtered_row_groups_with_unary_and = 2; + auto constexpr expected_filtered_row_groups_with_unary_and = 1; test_predicate_pushdown(filter_expression, ref_filter, expected_total_row_groups, expected_filtered_row_groups_with_unary_and); - // Filter expression OR unary operation + // col0 < 100 OR IS_NULL(col0) filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr1, expr2); ref_filter = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, ref_expr1, ref_expr2); - auto constexpr expected_filtered_row_groups_with_unary_or = 4; + auto constexpr expected_filtered_row_groups_with_unary_or = 3; test_predicate_pushdown(filter_expression, ref_filter, expected_total_row_groups, From b41d806458bcf53a0631518292b0fd0b9fc09383 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 30 Sep 2025 22:38:17 +0000 Subject: [PATCH 09/22] Propagate `_always_true` up to the expression tree --- cpp/src/io/parquet/bloom_filter_reader.cu | 1 + cpp/src/io/parquet/experimental/dictionary_page_filter.cu | 1 + cpp/src/io/parquet/stats_filter_helpers.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index 586edf11781..26e38bc9d6c 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -254,6 +254,7 @@ class bloom_filter_expression_converter : public equality_literals_collector { lit == &_always_true) { _bloom_filter_expr.push( ast::operation{ast_operator::IDENTITY, _bloom_filter_expr.back()}); + return _always_true; } else { _bloom_filter_expr.push(ast::operation{op, new_operands.front()}); } diff --git a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu index debfc3c0713..2a5254af7b6 100644 --- a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu +++ b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu @@ -1461,6 +1461,7 @@ class dictionary_expression_converter : public equality_literals_collector { if (auto* lit = dynamic_cast(&new_operands.front().get()); lit == &_always_true) { _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, _dictionary_expr.back()}); + return _always_true; } else { _dictionary_expr.push(ast::operation{op, new_operands.front()}); } diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp index 78ee1c39a53..4e734c6984d 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.cpp +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -204,6 +204,7 @@ std::reference_wrapper stats_expression_converter::visit( if (auto* lit = dynamic_cast(&new_operands.front().get()); lit == &_always_true) { _stats_expr.push(ast::operation{ast_operator::IDENTITY, _stats_expr.back()}); + return _always_true; } else { _stats_expr.push(ast::operation{op, new_operands.front()}); } From b56d27a8900346152a57ec27122cd9fd15364138 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 2 Oct 2025 21:00:26 +0000 Subject: [PATCH 10/22] Avoid dynamic cast and directly compare `new_operands` and `_always_true` ptrs --- cpp/src/io/parquet/bloom_filter_reader.cu | 3 +-- cpp/src/io/parquet/experimental/dictionary_page_filter.cu | 3 +-- cpp/src/io/parquet/stats_filter_helpers.cpp | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index 26e38bc9d6c..063bf72b7c7 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -250,8 +250,7 @@ class bloom_filter_expression_converter : public equality_literals_collector { _bloom_filter_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { // If the new_operands is just a `_always_true` literal, propagate it here - if (auto* lit = dynamic_cast(&new_operands.front().get()); - lit == &_always_true) { + if (&new_operands.front().get() == &_always_true) { _bloom_filter_expr.push( ast::operation{ast_operator::IDENTITY, _bloom_filter_expr.back()}); return _always_true; diff --git a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu index 2a5254af7b6..853d4eda963 100644 --- a/cpp/src/io/parquet/experimental/dictionary_page_filter.cu +++ b/cpp/src/io/parquet/experimental/dictionary_page_filter.cu @@ -1458,8 +1458,7 @@ class dictionary_expression_converter : public equality_literals_collector { _dictionary_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { // If the new_operands is just a `_always_true` literal, propagate it here - if (auto* lit = dynamic_cast(&new_operands.front().get()); - lit == &_always_true) { + if (&new_operands.front().get() == &_always_true) { _dictionary_expr.push(ast::operation{ast_operator::IDENTITY, _dictionary_expr.back()}); return _always_true; } else { diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp index 4e734c6984d..35c678962c4 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.cpp +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -201,8 +201,7 @@ std::reference_wrapper stats_expression_converter::visit( _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { // If the new_operands is just a `_always_true` literal, propagate it here - if (auto* lit = dynamic_cast(&new_operands.front().get()); - lit == &_always_true) { + if (&new_operands.front().get() == &_always_true) { _stats_expr.push(ast::operation{ast_operator::IDENTITY, _stats_expr.back()}); return _always_true; } else { From 4719b73a14ba3fd69ceb31b784e13276eec81d65 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 9 Oct 2025 17:58:05 +0000 Subject: [PATCH 11/22] Minor comment update --- cpp/tests/io/experimental/hybrid_scan_common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/experimental/hybrid_scan_common.cpp b/cpp/tests/io/experimental/hybrid_scan_common.cpp index b0d6422c110..e126b0cbf61 100644 --- a/cpp/tests/io/experimental/hybrid_scan_common.cpp +++ b/cpp/tests/io/experimental/hybrid_scan_common.cpp @@ -170,7 +170,7 @@ auto apply_parquet_filters(cudf::host_span file_buffer_span, std::vector dictionary_page_buffers = fetch_byte_ranges(file_buffer_span, dict_page_byte_ranges, stream, mr); - // NOT YET IMPLEMENTED - Filter row groups with dictionary pages + // Filter row groups with dictionary pages dictionary_page_filtered_row_group_indices = reader->filter_row_groups_with_dictionary_pages( dictionary_page_buffers, current_row_group_indices, options, stream); From 9e96d8fb13581090758f84e5f34774489d001a6b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 14 Oct 2025 00:24:27 +0000 Subject: [PATCH 12/22] Make `is_null` evaluation optional --- .../parquet/experimental/page_index_filter.cu | 158 +++++++++++------- cpp/src/io/parquet/predicate_pushdown.cpp | 71 ++++---- cpp/src/io/parquet/stats_filter_helpers.cpp | 27 ++- cpp/src/io/parquet/stats_filter_helpers.hpp | 7 +- 4 files changed, 157 insertions(+), 106 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 6fa0843ee89..5d75ee86642 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -249,15 +249,7 @@ struct page_stats_caster : public stats_caster_base { cudf::size_type total_rows; cudf::host_span per_file_metadata; cudf::host_span const> row_group_indices; - - page_stats_caster(size_type total_rows, - cudf::host_span per_file_metadata, - cudf::host_span const> row_group_indices) - : total_rows{total_rows}, - per_file_metadata{per_file_metadata}, - row_group_indices{row_group_indices} - { - } + bool const has_is_null_operator; /** * @brief Transforms a page-level stats column to a row-level stats column for non-string types @@ -316,6 +308,42 @@ struct page_stats_caster : public stats_caster_base { return {std::move(output_data), std::move(output_nullmask)}; } + /** + * @brief Builds a device column containing each page's `is_null` statistic at + * respectively of a column at each row index. + * + * @param is_null Host column storing the page-level is_null statistics + * @param page_indices Device vector containing the page index for each row index + * @param page_row_offsets Host vector row offsets of each page + * @param stream CUDA stream + * @param mr Device memory resource + * + * @return A pair containing the output data buffer and nullmask + */ + [[nodiscard]] std::optional> build_is_null_device_column( + std::optional>& is_null, + cudf::device_span page_indices, + cudf::host_span page_row_offsets, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const + { + CUDF_EXPECTS(has_is_null_operator and is_null.has_value(), + "is_null host column must be present"); + auto const dtype = cudf::data_type{cudf::type_id::BOOL8}; + auto is_nullcol = is_null->to_device(dtype, stream, mr); + auto [null_data, null_bitmask] = build_data_and_nullmask(is_nullcol->mutable_view(), + is_null->null_mask.data(), + page_indices, + page_row_offsets, + dtype, + stream, + mr); + auto const null_nulls = cudf::detail::null_count( + reinterpret_cast(null_bitmask.data()), 0, total_rows, stream); + return std::make_optional(std::make_unique( + dtype, total_rows, std::move(null_data), std::move(null_bitmask), null_nulls)); + } + /** * @brief Transforms a page-level stats column to a row-level stats column for string type * @@ -440,14 +468,16 @@ struct page_stats_caster : public stats_caster_base { * @param stream CUDA stream * @param mr Device memory resource * - * @return A pair of device columns with min and max value from page statistics for each row + * @return A tuple of device columns with min, max and optionally is_null value from page + * statistics for each row */ template - std::tuple, std::unique_ptr, std::unique_ptr> operator()( - cudf::size_type schema_idx, - cudf::data_type dtype, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const + std:: + tuple, std::unique_ptr, std::optional>> + operator()(cudf::size_type schema_idx, + cudf::data_type dtype, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { // List, Struct, Dictionary types are not supported if constexpr (cudf::is_compound() and not cuda::std::is_same_v) { @@ -463,10 +493,11 @@ struct page_stats_caster : public stats_caster_base { auto const total_pages = col_chunk_page_offsets.back(); - // Create host columns with page-level min, max values + // Create host columns with page-level min, max and optionally is_null statistics host_column min(total_pages, stream); host_column max(total_pages, stream); - host_column is_null(total_pages, stream); + std::optional> is_null; + if (has_is_null_operator) { is_null = host_column(total_pages, stream); } // Populate the host columns with page-level min, max statistics from the page index auto page_offset_idx = 0; @@ -501,23 +532,28 @@ struct page_stats_caster : public stats_caster_base { // Translate binary data to Type then to min.set_index(column_page_idx, min_value, colchunk.meta_data.type); max.set_index(column_page_idx, max_value, colchunk.meta_data.type); - if (column_index.null_pages[page_idx]) { - is_null.val[column_page_idx] = true; - return; - } - if (not column_index.null_counts.has_value()) { - is_null.set_index(column_page_idx, std::nullopt, {}); - return; - } - auto const page_row_count = page_row_offsets[column_page_idx + 1] - - page_row_offsets[column_page_idx]; - auto const& null_count = column_index.null_counts.value()[page_idx]; - if (null_count == page_row_count) { - is_null.val[column_page_idx] = false; - } else if (null_count > 0 and null_count < page_row_count) { - is_null.set_index(column_page_idx, std::nullopt, {}); - } else { - CUDF_FAIL("Invalid null count"); + if (has_is_null_operator) { + // Check if the page is completely null + if (column_index.null_pages[page_idx]) { + is_null->val[column_page_idx] = true; + return; + } + // Check if the page doesn't have a null count + if (not column_index.null_counts.has_value()) { + is_null->set_index(column_page_idx, std::nullopt, {}); + return; + } + // Use the null count to determine if the page is completely null + auto const page_row_count = page_row_offsets[column_page_idx + 1] - + page_row_offsets[column_page_idx]; + auto const& null_count = column_index.null_counts.value()[page_idx]; + if (null_count == page_row_count) { + is_null->val[column_page_idx] = false; + } else if (null_count > 0 and null_count < page_row_count) { + is_null->set_index(column_page_idx, std::nullopt, {}); + } else { + CUDF_FAIL("Invalid null count"); + } } }); }); @@ -530,50 +566,41 @@ struct page_stats_caster : public stats_caster_base { // For non-strings columns, directly gather the page-level column data and bitmask to the // row-level. if constexpr (not cuda::std::is_same_v) { - // Move host columns to device - auto mincol = min.to_device(dtype, stream, mr); - auto maxcol = max.to_device(dtype, stream, mr); - auto is_nullcol = is_null.to_device(cudf::data_type{cudf::type_id::BOOL8}, stream, mr); + // Move host min/max columns to device + auto mincol = min.to_device(dtype, stream, mr); + auto maxcol = max.to_device(dtype, stream, mr); // Convert page-level min and max columns to row-level min and max columns by gathering // values based on page-level row offsets - auto [min_data, min_bitmask] = build_data_and_nullmask(mincol->mutable_view(), + auto [min_data, min_bitmask] = build_data_and_nullmask(mincol->mutable_view(), min.null_mask.data(), page_indices, page_row_offsets, dtype, stream, mr); - auto [max_data, max_bitmask] = build_data_and_nullmask(maxcol->mutable_view(), + auto [max_data, max_bitmask] = build_data_and_nullmask(maxcol->mutable_view(), max.null_mask.data(), page_indices, page_row_offsets, dtype, stream, mr); - auto [null_data, null_bitmask] = build_data_and_nullmask(is_nullcol->mutable_view(), - is_null.null_mask.data(), - page_indices, - page_row_offsets, - dtype, - stream, - mr); // Count nulls in min and max columns auto const min_nulls = cudf::detail::null_count( reinterpret_cast(min_bitmask.data()), 0, total_rows, stream); auto const max_nulls = cudf::detail::null_count( reinterpret_cast(max_bitmask.data()), 0, total_rows, stream); - auto const null_nulls = cudf::detail::null_count( - reinterpret_cast(null_bitmask.data()), 0, total_rows, stream); // Return min, max and is_null device columns return {std::make_unique( dtype, total_rows, std::move(min_data), std::move(min_bitmask), min_nulls), std::make_unique( dtype, total_rows, std::move(max_data), std::move(max_bitmask), max_nulls), - std::make_unique( - dtype, total_rows, std::move(null_data), std::move(null_bitmask), null_nulls)}; + has_is_null_operator + ? build_is_null_device_column(is_null, page_indices, page_row_offsets, stream, mr) + : std::nullopt}; } // For strings columns, gather the page-level string offsets and bitmask to row-level // directly and gather string chars using a batched memcpy. @@ -582,17 +609,12 @@ struct page_stats_caster : public stats_caster_base { min.val, min.chars, min.null_mask.data(), page_indices, page_row_offsets, stream, mr); auto [max_data, max_offsets, max_nullmask] = build_string_data_and_nullmask( max.val, max.chars, max.null_mask.data(), page_indices, page_row_offsets, stream, mr); - auto is_nullcol = is_null.to_device(cudf::data_type{cudf::type_id::BOOL8}, stream, mr); - auto [null_data, null_bitmask] = build_data_and_nullmask( - is_nullcol->mutable_view(), {}, page_indices, page_row_offsets, dtype, stream, mr); // Count nulls in min and max columns auto const min_nulls = cudf::detail::null_count( reinterpret_cast(min_nullmask.data()), 0, total_rows, stream); auto const max_nulls = cudf::detail::null_count( reinterpret_cast(max_nullmask.data()), 0, total_rows, stream); - auto const null_nulls = cudf::detail::null_count( - reinterpret_cast(null_bitmask.data()), 0, total_rows, stream); // Return min, max and is_null device strings columns return { @@ -608,8 +630,9 @@ struct page_stats_caster : public stats_caster_base { std::move(max_data), max_nulls, std::move(max_nullmask)), - std::make_unique( - dtype, total_rows, std::move(null_data), std::move(null_bitmask), null_nulls)}; + has_is_null_operator + ? build_is_null_device_column(is_null, page_indices, page_row_offsets, stream, mr) + : std::nullopt}; } } } @@ -672,8 +695,10 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag // Convert page statistics to a table // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1] // For each column, it contains total number of rows from all row groups. - page_stats_caster const stats_col{ - static_cast(total_rows), per_file_metadata, row_group_indices}; + page_stats_caster const stats_col{.total_rows = static_cast(total_rows), + .per_file_metadata = per_file_metadata, + .row_group_indices = row_group_indices, + .has_is_null_operator = has_is_null_operator}; std::vector> columns; std::for_each( @@ -690,22 +715,27 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr)); columns.push_back(cudf::make_numeric_column( data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr)); - columns.push_back(cudf::make_numeric_column( - data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr)); + if (has_is_null_operator) { + columns.push_back(cudf::make_numeric_column( + data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr)); + } return; } auto [min_col, max_col, is_null_col] = cudf::type_dispatcher( dtype, stats_col, schema_idx, dtype, stream, mr); columns.push_back(std::move(min_col)); columns.push_back(std::move(max_col)); - columns.push_back(std::move(is_null_col)); + if (has_is_null_operator) { + CUDF_EXPECTS(is_null_col.has_value(), "is_null host column must be present"); + columns.push_back(std::move(is_null_col.value())); + } }); auto stats_table = cudf::table(std::move(columns)); // Converts AST to StatsAST with reference to min, max columns in above `stats_table`. parquet::detail::stats_expression_converter const stats_expr{ - filter.get(), static_cast(output_dtypes.size()), stream}; + filter.get(), static_cast(output_dtypes.size()), has_is_null_operator, stream}; // Filter the input table using AST expression and return the (BOOL8) predicate column. return cudf::detail::compute_column(stats_table, stats_expr.get_stats_expr().get(), stream, mr); diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index ed37e002837..9f418e2d33f 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -52,23 +52,16 @@ struct row_group_stats_caster : public stats_caster_base { size_type total_row_groups; std::vector const& per_file_metadata; host_span const> row_group_indices; - - row_group_stats_caster(size_type total_row_groups, - std::vector const& per_file_metadata, - host_span const> row_group_indices) - : total_row_groups{total_row_groups}, - per_file_metadata{per_file_metadata}, - row_group_indices{row_group_indices} - { - } + bool has_is_null_operator; // Creates device columns from column statistics (min, max) template - std::tuple, std::unique_ptr, std::unique_ptr> operator()( - int schema_idx, - cudf::data_type dtype, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const + std:: + tuple, std::unique_ptr, std::optional>> + operator()(int schema_idx, + cudf::data_type dtype, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { // List, Struct, Dictionary types are not supported if constexpr (cudf::is_compound() && !std::is_same_v) { @@ -76,7 +69,8 @@ struct row_group_stats_caster : public stats_caster_base { } else { host_column min(total_row_groups, stream); host_column max(total_row_groups, stream); - host_column is_null(total_row_groups, stream); + std::optional> is_null; + if (has_is_null_operator) { is_null = host_column(total_row_groups, stream); } size_type stats_idx = 0; for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) { @@ -99,30 +93,34 @@ struct row_group_stats_caster : public stats_caster_base { min.set_index(stats_idx, min_value, colchunk.meta_data.type); max.set_index(stats_idx, max_value, colchunk.meta_data.type); // Check the nullability of this column chunk - if (colchunk.meta_data.statistics.null_count.has_value()) { - auto const& null_count = colchunk.meta_data.statistics.null_count.value(); - if (null_count == 0) { - is_null.val[stats_idx] = false; - } else if (null_count < colchunk.meta_data.num_values) { - is_null.set_index(stats_idx, std::nullopt, {}); - } else if (null_count == colchunk.meta_data.num_values) { - is_null.val[stats_idx] = true; - } else { - CUDF_FAIL("Invalid null count"); + if (has_is_null_operator) { + if (colchunk.meta_data.statistics.null_count.has_value()) { + auto const& null_count = colchunk.meta_data.statistics.null_count.value(); + if (null_count == 0) { + is_null->val[stats_idx] = false; + } else if (null_count < colchunk.meta_data.num_values) { + is_null->set_index(stats_idx, std::nullopt, {}); + } else if (null_count == colchunk.meta_data.num_values) { + is_null->val[stats_idx] = true; + } else { + CUDF_FAIL("Invalid null count"); + } } } } else { // Marking it null, if column present in row group min.set_index(stats_idx, std::nullopt, {}); max.set_index(stats_idx, std::nullopt, {}); - is_null.set_index(stats_idx, std::nullopt, {}); + if (has_is_null_operator) { is_null->set_index(stats_idx, std::nullopt, {}); } } stats_idx++; } }; return {min.to_device(dtype, stream, mr), max.to_device(dtype, stream, mr), - is_null.to_device(data_type{cudf::type_id::BOOL8}, stream, mr)}; + has_is_null_operator ? std::make_optional(is_null->to_device( + data_type{cudf::type_id::BOOL8}, stream, mr)) + : std::nullopt}; } } }; @@ -152,7 +150,11 @@ std::optional>> aggregate_reader_metadata::ap // For each column, it contains #sources * #column_chunks_per_src rows std::vector> columns; row_group_stats_caster const stats_col{ - static_cast(total_row_groups), per_file_metadata, input_row_group_indices}; + .total_row_groups = static_cast(total_row_groups), + .per_file_metadata = per_file_metadata, + .row_group_indices = input_row_group_indices, + .has_is_null_operator = has_is_null_operator}; + for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) { auto const schema_idx = output_column_schemas[col_idx]; auto const& dtype = output_dtypes[col_idx]; @@ -164,21 +166,26 @@ std::optional>> aggregate_reader_metadata::ap data_type{cudf::type_id::BOOL8}, total_row_groups, rmm::device_buffer{}, 0, stream, mr)); columns.push_back(cudf::make_numeric_column( data_type{cudf::type_id::BOOL8}, total_row_groups, rmm::device_buffer{}, 0, stream, mr)); - columns.push_back(cudf::make_numeric_column( - data_type{cudf::type_id::BOOL8}, total_row_groups, rmm::device_buffer{}, 0, stream, mr)); + if (has_is_null_operator) { + columns.push_back(cudf::make_numeric_column( + data_type{cudf::type_id::BOOL8}, total_row_groups, rmm::device_buffer{}, 0, stream, mr)); + } continue; } auto [min_col, max_col, is_null_col] = cudf::type_dispatcher(dtype, stats_col, schema_idx, dtype, stream, mr); columns.push_back(std::move(min_col)); columns.push_back(std::move(max_col)); - columns.push_back(std::move(is_null_col)); + if (has_is_null_operator) { + CUDF_EXPECTS(is_null_col.has_value(), "is_null column must be present"); + columns.push_back(std::move(is_null_col.value())); + } } auto stats_table = cudf::table(std::move(columns)); // Converts AST to StatsAST with reference to min, max columns in above `stats_table`. stats_expression_converter const stats_expr{ - filter.get(), static_cast(output_dtypes.size()), stream}; + filter.get(), static_cast(output_dtypes.size()), has_is_null_operator, stream}; // Filter stats table with StatsAST expression and collect filtered row group indices return collect_filtered_row_group_indices( diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp index 3ae4c0e554d..5b1da5feb1d 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.cpp +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -111,11 +111,13 @@ std::vector> stats_columns_collect stats_expression_converter::stats_expression_converter(ast::expression const& expr, size_type num_columns, + bool has_is_null_operator, rmm::cuda_stream_view stream) : _always_true_scalar{std::make_unique>(true, true, stream)}, _always_true{std::make_unique(*_always_true_scalar)} { - _num_columns = num_columns; + _stats_cols_per_column = has_is_null_operator ? 3 : 2; + _num_columns = num_columns; expr.accept(*this); } @@ -141,7 +143,10 @@ std::reference_wrapper stats_expression_converter::visit( if (operator_arity == 1) { // Evaluate IS_NULL unary operator if (op == ast_operator::IS_NULL) { - auto const& vnull = _stats_expr.push(ast::column_reference{col_index * 3 + 2}); + CUDF_EXPECTS(std::cmp_equal(_stats_cols_per_column, 3), + "IS_NULL operator cannot be evaluated without nullability information column"); + auto const& vnull = + _stats_expr.push(ast::column_reference{col_index * _stats_cols_per_column + 2}); _stats_expr.push(ast::operation{ast_operator::IDENTITY, vnull}); return _stats_expr.back(); } // For all other unary operators, push and return the `_always_true` expression @@ -164,8 +169,10 @@ std::reference_wrapper stats_expression_converter::visit( col1 <= val --> vmin <= val */ case ast_operator::EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 3}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 3 + 1}); + auto const& vmin = + _stats_expr.push(ast::column_reference{col_index * _stats_cols_per_column}); + auto const& vmax = + _stats_expr.push(ast::column_reference{col_index * _stats_cols_per_column + 1}); _stats_expr.push(ast::operation{ ast::ast_operator::LOGICAL_AND, _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), @@ -173,8 +180,10 @@ std::reference_wrapper stats_expression_converter::visit( break; } case ast_operator::NOT_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 3}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 3 + 1}); + auto const& vmin = + _stats_expr.push(ast::column_reference{col_index * _stats_cols_per_column}); + auto const& vmax = + _stats_expr.push(ast::column_reference{col_index * _stats_cols_per_column + 1}); _stats_expr.push( ast::operation{ast_operator::LOGICAL_OR, _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), @@ -183,13 +192,15 @@ std::reference_wrapper stats_expression_converter::visit( } case ast_operator::LESS: [[fallthrough]]; case ast_operator::LESS_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 3}); + auto const& vmin = + _stats_expr.push(ast::column_reference{col_index * _stats_cols_per_column}); _stats_expr.push(ast::operation{op, vmin, literal}); break; } case ast_operator::GREATER: [[fallthrough]]; case ast_operator::GREATER_EQUAL: { - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 3 + 1}); + auto const& vmax = + _stats_expr.push(ast::column_reference{col_index * _stats_cols_per_column + 1}); _stats_expr.push(ast::operation{op, vmax, literal}); break; } diff --git a/cpp/src/io/parquet/stats_filter_helpers.hpp b/cpp/src/io/parquet/stats_filter_helpers.hpp index 8a5a0f0ba5c..f481a186e79 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.hpp +++ b/cpp/src/io/parquet/stats_filter_helpers.hpp @@ -283,13 +283,15 @@ class stats_columns_collector : public ast::detail::expression_transformer { * @brief Converts AST expression to StatsAST for comparing with column statistics * * This is used in row group filtering based on predicate. - * statistics min value of a column is referenced by column_index*2 - * statistics max value of a column is referenced by column_index*2+1 + * statistics min value of a column is referenced by column_index*3 + * statistics max value of a column is referenced by column_index*3+1 + * statistics is_null value of a column is referenced by column_index*3+2 */ class stats_expression_converter : public stats_columns_collector { public: stats_expression_converter(ast::expression const& expr, size_type num_columns, + bool has_is_null_operator, rmm::cuda_stream_view stream); // Bring all overrides of `visit` from stats_columns_collector into scope @@ -314,6 +316,7 @@ class stats_expression_converter : public stats_columns_collector { private: ast::tree _stats_expr; + cudf::size_type _stats_cols_per_column; std::unique_ptr> _always_true_scalar; std::unique_ptr _always_true; }; From de9a887d73b2fb8766b9f8417a6e85788f1a1efa Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 20 Oct 2025 20:59:11 +0000 Subject: [PATCH 13/22] Address reviews --- .../parquet/experimental/page_index_filter.cu | 28 ++++++++++--------- cpp/src/io/parquet/stats_filter_helpers.hpp | 2 +- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 5d75ee86642..da2974b1967 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -320,19 +320,20 @@ struct page_stats_caster : public stats_caster_base { * * @return A pair containing the output data buffer and nullmask */ - [[nodiscard]] std::optional> build_is_null_device_column( - std::optional>& is_null, + [[nodiscard]] std::unique_ptr build_is_null_device_column( + host_column const& is_null, cudf::device_span page_indices, cudf::host_span page_row_offsets, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { - CUDF_EXPECTS(has_is_null_operator and is_null.has_value(), - "is_null host column must be present"); - auto const dtype = cudf::data_type{cudf::type_id::BOOL8}; - auto is_nullcol = is_null->to_device(dtype, stream, mr); + CUDF_EXPECTS( + has_is_null_operator, + "The filter expression must have an IS_NULL operator to build is_null device column"); + auto const dtype = cudf::data_type{cudf::type_id::BOOL8}; + auto is_nullcol = is_null.to_device(dtype, stream, cudf::get_current_device_resource_ref()); auto [null_data, null_bitmask] = build_data_and_nullmask(is_nullcol->mutable_view(), - is_null->null_mask.data(), + is_null.null_mask.data(), page_indices, page_row_offsets, dtype, @@ -340,8 +341,8 @@ struct page_stats_caster : public stats_caster_base { mr); auto const null_nulls = cudf::detail::null_count( reinterpret_cast(null_bitmask.data()), 0, total_rows, stream); - return std::make_optional(std::make_unique( - dtype, total_rows, std::move(null_data), std::move(null_bitmask), null_nulls)); + return std::make_unique( + dtype, total_rows, std::move(null_data), std::move(null_bitmask), null_nulls); } /** @@ -599,7 +600,8 @@ struct page_stats_caster : public stats_caster_base { std::make_unique( dtype, total_rows, std::move(max_data), std::move(max_bitmask), max_nulls), has_is_null_operator - ? build_is_null_device_column(is_null, page_indices, page_row_offsets, stream, mr) + ? std::make_optional(build_is_null_device_column( + is_null.value(), page_indices, page_row_offsets, stream, mr)) : std::nullopt}; } // For strings columns, gather the page-level string offsets and bitmask to row-level @@ -630,9 +632,9 @@ struct page_stats_caster : public stats_caster_base { std::move(max_data), max_nulls, std::move(max_nullmask)), - has_is_null_operator - ? build_is_null_device_column(is_null, page_indices, page_row_offsets, stream, mr) - : std::nullopt}; + has_is_null_operator ? std::make_optional(build_is_null_device_column( + is_null.value(), page_indices, page_row_offsets, stream, mr)) + : std::nullopt}; } } } diff --git a/cpp/src/io/parquet/stats_filter_helpers.hpp b/cpp/src/io/parquet/stats_filter_helpers.hpp index f481a186e79..004f8203679 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.hpp +++ b/cpp/src/io/parquet/stats_filter_helpers.hpp @@ -206,7 +206,7 @@ class stats_caster_base { std::unique_ptr inline to_device(cudf::data_type dtype, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) + rmm::device_async_resource_ref mr) const { if constexpr (std::is_same_v) { auto [d_chars, d_offsets, _] = make_strings_children(val, chars, stream, mr); From a5cc17444f455f0eecc22c078432cf368b21b75c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 20 Oct 2025 15:04:34 -0700 Subject: [PATCH 14/22] Add [[nodiscard]] attribute to to_device method --- cpp/src/io/parquet/stats_filter_helpers.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/stats_filter_helpers.hpp b/cpp/src/io/parquet/stats_filter_helpers.hpp index 004f8203679..cb3b23e1864 100644 --- a/cpp/src/io/parquet/stats_filter_helpers.hpp +++ b/cpp/src/io/parquet/stats_filter_helpers.hpp @@ -204,9 +204,9 @@ class stats_caster_base { return {std::move(d_chars), std::move(d_offsets), std::move(d_sizes)}; } - std::unique_ptr inline to_device(cudf::data_type dtype, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const + [[nodiscard]] std::unique_ptr inline to_device(cudf::data_type dtype, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { if constexpr (std::is_same_v) { auto [d_chars, d_offsets, _] = make_strings_children(val, chars, stream, mr); From e6d52215f4a631c8b14994ecae3fa6e65d4165f2 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 21 Oct 2025 15:32:17 -0700 Subject: [PATCH 15/22] Apply suggestion from @mhaseeb123 --- cpp/src/io/parquet/experimental/page_index_filter.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index da2974b1967..cbe8c606708 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -461,7 +461,8 @@ struct page_stats_caster : public stats_caster_base { /** * @brief Builds two device columns storing the corresponding page-level statistics (min, max) - * respectively of a column at each row index. + * @brief Builds three device columns storing the corresponding page-level statistics + * (min, max, is_null) respectively of a column at each row index * * @tparam T underlying type of the column * @param schema_idx Column schema index From f6dc421c312696fac1f285051b6d195d976bb595 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 21 Oct 2025 15:32:33 -0700 Subject: [PATCH 16/22] Apply suggestion from @mhaseeb123 --- cpp/src/io/parquet/experimental/page_index_filter.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index cbe8c606708..abd6feef4e7 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -460,7 +460,6 @@ struct page_stats_caster : public stats_caster_base { } /** - * @brief Builds two device columns storing the corresponding page-level statistics (min, max) * @brief Builds three device columns storing the corresponding page-level statistics * (min, max, is_null) respectively of a column at each row index * From 106d25297bd4e9cef7e7b8263357c9050c6e5e6d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 21 Oct 2025 22:34:23 +0000 Subject: [PATCH 17/22] style plz --- cpp/src/io/parquet/experimental/page_index_filter.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index abd6feef4e7..decf7089787 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -460,8 +460,8 @@ struct page_stats_caster : public stats_caster_base { } /** - * @brief Builds three device columns storing the corresponding page-level statistics - * (min, max, is_null) respectively of a column at each row index + * @brief Builds three device columns storing the corresponding page-level statistics + * (min, max, is_null) respectively of a column at each row index * * @tparam T underlying type of the column * @param schema_idx Column schema index From f0fdf8b866fa835c56d5d1a6d0b471713af1d46e Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 22 Oct 2025 01:21:48 +0000 Subject: [PATCH 18/22] Use correct `mr` for internals --- .../parquet/experimental/page_index_filter.cu | 51 ++++++++++++------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index decf7089787..4902b8e8506 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -373,7 +373,8 @@ struct page_stats_caster : public stats_caster_base { // Construct device vectors containing page-level (input) string data, and offsets and sizes auto [page_str_chars, page_str_offsets, page_str_sizes] = - host_column::make_strings_children(host_strings, host_chars, stream, mr); + host_column::make_strings_children( + host_strings, host_chars, stream, cudf::get_current_device_resource_ref()); // Buffer for row-level string sizes (output). auto row_str_sizes = rmm::device_uvector(total_rows, stream, mr); @@ -568,8 +569,8 @@ struct page_stats_caster : public stats_caster_base { // row-level. if constexpr (not cuda::std::is_same_v) { // Move host min/max columns to device - auto mincol = min.to_device(dtype, stream, mr); - auto maxcol = max.to_device(dtype, stream, mr); + auto mincol = min.to_device(dtype, stream, cudf::get_current_device_resource_ref()); + auto maxcol = max.to_device(dtype, stream, cudf::get_current_device_resource_ref()); // Convert page-level min and max columns to row-level min and max columns by gathering // values based on page-level row offsets @@ -695,14 +696,14 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag } // Convert page statistics to a table - // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1] + // where min(col[i]) = columns[i*3], max(col[i])=columns[i*3+1], is_null(col[i])=columns[i*3+2] // For each column, it contains total number of rows from all row groups. page_stats_caster const stats_col{.total_rows = static_cast(total_rows), .per_file_metadata = per_file_metadata, .row_group_indices = row_group_indices, .has_is_null_operator = has_is_null_operator}; - std::vector> columns; + std::vector> page_stats_columns; std::for_each( thrust::counting_iterator(0), thrust::counting_iterator(num_columns), @@ -713,34 +714,50 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag if (not stats_columns_mask[col_idx] or (cudf::is_compound(dtype) && dtype.id() != cudf::type_id::STRING)) { // Placeholder for unsupported types and non-participating columns - columns.push_back(cudf::make_numeric_column( - data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr)); - columns.push_back(cudf::make_numeric_column( - data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr)); + page_stats_columns.push_back( + cudf::make_numeric_column(data_type{cudf::type_id::BOOL8}, + total_rows, + rmm::device_buffer{}, + 0, + stream, + cudf::get_current_device_resource_ref())); + page_stats_columns.push_back( + cudf::make_numeric_column(data_type{cudf::type_id::BOOL8}, + total_rows, + rmm::device_buffer{}, + 0, + stream, + cudf::get_current_device_resource_ref())); if (has_is_null_operator) { - columns.push_back(cudf::make_numeric_column( - data_type{cudf::type_id::BOOL8}, total_rows, rmm::device_buffer{}, 0, stream, mr)); + page_stats_columns.push_back( + cudf::make_numeric_column(data_type{cudf::type_id::BOOL8}, + total_rows, + rmm::device_buffer{}, + 0, + stream, + cudf::get_current_device_resource_ref())); } return; } auto [min_col, max_col, is_null_col] = cudf::type_dispatcher( - dtype, stats_col, schema_idx, dtype, stream, mr); - columns.push_back(std::move(min_col)); - columns.push_back(std::move(max_col)); + dtype, stats_col, schema_idx, dtype, stream, cudf::get_current_device_resource_ref()); + page_stats_columns.push_back(std::move(min_col)); + page_stats_columns.push_back(std::move(max_col)); if (has_is_null_operator) { CUDF_EXPECTS(is_null_col.has_value(), "is_null host column must be present"); - columns.push_back(std::move(is_null_col.value())); + page_stats_columns.push_back(std::move(is_null_col.value())); } }); - auto stats_table = cudf::table(std::move(columns)); + auto page_stats_table = cudf::table(std::move(page_stats_columns)); // Converts AST to StatsAST with reference to min, max columns in above `stats_table`. parquet::detail::stats_expression_converter const stats_expr{ filter.get(), static_cast(output_dtypes.size()), has_is_null_operator, stream}; // Filter the input table using AST expression and return the (BOOL8) predicate column. - return cudf::detail::compute_column(stats_table, stats_expr.get_stats_expr().get(), stream, mr); + return cudf::detail::compute_column( + page_stats_table, stats_expr.get_stats_expr().get(), stream, mr); } std::vector> aggregate_reader_metadata::compute_data_page_mask( From 7e496af873e6cd3767548412932f5ad1203fcefc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 22 Oct 2025 01:53:10 +0000 Subject: [PATCH 19/22] fix docstring --- cpp/src/io/parquet/experimental/page_index_filter.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 4902b8e8506..988311c7356 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -242,8 +242,8 @@ namespace { }; /** - * @brief Converts page-level statistics of a column to 2 device columns - min, max values. Each - * column has number of rows equal to the total rows in all row groups. + * @brief Converts page-level statistics of a column to 3 device columns - min, max and is_null + * values. Each column has number of rows equal to the total rows in all row groups. */ struct page_stats_caster : public stats_caster_base { cudf::size_type total_rows; From 54812f122c36656b9ec46a579788b64d1d3e7792 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 22 Oct 2025 02:05:10 +0000 Subject: [PATCH 20/22] Optimize for no nulls in page stats --- .../parquet/experimental/page_index_filter.cu | 177 +++++++++++------- 1 file changed, 108 insertions(+), 69 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 988311c7356..fccf2c78041 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -289,21 +289,25 @@ struct page_stats_caster : public stats_caster_base { input_column.template begin(), reinterpret_cast(output_data.data())); - // Buffer for output bitmask. Set all bits valid - auto output_nullmask = cudf::create_null_mask(total_rows, mask_state::ALL_VALID, stream, mr); - - // For each input page, invalidate the null mask for corresponding rows if needed. - std::for_each(thrust::counting_iterator(0), - thrust::counting_iterator(total_pages), - [&](auto const page_idx) { - if (not bit_is_set(page_nullmask, page_idx)) { - cudf::set_null_mask(static_cast(output_nullmask.data()), - page_row_offsets[page_idx], - page_row_offsets[page_idx + 1], - false, - stream); - } - }); + // Buffer for output bitmask + auto output_nullmask = rmm::device_buffer{}; + if (input_column.null_count()) { + // Set all bits in output nullmask to valid + output_nullmask = cudf::create_null_mask(total_rows, mask_state::ALL_VALID, stream, mr); + + // For each input page, invalidate the null mask for corresponding rows if needed. + std::for_each(thrust::counting_iterator(0), + thrust::counting_iterator(total_pages), + [&](auto const page_idx) { + if (not bit_is_set(page_nullmask, page_idx)) { + cudf::set_null_mask(static_cast(output_nullmask.data()), + page_row_offsets[page_idx], + page_row_offsets[page_idx + 1], + false, + stream); + } + }); + } return {std::move(output_data), std::move(output_nullmask)}; } @@ -332,17 +336,21 @@ struct page_stats_caster : public stats_caster_base { "The filter expression must have an IS_NULL operator to build is_null device column"); auto const dtype = cudf::data_type{cudf::type_id::BOOL8}; auto is_nullcol = is_null.to_device(dtype, stream, cudf::get_current_device_resource_ref()); - auto [null_data, null_bitmask] = build_data_and_nullmask(is_nullcol->mutable_view(), - is_null.null_mask.data(), - page_indices, - page_row_offsets, - dtype, - stream, - mr); - auto const null_nulls = cudf::detail::null_count( - reinterpret_cast(null_bitmask.data()), 0, total_rows, stream); + auto [is_null_data, is_null_nullmask] = + build_data_and_nullmask(is_nullcol->mutable_view(), + is_null.null_mask.data(), + page_indices, + page_row_offsets, + dtype, + stream, + mr); + auto const null_nulls = + is_nullcol->null_count() + ? cudf::detail::null_count( + reinterpret_cast(is_null_nullmask.data()), 0, total_rows, stream) + : 0; return std::make_unique( - dtype, total_rows, std::move(null_data), std::move(null_bitmask), null_nulls); + dtype, total_rows, std::move(is_null_data), std::move(is_null_nullmask), null_nulls); } /** @@ -351,6 +359,7 @@ struct page_stats_caster : public stats_caster_base { * @param host_strings Host span of cudf::string_view values in the input page-level host column * @param host_chars Host span of string data of the input page-level host column * @param host_nullmask Nullmask of the input page-level host column + * @param input_null_count Number of nulls in the input page-level host column * @param page_indices Device vector containing the page index for each row index * @param page_row_offsets Host vector row offsets of each page * @param stream CUDA stream @@ -363,6 +372,7 @@ struct page_stats_caster : public stats_caster_base { build_string_data_and_nullmask(cudf::host_span host_strings, cudf::host_span host_chars, bitmask_type const* host_page_nullmask, + size_type input_null_count, cudf::device_span page_indices, cudf::host_span page_row_offsets, rmm::cuda_stream_view stream, @@ -399,21 +409,25 @@ struct page_stats_caster : public stats_caster_base { // page-level strings nullmask (input) auto const input_nullmask = host_page_nullmask; - // Buffer for row-level strings nullmask (output). Initialize to all bits set. - auto output_nullmask = cudf::create_null_mask(total_rows, mask_state::ALL_VALID, stream, mr); - - // For each input page, invalidate the null mask for corresponding rows if needed. - std::for_each(thrust::counting_iterator(0), - thrust::counting_iterator(total_pages), - [&](auto const page_idx) { - if (not bit_is_set(input_nullmask, page_idx)) { - cudf::set_null_mask(static_cast(output_nullmask.data()), - page_row_offsets[page_idx], - page_row_offsets[page_idx + 1], - false, - stream); - } - }); + // Buffer for row-level strings nullmask (output) + auto output_nullmask = rmm::device_buffer{}; + if (input_null_count) { + // Initialize output nullmask to all bits set. + output_nullmask = cudf::create_null_mask(total_rows, mask_state::ALL_VALID, stream, mr); + + // For each input page, invalidate the null mask for corresponding rows if needed. + std::for_each(thrust::counting_iterator(0), + thrust::counting_iterator(total_pages), + [&](auto const page_idx) { + if (not bit_is_set(input_nullmask, page_idx)) { + cudf::set_null_mask(static_cast(output_nullmask.data()), + page_row_offsets[page_idx], + page_row_offsets[page_idx + 1], + false, + stream); + } + }); + } // Buffer for row-level string offsets (output). auto row_str_offsets = @@ -574,32 +588,37 @@ struct page_stats_caster : public stats_caster_base { // Convert page-level min and max columns to row-level min and max columns by gathering // values based on page-level row offsets - auto [min_data, min_bitmask] = build_data_and_nullmask(mincol->mutable_view(), - min.null_mask.data(), - page_indices, - page_row_offsets, - dtype, - stream, - mr); - auto [max_data, max_bitmask] = build_data_and_nullmask(maxcol->mutable_view(), - max.null_mask.data(), - page_indices, - page_row_offsets, - dtype, - stream, - mr); + auto [min_data, min_nullmask] = build_data_and_nullmask(mincol->mutable_view(), + min.null_mask.data(), + page_indices, + page_row_offsets, + dtype, + stream, + mr); + auto [max_data, max_nullmask] = build_data_and_nullmask(maxcol->mutable_view(), + max.null_mask.data(), + page_indices, + page_row_offsets, + dtype, + stream, + mr); // Count nulls in min and max columns - auto const min_nulls = cudf::detail::null_count( - reinterpret_cast(min_bitmask.data()), 0, total_rows, stream); - auto const max_nulls = cudf::detail::null_count( - reinterpret_cast(max_bitmask.data()), 0, total_rows, stream); - + auto const min_nulls = + mincol->null_count() + ? cudf::detail::null_count( + reinterpret_cast(min_nullmask.data()), 0, total_rows, stream) + : 0; + auto const max_nulls = + maxcol->null_count() + ? cudf::detail::null_count( + reinterpret_cast(max_nullmask.data()), 0, total_rows, stream) + : 0; // Return min, max and is_null device columns return {std::make_unique( - dtype, total_rows, std::move(min_data), std::move(min_bitmask), min_nulls), + dtype, total_rows, std::move(min_data), std::move(min_nullmask), min_nulls), std::make_unique( - dtype, total_rows, std::move(max_data), std::move(max_bitmask), max_nulls), + dtype, total_rows, std::move(max_data), std::move(max_nullmask), max_nulls), has_is_null_operator ? std::make_optional(build_is_null_device_column( is_null.value(), page_indices, page_row_offsets, stream, mr)) @@ -608,16 +627,36 @@ struct page_stats_caster : public stats_caster_base { // For strings columns, gather the page-level string offsets and bitmask to row-level // directly and gather string chars using a batched memcpy. else { - auto [min_data, min_offsets, min_nullmask] = build_string_data_and_nullmask( - min.val, min.chars, min.null_mask.data(), page_indices, page_row_offsets, stream, mr); - auto [max_data, max_offsets, max_nullmask] = build_string_data_and_nullmask( - max.val, max.chars, max.null_mask.data(), page_indices, page_row_offsets, stream, mr); + auto [min_data, min_offsets, min_nullmask] = + build_string_data_and_nullmask(min.val, + min.chars, + min.null_mask.data(), + min.null_count, + page_indices, + page_row_offsets, + stream, + mr); + auto [max_data, max_offsets, max_nullmask] = + build_string_data_and_nullmask(max.val, + max.chars, + max.null_mask.data(), + max.null_count, + page_indices, + page_row_offsets, + stream, + mr); // Count nulls in min and max columns - auto const min_nulls = cudf::detail::null_count( - reinterpret_cast(min_nullmask.data()), 0, total_rows, stream); - auto const max_nulls = cudf::detail::null_count( - reinterpret_cast(max_nullmask.data()), 0, total_rows, stream); + auto const min_nulls = + min.null_count + ? cudf::detail::null_count( + reinterpret_cast(min_nullmask.data()), 0, total_rows, stream) + : 0; + auto const max_nulls = + max.null_count + ? cudf::detail::null_count( + reinterpret_cast(max_nullmask.data()), 0, total_rows, stream) + : 0; // Return min, max and is_null device strings columns return { From 7cd8c2686e0f20801548bb53638306c784a92eca Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 22 Oct 2025 02:13:04 +0000 Subject: [PATCH 21/22] Minor docstring update --- cpp/src/io/parquet/experimental/page_index_filter.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index fccf2c78041..0b4a269e991 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -294,7 +294,6 @@ struct page_stats_caster : public stats_caster_base { if (input_column.null_count()) { // Set all bits in output nullmask to valid output_nullmask = cudf::create_null_mask(total_rows, mask_state::ALL_VALID, stream, mr); - // For each input page, invalidate the null mask for corresponding rows if needed. std::for_each(thrust::counting_iterator(0), thrust::counting_iterator(total_pages), @@ -412,9 +411,8 @@ struct page_stats_caster : public stats_caster_base { // Buffer for row-level strings nullmask (output) auto output_nullmask = rmm::device_buffer{}; if (input_null_count) { - // Initialize output nullmask to all bits set. + // Set all bits in output nullmask to valid output_nullmask = cudf::create_null_mask(total_rows, mask_state::ALL_VALID, stream, mr); - // For each input page, invalidate the null mask for corresponding rows if needed. std::for_each(thrust::counting_iterator(0), thrust::counting_iterator(total_pages), From 34372f93ff458311ea231d2a0b21fe32981c3119 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 22 Oct 2025 19:02:49 +0000 Subject: [PATCH 22/22] Minor: make var names consistent --- cpp/src/io/parquet/experimental/page_index_filter.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 0b4a269e991..bed1e24d309 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -343,13 +343,13 @@ struct page_stats_caster : public stats_caster_base { dtype, stream, mr); - auto const null_nulls = + auto const is_null_nulls = is_nullcol->null_count() ? cudf::detail::null_count( reinterpret_cast(is_null_nullmask.data()), 0, total_rows, stream) : 0; return std::make_unique( - dtype, total_rows, std::move(is_null_data), std::move(is_null_nullmask), null_nulls); + dtype, total_rows, std::move(is_null_data), std::move(is_null_nullmask), is_null_nulls); } /** @@ -358,7 +358,7 @@ struct page_stats_caster : public stats_caster_base { * @param host_strings Host span of cudf::string_view values in the input page-level host column * @param host_chars Host span of string data of the input page-level host column * @param host_nullmask Nullmask of the input page-level host column - * @param input_null_count Number of nulls in the input page-level host column + * @param host_null_count Number of nulls in the input page-level host column * @param page_indices Device vector containing the page index for each row index * @param page_row_offsets Host vector row offsets of each page * @param stream CUDA stream @@ -371,7 +371,7 @@ struct page_stats_caster : public stats_caster_base { build_string_data_and_nullmask(cudf::host_span host_strings, cudf::host_span host_chars, bitmask_type const* host_page_nullmask, - size_type input_null_count, + size_type host_null_count, cudf::device_span page_indices, cudf::host_span page_row_offsets, rmm::cuda_stream_view stream, @@ -410,7 +410,7 @@ struct page_stats_caster : public stats_caster_base { // Buffer for row-level strings nullmask (output) auto output_nullmask = rmm::device_buffer{}; - if (input_null_count) { + if (host_null_count) { // Set all bits in output nullmask to valid output_nullmask = cudf::create_null_mask(total_rows, mask_state::ALL_VALID, stream, mr); // For each input page, invalidate the null mask for corresponding rows if needed.