Skip to content

Commit 4968568

Browse files
Ziy1-Tanacezen
andauthored
[C++] Support property filter pushdown by utilizing payload file formats (#178)
--------- Signed-off-by: Ziy1-Tan <ajb459684460@gmail.com> Co-authored-by: Weibin Zeng <qiaozi.zwb@alibaba-inc.com>
1 parent f4d01b4 commit 4968568

13 files changed

+931
-77
lines changed

cpp/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,11 +191,13 @@ macro(build_gar)
191191
if(APPLE)
192192
target_link_libraries(gar PRIVATE -Wl,-force_load gar_arrow_static
193193
"${GAR_PARQUET_STATIC_LIB}"
194+
"${GAR_DATASET_STATIC_LIB}"
194195
"${GAR_ACERO_STATIC_LIB}"
195196
"${GAR_ARROW_BUNDLED_DEPS_STATIC_LIB}")
196197
else()
197198
target_link_libraries(gar PRIVATE -Wl,--exclude-libs,ALL -Wl,--whole-archive gar_arrow_static
198199
"${GAR_PARQUET_STATIC_LIB}"
200+
"${GAR_DATASET_STATIC_LIB}"
199201
"${GAR_ARROW_ACERO_STATIC_LIB}"
200202
"${GAR_ARROW_BUNDLED_DEPS_STATIC_LIB}" -Wl,--no-whole-archive)
201203
endif()

cpp/cmake/apache-arrow.cmake

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ function(build_arrow)
4444
set(GAR_PARQUET_STATIC_LIB_FILENAME
4545
"${CMAKE_STATIC_LIBRARY_PREFIX}parquet${CMAKE_STATIC_LIBRARY_SUFFIX}")
4646
set(GAR_PARQUET_STATIC_LIB "${GAR_ARROW_STATIC_LIBRARY_DIR}/${GAR_PARQUET_STATIC_LIB_FILENAME}" CACHE INTERNAL "parquet lib")
47+
set(GAR_DATASET_STATIC_LIB_FILENAME
48+
"${CMAKE_STATIC_LIBRARY_PREFIX}arrow_dataset${CMAKE_STATIC_LIBRARY_SUFFIX}")
49+
set(GAR_DATASET_STATIC_LIB "${GAR_ARROW_STATIC_LIBRARY_DIR}/${GAR_DATASET_STATIC_LIB_FILENAME}" CACHE INTERNAL "arrow dataset lib")
4750
set(GAR_ARROW_BUNDLED_DEPS_STATIC_LIB_FILENAME
4851
"${CMAKE_STATIC_LIBRARY_PREFIX}arrow_bundled_dependencies${CMAKE_STATIC_LIBRARY_SUFFIX}")
4952
set(GAR_ARROW_BUNDLED_DEPS_STATIC_LIB
@@ -83,7 +86,7 @@ function(build_arrow)
8386
"-DARROW_S3=ON")
8487

8588
set(GAR_ARROW_INCLUDE_DIR "${GAR_ARROW_PREFIX}/include" CACHE INTERNAL "arrow include directory")
86-
set(GAR_ARROW_BUILD_BYPRODUCTS "${GAR_ARROW_STATIC_LIB}" "${GAR_PARQUET_STATIC_LIB}")
89+
set(GAR_ARROW_BUILD_BYPRODUCTS "${GAR_ARROW_STATIC_LIB}" "${GAR_PARQUET_STATIC_LIB}" "${GAR_DATASET_STATIC_LIB}")
8790

8891
find_package(Threads)
8992
find_package(Arrow QUIET)
@@ -104,16 +107,21 @@ function(build_arrow)
104107

105108
set(GAR_ARROW_LIBRARY_TARGET gar_arrow_static)
106109
set(GAR_PARQUET_LIBRARY_TARGET gar_parquet_static)
110+
set(GAR_DATASET_LIBRARY_TARGET gar_dataset_static)
107111

108112
file(MAKE_DIRECTORY "${GAR_ARROW_INCLUDE_DIR}")
109113
add_library(${GAR_ARROW_LIBRARY_TARGET} STATIC IMPORTED)
110114
add_library(${GAR_PARQUET_LIBRARY_TARGET} STATIC IMPORTED)
115+
add_library(${GAR_DATASET_LIBRARY_TARGET} STATIC IMPORTED)
111116
set_target_properties(${GAR_ARROW_LIBRARY_TARGET}
112117
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${GAR_ARROW_INCLUDE_DIR}
113118
IMPORTED_LOCATION ${GAR_ARROW_STATIC_LIB})
114119
set_target_properties(${GAR_PARQUET_LIBRARY_TARGET}
115120
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${GAR_ARROW_INCLUDE_DIR}
116121
IMPORTED_LOCATION ${GAR_PARQUET_STATIC_LIB})
122+
set_target_properties(${GAR_DATASET_LIBRARY_TARGET}
123+
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${GAR_ARROW_INCLUDE_DIR}
124+
IMPORTED_LOCATION ${GAR_DATASET_STATIC_LIB})
117125
if (ARROW_VERSION_TO_BUILD GREATER_EQUAL "12.0.0")
118126
set(GAR_ARROW_ACERO_STATIC_LIB_FILENAME
119127
"${CMAKE_STATIC_LIBRARY_PREFIX}arrow_acero${CMAKE_STATIC_LIBRARY_SUFFIX}")

cpp/include/gar/graph_info.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ limitations under the License.
1919
#include <map>
2020
#include <memory>
2121
#include <string>
22+
#include <unordered_set>
2223
#include <vector>
2324

2425
#include "utils/adj_list_type.h"
@@ -43,6 +44,11 @@ struct Property {
4344
std::string name; // property name
4445
DataType type; // property data type
4546
bool is_primary; // primary key tag
47+
48+
Property() {}
49+
explicit Property(const std::string& name) : name(name) {}
50+
Property(const std::string& name, const DataType& type, bool is_primary)
51+
: name(name), type(type), is_primary(is_primary) {}
4652
};
4753

4854
static bool operator==(const Property& lhs, const Property& rhs) {
@@ -87,6 +93,7 @@ class PropertyGroup {
8793
std::vector<std::string> names;
8894
for (auto& property : properties_) {
8995
names.push_back(property.name);
96+
property_names_.insert(property.name);
9097
}
9198
prefix_ = util::ConcatStringWithDelimiter(names, REGULAR_SEPERATOR) + "/";
9299
}
@@ -121,6 +128,10 @@ class PropertyGroup {
121128
return properties_;
122129
}
123130

131+
inline bool ContainProperty(const std::string& property_name) const {
132+
return property_names_.find(property_name) != property_names_.end();
133+
}
134+
124135
/** Get the file type of property group chunk file.
125136
*
126137
* @return The file type of group.
@@ -146,6 +157,7 @@ class PropertyGroup {
146157

147158
private:
148159
std::vector<Property> properties_;
160+
std::unordered_set<std::string> property_names_;
149161
FileType file_type_;
150162
std::string prefix_;
151163
};

cpp/include/gar/reader/arrow_chunk_reader.h

Lines changed: 62 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ limitations under the License.
2424
#include "gar/graph_info.h"
2525
#include "gar/utils/data_type.h"
2626
#include "gar/utils/filesystem.h"
27-
#include "gar/utils/reader_utils.h"
2827
#include "gar/utils/result.h"
2928
#include "gar/utils/status.h"
3029
#include "gar/utils/utils.h"
@@ -52,18 +51,22 @@ class VertexPropertyArrowChunkReader {
5251
VertexPropertyArrowChunkReader(const VertexInfo& vertex_info,
5352
const PropertyGroup& property_group,
5453
const std::string& prefix,
55-
IdType chunk_index = 0)
54+
IdType chunk_index = 0,
55+
const utils::FilterOptions& options = {})
5656
: vertex_info_(vertex_info),
5757
property_group_(property_group),
5858
chunk_index_(chunk_index),
5959
seek_id_(chunk_index * vertex_info.GetChunkSize()),
60-
chunk_table_(nullptr) {
60+
chunk_table_(nullptr),
61+
filter_options_(options) {
6162
GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_));
6263
GAR_ASSIGN_OR_RAISE_ERROR(auto pg_path_prefix,
6364
vertex_info.GetPathPrefix(property_group));
6465
std::string base_dir = prefix_ + pg_path_prefix;
6566
GAR_ASSIGN_OR_RAISE_ERROR(chunk_num_,
6667
utils::GetVertexChunkNum(prefix_, vertex_info));
68+
GAR_ASSIGN_OR_RAISE_ERROR(vertex_num_,
69+
utils::GetVertexNum(prefix_, vertex_info_));
6770
}
6871

6972
/**
@@ -126,14 +129,32 @@ class VertexPropertyArrowChunkReader {
126129
*/
127130
IdType GetChunkNum() const noexcept { return chunk_num_; }
128131

132+
/**
133+
* @brief Apply the row filter to the table. No parameter call Filter() will
134+
* clear the filter.
135+
*
136+
* @param filter Predicate expression to filter rows.
137+
*/
138+
void Filter(utils::Filter filter = nullptr);
139+
140+
/**
141+
* @brief Apply the projection to the table to be read. No parameter call
142+
* Select() will clear the projection.
143+
*
144+
* @param column_names The name of columns to be selected.
145+
*/
146+
void Select(utils::ColumnNames column_names = std::nullopt);
147+
129148
private:
130149
VertexInfo vertex_info_;
131150
PropertyGroup property_group_;
132151
std::string prefix_;
133152
IdType chunk_index_;
134153
IdType seek_id_;
135154
IdType chunk_num_;
155+
IdType vertex_num_;
136156
std::shared_ptr<arrow::Table> chunk_table_;
157+
utils::FilterOptions filter_options_;
137158
std::shared_ptr<FileSystem> fs_;
138159
};
139160

@@ -227,7 +248,8 @@ class AdjListArrowChunkReader {
227248
}
228249

229250
/**
230-
* @brief Return the current chunk of chunk position indicator as arrow::Table
251+
* @brief Return the current chunk of chunk position indicator as
252+
* arrow::Table
231253
*/
232254
Result<std::shared_ptr<arrow::Table>> GetChunk() noexcept;
233255

@@ -420,7 +442,8 @@ class AdjListPropertyArrowChunkReader {
420442
* @brief Initialize the AdjListPropertyArrowChunkReader.
421443
*
422444
* @param edge_info The edge info that describes the edge type.
423-
* @param property_group The property group that describes the property group.
445+
* @param property_group The property group that describes the property
446+
* group.
424447
* @param adj_list_type The adj list type for the edges.
425448
* @param prefix The absolute prefix.
426449
* @param vertex_chunk_index The vertex chunk index, default is 0.
@@ -429,15 +452,17 @@ class AdjListPropertyArrowChunkReader {
429452
const PropertyGroup& property_group,
430453
AdjListType adj_list_type,
431454
const std::string prefix,
432-
IdType vertex_chunk_index = 0)
455+
IdType vertex_chunk_index = 0,
456+
const utils::FilterOptions& options = {})
433457
: edge_info_(edge_info),
434458
property_group_(property_group),
435459
adj_list_type_(adj_list_type),
436460
prefix_(prefix),
437461
vertex_chunk_index_(vertex_chunk_index),
438462
chunk_index_(0),
439463
seek_offset_(0),
440-
chunk_table_(nullptr) {
464+
chunk_table_(nullptr),
465+
filter_options_(options) {
441466
GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_));
442467
GAR_ASSIGN_OR_RAISE_ERROR(
443468
auto pg_path_prefix,
@@ -463,6 +488,7 @@ class AdjListPropertyArrowChunkReader {
463488
chunk_index_(other.chunk_index_),
464489
seek_offset_(other.seek_offset_),
465490
chunk_table_(nullptr),
491+
filter_options_(other.filter_options_),
466492
vertex_chunk_num_(other.vertex_chunk_num_),
467493
chunk_num_(other.chunk_num_),
468494
base_dir_(other.base_dir_),
@@ -506,7 +532,8 @@ class AdjListPropertyArrowChunkReader {
506532
}
507533

508534
/**
509-
* @brief Return the current chunk of chunk position indicator as arrow::Table
535+
* @brief Return the current chunk of chunk position indicator as
536+
* arrow::Table
510537
*/
511538
Result<std::shared_ptr<arrow::Table>> GetChunk() noexcept;
512539

@@ -564,6 +591,22 @@ class AdjListPropertyArrowChunkReader {
564591
return Status::OK();
565592
}
566593

594+
/**
595+
* @brief Apply the row filter to the table. No parameter call Filter() will
596+
* clear the filter.
597+
*
598+
* @param filter Predicate expression to filter rows.
599+
*/
600+
void Filter(utils::Filter filter = nullptr);
601+
602+
/**
603+
* @brief Apply the projection to the table to be read. No parameter call
604+
* Select() will clear the projection.
605+
*
606+
* @param column_names The name of columns to be selected.
607+
*/
608+
void Select(utils::ColumnNames column_names = std::nullopt);
609+
567610
private:
568611
EdgeInfo edge_info_;
569612
PropertyGroup property_group_;
@@ -572,6 +615,7 @@ class AdjListPropertyArrowChunkReader {
572615
IdType vertex_chunk_index_, chunk_index_;
573616
IdType seek_offset_;
574617
std::shared_ptr<arrow::Table> chunk_table_;
618+
utils::FilterOptions filter_options_;
575619
IdType vertex_chunk_num_, chunk_num_;
576620
std::string base_dir_;
577621
std::shared_ptr<FileSystem> fs_;
@@ -587,15 +631,16 @@ class AdjListPropertyArrowChunkReader {
587631
static inline Result<VertexPropertyArrowChunkReader>
588632
ConstructVertexPropertyArrowChunkReader(
589633
const GraphInfo& graph_info, const std::string& label,
590-
const PropertyGroup& property_group) noexcept {
634+
const PropertyGroup& property_group,
635+
const utils::FilterOptions& options = {}) noexcept {
591636
VertexInfo vertex_info;
592637
GAR_ASSIGN_OR_RAISE(vertex_info, graph_info.GetVertexInfo(label));
593638
if (!vertex_info.ContainPropertyGroup(property_group)) {
594639
return Status::KeyError("No property group ", property_group, " in vertex ",
595640
label, ".");
596641
}
597642
return VertexPropertyArrowChunkReader(vertex_info, property_group,
598-
graph_info.GetPrefix());
643+
graph_info.GetPrefix(), 0, options);
599644
}
600645

601646
/**
@@ -663,12 +708,11 @@ ConstructAdjListOffsetArrowChunkReader(const GraphInfo& graph_info,
663708
* @param adj_list_type The adj list type for the edges.
664709
*/
665710
static inline Result<AdjListPropertyArrowChunkReader>
666-
ConstructAdjListPropertyArrowChunkReader(const GraphInfo& graph_info,
667-
const std::string& src_label,
668-
const std::string& edge_label,
669-
const std::string& dst_label,
670-
const PropertyGroup& property_group,
671-
AdjListType adj_list_type) noexcept {
711+
ConstructAdjListPropertyArrowChunkReader(
712+
const GraphInfo& graph_info, const std::string& src_label,
713+
const std::string& edge_label, const std::string& dst_label,
714+
const PropertyGroup& property_group, AdjListType adj_list_type,
715+
const utils::FilterOptions& options = {}) noexcept {
672716
EdgeInfo edge_info;
673717
GAR_ASSIGN_OR_RAISE(edge_info,
674718
graph_info.GetEdgeInfo(src_label, edge_label, dst_label));
@@ -683,7 +727,8 @@ ConstructAdjListPropertyArrowChunkReader(const GraphInfo& graph_info,
683727
AdjListTypeToString(adj_list_type), ".");
684728
}
685729
return AdjListPropertyArrowChunkReader(edge_info, property_group,
686-
adj_list_type, graph_info.GetPrefix());
730+
adj_list_type, graph_info.GetPrefix(),
731+
0, options);
687732
}
688733

689734
} // namespace GAR_NAMESPACE_INTERNAL

0 commit comments

Comments
 (0)