14
14
15
15
use std:: collections:: hash_map:: Entry ;
16
16
use std:: collections:: HashMap ;
17
- use std:: collections:: HashSet ;
18
17
use std:: sync:: Arc ;
19
18
20
19
use common_arrow:: arrow:: datatypes:: Field ;
21
20
use common_arrow:: arrow:: io:: parquet:: read:: column_iter_to_arrays;
22
21
use common_arrow:: arrow:: io:: parquet:: read:: ArrayIter ;
23
22
use common_arrow:: arrow:: io:: parquet:: read:: RowGroupDeserializer ;
24
- use common_arrow:: arrow:: io:: parquet:: write:: to_parquet_schema;
25
23
use common_arrow:: parquet:: metadata:: ColumnDescriptor ;
26
- use common_arrow:: parquet:: metadata:: SchemaDescriptor ;
27
24
use common_arrow:: parquet:: read:: BasicDecompressor ;
28
25
use common_arrow:: parquet:: read:: PageMetaData ;
29
26
use common_arrow:: parquet:: read:: PageReader ;
30
27
use common_catalog:: plan:: PartInfoPtr ;
31
- use common_catalog:: plan:: Projection ;
32
28
use common_datablocks:: DataBlock ;
33
- use common_datavalues:: DataSchemaRef ;
34
29
use common_exception:: ErrorCode ;
35
30
use common_exception:: Result ;
36
31
use common_storage:: ColumnLeaf ;
37
- use common_storage:: ColumnLeaves ;
38
- use opendal:: Object ;
39
- use opendal:: Operator ;
40
32
41
33
use crate :: ParquetColumnMeta ;
42
34
use crate :: ParquetPartInfo ;
43
-
44
- #[ derive( Clone ) ]
45
- pub struct ParquetReader {
46
- operator : Operator ,
47
- projection : Projection ,
48
- projected_schema : DataSchemaRef ,
49
- column_leaves : ColumnLeaves ,
50
- parquet_schema_descriptor : SchemaDescriptor ,
51
- }
35
+ use crate :: ParquetReader ;
52
36
53
37
impl ParquetReader {
54
- pub fn create (
55
- operator : Operator ,
56
- schema : DataSchemaRef ,
57
- projection : Projection ,
58
- ) -> Result < Arc < ParquetReader > > {
59
- let projected_schema = match projection {
60
- Projection :: Columns ( ref indices) => DataSchemaRef :: new ( schema. project ( indices) ) ,
61
- Projection :: InnerColumns ( ref path_indices) => {
62
- DataSchemaRef :: new ( schema. inner_project ( path_indices) )
63
- }
64
- } ;
65
-
66
- let arrow_schema = schema. to_arrow ( ) ;
67
- let parquet_schema_descriptor = to_parquet_schema ( & arrow_schema) ?;
68
- let column_leaves = ColumnLeaves :: new_from_schema ( & arrow_schema) ;
69
-
70
- Ok ( Arc :: new ( ParquetReader {
71
- operator,
72
- projection,
73
- projected_schema,
74
- parquet_schema_descriptor,
75
- column_leaves,
76
- } ) )
77
- }
78
-
79
- pub fn schema ( & self ) -> DataSchemaRef {
80
- self . projected_schema . clone ( )
81
- }
82
-
83
38
fn to_array_iter (
84
39
metas : Vec < & ParquetColumnMeta > ,
85
40
chunks : Vec < Vec < u8 > > ,
@@ -168,73 +123,6 @@ impl ParquetReader {
168
123
self . try_next_block ( & mut deserializer)
169
124
}
170
125
171
- pub async fn read_columns_data ( & self , part : PartInfoPtr ) -> Result < Vec < ( usize , Vec < u8 > ) > > {
172
- let part = ParquetPartInfo :: from_part ( & part) ?;
173
- let columns = self . projection . project_column_leaves ( & self . column_leaves ) ?;
174
- let indices = Self :: build_projection_indices ( & columns) ;
175
- let mut join_handlers = Vec :: with_capacity ( indices. len ( ) ) ;
176
-
177
- for index in indices {
178
- let column_meta = & part. columns_meta [ & index] ;
179
- join_handlers. push ( Self :: read_column (
180
- self . operator . object ( & part. location ) ,
181
- index,
182
- column_meta. offset ,
183
- column_meta. length ,
184
- ) ) ;
185
- }
186
-
187
- futures:: future:: try_join_all ( join_handlers) . await
188
- }
189
-
190
- pub fn support_blocking_api ( & self ) -> bool {
191
- self . operator . metadata ( ) . can_blocking ( )
192
- }
193
-
194
- pub fn sync_read_columns_data ( & self , part : PartInfoPtr ) -> Result < Vec < ( usize , Vec < u8 > ) > > {
195
- let part = ParquetPartInfo :: from_part ( & part) ?;
196
-
197
- let columns = self . projection . project_column_leaves ( & self . column_leaves ) ?;
198
- let indices = Self :: build_projection_indices ( & columns) ;
199
- let mut results = Vec :: with_capacity ( indices. len ( ) ) ;
200
-
201
- for index in indices {
202
- let column_meta = & part. columns_meta [ & index] ;
203
-
204
- let op = self . operator . clone ( ) ;
205
-
206
- let location = part. location . clone ( ) ;
207
- let offset = column_meta. offset ;
208
- let length = column_meta. length ;
209
-
210
- let result = Self :: sync_read_column ( op. object ( & location) , index, offset, length) ;
211
- results. push ( result?) ;
212
- }
213
-
214
- Ok ( results)
215
- }
216
-
217
- pub async fn read_column (
218
- o : Object ,
219
- index : usize ,
220
- offset : u64 ,
221
- length : u64 ,
222
- ) -> Result < ( usize , Vec < u8 > ) > {
223
- let chunk = o. range_read ( offset..offset + length) . await ?;
224
-
225
- Ok ( ( index, chunk) )
226
- }
227
-
228
- pub fn sync_read_column (
229
- o : Object ,
230
- index : usize ,
231
- offset : u64 ,
232
- length : u64 ,
233
- ) -> Result < ( usize , Vec < u8 > ) > {
234
- let chunk = o. blocking_range_read ( offset..offset + length) ?;
235
- Ok ( ( index, chunk) )
236
- }
237
-
238
126
fn try_next_block ( & self , deserializer : & mut RowGroupDeserializer ) -> Result < DataBlock > {
239
127
match deserializer. next ( ) {
240
128
None => Err ( ErrorCode :: Internal (
@@ -245,17 +133,6 @@ impl ParquetReader {
245
133
}
246
134
}
247
135
248
- // Build non duplicate leaf_ids to avoid repeated read column from parquet
249
- fn build_projection_indices ( columns : & Vec < & ColumnLeaf > ) -> HashSet < usize > {
250
- let mut indices = HashSet :: with_capacity ( columns. len ( ) ) ;
251
- for column in columns {
252
- for index in & column. leaf_ids {
253
- indices. insert ( * index) ;
254
- }
255
- }
256
- indices
257
- }
258
-
259
136
// Build a map to record the count number of each leaf_id
260
137
fn build_projection_count_map ( columns : & Vec < & ColumnLeaf > ) -> HashMap < usize , usize > {
261
138
let mut cnt_map = HashMap :: with_capacity ( columns. len ( ) ) ;
0 commit comments