@@ -92,7 +92,7 @@ fn choose_compression_scheme(
92
92
table_schema : & TableSchema ,
93
93
stat : & StatisticsOfColumns ,
94
94
) -> Result < WriterPropertiesBuilder > {
95
- for ( ( parquet_field, table_field) , col ) in parquet_fields
95
+ for ( ( parquet_field, table_field) , _col ) in parquet_fields
96
96
. iter ( )
97
97
. zip ( table_schema. fields . iter ( ) )
98
98
. zip ( block. columns ( ) )
@@ -104,19 +104,21 @@ fn choose_compression_scheme(
104
104
type_length : _,
105
105
scale : _,
106
106
precision : _,
107
- } => {
108
- let distinct_of_values = stat
109
- . get ( & table_field. column_id )
110
- . and_then ( |stat| stat. distinct_of_values ) ;
111
- let num_rows = block. num_rows ( ) ;
112
- if can_apply_dict_encoding ( physical_type, distinct_of_values, num_rows, col) ? {
113
- let col_path = ColumnPath :: new ( vec ! [ table_field. name( ) . clone( ) ] ) ;
114
- props = props. set_column_dictionary_enabled ( col_path, true ) ;
115
- } else if can_apply_delta_binary_pack ( physical_type, col, num_rows) ? {
116
- let col_path = ColumnPath :: new ( vec ! [ table_field. name( ) . clone( ) ] ) ;
117
- props = props. set_column_encoding ( col_path, Encoding :: DELTA_BINARY_PACKED ) ;
107
+ } => match physical_type {
108
+ PhysicalType :: BYTE_ARRAY | PhysicalType :: FIXED_LEN_BYTE_ARRAY => {
109
+ let ndv = stat
110
+ . get ( & table_field. column_id )
111
+ . and_then ( |stat| stat. distinct_of_values ) ;
112
+ let num_rows = block. num_rows ( ) ;
113
+ if let Some ( ndv) = ndv {
114
+ if num_rows as f64 / ndv as f64 > 10.0 {
115
+ let col_path = ColumnPath :: new ( vec ! [ table_field. name( ) . clone( ) ] ) ;
116
+ props = props. set_column_dictionary_enabled ( col_path, true ) ;
117
+ }
118
+ }
118
119
}
119
- }
120
+ _ => { }
121
+ } ,
120
122
Type :: GroupType {
121
123
basic_info : _,
122
124
fields : _,
@@ -126,26 +128,7 @@ fn choose_compression_scheme(
126
128
Ok ( props)
127
129
}
128
130
129
- fn can_apply_dict_encoding (
130
- physical_type : & PhysicalType ,
131
- distinct_of_values : Option < u64 > ,
132
- num_rows : usize ,
133
- col : & BlockEntry ,
134
- ) -> Result < bool > {
135
- const LOW_CARDINALITY_THRESHOLD : f64 = 10.0 ;
136
- const AVG_BYTES_PER_VALUE : f64 = 10.0 ;
137
- if !matches ! ( physical_type, PhysicalType :: BYTE_ARRAY ) {
138
- return Ok ( false ) ;
139
- }
140
- let is_low_cardinality = distinct_of_values
141
- . is_some_and ( |ndv| num_rows as f64 / ndv as f64 > LOW_CARDINALITY_THRESHOLD ) ;
142
- let column = col. value . convert_to_full_column ( & col. data_type , num_rows) ;
143
- let memory_size = column. memory_size ( ) ;
144
- let total_bytes = memory_size - num_rows * 8 ;
145
- let avg_bytes_per_value = total_bytes as f64 / num_rows as f64 ;
146
- Ok ( is_low_cardinality && avg_bytes_per_value < AVG_BYTES_PER_VALUE )
147
- }
148
-
131
+ #[ allow( dead_code) ]
149
132
fn can_apply_delta_binary_pack (
150
133
physical_type : & PhysicalType ,
151
134
col : & BlockEntry ,
0 commit comments