Skip to content

Commit c8ab979

Browse files
authored
fix(query): fix precision of timestamp's stats in parquet-rs (#13284)
* fix(query): fix precision in timestamp of parquet * fix(query): fix precision in timestamp of parquet * fix(query): fix precision in timestamp of parquet * fix(query): fix precision in timestamp of parquet
1 parent b9898c6 commit c8ab979

File tree

8 files changed

+79
-6
lines changed

8 files changed

+79
-6
lines changed

src/query/storages/parquet/src/parquet_rs/statistics/column.rs

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,17 @@ pub fn convert_column_statistics(s: &Statistics, typ: &TableDataType) -> ColumnS
7979
TableDataType::Number(NumberDataType::Int64) => {
8080
(Scalar::from(max), Scalar::from(min))
8181
}
82-
TableDataType::Timestamp => (Scalar::Timestamp(max), Scalar::Timestamp(min)),
82+
TableDataType::Timestamp => {
83+
let multi = match max.checked_ilog10().unwrap_or_default() + 1 {
84+
0..=10 => 1_000_000,
85+
11..=13 => 1_000,
86+
_ => 1,
87+
};
88+
(
89+
Scalar::Timestamp(max * multi),
90+
Scalar::Timestamp(min * multi),
91+
)
92+
}
8393
TableDataType::Decimal(DecimalDataType::Decimal128(size)) => (
8494
Scalar::Decimal(DecimalScalar::Decimal128(i128::from(max), *size)),
8595
Scalar::Decimal(DecimalScalar::Decimal128(i128::from(min), *size)),
@@ -91,10 +101,18 @@ pub fn convert_column_statistics(s: &Statistics, typ: &TableDataType) -> ColumnS
91101
_ => (Scalar::Null, Scalar::Null),
92102
}
93103
}
94-
Statistics::Int96(s) => (
95-
Scalar::Timestamp(s.max().to_i64()),
96-
Scalar::Timestamp(s.min().to_i64()),
97-
),
104+
Statistics::Int96(s) => {
105+
let (max, min) = (s.max().to_i64(), s.min().to_i64());
106+
let multi = match max.checked_ilog10().unwrap_or_default() + 1 {
107+
0..=10 => 1_000_000,
108+
11..=13 => 1_000,
109+
_ => 1,
110+
};
111+
(
112+
Scalar::Timestamp(max * multi),
113+
Scalar::Timestamp(min * multi),
114+
)
115+
}
98116
Statistics::Float(s) => (Scalar::from(*s.max()), Scalar::from(*s.min())),
99117
Statistics::Double(s) => (Scalar::from(*s.max()), Scalar::from(*s.min())),
100118
Statistics::ByteArray(s) => (

tests/data/parquet/timestamp/gen.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import pandas as pd
2+
import pyarrow as pa
3+
import pyarrow.parquet as pq
4+
from datetime import datetime
5+
6+
names = ['s', "ms", "us", 'ns'] # small, large, small, large
7+
for i in range(len(names)):
8+
num_row = 300
9+
timestamps = [
10+
'2023-10-13T10:00:00',
11+
'2023-10-14T11:00:00',
12+
'2023-10-15T12:00:00',
13+
'2023-10-16T12:00:00',
14+
] * num_row
15+
datetime_objects = [datetime.fromisoformat(timestamp) for timestamp in timestamps]
16+
timestamp_array = pa.array(datetime_objects, type = pa.timestamp(f"{names[i]}"))
17+
18+
table = pa.Table.from_arrays([timestamp_array], ['col_timestamp'])
19+
20+
pq.write_table(
21+
table,
22+
f"timestamp_{names[i]}.parquet",
23+
row_group_size= num_row/2,
24+
)
Binary file not shown.
Binary file not shown.
3.05 KB
Binary file not shown.
Binary file not shown.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
query TI
2+
select col_timestamp, count() from @data/parquet/timestamp/timestamp_s.parquet where col_timestamp between '2023-10-13' and '2023-10-18' group by col_timestamp order by col_timestamp
3+
----
4+
2023-10-13 10:00:00.000000 300
5+
2023-10-14 11:00:00.000000 300
6+
2023-10-15 12:00:00.000000 300
7+
2023-10-16 12:00:00.000000 300
8+
9+
query TI
10+
select col_timestamp, count() from @data/parquet/timestamp/timestamp_ms.parquet where col_timestamp between '2023-10-13' and '2023-10-18' group by col_timestamp order by col_timestamp
11+
----
12+
2023-10-13 10:00:00.000000 300
13+
2023-10-14 11:00:00.000000 300
14+
2023-10-15 12:00:00.000000 300
15+
2023-10-16 12:00:00.000000 300
16+
17+
query TI
18+
select col_timestamp, count() from @data/parquet/timestamp/timestamp_us.parquet where col_timestamp between '2023-10-13' and '2023-10-18' group by col_timestamp order by col_timestamp
19+
----
20+
2023-10-13 10:00:00.000000 300
21+
2023-10-14 11:00:00.000000 300
22+
2023-10-15 12:00:00.000000 300
23+
2023-10-16 12:00:00.000000 300
24+
25+
query TI
26+
select col_timestamp, count() from @data/parquet/timestamp/timestamp_ns.parquet where col_timestamp between '2023-10-13' and '2023-10-18' group by col_timestamp order by col_timestamp
27+
----
28+
2023-10-13 10:00:00.000000 300
29+
2023-10-14 11:00:00.000000 300
30+
2023-10-15 12:00:00.000000 300
31+
2023-10-16 12:00:00.000000 300

tests/suites/1_stateful/08_select_stage/08_00_parquet/08_00_00_basic.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,4 @@ echo "set use_parquet2 = ${USE_PARQUET2} ; select * from @s3 (FILE_FORMAT => 'PA
3737

3838
rm -rf ${DATADIR_PATH}
3939

40-
done
40+
done

0 commit comments

Comments
 (0)