Skip to content

Commit 3b729e5

Browse files
authored
fix(optimizer): histogram calculations for string (Bytes) data types (#17873)
* chore(optimizer): add more error log for get_upper_bound * fix the string types for the histogram calc * fix t_string in analyze.test * fix fuse_statistic after enable_analyze_histogram
1 parent 7b51ed1 commit 3b729e5

File tree

3 files changed

+90
-3
lines changed

3 files changed

+90
-3
lines changed

โ€Žsrc/common/storage/src/statistics.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,16 @@ impl Datum {
156156
matches!(self, Datum::Int(_) | Datum::UInt(_) | Datum::Float(_))
157157
}
158158

159+
pub fn type_name(&self) -> &'static str {
160+
match self {
161+
Datum::Bool(_) => "Boolean",
162+
Datum::Int(_) => "Integer",
163+
Datum::UInt(_) => "Unsigned Integer",
164+
Datum::Float(_) => "Float",
165+
Datum::Bytes(_) => "String",
166+
}
167+
}
168+
159169
pub fn compare(&self, other: &Self) -> Result<std::cmp::Ordering> {
160170
match (self, other) {
161171
(Datum::Bool(l), Datum::Bool(r)) => Ok(l.cmp(r)),

โ€Žsrc/query/sql/src/planner/optimizer/ir/stats/histogram.rs

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -243,10 +243,42 @@ impl SampleSet for UniformSampleSet {
243243
Ok(Datum::Float(upper_bound))
244244
}
245245

246+
// Handle Bytes type for histogram calculation by converting to strings first
247+
(Datum::Bytes(min_bytes), Datum::Bytes(max_bytes)) => {
248+
// Convert bytes to strings for comparison
249+
let min_str = String::from_utf8_lossy(min_bytes);
250+
let max_str = String::from_utf8_lossy(max_bytes);
251+
252+
// For boundary cases, return the exact values
253+
if min_str == max_str {
254+
return Ok(Datum::Bytes(min_bytes.clone()));
255+
}
256+
257+
if bucket_index == 0 {
258+
return Ok(Datum::Bytes(min_bytes.clone()));
259+
} else if bucket_index >= num_buckets {
260+
return Ok(Datum::Bytes(max_bytes.clone()));
261+
}
262+
263+
// For intermediate buckets, use a simple approach based on string comparison
264+
// Just divide the range into equal parts based on bucket_index
265+
266+
// If bucket_index is in the first half, return min
267+
// If bucket_index is in the second half, return max
268+
// This preserves the string ordering semantics
269+
let mid_bucket = num_buckets / 2;
270+
271+
if bucket_index <= mid_bucket {
272+
Ok(Datum::Bytes(min_bytes.clone()))
273+
} else {
274+
Ok(Datum::Bytes(max_bytes.clone()))
275+
}
276+
}
277+
246278
_ => Err(format!(
247-
"Unsupported datum type: {:?}, {:?}",
248-
self.min, self.max
249-
)),
279+
"Unsupported datum type for histogram calculation: {} (type: {}), {} (type: {}). Only numeric types are supported.",
280+
self.min, self.min.type_name(), self.max, self.max.type_name()
281+
))
250282
}
251283
}
252284
}

โ€Žtests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,5 +154,50 @@ analyze table t1;
154154
statement ok
155155
DROP TABLE t
156156

157+
# Test case for string histogram functionality
158+
statement ok
159+
create or replace table t_string(id int, str_val varchar);
160+
161+
statement ok
162+
insert into t_string values
163+
(1, '1.0'),
164+
(2, '2.0'),
165+
(3, '3.0'),
166+
(4, '4.0'),
167+
(5, '5.0'),
168+
(6, '6.0'),
169+
(7, '7.0'),
170+
(8, '8.0'),
171+
(9, '9.0'),
172+
(10, '10.0');
173+
174+
statement ok
175+
set enable_analyze_histogram=1;
176+
177+
statement ok
178+
analyze table t_string;
179+
180+
# Verify that histogram was created for string column
181+
query IIT
182+
select * from fuse_statistic('db_09_0020', 't_string') order by column_name asc;
183+
----
184+
id 10 [bucket id: 0, min: "1", max: "1", ndv: 1.0, count: 1.0], [bucket id: 1, min: "2", max: "2", ndv: 1.0, count: 1.0], [bucket id: 2, min: "3", max: "3", ndv: 1.0, count: 1.0], [bucket id: 3, min: "4", max: "4", ndv: 1.0, count: 1.0], [bucket id: 4, min: "5", max: "5", ndv: 1.0, count: 1.0], [bucket id: 5, min: "6", max: "6", ndv: 1.0, count: 1.0], [bucket id: 6, min: "7", max: "7", ndv: 1.0, count: 1.0], [bucket id: 7, min: "8", max: "8", ndv: 1.0, count: 1.0], [bucket id: 8, min: "9", max: "9", ndv: 1.0, count: 1.0], [bucket id: 9, min: "10", max: "10", ndv: 1.0, count: 1.0]
185+
str_val 10 [bucket id: 0, min: "1.0", max: "1.0", ndv: 1.0, count: 1.0], [bucket id: 1, min: "10.0", max: "10.0", ndv: 1.0, count: 1.0], [bucket id: 2, min: "2.0", max: "2.0", ndv: 1.0, count: 1.0], [bucket id: 3, min: "3.0", max: "3.0", ndv: 1.0, count: 1.0], [bucket id: 4, min: "4.0", max: "4.0", ndv: 1.0, count: 1.0], [bucket id: 5, min: "5.0", max: "5.0", ndv: 1.0, count: 1.0], [bucket id: 6, min: "6.0", max: "6.0", ndv: 1.0, count: 1.0], [bucket id: 7, min: "7.0", max: "7.0", ndv: 1.0, count: 1.0], [bucket id: 8, min: "8.0", max: "8.0", ndv: 1.0, count: 1.0], [bucket id: 9, min: "9.0", max: "9.0", ndv: 1.0, count: 1.0]
186+
187+
# Test string comparison with histogram
188+
query I
189+
select count(*) from t_string where str_val > '5.0';
190+
----
191+
4
192+
193+
# Test string range query with histogram
194+
query I
195+
select count(*) from t_string where str_val between '3.0' and '7.0';
196+
----
197+
5
198+
199+
statement ok
200+
DROP TABLE t_string
201+
157202
statement ok
158203
DROP DATABASE db_09_0020

0 commit comments

Comments
ย (0)