fix(optimizer): histogram calculations for string (Bytes) data types (#17873)

BohuTANG · web-flow · commit 3b729e5ad522 · 2025-05-06T12:30:58.000+08:00
* chore(optimizer): add more error log for get_upper_bound

* fix the string types for the histogram calc

* fix t_string in analyze.test

* fix fuse_statistic after enable_analyze_histogram
diff --git a/src/common/storage/src/statistics.rs b/src/common/storage/src/statistics.rs
@@ -156,6 +156,16 @@ impl Datum {
         matches!(self, Datum::Int(_) | Datum::UInt(_) | Datum::Float(_))
     }
 
+    pub fn type_name(&self) -> &'static str {
+        match self {
+            Datum::Bool(_) => "Boolean",
+            Datum::Int(_) => "Integer",
+            Datum::UInt(_) => "Unsigned Integer",
+            Datum::Float(_) => "Float",
+            Datum::Bytes(_) => "String",
+        }
+    }
+
     pub fn compare(&self, other: &Self) -> Result<std::cmp::Ordering> {
         match (self, other) {
             (Datum::Bool(l), Datum::Bool(r)) => Ok(l.cmp(r)),
diff --git a/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs b/src/query/sql/src/planner/optimizer/ir/stats/histogram.rs
@@ -243,10 +243,42 @@ impl SampleSet for UniformSampleSet {
                 Ok(Datum::Float(upper_bound))
             }
 
+            // Handle Bytes type for histogram calculation by converting to strings first
+            (Datum::Bytes(min_bytes), Datum::Bytes(max_bytes)) => {
+                // Convert bytes to strings for comparison
+                let min_str = String::from_utf8_lossy(min_bytes);
+                let max_str = String::from_utf8_lossy(max_bytes);
+
+                // For boundary cases, return the exact values
+                if min_str == max_str {
+                    return Ok(Datum::Bytes(min_bytes.clone()));
+                }
+
+                if bucket_index == 0 {
+                    return Ok(Datum::Bytes(min_bytes.clone()));
+                } else if bucket_index >= num_buckets {
+                    return Ok(Datum::Bytes(max_bytes.clone()));
+                }
+
+                // For intermediate buckets, use a simple approach based on string comparison
+                // Just divide the range into equal parts based on bucket_index
+
+                // If bucket_index is in the first half, return min
+                // If bucket_index is in the second half, return max
+                // This preserves the string ordering semantics
+                let mid_bucket = num_buckets / 2;
+
+                if bucket_index <= mid_bucket {
+                    Ok(Datum::Bytes(min_bytes.clone()))
+                } else {
+                    Ok(Datum::Bytes(max_bytes.clone()))
+                }
+            }
+
             _ => Err(format!(
-                "Unsupported datum type: {:?}, {:?}",
-                self.min, self.max
-            )),
+                "Unsupported datum type for histogram calculation: {} (type: {}), {} (type: {}). Only numeric types are supported.",
+                self.min, self.min.type_name(), self.max, self.max.type_name()
+            ))
         }
     }
 }
diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0020_analyze.test
@@ -154,5 +154,50 @@ analyze table t1;
 statement ok
 DROP TABLE t
 
+# Test case for string histogram functionality
+statement ok
+create or replace table t_string(id int, str_val varchar);
+
+statement ok
+insert into t_string values
+    (1, '1.0'),
+    (2, '2.0'),
+    (3, '3.0'),
+    (4, '4.0'),
+    (5, '5.0'),
+    (6, '6.0'),
+    (7, '7.0'),
+    (8, '8.0'),
+    (9, '9.0'),
+    (10, '10.0');
+
+statement ok
+set enable_analyze_histogram=1;
+
+statement ok
+analyze table t_string;
+
+# Verify that histogram was created for string column
+query IIT
+select * from fuse_statistic('db_09_0020', 't_string') order by column_name asc;
+----
+id 10 [bucket id: 0, min: "1", max: "1", ndv: 1.0, count: 1.0], [bucket id: 1, min: "2", max: "2", ndv: 1.0, count: 1.0], [bucket id: 2, min: "3", max: "3", ndv: 1.0, count: 1.0], [bucket id: 3, min: "4", max: "4", ndv: 1.0, count: 1.0], [bucket id: 4, min: "5", max: "5", ndv: 1.0, count: 1.0], [bucket id: 5, min: "6", max: "6", ndv: 1.0, count: 1.0], [bucket id: 6, min: "7", max: "7", ndv: 1.0, count: 1.0], [bucket id: 7, min: "8", max: "8", ndv: 1.0, count: 1.0], [bucket id: 8, min: "9", max: "9", ndv: 1.0, count: 1.0], [bucket id: 9, min: "10", max: "10", ndv: 1.0, count: 1.0]
+str_val 10 [bucket id: 0, min: "1.0", max: "1.0", ndv: 1.0, count: 1.0], [bucket id: 1, min: "10.0", max: "10.0", ndv: 1.0, count: 1.0], [bucket id: 2, min: "2.0", max: "2.0", ndv: 1.0, count: 1.0], [bucket id: 3, min: "3.0", max: "3.0", ndv: 1.0, count: 1.0], [bucket id: 4, min: "4.0", max: "4.0", ndv: 1.0, count: 1.0], [bucket id: 5, min: "5.0", max: "5.0", ndv: 1.0, count: 1.0], [bucket id: 6, min: "6.0", max: "6.0", ndv: 1.0, count: 1.0], [bucket id: 7, min: "7.0", max: "7.0", ndv: 1.0, count: 1.0], [bucket id: 8, min: "8.0", max: "8.0", ndv: 1.0, count: 1.0], [bucket id: 9, min: "9.0", max: "9.0", ndv: 1.0, count: 1.0]
+
+# Test string comparison with histogram
+query I
+select count(*) from t_string where str_val > '5.0';
+----
+4
+
+# Test string range query with histogram
+query I
+select count(*) from t_string where str_val between '3.0' and '7.0';
+----
+5
+
+statement ok
+DROP TABLE t_string
+
 statement ok
 DROP DATABASE db_09_0020