@@ -21,24 +21,33 @@ use databend_common_expression::types::BinaryType;
21
21
use databend_common_expression:: types:: DataType ;
22
22
use databend_common_expression:: types:: GenericType ;
23
23
use databend_common_expression:: types:: NullableType ;
24
+ use databend_common_expression:: types:: NumberDataType ;
24
25
use databend_common_expression:: types:: NumberType ;
25
26
use databend_common_expression:: types:: ReturnType ;
27
+ use databend_common_expression:: types:: StringType ;
26
28
use databend_common_expression:: types:: ValueType ;
29
+ use databend_common_expression:: types:: ALL_NUMERICS_TYPES ;
30
+ use databend_common_expression:: vectorize_with_builder_1_arg;
27
31
use databend_common_expression:: vectorize_with_builder_2_arg;
32
+ use databend_common_expression:: with_number_mapped_type;
28
33
use databend_common_expression:: Column ;
29
34
use databend_common_expression:: FixedLengthEncoding ;
30
35
use databend_common_expression:: Function ;
31
36
use databend_common_expression:: FunctionDomain ;
32
37
use databend_common_expression:: FunctionEval ;
33
38
use databend_common_expression:: FunctionFactory ;
39
+ use databend_common_expression:: FunctionProperty ;
34
40
use databend_common_expression:: FunctionRegistry ;
35
41
use databend_common_expression:: FunctionSignature ;
36
42
use databend_common_expression:: ScalarRef ;
37
43
use databend_common_expression:: Value ;
44
+ use rand:: rngs:: SmallRng ;
45
+ use rand:: Rng ;
46
+ use rand:: SeedableRng ;
38
47
39
48
/// Registers Hilbert curve related functions with the function registry.
40
49
pub fn register ( registry : & mut FunctionRegistry ) {
41
- // Register the hilbert_range_index function that calculates Hilbert indices for multi-dimensional data
50
+ // Register the hilbert_range_index function that calculates Hilbert indices for multidimensional data
42
51
let factory = FunctionFactory :: Closure ( Box :: new ( |_, args_type : & [ DataType ] | {
43
52
let args_num = args_type. len ( ) ;
44
53
// The function supports 2, 3, 4, or 5 dimensions (each dimension requires 2 arguments)
@@ -97,7 +106,7 @@ pub fn register(registry: &mut FunctionRegistry) {
97
106
points. push ( key) ;
98
107
}
99
108
100
- // Convert the multi-dimensional point to a Hilbert index
109
+ // Convert the multidimensional point to a Hilbert index
101
110
// This maps the n-dimensional point to a 1-dimensional value
102
111
let points = points
103
112
. iter ( )
@@ -153,6 +162,88 @@ pub fn register(registry: &mut FunctionRegistry) {
153
162
builder. push ( id) ;
154
163
} ) ,
155
164
) ;
165
+
166
+ // We use true randomness by appending a random u8 value at the end of the binary key.
167
+ // This introduces noise to break tie cases in clustering keys that are not uniformly distributed.
168
+ // Although this may slightly affect the accuracy of range_bound estimation,
169
+ // it ensures that Hilbert index + scatter will no longer suffer from data skew.
170
+ // Moreover, since the noise is added at the tail, the original order of the keys is preserved.
171
+ registry. properties . insert (
172
+ "add_noise" . to_string ( ) ,
173
+ FunctionProperty :: default ( ) . non_deterministic ( ) ,
174
+ ) ;
175
+
176
+ registry. register_passthrough_nullable_1_arg :: < StringType , BinaryType , _ , _ > (
177
+ "add_noise" ,
178
+ |_, _| FunctionDomain :: Full ,
179
+ vectorize_with_builder_1_arg :: < StringType , BinaryType > ( |val, builder, _| {
180
+ let mut bytes = val. as_bytes ( ) . to_vec ( ) ;
181
+ let mut rng = SmallRng :: from_entropy ( ) ;
182
+ bytes. push ( rng. gen :: < u8 > ( ) ) ;
183
+ builder. put_slice ( & bytes) ;
184
+ builder. commit_row ( ) ;
185
+ } ) ,
186
+ ) ;
187
+
188
+ for ty in ALL_NUMERICS_TYPES {
189
+ with_number_mapped_type ! ( |NUM_TYPE | match ty {
190
+ NumberDataType :: NUM_TYPE => {
191
+ registry
192
+ . register_passthrough_nullable_1_arg:: <NumberType <NUM_TYPE >, BinaryType , _, _>(
193
+ "add_noise" ,
194
+ |_, _| FunctionDomain :: Full ,
195
+ vectorize_with_builder_1_arg:: <NumberType <NUM_TYPE >, BinaryType >(
196
+ |val, builder, _| {
197
+ let mut encoded = val. encode( ) . to_vec( ) ;
198
+ let mut rng = SmallRng :: from_entropy( ) ;
199
+ encoded. push( rng. gen :: <u8 >( ) ) ;
200
+ builder. put_slice( & encoded) ;
201
+ builder. commit_row( ) ;
202
+ } ,
203
+ ) ,
204
+ ) ;
205
+ }
206
+ } )
207
+ }
208
+
209
+ registry. register_passthrough_nullable_2_arg :: < StringType , NumberType < u64 > , BinaryType , _ , _ > (
210
+ "add_noise" ,
211
+ |_, _, _| FunctionDomain :: Full ,
212
+ vectorize_with_builder_2_arg :: < StringType , NumberType < u64 > , BinaryType > (
213
+ |val, level, builder, _| {
214
+ let mut bytes = val. as_bytes ( ) . to_vec ( ) ;
215
+ let mut rng = SmallRng :: from_entropy ( ) ;
216
+ for _ in 0 ..level {
217
+ bytes. push ( rng. gen :: < u8 > ( ) ) ;
218
+ }
219
+ builder. put_slice ( & bytes) ;
220
+ builder. commit_row ( ) ;
221
+ } ,
222
+ ) ,
223
+ ) ;
224
+
225
+ for ty in ALL_NUMERICS_TYPES {
226
+ with_number_mapped_type ! ( |NUM_TYPE | match ty {
227
+ NumberDataType :: NUM_TYPE => {
228
+ registry
229
+ . register_passthrough_nullable_2_arg:: <NumberType <NUM_TYPE >, NumberType <u64 >, BinaryType , _, _>(
230
+ "add_noise" ,
231
+ |_, _, _| FunctionDomain :: Full ,
232
+ vectorize_with_builder_2_arg:: <NumberType <NUM_TYPE >, NumberType <u64 >, BinaryType >(
233
+ |val, level, builder, _| {
234
+ let mut encoded = val. encode( ) . to_vec( ) ;
235
+ let mut rng = SmallRng :: from_entropy( ) ;
236
+ for _ in 0 ..level {
237
+ encoded. push( rng. gen :: <u8 >( ) ) ;
238
+ }
239
+ builder. put_slice( & encoded) ;
240
+ builder. commit_row( ) ;
241
+ } ,
242
+ ) ,
243
+ ) ;
244
+ }
245
+ } )
246
+ }
156
247
}
157
248
158
249
/// Calculates the partition ID for a value based on range boundaries.
0 commit comments