@@ -30,6 +30,7 @@ use common_expression::types::ValueType;
30
30
use common_expression:: Column ;
31
31
use common_expression:: ColumnBuilder ;
32
32
use common_expression:: Scalar ;
33
+ use common_hashtable:: HashSet as CommonHashSet ;
33
34
use common_hashtable:: HashSetWithStackMemory ;
34
35
use common_hashtable:: HashTableEntity ;
35
36
use common_hashtable:: HashTableKeyable ;
@@ -64,9 +65,13 @@ pub struct AggregateDistinctNumberState<T: Number + HashTableKeyable> {
64
65
inserted : bool ,
65
66
}
66
67
68
+ const HOLDER_CAPACITY : usize = 256 ;
69
+ const HOLDER_BYTES_CAPACITY : usize = HOLDER_CAPACITY * 8 ;
70
+
67
71
pub struct AggregateDistinctStringState {
68
- set : HashSet < KeysRef , RandomState > ,
69
- holder : StringColumnBuilder ,
72
+ set : CommonHashSet < KeysRef > ,
73
+ inserted : bool ,
74
+ holders : Vec < StringColumnBuilder > ,
70
75
}
71
76
72
77
pub struct DataGroupValue ;
@@ -148,26 +153,61 @@ impl DistinctStateFunc<DataGroupValue> for AggregateDistinctState {
148
153
}
149
154
}
150
155
156
+ impl AggregateDistinctStringState {
157
+ #[ inline]
158
+ fn insert_and_materialize ( & mut self , key : & KeysRef ) {
159
+ let entity = self . set . insert_key ( key, & mut self . inserted ) ;
160
+ if self . inserted {
161
+ let data = unsafe { key. as_slice ( ) } ;
162
+
163
+ let holder = self . holders . last_mut ( ) . unwrap ( ) ;
164
+ // TODO(sundy): may cause memory fragmentation, refactor this using arena
165
+ if holder. may_resize ( data. len ( ) ) {
166
+ let mut holder = StringColumnBuilder :: with_capacity (
167
+ HOLDER_CAPACITY ,
168
+ HOLDER_BYTES_CAPACITY . max ( data. len ( ) ) ,
169
+ ) ;
170
+ holder. put_slice ( data) ;
171
+ holder. commit_row ( ) ;
172
+ let value = unsafe { holder. index_unchecked ( holder. len ( ) - 1 ) } ;
173
+ entity. set_key ( KeysRef :: create ( value. as_ptr ( ) as usize , value. len ( ) ) ) ;
174
+ self . holders . push ( holder) ;
175
+ } else {
176
+ holder. put_slice ( data) ;
177
+ holder. commit_row ( ) ;
178
+ let value = unsafe { holder. index_unchecked ( holder. len ( ) - 1 ) } ;
179
+ entity. set_key ( KeysRef :: create ( value. as_ptr ( ) as usize , value. len ( ) ) ) ;
180
+ }
181
+ }
182
+ }
183
+ }
184
+
151
185
impl DistinctStateFunc < KeysRef > for AggregateDistinctStringState {
152
186
fn new ( ) -> Self {
153
187
AggregateDistinctStringState {
154
- set : HashSet :: new ( ) ,
155
- holder : StringColumnBuilder :: with_capacity ( 0 , 0 ) ,
188
+ set : CommonHashSet :: create ( ) ,
189
+ inserted : false ,
190
+ holders : vec ! [ StringColumnBuilder :: with_capacity(
191
+ HOLDER_CAPACITY ,
192
+ HOLDER_BYTES_CAPACITY ,
193
+ ) ] ,
156
194
}
157
195
}
158
196
159
197
fn serialize ( & self , writer : & mut BytesMut ) -> Result < ( ) > {
160
- serialize_into_buf ( writer, & self . holder )
198
+ serialize_into_buf ( writer, & self . holders )
161
199
}
162
200
163
201
fn deserialize ( & mut self , reader : & mut & [ u8 ] ) -> Result < ( ) > {
164
- self . holder = deserialize_from_slice ( reader) ?;
165
- self . set = HashSet :: with_capacity ( self . holder . len ( ) ) ;
166
-
167
- for index in 0 ..self . holder . len ( ) {
168
- let data = unsafe { self . holder . index_unchecked ( index) } ;
169
- let key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
170
- self . set . insert ( key) ;
202
+ self . holders = deserialize_from_slice ( reader) ?;
203
+ self . set = CommonHashSet :: with_capacity ( self . holders . iter ( ) . map ( |h| h. len ( ) ) . sum ( ) ) ;
204
+
205
+ for holder in self . holders . iter ( ) {
206
+ for index in 0 ..holder. len ( ) {
207
+ let data = unsafe { holder. index_unchecked ( index) } ;
208
+ let key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
209
+ self . set . insert_key ( & key, & mut self . inserted ) ;
210
+ }
171
211
}
172
212
Ok ( ( ) )
173
213
}
@@ -183,16 +223,8 @@ impl DistinctStateFunc<KeysRef> for AggregateDistinctStringState {
183
223
fn add ( & mut self , columns : & [ Column ] , row : usize ) -> Result < ( ) > {
184
224
let column = StringType :: try_downcast_column ( & columns[ 0 ] ) . unwrap ( ) ;
185
225
let data = unsafe { column. index_unchecked ( row) } ;
186
-
187
- let mut key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
188
-
189
- if !self . set . contains ( & key) {
190
- self . holder . put_slice ( data) ;
191
- self . holder . commit_row ( ) ;
192
- let data = unsafe { self . holder . index_unchecked ( self . holder . len ( ) - 1 ) } ;
193
- key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
194
- self . set . insert ( key) ;
195
- }
226
+ let key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
227
+ self . insert_and_materialize ( & key) ;
196
228
Ok ( ( ) )
197
229
}
198
230
@@ -204,47 +236,59 @@ impl DistinctStateFunc<KeysRef> for AggregateDistinctStringState {
204
236
) -> Result < ( ) > {
205
237
let column = StringType :: try_downcast_column ( & columns[ 0 ] ) . unwrap ( ) ;
206
238
207
- for row in 0 ..input_rows {
208
- match validity {
209
- Some ( v ) => {
239
+ match validity {
240
+ Some ( v ) => {
241
+ for row in 0 ..input_rows {
210
242
if v. get_bit ( row) {
211
243
let data = unsafe { column. index_unchecked ( row) } ;
212
- let mut key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
213
- if !self . set . contains ( & key) {
214
- self . holder . put_slice ( data) ;
215
- self . holder . commit_row ( ) ;
216
-
217
- let data =
218
- unsafe { self . holder . index_unchecked ( self . holder . len ( ) - 1 ) } ;
219
- key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
220
- self . set . insert ( key) ;
221
- }
244
+ let key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
245
+ self . insert_and_materialize ( & key) ;
222
246
}
223
247
}
224
- None => {
248
+ }
249
+ None => {
250
+ for row in 0 ..input_rows {
225
251
let data = unsafe { column. index_unchecked ( row) } ;
226
- let mut key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
227
- if !self . set . contains ( & key) {
228
- self . holder . put_slice ( data) ;
229
- self . holder . commit_row ( ) ;
230
-
231
- let data = unsafe { self . holder . index_unchecked ( self . holder . len ( ) - 1 ) } ;
232
- key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
233
- self . set . insert ( key) ;
234
- }
252
+ let key = KeysRef :: create ( data. as_ptr ( ) as usize , data. len ( ) ) ;
253
+ self . insert_and_materialize ( & key) ;
235
254
}
236
255
}
237
256
}
238
257
Ok ( ( ) )
239
258
}
240
259
241
260
fn merge ( & mut self , rhs : & Self ) -> Result < ( ) > {
242
- self . set . extend ( rhs. set . clone ( ) ) ;
261
+ for value in rhs. set . iter ( ) {
262
+ self . insert_and_materialize ( value. get_key ( ) ) ;
263
+ }
243
264
Ok ( ( ) )
244
265
}
245
266
246
267
fn build_columns ( & mut self , _types : & [ DataType ] ) -> Result < Vec < Column > > {
247
- let c = std:: mem:: replace ( & mut self . holder , StringColumnBuilder :: with_capacity ( 0 , 0 ) ) ;
268
+ if self . holders . len ( ) == 1 {
269
+ let c = std:: mem:: replace (
270
+ & mut self . holders [ 0 ] ,
271
+ StringColumnBuilder :: with_capacity ( 0 , 0 ) ,
272
+ ) ;
273
+ return Ok ( vec ! [ Column :: String ( c. build( ) ) ] ) ;
274
+ }
275
+
276
+ let mut values = Vec :: with_capacity ( self . holders . iter ( ) . map ( |h| h. data . len ( ) ) . sum ( ) ) ;
277
+ let mut offsets = Vec :: with_capacity ( self . holders . iter ( ) . map ( |h| h. len ( ) ) . sum ( ) ) ;
278
+
279
+ let mut last_offset = 0 ;
280
+ offsets. push ( 0 ) ;
281
+ for holder in self . holders . iter_mut ( ) {
282
+ for offset in holder. offsets . iter ( ) {
283
+ last_offset += * offset;
284
+ offsets. push ( last_offset) ;
285
+ }
286
+ values. append ( & mut holder. data ) ;
287
+ }
288
+ let c = StringColumnBuilder {
289
+ data : values,
290
+ offsets,
291
+ } ;
248
292
Ok ( vec ! [ Column :: String ( c. build( ) ) ] )
249
293
}
250
294
}
0 commit comments