Skip to content

Commit 5b04ca7

Browse files
metalmatzealamb
andauthored
arrow-cast: Support cast to Dictionary<_, FixedSizeBinary> and add FixedSizeBinaryDictionaryBuilder (#6666)
* arrow-cast: Support FixedSizeBinary packing * Add fixed_size_binary_dictionary_builder.rs * Improve tests / fmt * clippy * Add a documentation example * Add test for error, improve message * fix link in doc --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 9e91029 commit 5b04ca7

File tree

5 files changed

+456
-0
lines changed

5 files changed

+456
-0
lines changed

arrow-array/src/builder/fixed_size_binary_builder.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ impl FixedSizeBinaryBuilder {
9393
self.null_buffer_builder.append_null();
9494
}
9595

96+
/// Returns the current values buffer as a slice
97+
pub fn values_slice(&self) -> &[u8] {
98+
self.values_builder.as_slice()
99+
}
100+
96101
/// Builds the [`FixedSizeBinaryArray`] and reset this builder.
97102
pub fn finish(&mut self) -> FixedSizeBinaryArray {
98103
let array_length = self.len();
Lines changed: 354 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,354 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder};
19+
use crate::types::ArrowDictionaryKeyType;
20+
use crate::{Array, ArrayRef, DictionaryArray};
21+
use arrow_buffer::ArrowNativeType;
22+
use arrow_schema::DataType::FixedSizeBinary;
23+
use arrow_schema::{ArrowError, DataType};
24+
use hashbrown::HashTable;
25+
use std::any::Any;
26+
use std::sync::Arc;
27+
28+
/// Builder for [`DictionaryArray`] of [`FixedSizeBinaryArray`]
29+
///
30+
/// The output array has a dictionary of unique, fixed-size binary values. The
31+
/// builder handles deduplication.
32+
///
33+
/// # Example
34+
/// ```
35+
/// # use arrow_array::builder::{FixedSizeBinaryDictionaryBuilder};
36+
/// # use arrow_array::array::{Array, FixedSizeBinaryArray};
37+
/// # use arrow_array::DictionaryArray;
38+
/// # use arrow_array::types::Int8Type;
39+
/// // Build 3 byte FixedBinaryArrays
40+
/// let byte_width = 3;
41+
/// let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
42+
/// builder.append("abc").unwrap();
43+
/// builder.append_null();
44+
/// builder.append(b"def").unwrap();
45+
/// builder.append(b"def").unwrap(); // duplicate value
46+
/// // Result is a Dictionary Array
47+
/// let array = builder.finish();
48+
/// let dict_array = array.as_any().downcast_ref::<DictionaryArray<Int8Type>>().unwrap();
49+
/// // The array represents "abc", null, "def", "def"
50+
/// assert_eq!(array.keys().len(), 4);
51+
/// // but there are only 2 unique values
52+
/// assert_eq!(array.values().len(), 2);
53+
/// let values = dict_array.values().as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
54+
/// assert_eq!(values.value(0), "abc".as_bytes());
55+
/// assert_eq!(values.value(1), "def".as_bytes());
56+
/// ```
57+
///
58+
/// [`FixedSizeBinaryArray`]: crate::FixedSizeBinaryArray
59+
#[derive(Debug)]
60+
pub struct FixedSizeBinaryDictionaryBuilder<K>
61+
where
62+
K: ArrowDictionaryKeyType,
63+
{
64+
state: ahash::RandomState,
65+
dedup: HashTable<usize>,
66+
67+
keys_builder: PrimitiveBuilder<K>,
68+
values_builder: FixedSizeBinaryBuilder,
69+
byte_width: i32,
70+
}
71+
72+
impl<K> FixedSizeBinaryDictionaryBuilder<K>
73+
where
74+
K: ArrowDictionaryKeyType,
75+
{
76+
/// Creates a new `FixedSizeBinaryDictionaryBuilder`
77+
pub fn new(byte_width: i32) -> Self {
78+
let keys_builder = PrimitiveBuilder::new();
79+
let values_builder = FixedSizeBinaryBuilder::new(byte_width);
80+
Self {
81+
state: Default::default(),
82+
dedup: HashTable::with_capacity(keys_builder.capacity()),
83+
keys_builder,
84+
values_builder,
85+
byte_width,
86+
}
87+
}
88+
89+
/// Creates a new `FixedSizeBinaryDictionaryBuilder` with the provided capacities
90+
///
91+
/// `keys_capacity`: the number of keys, i.e. length of array to build
92+
/// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
93+
/// `byte_width`: the byte width for individual values in the values array
94+
pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self {
95+
Self {
96+
state: Default::default(),
97+
dedup: Default::default(),
98+
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
99+
values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width),
100+
byte_width,
101+
}
102+
}
103+
}
104+
105+
impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K>
106+
where
107+
K: ArrowDictionaryKeyType,
108+
{
109+
/// Returns the builder as an non-mutable `Any` reference.
110+
fn as_any(&self) -> &dyn Any {
111+
self
112+
}
113+
114+
/// Returns the builder as an mutable `Any` reference.
115+
fn as_any_mut(&mut self) -> &mut dyn Any {
116+
self
117+
}
118+
119+
/// Returns the boxed builder as a box of `Any`.
120+
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
121+
self
122+
}
123+
124+
/// Returns the number of array slots in the builder
125+
fn len(&self) -> usize {
126+
self.keys_builder.len()
127+
}
128+
129+
/// Builds the array and reset this builder.
130+
fn finish(&mut self) -> ArrayRef {
131+
Arc::new(self.finish())
132+
}
133+
134+
/// Builds the array without resetting the builder.
135+
fn finish_cloned(&self) -> ArrayRef {
136+
Arc::new(self.finish_cloned())
137+
}
138+
}
139+
140+
impl<K> FixedSizeBinaryDictionaryBuilder<K>
141+
where
142+
K: ArrowDictionaryKeyType,
143+
{
144+
fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
145+
let value_bytes: &[u8] = value.as_ref();
146+
147+
let state = &self.state;
148+
let storage = &mut self.values_builder;
149+
let hash = state.hash_one(value_bytes);
150+
151+
let idx = *self
152+
.dedup
153+
.entry(
154+
hash,
155+
|idx| value_bytes == get_bytes(storage, self.byte_width, *idx),
156+
|idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)),
157+
)
158+
.or_insert_with(|| {
159+
let idx = storage.len();
160+
let _ = storage.append_value(value);
161+
idx
162+
})
163+
.get();
164+
165+
let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
166+
167+
Ok(key)
168+
}
169+
170+
/// Append a value to the array. Return an existing index
171+
/// if already present in the values array or a new index if the
172+
/// value is appended to the values array.
173+
///
174+
/// Returns an error if the new index would overflow the key type.
175+
pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
176+
if self.byte_width != value.as_ref().len() as i32 {
177+
Err(ArrowError::InvalidArgumentError(format!(
178+
"Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
179+
self.byte_width,
180+
value.as_ref().len()
181+
)))
182+
} else {
183+
let key = self.get_or_insert_key(value)?;
184+
self.keys_builder.append_value(key);
185+
Ok(key)
186+
}
187+
}
188+
189+
/// Appends a null slot into the builder
190+
#[inline]
191+
pub fn append_null(&mut self) {
192+
self.keys_builder.append_null()
193+
}
194+
195+
/// Infallibly append a value to this builder
196+
///
197+
/// # Panics
198+
///
199+
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
200+
pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
201+
self.append(value).expect("dictionary key overflow");
202+
}
203+
204+
/// Builds the `DictionaryArray` and reset this builder.
205+
pub fn finish(&mut self) -> DictionaryArray<K> {
206+
self.dedup.clear();
207+
let values = self.values_builder.finish();
208+
let keys = self.keys_builder.finish();
209+
210+
let data_type = DataType::Dictionary(
211+
Box::new(K::DATA_TYPE),
212+
Box::new(FixedSizeBinary(self.byte_width)),
213+
);
214+
215+
let builder = keys
216+
.into_data()
217+
.into_builder()
218+
.data_type(data_type)
219+
.child_data(vec![values.into_data()]);
220+
221+
DictionaryArray::from(unsafe { builder.build_unchecked() })
222+
}
223+
224+
/// Builds the `DictionaryArray` without resetting the builder.
225+
pub fn finish_cloned(&self) -> DictionaryArray<K> {
226+
let values = self.values_builder.finish_cloned();
227+
let keys = self.keys_builder.finish_cloned();
228+
229+
let data_type = DataType::Dictionary(
230+
Box::new(K::DATA_TYPE),
231+
Box::new(FixedSizeBinary(self.byte_width)),
232+
);
233+
234+
let builder = keys
235+
.into_data()
236+
.into_builder()
237+
.data_type(data_type)
238+
.child_data(vec![values.into_data()]);
239+
240+
DictionaryArray::from(unsafe { builder.build_unchecked() })
241+
}
242+
}
243+
244+
fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] {
245+
let values = values.values_slice();
246+
let start = idx * byte_width.as_usize();
247+
let end = idx * byte_width.as_usize() + byte_width.as_usize();
248+
&values[start..end]
249+
}
250+
251+
#[cfg(test)]
252+
mod tests {
253+
use super::*;
254+
255+
use crate::types::Int8Type;
256+
use crate::{FixedSizeBinaryArray, Int8Array};
257+
258+
#[test]
259+
fn test_fixed_size_dictionary_builder() {
260+
let values = ["abc", "def"];
261+
262+
let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
263+
assert_eq!(b.append(values[0]).unwrap(), 0);
264+
b.append_null();
265+
assert_eq!(b.append(values[1]).unwrap(), 1);
266+
assert_eq!(b.append(values[1]).unwrap(), 1);
267+
assert_eq!(b.append(values[0]).unwrap(), 0);
268+
let array = b.finish();
269+
270+
assert_eq!(
271+
array.keys(),
272+
&Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]),
273+
);
274+
275+
// Values are polymorphic and so require a downcast.
276+
let ava = array
277+
.values()
278+
.as_any()
279+
.downcast_ref::<FixedSizeBinaryArray>()
280+
.unwrap();
281+
282+
assert_eq!(ava.value(0), values[0].as_bytes());
283+
assert_eq!(ava.value(1), values[1].as_bytes());
284+
}
285+
286+
#[test]
287+
fn test_fixed_size_dictionary_builder_wrong_size() {
288+
let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
289+
let err = b.append(b"too long").unwrap_err().to_string();
290+
assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8");
291+
let err = b.append("").unwrap_err().to_string();
292+
assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0");
293+
}
294+
295+
#[test]
296+
fn test_fixed_size_dictionary_builder_finish_cloned() {
297+
let values = ["abc", "def", "ghi"];
298+
299+
let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
300+
301+
builder.append(values[0]).unwrap();
302+
builder.append_null();
303+
builder.append(values[1]).unwrap();
304+
builder.append(values[1]).unwrap();
305+
builder.append(values[0]).unwrap();
306+
let mut array = builder.finish_cloned();
307+
308+
assert_eq!(
309+
array.keys(),
310+
&Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
311+
);
312+
313+
// Values are polymorphic and so require a downcast.
314+
let ava = array
315+
.values()
316+
.as_any()
317+
.downcast_ref::<FixedSizeBinaryArray>()
318+
.unwrap();
319+
320+
assert_eq!(ava.value(0), values[0].as_bytes());
321+
assert_eq!(ava.value(1), values[1].as_bytes());
322+
323+
builder.append(values[0]).unwrap();
324+
builder.append(values[2]).unwrap();
325+
builder.append(values[1]).unwrap();
326+
327+
array = builder.finish();
328+
329+
assert_eq!(
330+
array.keys(),
331+
&Int8Array::from(vec![
332+
Some(0),
333+
None,
334+
Some(1),
335+
Some(1),
336+
Some(0),
337+
Some(0),
338+
Some(2),
339+
Some(1)
340+
])
341+
);
342+
343+
// Values are polymorphic and so require a downcast.
344+
let ava2 = array
345+
.values()
346+
.as_any()
347+
.downcast_ref::<FixedSizeBinaryArray>()
348+
.unwrap();
349+
350+
assert_eq!(ava2.value(0), values[0].as_bytes());
351+
assert_eq!(ava2.value(1), values[1].as_bytes());
352+
assert_eq!(ava2.value(2), values[2].as_bytes());
353+
}
354+
}

arrow-array/src/builder/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,8 @@ mod fixed_size_binary_builder;
243243
pub use fixed_size_binary_builder::*;
244244
mod fixed_size_list_builder;
245245
pub use fixed_size_list_builder::*;
246+
mod fixed_size_binary_dictionary_builder;
247+
pub use fixed_size_binary_dictionary_builder::*;
246248
mod generic_bytes_builder;
247249
pub use generic_bytes_builder::*;
248250
mod generic_list_builder;

0 commit comments

Comments
 (0)