@@ -193,6 +193,13 @@ fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
193
193
"The datatype \" {data_type:?}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented."
194
194
) ) )
195
195
}
196
+ // Variable-sized views: have 3 or more buffers.
197
+ // Buffer 1 are the u128 views
198
+ // Buffers 2...N-1 are u8 byte buffers
199
+ ( DataType :: Utf8View , 1 ) | ( DataType :: BinaryView , 1 ) => u128:: BITS as _ ,
200
+ ( DataType :: Utf8View , _) | ( DataType :: BinaryView , _) => {
201
+ u8:: BITS as _
202
+ }
196
203
// type ids. UnionArray doesn't have null bitmap so buffer index begins with 0.
197
204
( DataType :: Union ( _, _) , 0 ) => i8:: BITS as _ ,
198
205
// Only DenseUnion has 2nd buffer
@@ -300,7 +307,7 @@ impl<'a> ImportedArrowArray<'a> {
300
307
} ;
301
308
302
309
let data_layout = layout ( & self . data_type ) ;
303
- let buffers = self . buffers ( data_layout. can_contain_null_mask ) ?;
310
+ let buffers = self . buffers ( data_layout. can_contain_null_mask , data_layout . variadic ) ?;
304
311
305
312
let null_bit_buffer = if data_layout. can_contain_null_mask {
306
313
self . null_bit_buffer ( )
@@ -373,13 +380,30 @@ impl<'a> ImportedArrowArray<'a> {
373
380
374
381
/// returns all buffers, as organized by Rust (i.e. null buffer is skipped if it's present
375
382
/// in the spec of the type)
376
- fn buffers ( & self , can_contain_null_mask : bool ) -> Result < Vec < Buffer > > {
383
+ fn buffers ( & self , can_contain_null_mask : bool , variadic : bool ) -> Result < Vec < Buffer > > {
377
384
// + 1: skip null buffer
378
385
let buffer_begin = can_contain_null_mask as usize ;
379
- ( buffer_begin..self . array . num_buffers ( ) )
380
- . map ( |index| {
381
- let len = self . buffer_len ( index, & self . data_type ) ?;
386
+ let buffer_end = self . array . num_buffers ( ) - usize:: from ( variadic) ;
387
+
388
+ let variadic_buffer_lens = if variadic {
389
+ // Each views array has 1 (optional) null buffer, 1 views buffer, 1 lengths buffer.
390
+ // Rest are variadic.
391
+ let num_variadic_buffers =
392
+ self . array . num_buffers ( ) - ( 2 + usize:: from ( can_contain_null_mask) ) ;
393
+ if num_variadic_buffers == 0 {
394
+ & [ ]
395
+ } else {
396
+ let lengths = self . array . buffer ( self . array . num_buffers ( ) - 1 ) ;
397
+ // SAFETY: is lengths is non-null, then it must be valid for up to num_variadic_buffers.
398
+ unsafe { std:: slice:: from_raw_parts ( lengths. cast :: < i64 > ( ) , num_variadic_buffers) }
399
+ }
400
+ } else {
401
+ & [ ]
402
+ } ;
382
403
404
+ ( buffer_begin..buffer_end)
405
+ . map ( |index| {
406
+ let len = self . buffer_len ( index, variadic_buffer_lens, & self . data_type ) ?;
383
407
match unsafe { create_buffer ( self . owner . clone ( ) , self . array , index, len) } {
384
408
Some ( buf) => Ok ( buf) ,
385
409
None if len == 0 => {
@@ -399,7 +423,12 @@ impl<'a> ImportedArrowArray<'a> {
399
423
/// Rust implementation uses fixed-sized buffers, which require knowledge of their `len`.
400
424
/// for variable-sized buffers, such as the second buffer of a stringArray, we need
401
425
/// to fetch offset buffer's len to build the second buffer.
402
- fn buffer_len ( & self , i : usize , dt : & DataType ) -> Result < usize > {
426
+ fn buffer_len (
427
+ & self ,
428
+ i : usize ,
429
+ variadic_buffer_lengths : & [ i64 ] ,
430
+ dt : & DataType ,
431
+ ) -> Result < usize > {
403
432
// Special handling for dictionary type as we only care about the key type in the case.
404
433
let data_type = match dt {
405
434
DataType :: Dictionary ( key_data_type, _) => key_data_type. as_ref ( ) ,
@@ -430,7 +459,7 @@ impl<'a> ImportedArrowArray<'a> {
430
459
}
431
460
432
461
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
433
- let len = self . buffer_len ( 1 , dt) ?;
462
+ let len = self . buffer_len ( 1 , variadic_buffer_lengths , dt) ?;
434
463
// first buffer is the null buffer => add(1)
435
464
// we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets.
436
465
#[ allow( clippy:: cast_ptr_alignment) ]
@@ -444,14 +473,24 @@ impl<'a> ImportedArrowArray<'a> {
444
473
}
445
474
446
475
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
447
- let len = self . buffer_len ( 1 , dt) ?;
476
+ let len = self . buffer_len ( 1 , variadic_buffer_lengths , dt) ?;
448
477
// first buffer is the null buffer => add(1)
449
478
// we assume that pointer is aligned for `i64`, as Large uses `i64` offsets.
450
479
#[ allow( clippy:: cast_ptr_alignment) ]
451
480
let offset_buffer = self . array . buffer ( 1 ) as * const i64 ;
452
481
// get last offset
453
482
( unsafe { * offset_buffer. add ( len / size_of :: < i64 > ( ) - 1 ) } ) as usize
454
483
}
484
+ // View types: these have variadic buffers.
485
+ // Buffer 1 is the views buffer, which stores 1 u128 per length of the array.
486
+ // Buffers 2..N-1 are the buffers holding the byte data. Their lengths are variable.
487
+ // Buffer N is of length (N - 2) and stores i64 containing the lengths of buffers 2..N-1
488
+ ( DataType :: Utf8View , 1 ) | ( DataType :: BinaryView , 1 ) => {
489
+ std:: mem:: size_of :: < u128 > ( ) * length
490
+ }
491
+ ( DataType :: Utf8View , i) | ( DataType :: BinaryView , i) => {
492
+ variadic_buffer_lengths[ i - 2 ] as usize
493
+ }
455
494
// buffer len of primitive types
456
495
_ => {
457
496
let bits = bit_width ( data_type, i) ?;
@@ -1229,18 +1268,18 @@ mod tests_from_ffi {
1229
1268
use arrow_data:: ArrayData ;
1230
1269
use arrow_schema:: { DataType , Field } ;
1231
1270
1232
- use crate :: types:: Int32Type ;
1271
+ use super :: { ImportedArrowArray , Result } ;
1272
+ use crate :: builder:: GenericByteViewBuilder ;
1273
+ use crate :: types:: { BinaryViewType , ByteViewType , Int32Type , StringViewType } ;
1233
1274
use crate :: {
1234
1275
array:: {
1235
1276
Array , BooleanArray , DictionaryArray , FixedSizeBinaryArray , FixedSizeListArray ,
1236
1277
Int32Array , Int64Array , StringArray , StructArray , UInt32Array , UInt64Array ,
1237
1278
} ,
1238
1279
ffi:: { from_ffi, FFI_ArrowArray , FFI_ArrowSchema } ,
1239
- make_array, ArrayRef , ListArray ,
1280
+ make_array, ArrayRef , GenericByteViewArray , ListArray ,
1240
1281
} ;
1241
1282
1242
- use super :: { ImportedArrowArray , Result } ;
1243
-
1244
1283
fn test_round_trip ( expected : & ArrayData ) -> Result < ( ) > {
1245
1284
// here we export the array
1246
1285
let array = FFI_ArrowArray :: new ( expected) ;
@@ -1453,8 +1492,8 @@ mod tests_from_ffi {
1453
1492
owner : & array,
1454
1493
} ;
1455
1494
1456
- let offset_buf_len = imported_array. buffer_len ( 1 , & imported_array. data_type ) ?;
1457
- let data_buf_len = imported_array. buffer_len ( 2 , & imported_array. data_type ) ?;
1495
+ let offset_buf_len = imported_array. buffer_len ( 1 , & [ ] , & imported_array. data_type ) ?;
1496
+ let data_buf_len = imported_array. buffer_len ( 2 , & [ ] , & imported_array. data_type ) ?;
1458
1497
1459
1498
assert_eq ! ( offset_buf_len, 4 ) ;
1460
1499
assert_eq ! ( data_buf_len, 0 ) ;
@@ -1472,6 +1511,18 @@ mod tests_from_ffi {
1472
1511
StringArray :: from ( array)
1473
1512
}
1474
1513
1514
+ fn roundtrip_byte_view_array < T : ByteViewType > (
1515
+ array : GenericByteViewArray < T > ,
1516
+ ) -> GenericByteViewArray < T > {
1517
+ let data = array. into_data ( ) ;
1518
+
1519
+ let array = FFI_ArrowArray :: new ( & data) ;
1520
+ let schema = FFI_ArrowSchema :: try_from ( data. data_type ( ) ) . unwrap ( ) ;
1521
+
1522
+ let array = unsafe { from_ffi ( array, & schema) } . unwrap ( ) ;
1523
+ GenericByteViewArray :: < T > :: from ( array)
1524
+ }
1525
+
1475
1526
fn extend_array ( array : & dyn Array ) -> ArrayRef {
1476
1527
let len = array. len ( ) ;
1477
1528
let data = array. to_data ( ) ;
@@ -1551,4 +1602,93 @@ mod tests_from_ffi {
1551
1602
& imported
1552
1603
) ;
1553
1604
}
1605
+
1606
+ /// Helper trait to allow us to use easily strings as either BinaryViewType::Native or
1607
+ /// StringViewType::Native scalars.
1608
+ trait NativeFromStr {
1609
+ fn from_str ( value : & str ) -> & Self ;
1610
+ }
1611
+
1612
+ impl NativeFromStr for str {
1613
+ fn from_str ( value : & str ) -> & Self {
1614
+ value
1615
+ }
1616
+ }
1617
+
1618
+ impl NativeFromStr for [ u8 ] {
1619
+ fn from_str ( value : & str ) -> & Self {
1620
+ value. as_bytes ( )
1621
+ }
1622
+ }
1623
+
1624
+ #[ test]
1625
+ fn test_round_trip_byte_view ( ) {
1626
+ fn test_case < T > ( )
1627
+ where
1628
+ T : ByteViewType ,
1629
+ T :: Native : NativeFromStr ,
1630
+ {
1631
+ macro_rules! run_test_case {
1632
+ ( $array: expr) => { {
1633
+ // round-trip through C Data Interface
1634
+ let len = $array. len( ) ;
1635
+ let imported = roundtrip_byte_view_array( $array) ;
1636
+ assert_eq!( imported. len( ) , len) ;
1637
+
1638
+ let copied = extend_array( & imported) ;
1639
+ assert_eq!(
1640
+ copied
1641
+ . as_any( )
1642
+ . downcast_ref:: <GenericByteViewArray <T >>( )
1643
+ . unwrap( ) ,
1644
+ & imported
1645
+ ) ;
1646
+ } } ;
1647
+ }
1648
+
1649
+ // Empty test case.
1650
+ let empty = GenericByteViewBuilder :: < T > :: new ( ) . finish ( ) ;
1651
+ run_test_case ! ( empty) ;
1652
+
1653
+ // All inlined strings test case.
1654
+ let mut all_inlined = GenericByteViewBuilder :: < T > :: new ( ) ;
1655
+ all_inlined. append_value ( T :: Native :: from_str ( "inlined1" ) ) ;
1656
+ all_inlined. append_value ( T :: Native :: from_str ( "inlined2" ) ) ;
1657
+ all_inlined. append_value ( T :: Native :: from_str ( "inlined3" ) ) ;
1658
+ let all_inlined = all_inlined. finish ( ) ;
1659
+ assert_eq ! ( all_inlined. data_buffers( ) . len( ) , 0 ) ;
1660
+ run_test_case ! ( all_inlined) ;
1661
+
1662
+ // some inlined + non-inlined, 1 variadic buffer.
1663
+ let mixed_one_variadic = {
1664
+ let mut builder = GenericByteViewBuilder :: < T > :: new ( ) ;
1665
+ builder. append_value ( T :: Native :: from_str ( "inlined" ) ) ;
1666
+ let block_id =
1667
+ builder. append_block ( Buffer :: from ( "non-inlined-string-buffer" . as_bytes ( ) ) ) ;
1668
+ builder. try_append_view ( block_id, 0 , 25 ) . unwrap ( ) ;
1669
+ builder. finish ( )
1670
+ } ;
1671
+ assert_eq ! ( mixed_one_variadic. data_buffers( ) . len( ) , 1 ) ;
1672
+ run_test_case ! ( mixed_one_variadic) ;
1673
+
1674
+ // inlined + non-inlined, 2 variadic buffers.
1675
+ let mixed_two_variadic = {
1676
+ let mut builder = GenericByteViewBuilder :: < T > :: new ( ) ;
1677
+ builder. append_value ( T :: Native :: from_str ( "inlined" ) ) ;
1678
+ let block_id =
1679
+ builder. append_block ( Buffer :: from ( "non-inlined-string-buffer" . as_bytes ( ) ) ) ;
1680
+ builder. try_append_view ( block_id, 0 , 25 ) . unwrap ( ) ;
1681
+
1682
+ let block_id = builder
1683
+ . append_block ( Buffer :: from ( "another-non-inlined-string-buffer" . as_bytes ( ) ) ) ;
1684
+ builder. try_append_view ( block_id, 0 , 33 ) . unwrap ( ) ;
1685
+ builder. finish ( )
1686
+ } ;
1687
+ assert_eq ! ( mixed_two_variadic. data_buffers( ) . len( ) , 2 ) ;
1688
+ run_test_case ! ( mixed_two_variadic) ;
1689
+ }
1690
+
1691
+ test_case :: < StringViewType > ( ) ;
1692
+ test_case :: < BinaryViewType > ( ) ;
1693
+ }
1554
1694
}
0 commit comments