Skip to content

Commit 60ec869

Browse files
authored
Support StringViewArray interop with python: fix lingering C Data Interface issues for *ViewArray (#6368)
* fix lingering C Data Interface issues for *ViewArray Fixes #6366 * report views length in elements -> bytes * use pyarrow 17 * use only good versions * fix support for View arrays in C FFI, add test * update comment in github action * more ffi test cases * more byte_view tests for into_pyarrow
1 parent e838e62 commit 60ec869

File tree

6 files changed

+247
-26
lines changed

6 files changed

+247
-26
lines changed

.github/workflows/integration.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ on:
4848
- arrow/**
4949

5050
jobs:
51-
5251
integration:
5352
name: Archery test With other arrows
5453
runs-on: ubuntu-latest
@@ -118,9 +117,9 @@ jobs:
118117
runs-on: ubuntu-latest
119118
strategy:
120119
matrix:
121-
rust: [ stable ]
122-
# PyArrow 13 was the last version prior to introduction to Arrow PyCapsules
123-
pyarrow: [ "13", "14" ]
120+
rust: [stable]
121+
# PyArrow 15 was the first version to introduce StringView/BinaryView support
122+
pyarrow: ["15", "16", "17"]
124123
steps:
125124
- uses: actions/checkout@v4
126125
with:

arrow-array/src/ffi.rs

Lines changed: 154 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,13 @@ fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
193193
"The datatype \"{data_type:?}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented."
194194
)))
195195
}
196+
// Variable-sized views: have 3 or more buffers.
197+
// Buffer 1 are the u128 views
198+
// Buffers 2...N-1 are u8 byte buffers
199+
(DataType::Utf8View, 1) | (DataType::BinaryView,1) => u128::BITS as _,
200+
(DataType::Utf8View, _) | (DataType::BinaryView, _) => {
201+
u8::BITS as _
202+
}
196203
// type ids. UnionArray doesn't have null bitmap so buffer index begins with 0.
197204
(DataType::Union(_, _), 0) => i8::BITS as _,
198205
// Only DenseUnion has 2nd buffer
@@ -300,7 +307,7 @@ impl<'a> ImportedArrowArray<'a> {
300307
};
301308

302309
let data_layout = layout(&self.data_type);
303-
let buffers = self.buffers(data_layout.can_contain_null_mask)?;
310+
let buffers = self.buffers(data_layout.can_contain_null_mask, data_layout.variadic)?;
304311

305312
let null_bit_buffer = if data_layout.can_contain_null_mask {
306313
self.null_bit_buffer()
@@ -373,13 +380,30 @@ impl<'a> ImportedArrowArray<'a> {
373380

374381
/// returns all buffers, as organized by Rust (i.e. null buffer is skipped if it's present
375382
/// in the spec of the type)
376-
fn buffers(&self, can_contain_null_mask: bool) -> Result<Vec<Buffer>> {
383+
fn buffers(&self, can_contain_null_mask: bool, variadic: bool) -> Result<Vec<Buffer>> {
377384
// + 1: skip null buffer
378385
let buffer_begin = can_contain_null_mask as usize;
379-
(buffer_begin..self.array.num_buffers())
380-
.map(|index| {
381-
let len = self.buffer_len(index, &self.data_type)?;
386+
let buffer_end = self.array.num_buffers() - usize::from(variadic);
387+
388+
let variadic_buffer_lens = if variadic {
389+
// Each views array has 1 (optional) null buffer, 1 views buffer, 1 lengths buffer.
390+
// Rest are variadic.
391+
let num_variadic_buffers =
392+
self.array.num_buffers() - (2 + usize::from(can_contain_null_mask));
393+
if num_variadic_buffers == 0 {
394+
&[]
395+
} else {
396+
let lengths = self.array.buffer(self.array.num_buffers() - 1);
397+
// SAFETY: is lengths is non-null, then it must be valid for up to num_variadic_buffers.
398+
unsafe { std::slice::from_raw_parts(lengths.cast::<i64>(), num_variadic_buffers) }
399+
}
400+
} else {
401+
&[]
402+
};
382403

404+
(buffer_begin..buffer_end)
405+
.map(|index| {
406+
let len = self.buffer_len(index, variadic_buffer_lens, &self.data_type)?;
383407
match unsafe { create_buffer(self.owner.clone(), self.array, index, len) } {
384408
Some(buf) => Ok(buf),
385409
None if len == 0 => {
@@ -399,7 +423,12 @@ impl<'a> ImportedArrowArray<'a> {
399423
/// Rust implementation uses fixed-sized buffers, which require knowledge of their `len`.
400424
/// for variable-sized buffers, such as the second buffer of a stringArray, we need
401425
/// to fetch offset buffer's len to build the second buffer.
402-
fn buffer_len(&self, i: usize, dt: &DataType) -> Result<usize> {
426+
fn buffer_len(
427+
&self,
428+
i: usize,
429+
variadic_buffer_lengths: &[i64],
430+
dt: &DataType,
431+
) -> Result<usize> {
403432
// Special handling for dictionary type as we only care about the key type in the case.
404433
let data_type = match dt {
405434
DataType::Dictionary(key_data_type, _) => key_data_type.as_ref(),
@@ -430,7 +459,7 @@ impl<'a> ImportedArrowArray<'a> {
430459
}
431460

432461
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
433-
let len = self.buffer_len(1, dt)?;
462+
let len = self.buffer_len(1, variadic_buffer_lengths, dt)?;
434463
// first buffer is the null buffer => add(1)
435464
// we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets.
436465
#[allow(clippy::cast_ptr_alignment)]
@@ -444,14 +473,24 @@ impl<'a> ImportedArrowArray<'a> {
444473
}
445474

446475
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
447-
let len = self.buffer_len(1, dt)?;
476+
let len = self.buffer_len(1, variadic_buffer_lengths, dt)?;
448477
// first buffer is the null buffer => add(1)
449478
// we assume that pointer is aligned for `i64`, as Large uses `i64` offsets.
450479
#[allow(clippy::cast_ptr_alignment)]
451480
let offset_buffer = self.array.buffer(1) as *const i64;
452481
// get last offset
453482
(unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
454483
}
484+
// View types: these have variadic buffers.
485+
// Buffer 1 is the views buffer, which stores 1 u128 per length of the array.
486+
// Buffers 2..N-1 are the buffers holding the byte data. Their lengths are variable.
487+
// Buffer N is of length (N - 2) and stores i64 containing the lengths of buffers 2..N-1
488+
(DataType::Utf8View, 1) | (DataType::BinaryView, 1) => {
489+
std::mem::size_of::<u128>() * length
490+
}
491+
(DataType::Utf8View, i) | (DataType::BinaryView, i) => {
492+
variadic_buffer_lengths[i - 2] as usize
493+
}
455494
// buffer len of primitive types
456495
_ => {
457496
let bits = bit_width(data_type, i)?;
@@ -1229,18 +1268,18 @@ mod tests_from_ffi {
12291268
use arrow_data::ArrayData;
12301269
use arrow_schema::{DataType, Field};
12311270

1232-
use crate::types::Int32Type;
1271+
use super::{ImportedArrowArray, Result};
1272+
use crate::builder::GenericByteViewBuilder;
1273+
use crate::types::{BinaryViewType, ByteViewType, Int32Type, StringViewType};
12331274
use crate::{
12341275
array::{
12351276
Array, BooleanArray, DictionaryArray, FixedSizeBinaryArray, FixedSizeListArray,
12361277
Int32Array, Int64Array, StringArray, StructArray, UInt32Array, UInt64Array,
12371278
},
12381279
ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema},
1239-
make_array, ArrayRef, ListArray,
1280+
make_array, ArrayRef, GenericByteViewArray, ListArray,
12401281
};
12411282

1242-
use super::{ImportedArrowArray, Result};
1243-
12441283
fn test_round_trip(expected: &ArrayData) -> Result<()> {
12451284
// here we export the array
12461285
let array = FFI_ArrowArray::new(expected);
@@ -1453,8 +1492,8 @@ mod tests_from_ffi {
14531492
owner: &array,
14541493
};
14551494

1456-
let offset_buf_len = imported_array.buffer_len(1, &imported_array.data_type)?;
1457-
let data_buf_len = imported_array.buffer_len(2, &imported_array.data_type)?;
1495+
let offset_buf_len = imported_array.buffer_len(1, &[], &imported_array.data_type)?;
1496+
let data_buf_len = imported_array.buffer_len(2, &[], &imported_array.data_type)?;
14581497

14591498
assert_eq!(offset_buf_len, 4);
14601499
assert_eq!(data_buf_len, 0);
@@ -1472,6 +1511,18 @@ mod tests_from_ffi {
14721511
StringArray::from(array)
14731512
}
14741513

1514+
fn roundtrip_byte_view_array<T: ByteViewType>(
1515+
array: GenericByteViewArray<T>,
1516+
) -> GenericByteViewArray<T> {
1517+
let data = array.into_data();
1518+
1519+
let array = FFI_ArrowArray::new(&data);
1520+
let schema = FFI_ArrowSchema::try_from(data.data_type()).unwrap();
1521+
1522+
let array = unsafe { from_ffi(array, &schema) }.unwrap();
1523+
GenericByteViewArray::<T>::from(array)
1524+
}
1525+
14751526
fn extend_array(array: &dyn Array) -> ArrayRef {
14761527
let len = array.len();
14771528
let data = array.to_data();
@@ -1551,4 +1602,93 @@ mod tests_from_ffi {
15511602
&imported
15521603
);
15531604
}
1605+
1606+
/// Helper trait to allow us to use easily strings as either BinaryViewType::Native or
1607+
/// StringViewType::Native scalars.
1608+
trait NativeFromStr {
1609+
fn from_str(value: &str) -> &Self;
1610+
}
1611+
1612+
impl NativeFromStr for str {
1613+
fn from_str(value: &str) -> &Self {
1614+
value
1615+
}
1616+
}
1617+
1618+
impl NativeFromStr for [u8] {
1619+
fn from_str(value: &str) -> &Self {
1620+
value.as_bytes()
1621+
}
1622+
}
1623+
1624+
#[test]
1625+
fn test_round_trip_byte_view() {
1626+
fn test_case<T>()
1627+
where
1628+
T: ByteViewType,
1629+
T::Native: NativeFromStr,
1630+
{
1631+
macro_rules! run_test_case {
1632+
($array:expr) => {{
1633+
// round-trip through C Data Interface
1634+
let len = $array.len();
1635+
let imported = roundtrip_byte_view_array($array);
1636+
assert_eq!(imported.len(), len);
1637+
1638+
let copied = extend_array(&imported);
1639+
assert_eq!(
1640+
copied
1641+
.as_any()
1642+
.downcast_ref::<GenericByteViewArray<T>>()
1643+
.unwrap(),
1644+
&imported
1645+
);
1646+
}};
1647+
}
1648+
1649+
// Empty test case.
1650+
let empty = GenericByteViewBuilder::<T>::new().finish();
1651+
run_test_case!(empty);
1652+
1653+
// All inlined strings test case.
1654+
let mut all_inlined = GenericByteViewBuilder::<T>::new();
1655+
all_inlined.append_value(T::Native::from_str("inlined1"));
1656+
all_inlined.append_value(T::Native::from_str("inlined2"));
1657+
all_inlined.append_value(T::Native::from_str("inlined3"));
1658+
let all_inlined = all_inlined.finish();
1659+
assert_eq!(all_inlined.data_buffers().len(), 0);
1660+
run_test_case!(all_inlined);
1661+
1662+
// some inlined + non-inlined, 1 variadic buffer.
1663+
let mixed_one_variadic = {
1664+
let mut builder = GenericByteViewBuilder::<T>::new();
1665+
builder.append_value(T::Native::from_str("inlined"));
1666+
let block_id =
1667+
builder.append_block(Buffer::from("non-inlined-string-buffer".as_bytes()));
1668+
builder.try_append_view(block_id, 0, 25).unwrap();
1669+
builder.finish()
1670+
};
1671+
assert_eq!(mixed_one_variadic.data_buffers().len(), 1);
1672+
run_test_case!(mixed_one_variadic);
1673+
1674+
// inlined + non-inlined, 2 variadic buffers.
1675+
let mixed_two_variadic = {
1676+
let mut builder = GenericByteViewBuilder::<T>::new();
1677+
builder.append_value(T::Native::from_str("inlined"));
1678+
let block_id =
1679+
builder.append_block(Buffer::from("non-inlined-string-buffer".as_bytes()));
1680+
builder.try_append_view(block_id, 0, 25).unwrap();
1681+
1682+
let block_id = builder
1683+
.append_block(Buffer::from("another-non-inlined-string-buffer".as_bytes()));
1684+
builder.try_append_view(block_id, 0, 33).unwrap();
1685+
builder.finish()
1686+
};
1687+
assert_eq!(mixed_two_variadic.data_buffers().len(), 2);
1688+
run_test_case!(mixed_two_variadic);
1689+
}
1690+
1691+
test_case::<StringViewType>();
1692+
test_case::<BinaryViewType>();
1693+
}
15541694
}

arrow-buffer/src/buffer/immutable.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,9 @@ impl Buffer {
203203
pub fn advance(&mut self, offset: usize) {
204204
assert!(
205205
offset <= self.length,
206-
"the offset of the new Buffer cannot exceed the existing length"
206+
"the offset of the new Buffer cannot exceed the existing length: offset={} length={}",
207+
offset,
208+
self.length
207209
);
208210
self.length -= offset;
209211
// Safety:
@@ -221,7 +223,8 @@ impl Buffer {
221223
pub fn slice_with_length(&self, offset: usize, length: usize) -> Self {
222224
assert!(
223225
offset.saturating_add(length) <= self.length,
224-
"the offset of the new Buffer cannot exceed the existing length"
226+
"the offset of the new Buffer cannot exceed the existing length: slice offset={offset} length={length} selflen={}",
227+
self.length
225228
);
226229
// Safety:
227230
// offset + length <= self.length

arrow-data/src/ffi.rs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
use crate::bit_mask::set_bits;
2121
use crate::{layout, ArrayData};
2222
use arrow_buffer::buffer::NullBuffer;
23-
use arrow_buffer::{Buffer, MutableBuffer};
23+
use arrow_buffer::{Buffer, MutableBuffer, ScalarBuffer};
2424
use arrow_schema::DataType;
2525
use std::ffi::c_void;
2626

@@ -121,7 +121,7 @@ impl FFI_ArrowArray {
121121
pub fn new(data: &ArrayData) -> Self {
122122
let data_layout = layout(data.data_type());
123123

124-
let buffers = if data_layout.can_contain_null_mask {
124+
let mut buffers = if data_layout.can_contain_null_mask {
125125
// * insert the null buffer at the start
126126
// * make all others `Option<Buffer>`.
127127
std::iter::once(align_nulls(data.offset(), data.nulls()))
@@ -132,7 +132,7 @@ impl FFI_ArrowArray {
132132
};
133133

134134
// `n_buffers` is the number of buffers by the spec.
135-
let n_buffers = {
135+
let mut n_buffers = {
136136
data_layout.buffers.len() + {
137137
// If the layout has a null buffer by Arrow spec.
138138
// Note that even the array doesn't have a null buffer because it has
@@ -141,10 +141,22 @@ impl FFI_ArrowArray {
141141
}
142142
} as i64;
143143

144+
if data_layout.variadic {
145+
// Save the lengths of all variadic buffers into a new buffer.
146+
// The first buffer is `views`, and the rest are variadic.
147+
let mut data_buffers_lengths = Vec::new();
148+
for buffer in data.buffers().iter().skip(1) {
149+
data_buffers_lengths.push(buffer.len() as i64);
150+
n_buffers += 1;
151+
}
152+
153+
buffers.push(Some(ScalarBuffer::from(data_buffers_lengths).into_inner()));
154+
n_buffers += 1;
155+
}
156+
144157
let buffers_ptr = buffers
145158
.iter()
146159
.flat_map(|maybe_buffer| match maybe_buffer {
147-
// note that `raw_data` takes into account the buffer's offset
148160
Some(b) => Some(b.as_ptr() as *const c_void),
149161
// This is for null buffer. We only put a null pointer for
150162
// null buffer if by spec it can contain null mask.

arrow/src/pyarrow.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ impl FromPyArrow for RecordBatch {
354354
validate_pycapsule(array_capsule, "arrow_array")?;
355355

356356
let schema_ptr = unsafe { schema_capsule.reference::<FFI_ArrowSchema>() };
357-
let ffi_array = unsafe { FFI_ArrowArray::from_raw(array_capsule.pointer() as _) };
357+
let ffi_array = unsafe { FFI_ArrowArray::from_raw(array_capsule.pointer().cast()) };
358358
let array_data = unsafe { ffi::from_ffi(ffi_array, schema_ptr) }.map_err(to_py_err)?;
359359
if !matches!(array_data.data_type(), DataType::Struct(_)) {
360360
return Err(PyTypeError::new_err(

0 commit comments

Comments
 (0)