Skip to content

Commit a5afda2

Browse files
scovichalamb
andauthored
[Variant] VariantMetadata is allowed to contain the empty string (#7956)
# Which issue does this PR close? - Follow-up to #7901 # Rationale for this change - #7934 Introduced a minor regression, in (accidentally?) forbidding the empty string as a dictionary key. Fix the bug and simplify the code a bit further while we're at it. # What changes are included in this PR? Revert the unsorted dictionary check back to what it had been (it just uses `Iterator::is_sorted_by` now, instead of `primitive.slice::is_sorted_by`). Remove the redundant offset monotonicity check from the ordered dictionary path, relying on the fact that string slice extraction will anyway fail if the offsets are not monotonic. Improve the error message now that it does double duty. # Are these changes tested? New unit tests for dictionaries containing the empty string. As a side effect, we now have at least a little coverage for sorted dictionaries -- somehow, I couldn't find any existing unit test that creates a sorted dictionary?? # Are there any user-facing changes? No --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent a984ca7 commit a5afda2

File tree

3 files changed

+65
-20
lines changed

3 files changed

+65
-20
lines changed

parquet-variant-compute/src/variant_array.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ impl VariantArray {
9595
};
9696
// Ensure the StructArray has a metadata field of BinaryView
9797

98-
let Some(metadata_field) = VariantArray::find_metadata_field(&inner) else {
98+
let Some(metadata_field) = VariantArray::find_metadata_field(inner) else {
9999
return Err(ArrowError::InvalidArgumentError(
100100
"Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
101101
));
@@ -106,7 +106,7 @@ impl VariantArray {
106106
metadata_field.data_type()
107107
)));
108108
}
109-
let Some(value_field) = VariantArray::find_value_field(&inner) else {
109+
let Some(value_field) = VariantArray::find_value_field(inner) else {
110110
return Err(ArrowError::InvalidArgumentError(
111111
"Invalid VariantArray: StructArray must contain a 'value' field".to_string(),
112112
));

parquet-variant/src/variant/metadata.rs

Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -240,28 +240,23 @@ impl<'m> VariantMetadata<'m> {
240240
let value_buffer =
241241
string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
242242

243-
let mut offsets_iter = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
244-
let mut current_offset = offsets_iter.next().unwrap_or(0);
243+
let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
245244

246245
if self.header.is_sorted {
247246
// Validate the dictionary values are unique and lexicographically sorted
248247
//
249248
// Since we use the offsets to access dictionary values, this also validates
250249
// offsets are in-bounds and monotonically increasing
250+
let mut current_offset = offsets.next().unwrap_or(0);
251251
let mut prev_value: Option<&str> = None;
252-
253-
for next_offset in offsets_iter {
254-
if next_offset <= current_offset {
255-
return Err(ArrowError::InvalidArgumentError(
256-
"offsets not monotonically increasing".to_string(),
257-
));
258-
}
259-
252+
for next_offset in offsets {
260253
let current_value =
261254
value_buffer
262255
.get(current_offset..next_offset)
263256
.ok_or_else(|| {
264-
ArrowError::InvalidArgumentError("offset out of bounds".to_string())
257+
ArrowError::InvalidArgumentError(format!(
258+
"range {current_offset}..{next_offset} is invalid or out of bounds"
259+
))
265260
})?;
266261

267262
if let Some(prev_val) = prev_value {
@@ -281,13 +276,10 @@ impl<'m> VariantMetadata<'m> {
281276
// Since shallow validation ensures the first and last offsets are in bounds,
282277
// we can also verify all offsets are in-bounds by checking if
283278
// offsets are monotonically increasing
284-
for next_offset in offsets_iter {
285-
if next_offset <= current_offset {
286-
return Err(ArrowError::InvalidArgumentError(
287-
"offsets not monotonically increasing".to_string(),
288-
));
289-
}
290-
current_offset = next_offset;
279+
if !offsets.is_sorted_by(|a, b| a < b) {
280+
return Err(ArrowError::InvalidArgumentError(
281+
"offsets not monotonically increasing".to_string(),
282+
));
291283
}
292284
}
293285

@@ -531,4 +523,44 @@ mod tests {
531523
"unexpected error: {err:?}"
532524
);
533525
}
526+
527+
#[test]
528+
fn empty_string_is_valid() {
529+
let bytes = &[
530+
0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
531+
1,
532+
0x00,
533+
0x00,
534+
];
535+
let metadata = VariantMetadata::try_new(bytes).unwrap();
536+
assert_eq!(&metadata[0], "");
537+
538+
let bytes = &[
539+
0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
540+
2,
541+
0x00,
542+
0x00,
543+
0x02,
544+
b'h',
545+
b'i',
546+
];
547+
let metadata = VariantMetadata::try_new(bytes).unwrap();
548+
assert_eq!(&metadata[0], "");
549+
assert_eq!(&metadata[1], "hi");
550+
551+
let bytes = &[
552+
0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
553+
2,
554+
0x00,
555+
0x02,
556+
0x02, // empty string is allowed, but must be first in a sorted dict
557+
b'h',
558+
b'i',
559+
];
560+
let err = VariantMetadata::try_new(bytes).unwrap_err();
561+
assert!(
562+
matches!(err, ArrowError::InvalidArgumentError(_)),
563+
"unexpected error: {err:?}"
564+
);
565+
}
534566
}

parquet-variant/src/variant/object.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,19 @@ mod tests {
553553
assert_eq!(variant_obj.field(2).unwrap().as_string(), Some("hello"));
554554
}
555555

556+
#[test]
557+
fn test_variant_object_empty_fields() {
558+
let mut builder = VariantBuilder::new();
559+
builder.new_object().with_field("", 42).finish().unwrap();
560+
let (metadata, value) = builder.finish();
561+
562+
// Resulting object is valid and has a single empty field
563+
let variant = Variant::try_new(&metadata, &value).unwrap();
564+
let variant_obj = variant.as_object().unwrap();
565+
assert_eq!(variant_obj.len(), 1);
566+
assert_eq!(variant_obj.get(""), Some(Variant::from(42)));
567+
}
568+
556569
#[test]
557570
fn test_variant_object_empty() {
558571
// Create metadata with no fields

0 commit comments

Comments
 (0)