Skip to content
Draft
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1010,7 +1010,101 @@ mod test {
let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)]));
assert_eq!(&result, &expected);
}
/// This test manually constructs a shredded variant array representing lists
/// like ["comedy", "drama"], ["horror", null] and ["comedy", "drama", "romance"]
/// as VariantArray using variant_get.
#[test]
fn test_shredded_list_field_access() {
let array = shredded_list_variant_array();

// Test: Extract the 0 index field as VariantArray first
let options = GetOptions::new_with_path(VariantPath::from(0));
let result = variant_get(&array, options).unwrap();

let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap();
assert_eq!(result_variant.len(), 3);

// Row 0: expect 0 index = "comedy"
assert_eq!(result_variant.value(0), Variant::String("comedy"));
// Row 1: expect 0 index = "horror"
assert_eq!(result_variant.value(1), Variant::String("horror"));
// Row 2: expect 0 index = "comedy"
assert_eq!(result_variant.value(2), Variant::String("comedy"));
}
/// Test extracting shredded list field with type conversion
#[test]
fn test_shredded_list_as_string() {
let array = shredded_list_variant_array();

// Test: Extract the 0 index values as StringArray (type conversion)
let field = Field::new("typed_value", DataType::Utf8, false);
let options = GetOptions::new_with_path(VariantPath::from(0))
.with_as_type(Some(FieldRef::from(field)));
let result = variant_get(&array, options).unwrap();

// Should get StringArray
let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("drama")]));
assert_eq!(&result, &expected);
}
/// Helper function to create a shredded variant array representing lists
///
/// This creates an array that represents:
/// Row 0: ["comedy", "drama"] ([0] is shredded, [1] is shredded - perfectly shredded)
/// Row 1: ["horror", null] ([0] is shredded, [1] is binary null - partially shredded)
/// Row 2: ["comedy", "drama", "romance"] (perfectly shredded)
///
/// The physical layout follows the shredding spec where:
/// - metadata: contains list metadata
/// - typed_value: StructArray with 0 index value
/// - value: contains fallback for
fn shredded_list_variant_array() -> ArrayRef {
// Create the base metadata for lists

// Could add this as an api for VariantList, like VariantList::from()
fn build_list_metadata(vector: Vec<Variant>) -> (Vec<u8>, Vec<u8>) {
let mut builder = parquet_variant::VariantBuilder::new();
let mut list = builder.new_list();
for value in vector {
list.append_value(value);
}
list.finish();
builder.finish()
}
let (metadata1, _) =
build_list_metadata(vec![Variant::String("comedy"), Variant::String("drama")]);

let (metadata2, _) = build_list_metadata(vec![Variant::String("horror"), Variant::Null]);

let (metadata3, _) = build_list_metadata(vec![
Variant::String("comedy"),
Variant::String("drama"),
Variant::String("romance"),
]);

// Create metadata array
let metadata_array =
BinaryViewArray::from_iter_values(vec![metadata1, metadata2, metadata3]);

// Create the untyped value array
let value_array = BinaryViewArray::from(vec![Variant::Null.as_u8_slice()]);
// Maybe I should try with an actual primitive array
let typed_value_array = StringArray::from(vec![
Some("comedy"),
Some("drama"),
Some("horror"),
Some("comedy"),
Some("drama"),
Some("romance"),
]);
// Build the main VariantArray
let main_struct = crate::variant_array::StructArrayBuilder::new()
.with_field("metadata", Arc::new(metadata_array))
.with_field("value", Arc::new(value_array))
.with_field("typed_value", Arc::new(typed_value_array))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check the variant shredding spec for arrays -- the typed_value for a shredded variant array is a non-nullable group called element, with child fields typed_value and value for shredded and unshredded list elements, respectively.

And then we'll need to build an appropriate GenericListArray out of this string array you built, which gives the offsets for each sub-list.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for this too, I was under the wrong impression that the metadata encoding stores the offsets for the actual values. Reading your #8359 and rereading the Variant Encoding spec I see that the values offsets are within the value encoding itself.

So the outermost typed_value should be an GenericListArray of element - VariantObjects with {value and typed_value fields}?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, exactly! And element is non-nullable (**), while the two children are nullable.

(**) As always, in arrow, it can still have null entries, but only if its parent is already NULL for the same row (so nobody can ever observe a non-null element)

.build();

Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array"))
}
/// Helper function to create a shredded variant array representing objects
///
/// This creates an array that represents:
Expand Down
Loading