From 9c25cc4275cc916655b4aa2c649389073497383d Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 15 Sep 2025 13:49:09 -0400 Subject: [PATCH 1/7] Add test shredded variant list array --- parquet-variant-compute/src/variant_get.rs | 53 ++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 7774d136701f..1a5343164a35 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -985,7 +985,60 @@ mod test { let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)])); assert_eq!(&result, &expected); } + /// Helper function to create a shredded variant array representing lists + /// + /// This creates an array that represents: + /// Row 0: ["comedy", "drama"] ([0] is shredded, [1] is shredded - perfectly shredded) + /// Row 1: ["horror", null] ([0] is shredded, [1] is binary null - partially shredded) + /// Row 2: ["comedy", "drama", "romance"] (perfectly shredded) + /// + /// The physical layout follows the shredding spec where: + /// - metadata: contains list metadata + /// - typed_value: StructArray with 0 index value + /// - value: contains fallback for + fn shredded_list_variant_array() -> ArrayRef { + // Create the base metadata for lists + + // Could add this as an api for VariantList, like VariantList::from() + fn build_list_metadata(vector: Vec) -> (Vec, Vec) { + let mut builder = parquet_variant::VariantBuilder::new(); + let mut list = builder.new_list(); + for value in vector { + list.append_value(value); + } + list.finish(); + builder.finish() + } + let (metadata1, _) = + build_list_metadata(vec![Variant::String("comedy"), Variant::String("drama")]); + + let (metadata2, _) = build_list_metadata(vec![Variant::String("horror"), Variant::Null]); + + let (metadata3, _) = build_list_metadata(vec![ + Variant::String("comedy"), + Variant::String("drama"), + Variant::String("romance"), + ]); + + // Create metadata array + let metadata_array = + BinaryViewArray::from_iter_values(vec![metadata1, metadata2, metadata3]); + + // Create the untyped value array + let value_array = BinaryViewArray::from(vec![Variant::Null.as_u8_slice()]); + // Maybe I should try with an actual primitive array + let typed_value_array = StringArray::from(vec![ + "comedy", "drama", "horror", "comedy", "drama", "romance", + ]); + // Build the main VariantArray + let main_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array)) + .with_field("value", Arc::new(value_array)) + .with_field("typed_value", Arc::new(typed_value_array)) + .build(); + Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array")) + } /// Helper function to create a shredded variant array representing objects /// /// This creates an array that represents: From ed961a46234a83724b534fa18a587e441b94b4b6 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 15 Sep 2025 23:12:11 -0400 Subject: [PATCH 2/7] Add basic tests --- parquet-variant-compute/src/variant_get.rs | 43 +++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 1a5343164a35..5e1e62d64693 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -985,6 +985,42 @@ mod test { let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)])); assert_eq!(&result, &expected); } + /// This test manually constructs a shredded variant array representing lists + /// like ["comedy", "drama"], ["horror", null] and ["comedy", "drama", "romance"] + /// as VariantArray using variant_get. + #[test] + fn test_shredded_list_field_access() { + let array = shredded_list_variant_array(); + + // Test: Extract the 0 index field as VariantArray first + let options = GetOptions::new_with_path(VariantPath::from(0)); + let result = variant_get(&array, options).unwrap(); + + let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result_variant.len(), 3); + + // Row 0: expect 0 index = "comedy" + assert_eq!(result_variant.value(0), Variant::String("comedy")); + // Row 1: expect 0 index = "horror" + assert_eq!(result_variant.value(1), Variant::String("horror")); + // Row 2: expect 0 index = "comedy" + assert_eq!(result_variant.value(2), Variant::String("comedy")); + } + /// Test extracting shredded list field with type conversion + #[test] + fn test_shredded_list_as_string() { + let array = shredded_list_variant_array(); + + // Test: Extract the 0 index values as StringArray (type conversion) + let field = Field::new("typed_value", DataType::Utf8, false); + let options = GetOptions::new_with_path(VariantPath::from(0)) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + // Should get StringArray + let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("drama")])); + assert_eq!(&result, &expected); + } /// Helper function to create a shredded variant array representing lists /// /// This creates an array that represents: @@ -1028,7 +1064,12 @@ mod test { let value_array = BinaryViewArray::from(vec![Variant::Null.as_u8_slice()]); // Maybe I should try with an actual primitive array let typed_value_array = StringArray::from(vec![ - "comedy", "drama", "horror", "comedy", "drama", "romance", + Some("comedy"), + Some("drama"), + Some("horror"), + Some("comedy"), + Some("drama"), + Some("romance"), ]); // Build the main VariantArray let main_struct = crate::variant_array::StructArrayBuilder::new() From d53c8312e36b5124073b81988e02650bcf9ee994 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 17 Sep 2025 16:36:43 -0400 Subject: [PATCH 3/7] Redo test shredded array --- parquet-variant-compute/src/variant_get.rs | 142 ++++++++++++++------- 1 file changed, 93 insertions(+), 49 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 031db6b4e522..ad97e195b8b7 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -298,14 +298,15 @@ mod test { use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BinaryViewArray, Float16Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, StringArray, StructArray, UInt16Array, UInt32Array, + make_builder, Array, ArrayRef, BinaryBuilder, BinaryViewArray, Float16Array, Float32Array, + Float64Array, GenericListBuilder, Int16Array, Int32Array, Int64Array, Int8Array, + StringArray, StringBuilder, StructArray, StructBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow_schema::{DataType, Field, FieldRef, Fields}; - use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; + use parquet_variant::{Variant, VariantBuilder, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; use crate::json_to_variant; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; @@ -1090,7 +1091,7 @@ mod test { assert_eq!(&result, &expected); } /// This test manually constructs a shredded variant array representing lists - /// like ["comedy", "drama"], ["horror", null] and ["comedy", "drama", "romance"] + /// like ["comedy", "drama"] and ["horror", 123] /// as VariantArray using variant_get. #[test] fn test_shredded_list_field_access() { @@ -1102,13 +1103,11 @@ mod test { let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); assert_eq!(result_variant.len(), 3); - + // Row 0: expect 0 index = "comedy" - assert_eq!(result_variant.value(0), Variant::String("comedy")); + assert_eq!(result_variant.value(0), Variant::from("comedy")); // Row 1: expect 0 index = "horror" - assert_eq!(result_variant.value(1), Variant::String("horror")); - // Row 2: expect 0 index = "comedy" - assert_eq!(result_variant.value(2), Variant::String("comedy")); + assert_eq!(result_variant.value(1), Variant::from("horror")); } /// Test extracting shredded list field with type conversion #[test] @@ -1122,64 +1121,109 @@ mod test { let result = variant_get(&array, options).unwrap(); // Should get StringArray - let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("drama")])); + let expected: ArrayRef = + Arc::new(StringArray::from(vec![Some("comedy"), None, Some("drama")])); assert_eq!(&result, &expected); } /// Helper function to create a shredded variant array representing lists /// /// This creates an array that represents: /// Row 0: ["comedy", "drama"] ([0] is shredded, [1] is shredded - perfectly shredded) - /// Row 1: ["horror", null] ([0] is shredded, [1] is binary null - partially shredded) - /// Row 2: ["comedy", "drama", "romance"] (perfectly shredded) + /// Row 1: ["horror", 123] ([0] is shredded, [1] is int - partially shredded) /// /// The physical layout follows the shredding spec where: /// - metadata: contains list metadata /// - typed_value: StructArray with 0 index value /// - value: contains fallback for fn shredded_list_variant_array() -> ArrayRef { - // Create the base metadata for lists - - // Could add this as an api for VariantList, like VariantList::from() - fn build_list_metadata(vector: Vec) -> (Vec, Vec) { - let mut builder = parquet_variant::VariantBuilder::new(); - let mut list = builder.new_list(); - for value in vector { - list.append_value(value); - } - list.finish(); - builder.finish() - } - let (metadata1, _) = - build_list_metadata(vec![Variant::String("comedy"), Variant::String("drama")]); + // Create metadata array + let metadata_array = + BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 2)); - let (metadata2, _) = build_list_metadata(vec![Variant::String("horror"), Variant::Null]); + // Building the typed_value ListArray - let (metadata3, _) = build_list_metadata(vec![ - Variant::String("comedy"), - Variant::String("drama"), - Variant::String("romance"), + // Need a StructBuilder to create a ListBuilder + let fields = Fields::from(vec![ + Field::new("value", DataType::Binary, true), + Field::new("typed_value", DataType::Utf8, true), ]); + let field_builders = vec![ + make_builder(&DataType::Binary, 4), + make_builder(&DataType::Utf8, 4), + ]; + let struct_builder = StructBuilder::new(fields, field_builders); + + let mut builder = GenericListBuilder::::new(struct_builder); + + // Row 0 index 0 + builder + .values() + .field_builder::(0) + .unwrap() + .append_null(); + builder + .values() + .field_builder::(1) + .unwrap() + .append_value("comedy"); + builder.values().append(true); + + // Row 0 index 1 + builder + .values() + .field_builder::(0) + .unwrap() + .append_null(); + builder + .values() + .field_builder::(1) + .unwrap() + .append_value("drama"); + builder.values().append(true); + + // Next row + builder.append(true); + + // Row 1 index 0 + builder + .values() + .field_builder::(0) + .unwrap() + .append_null(); + builder + .values() + .field_builder::(1) + .unwrap() + .append_value("horror"); + builder.values().append(true); + + // Row 1 index 1 + let mut variant_builder = VariantBuilder::new(); + variant_builder.append_value(123i32); // <------ couldn't find the right way to do it, used this as placeholder for binary + let (_, value) = variant_builder.finish(); + + builder + .values() + .field_builder::(0) + .unwrap() + .append_value(value); + builder + .values() + .field_builder::(1) + .unwrap() + .append_null(); + builder.values().append(true); + + // Next row + builder.append(true); + + let typed_value_array = builder.finish(); - // Create metadata array - let metadata_array = - BinaryViewArray::from_iter_values(vec![metadata1, metadata2, metadata3]); - - // Create the untyped value array - let value_array = BinaryViewArray::from(vec![Variant::Null.as_u8_slice()]); - // Maybe I should try with an actual primitive array - let typed_value_array = StringArray::from(vec![ - Some("comedy"), - Some("drama"), - Some("horror"), - Some("comedy"), - Some("drama"), - Some("romance"), - ]); // Build the main VariantArray let main_struct = crate::variant_array::StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata_array)) - .with_field("value", Arc::new(value_array)) - .with_field("typed_value", Arc::new(typed_value_array)) + .with_field("metadata", Arc::new(metadata_array), false) + // .with_field("value", Arc::new(value_array), true) + .with_field("typed_value", Arc::new(typed_value_array), true) .build(); Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array")) From 69de7d742302a39294be5123fa2f49c6d26c36e1 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 19 Sep 2025 14:54:54 -0400 Subject: [PATCH 4/7] Rebuild the shredded list array --- parquet-variant-compute/src/variant_get.rs | 123 +++++++-------------- 1 file changed, 40 insertions(+), 83 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index fccf01b39fb9..d129b030c219 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -302,19 +302,19 @@ impl<'a> GetOptions<'a> { mod test { use std::sync::Arc; + use crate::{json_to_variant, VariantValueArrayBuilder}; use arrow::array::{ - make_builder, Array, ArrayRef, BinaryBuilder, BinaryViewArray, Float16Array, Float32Array, - Float64Array, GenericListBuilder, Int16Array, Int32Array, Int64Array, Int8Array, - StringArray, StringBuilder, StructArray, StructBuilder, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, + Array, ArrayRef, BinaryViewArray, Float16Array, Float32Array, + Float64Array, GenericListArray, Int16Array, Int32Array, Int64Array, + Int8Array, StringArray, StructArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, }; - use arrow::buffer::NullBuffer; + use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::compute::CastOptions; use arrow::datatypes::DataType::{Int16, Int32, Int64, UInt16, UInt32, UInt64, UInt8}; use arrow_schema::{DataType, Field, FieldRef, Fields}; - use parquet_variant::{Variant, VariantBuilder, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; + use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; - use crate::json_to_variant; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use crate::VariantArray; @@ -1314,82 +1314,39 @@ mod test { // Building the typed_value ListArray - // Need a StructBuilder to create a ListBuilder - let fields = Fields::from(vec![ - Field::new("value", DataType::Binary, true), - Field::new("typed_value", DataType::Utf8, true), - ]); - let field_builders = vec![ - make_builder(&DataType::Binary, 4), - make_builder(&DataType::Utf8, 4), - ]; - let struct_builder = StructBuilder::new(fields, field_builders); - - let mut builder = GenericListBuilder::::new(struct_builder); - - // Row 0 index 0 - builder - .values() - .field_builder::(0) - .unwrap() - .append_null(); - builder - .values() - .field_builder::(1) - .unwrap() - .append_value("comedy"); - builder.values().append(true); - - // Row 0 index 1 - builder - .values() - .field_builder::(0) - .unwrap() - .append_null(); - builder - .values() - .field_builder::(1) - .unwrap() - .append_value("drama"); - builder.values().append(true); - - // Next row - builder.append(true); - - // Row 1 index 0 - builder - .values() - .field_builder::(0) - .unwrap() - .append_null(); - builder - .values() - .field_builder::(1) - .unwrap() - .append_value("horror"); - builder.values().append(true); - - // Row 1 index 1 - let mut variant_builder = VariantBuilder::new(); - variant_builder.append_value(123i32); // <------ couldn't find the right way to do it, used this as placeholder for binary - let (_, value) = variant_builder.finish(); - - builder - .values() - .field_builder::(0) - .unwrap() - .append_value(value); - builder - .values() - .field_builder::(1) - .unwrap() - .append_null(); - builder.values().append(true); - - // Next row - builder.append(true); - - let typed_value_array = builder.finish(); + let mut variant_value_builder = VariantValueArrayBuilder::new(8); + variant_value_builder.append_null(); + variant_value_builder.append_null(); + variant_value_builder.append_null(); + variant_value_builder.append_value(Variant::from(123i32)); + + let struct_array = StructArrayBuilder::new() + .with_field( + "value", + Arc::new(variant_value_builder.build().unwrap()), + true, + ) + .with_field( + "typed_value", + Arc::new(StringArray::from(vec![ + Some("comedy"), + Some("drama"), + Some("horror"), + None, + ])), + true, + ) + .build(); + + let typed_value_array = GenericListArray::::new( + Arc::new(Field::new_list_field( + struct_array.data_type().clone(), + true, + )), + OffsetBuffer::from_lengths([2,2]), + Arc::new(struct_array), + None, + ); // Build the main VariantArray let main_struct = crate::variant_array::StructArrayBuilder::new() From cc6d7877fcf7009ce70f77d073c9ba328ed59614 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 23 Sep 2025 17:54:48 -0400 Subject: [PATCH 5/7] Use select::take to build the output array --- parquet-variant-compute/src/variant_get.rs | 70 ++++++++++++++++++---- 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index d129b030c219..7e5894d80a61 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -15,8 +15,11 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{self, Array, ArrayRef, BinaryViewArray, StructArray}, - compute::CastOptions, + array::{ + self, Array, ArrayRef, BinaryViewArray, GenericListArray, StructArray, + UInt32Array, + }, + compute::{take, CastOptions}, datatypes::Field, error::Result, }; @@ -102,12 +105,56 @@ pub(crate) fn follow_shredded_path_element<'a>( Ok(ShreddedPathStep::Success(field.shredding_state())) } - VariantPathElement::Index { .. } => { + VariantPathElement::Index { index } => { // TODO: Support array indexing. Among other things, it will require slicing not // only the array we have here, but also the corresponding metadata and null masks. - Err(ArrowError::NotYetImplemented( - "Pathing into shredded variant array index".into(), - )) + let Some(list_array) = typed_value.as_any().downcast_ref::>()// <- shouldn't be just i64 + else { + // Downcast failure - if strict cast options are enabled, this should be an error + if !cast_options.safe { + return Err(ArrowError::CastError(format!( + "Cannot access index '{}' on non-list type: {}", + index, + typed_value.data_type() + ))); + } + // With safe cast options, return NULL (missing_path_step) + return Ok(missing_path_step()); + }; + + let offsets = list_array.offsets(); + let list_len = list_array.len(); // number of lists + let values = list_array.values(); // This is a StructArray + + let Some(struct_array) = values.as_any().downcast_ref::() else { + return Ok(missing_path_step()); + }; + + let Some(field_array) = struct_array.column_by_name("typed_value") else { + return Ok(missing_path_step()); + }; + + // Build the list of indices to take + let mut take_indices = Vec::with_capacity(list_len); + for i in 0..list_len { + let start = offsets[i] as usize; + let end = offsets[i + 1] as usize; + let len = end - start; + + if *index < len { + take_indices.push(Some((start + index) as u32)); + } else { + take_indices.push(None); + } + } + + let index_array = UInt32Array::from(take_indices); + + // Use Arrow compute kernel to gather elements + let taken = take(field_array, &index_array, None)?; + + let state = ShreddingState::try_new(None, Some(Arc::new(taken)))?; + Ok(ShreddedPathStep::Success(&state)) } } } @@ -304,10 +351,9 @@ mod test { use crate::{json_to_variant, VariantValueArrayBuilder}; use arrow::array::{ - Array, ArrayRef, BinaryViewArray, Float16Array, Float32Array, - Float64Array, GenericListArray, Int16Array, Int32Array, Int64Array, - Int8Array, StringArray, StructArray, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, + Array, ArrayRef, BinaryViewArray, Float16Array, Float32Array, Float64Array, + GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::compute::CastOptions; @@ -1343,7 +1389,7 @@ mod test { struct_array.data_type().clone(), true, )), - OffsetBuffer::from_lengths([2,2]), + OffsetBuffer::from_lengths([2, 2]), Arc::new(struct_array), None, ); @@ -1411,7 +1457,7 @@ mod test { // Wrap the x field struct in a ShreddedVariantFieldArray let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) .expect("should create ShreddedVariantFieldArray"); - + // Create the main typed_value as a struct containing the "x" field let typed_value_fields = Fields::from(vec![Field::new( "x", From c0d20653716ee67b2e57f056328079d9524ea772 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 25 Sep 2025 17:09:30 -0400 Subject: [PATCH 6/7] Pass one test --- parquet-variant-compute/src/variant_get.rs | 68 ++++++++++++---------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 003ea69b9066..2fd5dc09fb25 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -15,18 +15,15 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{ - self, Array, ArrayRef, BinaryViewArray, GenericListArray, StructArray, - UInt32Array, - }, + array::{self, Array, ArrayRef, BinaryViewArray, GenericListArray, StructArray, UInt32Array}, compute::{take, CastOptions}, datatypes::Field, error::Result, }; use arrow_schema::{ArrowError, DataType, FieldRef}; -use parquet_variant::{VariantPath, VariantPathElement}; +use parquet_variant::{VariantPath, VariantPathElement, EMPTY_VARIANT_METADATA_BYTES}; -use crate::variant_array::ShreddingState; +use crate::variant_array::{ShreddingState, StructArrayBuilder}; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; use crate::VariantArray; @@ -106,7 +103,7 @@ pub(crate) fn follow_shredded_path_element( VariantPathElement::Index { index } => { // TODO: Support array indexing. Among other things, it will require slicing not // only the array we have here, but also the corresponding metadata and null masks. - let Some(list_array) = typed_value.as_any().downcast_ref::>()// <- shouldn't be just i64 + let Some(list_array) = typed_value.as_any().downcast_ref::>() else { // Downcast failure - if strict cast options are enabled, this should be an error if !cast_options.safe { @@ -121,20 +118,15 @@ pub(crate) fn follow_shredded_path_element( }; let offsets = list_array.offsets(); - let list_len = list_array.len(); // number of lists let values = list_array.values(); // This is a StructArray let Some(struct_array) = values.as_any().downcast_ref::() else { return Ok(missing_path_step()); }; - let Some(field_array) = struct_array.column_by_name("typed_value") else { - return Ok(missing_path_step()); - }; - // Build the list of indices to take - let mut take_indices = Vec::with_capacity(list_len); - for i in 0..list_len { + let mut take_indices = Vec::with_capacity(list_array.len()); + for i in 0..list_array.len() { let start = offsets[i] as usize; let end = offsets[i + 1] as usize; let len = end - start; @@ -149,10 +141,28 @@ pub(crate) fn follow_shredded_path_element( let index_array = UInt32Array::from(take_indices); // Use Arrow compute kernel to gather elements - let taken = take(field_array, &index_array, None)?; - - let state = ShreddingState::try_new(None, Some(Arc::new(taken)))?; - Ok(ShreddedPathStep::Success(&state)) + let taken = take(struct_array, &index_array, None)?; + + let typed_array = taken + .as_any() + .downcast_ref::() + .unwrap() + .column_by_name("typed_value") + .unwrap() + .clone(); + + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n( + EMPTY_VARIANT_METADATA_BYTES, + taken.len(), + )); + + let struct_array = &StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array), false) + .with_field("typed_value", typed_array, true) + .build(); + + // let state = ShreddingState::new(None, Some(Arc::new(taken))); + Ok(ShreddedPathStep::Success(struct_array.into())) } } } @@ -346,8 +356,9 @@ mod test { use crate::{json_to_variant, VariantValueArrayBuilder}; use arrow::array::{ Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, FixedSizeBinaryArray, - Float16Array, GenericListArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - StringArray, StructArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + Float16Array, Float32Array, Float64Array, GenericListArray, Int16Array, Int32Array, + Int64Array, Int8Array, StringArray, StructArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, }; use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::compute::CastOptions; @@ -1356,15 +1367,13 @@ mod test { /// like ["comedy", "drama"] and ["horror", 123] /// as VariantArray using variant_get. #[test] - fn test_shredded_list_field_access() { + fn test_shredded_list_index_access() { let array = shredded_list_variant_array(); - // Test: Extract the 0 index field as VariantArray first let options = GetOptions::new_with_path(VariantPath::from(0)); let result = variant_get(&array, options).unwrap(); - - let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); - assert_eq!(result_variant.len(), 3); + let result_variant = VariantArray::try_new(&result).unwrap(); + assert_eq!(result_variant.len(), 2); // Row 0: expect 0 index = "comedy" assert_eq!(result_variant.value(0), Variant::from("comedy")); @@ -1375,16 +1384,13 @@ mod test { #[test] fn test_shredded_list_as_string() { let array = shredded_list_variant_array(); - // Test: Extract the 0 index values as StringArray (type conversion) let field = Field::new("typed_value", DataType::Utf8, false); let options = GetOptions::new_with_path(VariantPath::from(0)) .with_as_type(Some(FieldRef::from(field))); let result = variant_get(&array, options).unwrap(); - // Should get StringArray - let expected: ArrayRef = - Arc::new(StringArray::from(vec![Some("comedy"), None, Some("drama")])); + let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("horror")])); assert_eq!(&result, &expected); } /// Helper function to create a shredded variant array representing lists @@ -1445,7 +1451,7 @@ mod test { .with_field("typed_value", Arc::new(typed_value_array), true) .build(); - Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array")) + Arc::new(main_struct) } /// Helper function to create a shredded variant array representing objects /// @@ -1501,7 +1507,7 @@ mod test { // Wrap the x field struct in a ShreddedVariantFieldArray let x_field_shredded = ShreddedVariantFieldArray::try_new(&x_field_struct) .expect("should create ShreddedVariantFieldArray"); - + // Create the main typed_value as a struct containing the "x" field let typed_value_fields = Fields::from(vec![Field::new( "x", From 40b631181fba421d6a3e038abc16d8bb7a7d94ca Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 25 Sep 2025 19:11:07 -0400 Subject: [PATCH 7/7] Get typed values directly --- parquet-variant-compute/src/variant_get.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 136a2d515054..d664bb4e4707 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -124,6 +124,10 @@ pub(crate) fn follow_shredded_path_element( return Ok(missing_path_step()); }; + let Some(typed_array) = struct_array.column_by_name("typed_value") else { + return Ok(missing_path_step()) + }; + // Build the list of indices to take let mut take_indices = Vec::with_capacity(list_array.len()); for i in 0..list_array.len() { @@ -141,15 +145,7 @@ pub(crate) fn follow_shredded_path_element( let index_array = UInt32Array::from(take_indices); // Use Arrow compute kernel to gather elements - let taken = take(struct_array, &index_array, None)?; - - let typed_array = taken - .as_any() - .downcast_ref::() - .unwrap() - .column_by_name("typed_value") - .unwrap() - .clone(); + let taken = take(typed_array, &index_array, None)?; let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n( EMPTY_VARIANT_METADATA_BYTES, @@ -158,10 +154,9 @@ pub(crate) fn follow_shredded_path_element( let struct_array = &StructArrayBuilder::new() .with_field("metadata", Arc::new(metadata_array), false) - .with_field("typed_value", typed_array, true) + .with_field("typed_value", taken, true) .build(); - // let state = ShreddingState::new(None, Some(Arc::new(taken))); Ok(ShreddedPathStep::Success(struct_array.into())) } }