-
Notifications
You must be signed in to change notification settings - Fork 1k
[Variant] Allow VariantArray::value
to work with owned value bytes
#8430
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -28,7 +28,9 @@ use arrow::datatypes::{ | |||||
use arrow_schema::extension::ExtensionType; | ||||||
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields}; | ||||||
use parquet_variant::Uuid; | ||||||
use parquet_variant::Variant; | ||||||
use parquet_variant::{Variant, VariantList, VariantMetadata, VariantObject}; | ||||||
|
||||||
use std::borrow::Cow; | ||||||
use std::sync::Arc; | ||||||
|
||||||
/// Arrow Variant [`ExtensionType`]. | ||||||
|
@@ -72,6 +74,131 @@ impl ExtensionType for VariantType { | |||||
} | ||||||
} | ||||||
|
||||||
/// A [`Cow`]-like representation of a [`Variant`] value returned by [`VariantArray::value`], which | ||||||
/// may use owned or borrowed value bytes depending on how the underlying variant was shredded. We | ||||||
/// cannot "just" use [`Cow`] because of the special lifetime management that [`Variant`] requires. | ||||||
pub enum VariantArrayValue<'m, 'v> { | ||||||
Borrowed(Variant<'m, 'v>), | ||||||
Owned { | ||||||
metadata: VariantMetadata<'m>, | ||||||
value_bytes: Vec<u8>, | ||||||
}, | ||||||
} | ||||||
|
||||||
impl<'m, 'v> VariantArrayValue<'m, 'v> { | ||||||
/// Creates a new instance that borrows from a normal [`Variant`] value. | ||||||
pub fn borrowed(value: Variant<'m, 'v>) -> Self { | ||||||
Self::Borrowed(value) | ||||||
} | ||||||
|
||||||
/// Creates a new instance that wraps owned bytes that can be converted to a [`Variant`] value. | ||||||
pub fn owned(metadata_bytes: &'m [u8], value_bytes: Vec<u8>) -> Self { | ||||||
Self::Owned { | ||||||
metadata: VariantMetadata::new(metadata_bytes), | ||||||
value_bytes, | ||||||
} | ||||||
} | ||||||
|
||||||
/// Consumes this variant value, passing the result to a `visitor` function. | ||||||
/// | ||||||
/// The visitor idiom is helpful because a variant value based on owned bytes cannot outlive | ||||||
/// self. | ||||||
pub fn consume<R>(self, visitor: impl FnOnce(Variant<'_, '_>) -> R) -> R { | ||||||
match self { | ||||||
VariantArrayValue::Borrowed(v) => visitor(v), | ||||||
VariantArrayValue::Owned { | ||||||
metadata, | ||||||
value_bytes, | ||||||
} => visitor(Variant::new_with_metadata(metadata, &value_bytes)), | ||||||
} | ||||||
} | ||||||
|
||||||
// internal helper for when we don't want to pay the extra clone | ||||||
fn as_variant_cow(&self) -> Cow<'_, Variant<'m, '_>> { | ||||||
match self { | ||||||
VariantArrayValue::Borrowed(v) => Cow::Borrowed(v), | ||||||
VariantArrayValue::Owned { | ||||||
metadata, | ||||||
value_bytes, | ||||||
} => Cow::Owned(Variant::new_with_metadata(metadata.clone(), value_bytes)), | ||||||
} | ||||||
} | ||||||
|
||||||
/// Returns a [`Variant`] instance for this value. | ||||||
pub fn as_variant(&self) -> Variant<'m, '_> { | ||||||
self.as_variant_cow().into_owned() | ||||||
} | ||||||
|
||||||
/// Returns the variant metadata that backs this value. | ||||||
pub fn metadata(&self) -> &VariantMetadata<'m> { | ||||||
match self { | ||||||
VariantArrayValue::Borrowed(v) => v.metadata(), | ||||||
VariantArrayValue::Owned { metadata, .. } => metadata, | ||||||
} | ||||||
} | ||||||
|
||||||
/// Extracts the underlying [`VariantObject`], if this is a variant object. | ||||||
/// | ||||||
/// See also [`Variant::as_object`]. | ||||||
pub fn as_object(&self) -> Option<VariantObject<'m, '_>> { | ||||||
self.as_variant_cow().as_object().cloned() | ||||||
} | ||||||
|
||||||
/// Extracts the underlying [`VariantList`], if this is a variant array. | ||||||
/// | ||||||
/// See also [`Variant::as_list`]. | ||||||
pub fn as_list(&self) -> Option<VariantList<'m, '_>> { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two as_xxx methods are not as efficient as their There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add a note in the docs clarifying that this is test-only? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean, if we think it's important we could mark them |
||||||
self.as_variant_cow().as_list().cloned() | ||||||
} | ||||||
|
||||||
/// Extracts the value of the named variant object field, if this is a variant object. | ||||||
/// | ||||||
/// See also [`Variant::get_object_field`]. | ||||||
pub fn get_object_field<'s>(&'s self, field_name: &str) -> Option<Variant<'m, 's>> { | ||||||
self.as_variant_cow().get_object_field(field_name) | ||||||
} | ||||||
|
||||||
/// Extracts the value of the variant array element at `index`, if this is a variant object. | ||||||
/// | ||||||
/// See also [`Variant::get_list_element`]. | ||||||
pub fn get_list_element(&self, index: usize) -> Option<Variant<'m, '_>> { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||||||
self.as_variant_cow().get_list_element(index) | ||||||
} | ||||||
} | ||||||
|
||||||
impl<'m, 'v> From<Variant<'m, 'v>> for VariantArrayValue<'m, 'v> { | ||||||
fn from(value: Variant<'m, 'v>) -> Self { | ||||||
Self::borrowed(value) | ||||||
} | ||||||
} | ||||||
|
||||||
// By providing PartialEq for all three combinations, we avoid changing a lot of unit test code that | ||||||
// relies on comparisons. | ||||||
impl PartialEq for VariantArrayValue<'_, '_> { | ||||||
fn eq(&self, other: &VariantArrayValue<'_, '_>) -> bool { | ||||||
self.as_variant_cow().as_ref() == other.as_variant_cow().as_ref() | ||||||
} | ||||||
} | ||||||
|
||||||
impl PartialEq<Variant<'_, '_>> for VariantArrayValue<'_, '_> { | ||||||
fn eq(&self, other: &Variant<'_, '_>) -> bool { | ||||||
self.as_variant_cow().as_ref() == other | ||||||
} | ||||||
} | ||||||
|
||||||
impl PartialEq<VariantArrayValue<'_, '_>> for Variant<'_, '_> { | ||||||
fn eq(&self, other: &VariantArrayValue<'_, '_>) -> bool { | ||||||
self == other.as_variant_cow().as_ref() | ||||||
} | ||||||
} | ||||||
|
||||||
// Make it transparent -- looks just like the underlying value it proxies | ||||||
impl std::fmt::Debug for VariantArrayValue<'_, '_> { | ||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||||||
self.as_variant_cow().as_ref().fmt(f) | ||||||
} | ||||||
} | ||||||
|
||||||
/// An array of Parquet [`Variant`] values | ||||||
/// | ||||||
/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying | ||||||
|
@@ -352,39 +479,24 @@ impl VariantArray { | |||||
/// | ||||||
/// Note: Does not do deep validation of the [`Variant`], so it is up to the | ||||||
/// caller to ensure that the metadata and value were constructed correctly. | ||||||
pub fn value(&self, index: usize) -> Variant<'_, '_> { | ||||||
match &self.shredding_state { | ||||||
ShreddingState::Unshredded { value, .. } => { | ||||||
// Unshredded case | ||||||
Variant::new(self.metadata.value(index), value.value(index)) | ||||||
pub fn value(&self, index: usize) -> VariantArrayValue<'_, '_> { | ||||||
scovich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
let value = match &self.shredding_state { | ||||||
// Always prefer to use the typed_value, if present | ||||||
ShreddingState::Typed { typed_value, .. } | ||||||
| ShreddingState::PartiallyShredded { typed_value, .. } | ||||||
if typed_value.is_valid(index) => | ||||||
{ | ||||||
return typed_value_to_variant(typed_value, index); | ||||||
|
return typed_value_to_variant(typed_value, index); | |
typed_value_to_variant(typed_value, index) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the return
is needed due to differing return type...
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These were some weird uses of |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This method can be implemented directly, without resorting to the cost of calling
self.as_variant_cow()