Skip to content

Commit 3126dad

Browse files
[Variant] Remove superflous validate call and rename methods (#7871)
# Rationale for this change I was investigating #7869, when I found we were performing deep validation in areas where we only want shallow validation For example: `try_get_impl` is aimed to perform shallow validation https://github.com/apache/arrow-rs/blob/13d79b35884bf1fb2b761dc8e70b39bb24ae6c6b/parquet-variant/src/variant/list.rs#L272-L280 However, `Variant::try_new_with_metadata` _will_ perform deep validation, which is undesired. https://github.com/apache/arrow-rs/blob/13d79b35884bf1fb2b761dc8e70b39bb24ae6c6b/parquet-variant/src/variant.rs#L322-L327 Also fallible versions like `try_get` and `iter_try` will call (1) `validate` through `try_get_impl` -> `Variant::try_new_with_metadata` and then (2) manually call `validate` again. This is also a bit undesired, but it doesn't hurt us perf-wise, since we set a flag to make sure the full validation is run only once. https://github.com/apache/arrow-rs/blob/13d79b35884bf1fb2b761dc8e70b39bb24ae6c6b/parquet-variant/src/variant/list.rs#L241-L249 I personally found the `_impl` convention a bit hard to reason about. From what I understand, `_impl` functions should only perform shallow validation. Here are my proposed name changes: - `iter_try` -> `try_iter` to follow other `try_..` methods - `_impl` -> `with_shallow_validation` to make it clear to the reader that this function does basic validation - `validate` -> `with_deep_validation`, the builder method will perform linear validation - `is_validated` -> `is_fully_validated`, both shallow and deep validation has been done
1 parent 213d3be commit 3126dad

File tree

6 files changed

+93
-78
lines changed

6 files changed

+93
-78
lines changed

parquet-variant/benches/variant_builder.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ fn bench_validation_validated_vs_unvalidated(c: &mut Criterion) {
441441

442442
b.iter(|| {
443443
for variant in &unvalidated {
444-
let validated = variant.clone().validate().unwrap();
444+
let validated = variant.clone().with_full_validation().unwrap();
445445
hint::black_box(validated);
446446
}
447447
})

parquet-variant/src/variant.rs

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
use std::ops::Deref;
2-
31
// Licensed to the Apache Software Foundation (ASF) under one
42
// or more contributor license agreements. See the NOTICE file
53
// distributed with this work for additional information
@@ -16,6 +14,7 @@ use std::ops::Deref;
1614
// KIND, either express or implied. See the License for the
1715
// specific language governing permissions and limitations
1816
// under the License.
17+
1918
pub use self::decimal::{VariantDecimal16, VariantDecimal4, VariantDecimal8};
2019
pub use self::list::VariantList;
2120
pub use self::metadata::VariantMetadata;
@@ -24,6 +23,7 @@ use crate::decoder::{
2423
self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType,
2524
};
2625
use crate::utils::{first_byte_from_slice, slice_from_slice};
26+
use std::ops::Deref;
2727

2828
use arrow_schema::ArrowError;
2929
use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};
@@ -184,15 +184,15 @@ impl Deref for ShortString<'_> {
184184
/// Every instance of variant is either _valid_ or _invalid_. depending on whether the
185185
/// underlying bytes are a valid encoding of a variant value (see below).
186186
///
187-
/// Instances produced by [`Self::try_new`], [`Self::try_new_with_metadata`], or [`Self::validate`]
187+
/// Instances produced by [`Self::try_new`], [`Self::try_new_with_metadata`], or [`Self::with_full_validation`]
188188
/// are fully _validated_. They always contain _valid_ data, and infallible accesses such as
189189
/// iteration and indexing are panic-free. The validation cost is `O(m + v)` where `m` and
190190
/// `v` are the number of bytes in the metadata and value buffers, respectively.
191191
///
192192
/// Instances produced by [`Self::new`] and [`Self::new_with_metadata`] are _unvalidated_ and so
193193
/// they may contain either _valid_ or _invalid_ data. Infallible accesses to variant objects and
194194
/// arrays, such as iteration and indexing will panic if the underlying bytes are _invalid_, and
195-
/// fallible alternatives are provided as panic-free alternatives. [`Self::validate`] can also be
195+
/// fallible alternatives are provided as panic-free alternatives. [`Self::with_full_validation`] can also be
196196
/// used to _validate_ an _unvalidated_ instance, if desired.
197197
///
198198
/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
@@ -297,8 +297,10 @@ impl<'m, 'v> Variant<'m, 'v> {
297297
///
298298
/// [unvalidated]: Self#Validation
299299
pub fn new(metadata: &'m [u8], value: &'v [u8]) -> Self {
300-
let metadata = VariantMetadata::try_new_impl(metadata).expect("Invalid variant metadata");
301-
Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant data")
300+
let metadata = VariantMetadata::try_new_with_shallow_validation(metadata)
301+
.expect("Invalid variant metadata");
302+
Self::try_new_with_metadata_and_shallow_validation(metadata, value)
303+
.expect("Invalid variant data")
302304
}
303305

304306
/// Create a new variant with existing metadata.
@@ -323,18 +325,19 @@ impl<'m, 'v> Variant<'m, 'v> {
323325
metadata: VariantMetadata<'m>,
324326
value: &'v [u8],
325327
) -> Result<Self, ArrowError> {
326-
Self::try_new_with_metadata_impl(metadata, value)?.validate()
328+
Self::try_new_with_metadata_and_shallow_validation(metadata, value)?.with_full_validation()
327329
}
328330

329331
/// Similar to [`Self::try_new_with_metadata`], but [unvalidated].
330332
///
331333
/// [unvalidated]: Self#Validation
332334
pub fn new_with_metadata(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
333-
Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant")
335+
Self::try_new_with_metadata_and_shallow_validation(metadata, value)
336+
.expect("Invalid variant")
334337
}
335338

336339
// The actual constructor, which only performs shallow (constant-time) validation.
337-
fn try_new_with_metadata_impl(
340+
fn try_new_with_metadata_and_shallow_validation(
338341
metadata: VariantMetadata<'m>,
339342
value: &'v [u8],
340343
) -> Result<Self, ArrowError> {
@@ -382,21 +385,23 @@ impl<'m, 'v> Variant<'m, 'v> {
382385
VariantBasicType::ShortString => {
383386
Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?)
384387
}
385-
VariantBasicType::Object => {
386-
Variant::Object(VariantObject::try_new_impl(metadata, value)?)
387-
}
388-
VariantBasicType::Array => Variant::List(VariantList::try_new_impl(metadata, value)?),
388+
VariantBasicType::Object => Variant::Object(
389+
VariantObject::try_new_with_shallow_validation(metadata, value)?,
390+
),
391+
VariantBasicType::Array => Variant::List(VariantList::try_new_with_shallow_validation(
392+
metadata, value,
393+
)?),
389394
};
390395
Ok(new_self)
391396
}
392397

393398
/// True if this variant instance has already been [validated].
394399
///
395400
/// [validated]: Self#Validation
396-
pub fn is_validated(&self) -> bool {
401+
pub fn is_fully_validated(&self) -> bool {
397402
match self {
398-
Variant::List(list) => list.is_validated(),
399-
Variant::Object(obj) => obj.is_validated(),
403+
Variant::List(list) => list.is_fully_validated(),
404+
Variant::Object(obj) => obj.is_fully_validated(),
400405
_ => true,
401406
}
402407
}
@@ -407,16 +412,16 @@ impl<'m, 'v> Variant<'m, 'v> {
407412
/// Variant leaf values are always valid by construction, but [objects] and [arrays] can be
408413
/// constructed in unvalidated (and potentially invalid) state.
409414
///
410-
/// If [`Self::is_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)`
415+
/// If [`Self::is_fully_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)`
411416
/// where `m` and `v` are the sizes of metadata and value buffers, respectively.
412417
///
413418
/// [objects]: VariantObject#Validation
414419
/// [arrays]: VariantList#Validation
415-
pub fn validate(self) -> Result<Self, ArrowError> {
420+
pub fn with_full_validation(self) -> Result<Self, ArrowError> {
416421
use Variant::*;
417422
match self {
418-
List(list) => list.validate().map(List),
419-
Object(obj) => obj.validate().map(Object),
423+
List(list) => list.with_full_validation().map(List),
424+
Object(obj) => obj.with_full_validation().map(Object),
420425
_ => Ok(self),
421426
}
422427
}

parquet-variant/src/variant/list.rs

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,14 @@ impl VariantListHeader {
8282
/// Every instance of variant list is either _valid_ or _invalid_. depending on whether the
8383
/// underlying bytes are a valid encoding of a variant array (see below).
8484
///
85-
/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_. They always
85+
/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
8686
/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
8787
/// validation cost is linear in the number of underlying bytes.
8888
///
8989
/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
9090
/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
9191
/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
92-
/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an
92+
/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
9393
/// _unvalidated_ instance, if desired.
9494
///
9595
/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
@@ -136,18 +136,18 @@ impl<'m, 'v> VariantList<'m, 'v> {
136136
/// This constructor verifies that `value` points to a valid variant array value. In particular,
137137
/// that all offsets are in-bounds and point to valid (recursively validated) objects.
138138
pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> {
139-
Self::try_new_impl(metadata, value)?.validate()
139+
Self::try_new_with_shallow_validation(metadata, value)?.with_full_validation()
140140
}
141141

142142
pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
143-
Self::try_new_impl(metadata, value).expect("Invalid variant list value")
143+
Self::try_new_with_shallow_validation(metadata, value).expect("Invalid variant list value")
144144
}
145145

146146
/// Attempts to interpet `metadata` and `value` as a variant array, performing only basic
147147
/// (constant-cost) [validation].
148148
///
149149
/// [validation]: Self#Validation
150-
pub(crate) fn try_new_impl(
150+
pub(crate) fn try_new_with_shallow_validation(
151151
metadata: VariantMetadata<'m>,
152152
value: &'v [u8],
153153
) -> Result<Self, ArrowError> {
@@ -196,18 +196,18 @@ impl<'m, 'v> VariantList<'m, 'v> {
196196
/// True if this instance is fully [validated] for panic-free infallible accesses.
197197
///
198198
/// [validated]: Self#Validation
199-
pub fn is_validated(&self) -> bool {
199+
pub fn is_fully_validated(&self) -> bool {
200200
self.validated
201201
}
202202

203203
/// Performs a full [validation] of this variant array and returns the result.
204204
///
205205
/// [validation]: Self#Validation
206-
pub fn validate(mut self) -> Result<Self, ArrowError> {
206+
pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
207207
if !self.validated {
208208
// Validate the metadata dictionary first, if not already validated, because we pass it
209209
// by value to all the children (who would otherwise re-validate it repeatedly).
210-
self.metadata = self.metadata.validate()?;
210+
self.metadata = self.metadata.with_full_validation()?;
211211

212212
// Iterate over all string keys in this dictionary in order to prove that the offset
213213
// array is valid, all offsets are in bounds, and all string bytes are valid utf-8.
@@ -232,52 +232,55 @@ impl<'m, 'v> VariantList<'m, 'v> {
232232
/// [invalid]: Self#Validation
233233
pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
234234
(index < self.num_elements).then(|| {
235-
self.try_get_impl(index)
236-
.and_then(Variant::validate)
235+
self.try_get_with_shallow_validation(index)
237236
.expect("Invalid variant array element")
238237
})
239238
}
240239

241240
/// Fallible version of `get`. Returns element by index, capturing validation errors
242241
pub fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
243-
self.try_get_impl(index)?.validate()
242+
self.try_get_with_shallow_validation(index)?
243+
.with_full_validation()
244244
}
245245

246-
/// Fallible iteration over the elements of this list.
247-
pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
248-
self.iter_try_impl().map(|result| result?.validate())
249-
}
250-
251-
// Fallible iteration that only performs basic (constant-time) validation.
252-
fn iter_try_impl(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
253-
(0..self.len()).map(move |i| self.try_get_impl(i))
246+
// Fallible version of `get`, performing only basic (constant-time) validation.
247+
fn try_get_with_shallow_validation(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
248+
// Fetch the value bytes between the two offsets for this index, from the value array region
249+
// of the byte buffer
250+
let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?;
251+
let value_bytes =
252+
slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?;
253+
Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes)
254254
}
255255

256256
/// Iterates over the values of this list. When working with [unvalidated] input, consider
257257
/// [`Self::iter_try`] to avoid panics due to invalid data.
258258
///
259259
/// [unvalidated]: Self#Validation
260260
pub fn iter(&self) -> impl Iterator<Item = Variant<'m, 'v>> + '_ {
261-
self.iter_try_impl()
261+
self.iter_try_with_shallow_validation()
262262
.map(|result| result.expect("Invalid variant list entry"))
263263
}
264264

265+
/// Fallible iteration over the elements of this list.
266+
pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
267+
self.iter_try_with_shallow_validation()
268+
.map(|result| result?.with_full_validation())
269+
}
270+
271+
// Fallible iteration that only performs basic (constant-time) validation.
272+
fn iter_try_with_shallow_validation(
273+
&self,
274+
) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
275+
(0..self.len()).map(move |i| self.try_get_with_shallow_validation(i))
276+
}
277+
265278
// Attempts to retrieve the ith offset from the offset array region of the byte buffer.
266279
fn get_offset(&self, index: usize) -> Result<usize, ArrowError> {
267280
let byte_range = self.header.first_offset_byte()..self.first_value_byte;
268281
let offset_bytes = slice_from_slice(self.value, byte_range)?;
269282
self.header.offset_size.unpack_usize(offset_bytes, index)
270283
}
271-
272-
// Fallible version of `get`, performing only basic (constant-time) validation.
273-
fn try_get_impl(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
274-
// Fetch the value bytes between the two offsets for this index, from the value array region
275-
// of the byte buffer
276-
let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?;
277-
let value_bytes =
278-
slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?;
279-
Variant::try_new_with_metadata(self.metadata, value_bytes)
280-
}
281284
}
282285

283286
#[cfg(test)]

parquet-variant/src/variant/metadata.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -93,14 +93,14 @@ impl VariantMetadataHeader {
9393
/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
9494
/// underlying bytes are a valid encoding of variant metadata (see below).
9595
///
96-
/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_. They always
96+
/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
9797
/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
9898
/// validation cost is linear in the number of underlying bytes.
9999
///
100100
/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
101101
/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
102102
/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
103-
/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an
103+
/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
104104
/// _unvalidated_ instance, if desired.
105105
///
106106
/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
@@ -143,7 +143,7 @@ impl<'m> VariantMetadata<'m> {
143143
///
144144
/// [validation]: Self#Validation
145145
pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
146-
Self::try_new_impl(bytes)?.validate()
146+
Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
147147
}
148148

149149
/// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
@@ -157,11 +157,11 @@ impl<'m> VariantMetadata<'m> {
157157
///
158158
/// [validate]: Self#Validation
159159
pub fn new(bytes: &'m [u8]) -> Self {
160-
Self::try_new_impl(bytes).expect("Invalid variant metadata")
160+
Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
161161
}
162162

163163
// The actual constructor, which performs only basic (constant-const) validation.
164-
pub(crate) fn try_new_impl(bytes: &'m [u8]) -> Result<Self, ArrowError> {
164+
pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
165165
let header_byte = first_byte_from_slice(bytes)?;
166166
let header = VariantMetadataHeader::try_new(header_byte)?;
167167

@@ -219,14 +219,14 @@ impl<'m> VariantMetadata<'m> {
219219
/// True if this instance is fully [validated] for panic-free infallible accesses.
220220
///
221221
/// [validated]: Self#Validation
222-
pub fn is_validated(&self) -> bool {
222+
pub fn is_fully_validated(&self) -> bool {
223223
self.validated
224224
}
225225

226226
/// Performs a full [validation] of this metadata dictionary and returns the result.
227227
///
228228
/// [validation]: Self#Validation
229-
pub fn validate(mut self) -> Result<Self, ArrowError> {
229+
pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
230230
if !self.validated {
231231
// Iterate over all string keys in this dictionary in order to prove that the offset
232232
// array is valid, all offsets are in bounds, and all string bytes are valid utf-8.

0 commit comments

Comments
 (0)