Skip to content

Commit b4bc6dd

Browse files
authored
feat: Add deletion vector related fields in spec types (#1276)
1 parent 7a5ad1f commit b4bc6dd

File tree

9 files changed

+335
-110
lines changed

9 files changed

+335
-110
lines changed

crates/iceberg/src/expr/visitors/expression_evaluator.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,10 @@ mod tests {
347347
equality_ids: vec![],
348348
sort_order_id: None,
349349
partition_spec_id: 0,
350+
first_row_id: None,
351+
referenced_data_file: None,
352+
content_offset: None,
353+
content_size_in_bytes: None,
350354
}
351355
}
352356

@@ -371,6 +375,10 @@ mod tests {
371375
equality_ids: vec![],
372376
sort_order_id: None,
373377
partition_spec_id: 0,
378+
first_row_id: None,
379+
referenced_data_file: None,
380+
content_offset: None,
381+
content_size_in_bytes: None,
374382
}
375383
}
376384

crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1997,6 +1997,10 @@ mod test {
19971997
equality_ids: vec![],
19981998
sort_order_id: None,
19991999
partition_spec_id: 0,
2000+
first_row_id: None,
2001+
referenced_data_file: None,
2002+
content_offset: None,
2003+
content_size_in_bytes: None,
20002004
}
20012005
}
20022006

@@ -2019,6 +2023,10 @@ mod test {
20192023
equality_ids: vec![],
20202024
sort_order_id: None,
20212025
partition_spec_id: 0,
2026+
first_row_id: None,
2027+
referenced_data_file: None,
2028+
content_offset: None,
2029+
content_size_in_bytes: None,
20222030
}
20232031
}
20242032

@@ -2077,6 +2085,10 @@ mod test {
20772085
equality_ids: vec![],
20782086
sort_order_id: None,
20792087
partition_spec_id: 0,
2088+
first_row_id: None,
2089+
referenced_data_file: None,
2090+
content_offset: None,
2091+
content_size_in_bytes: None,
20802092
}
20812093
}
20822094
fn get_test_file_2() -> DataFile {
@@ -2104,6 +2116,10 @@ mod test {
21042116
equality_ids: vec![],
21052117
sort_order_id: None,
21062118
partition_spec_id: 0,
2119+
first_row_id: None,
2120+
referenced_data_file: None,
2121+
content_offset: None,
2122+
content_size_in_bytes: None,
21072123
}
21082124
}
21092125

@@ -2132,6 +2148,10 @@ mod test {
21322148
equality_ids: vec![],
21332149
sort_order_id: None,
21342150
partition_spec_id: 0,
2151+
first_row_id: None,
2152+
referenced_data_file: None,
2153+
content_offset: None,
2154+
content_size_in_bytes: None,
21352155
}
21362156
}
21372157

@@ -2160,6 +2180,10 @@ mod test {
21602180
equality_ids: vec![],
21612181
sort_order_id: None,
21622182
partition_spec_id: 0,
2183+
first_row_id: None,
2184+
referenced_data_file: None,
2185+
content_offset: None,
2186+
content_size_in_bytes: None,
21632187
}
21642188
}
21652189
}

crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,10 @@ mod test {
582582
equality_ids: vec![],
583583
sort_order_id: None,
584584
partition_spec_id: 0,
585+
first_row_id: None,
586+
referenced_data_file: None,
587+
content_offset: None,
588+
content_size_in_bytes: None,
585589
}
586590
}
587591

@@ -604,6 +608,10 @@ mod test {
604608
equality_ids: vec![],
605609
sort_order_id: None,
606610
partition_spec_id: 0,
611+
first_row_id: None,
612+
referenced_data_file: None,
613+
content_offset: None,
614+
content_size_in_bytes: None,
607615
}
608616
}
609617

@@ -626,6 +634,10 @@ mod test {
626634
equality_ids: vec![],
627635
sort_order_id: None,
628636
partition_spec_id: 0,
637+
first_row_id: None,
638+
referenced_data_file: None,
639+
content_offset: None,
640+
content_size_in_bytes: None,
629641
}
630642
}
631643

@@ -649,6 +661,10 @@ mod test {
649661
equality_ids: vec![],
650662
sort_order_id: None,
651663
partition_spec_id: 0,
664+
first_row_id: None,
665+
referenced_data_file: None,
666+
content_offset: None,
667+
content_size_in_bytes: None,
652668
}
653669
}
654670

crates/iceberg/src/spec/manifest/_serde.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,10 @@ pub(super) struct DataFileSerde {
119119
#[serde(default)]
120120
equality_ids: Option<Vec<i32>>,
121121
sort_order_id: Option<i32>,
122+
first_row_id: Option<i64>,
123+
referenced_data_file: Option<String>,
124+
content_offset: Option<i64>,
125+
content_size_in_bytes: Option<i64>,
122126
}
123127

124128
impl DataFileSerde {
@@ -149,6 +153,10 @@ impl DataFileSerde {
149153
split_offsets: Some(value.split_offsets),
150154
equality_ids: Some(value.equality_ids),
151155
sort_order_id: value.sort_order_id,
156+
first_row_id: value.first_row_id,
157+
referenced_data_file: value.referenced_data_file,
158+
content_offset: value.content_offset,
159+
content_size_in_bytes: value.content_size_in_bytes,
152160
})
153161
}
154162

@@ -215,6 +223,10 @@ impl DataFileSerde {
215223
equality_ids: self.equality_ids.unwrap_or_default(),
216224
sort_order_id: self.sort_order_id,
217225
partition_spec_id,
226+
first_row_id: self.first_row_id,
227+
referenced_data_file: self.referenced_data_file,
228+
content_offset: self.content_offset,
229+
content_size_in_bytes: self.content_size_in_bytes,
218230
})
219231
}
220232
}
@@ -359,7 +371,11 @@ mod tests {
359371
split_offsets: vec![4],
360372
equality_ids: vec![],
361373
sort_order_id: Some(0),
362-
partition_spec_id: 0
374+
partition_spec_id: 0,
375+
first_row_id: None,
376+
referenced_data_file: None,
377+
content_offset: None,
378+
content_size_in_bytes: None,
363379
}];
364380

365381
let mut buffer = Vec::new();

crates/iceberg/src/spec/manifest/data_file.rs

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ pub struct DataFile {
4343
pub(crate) file_path: String,
4444
/// field id: 101
4545
///
46-
/// String file format name, avro, orc or parquet
46+
/// String file format name, `avro`, `orc`, `parquet`, or `puffin`
4747
pub(crate) file_format: DataFileFormat,
4848
/// field id: 102
4949
///
@@ -52,7 +52,7 @@ pub struct DataFile {
5252
pub(crate) partition: Struct,
5353
/// field id: 103
5454
///
55-
/// Number of records in this file
55+
/// Number of records in this file, or the cardinality of a deletion vector
5656
pub(crate) record_count: u64,
5757
/// field id: 104
5858
///
@@ -148,9 +148,35 @@ pub struct DataFile {
148148
/// delete files.
149149
#[builder(default, setter(strip_option))]
150150
pub(crate) sort_order_id: Option<i32>,
151+
/// field id: 142
152+
///
153+
/// The _row_id for the first row in the data file.
154+
/// For more details, refer to https://github.com/apache/iceberg/blob/main/format/spec.md#first-row-id-inheritance
155+
#[builder(default)]
156+
pub(crate) first_row_id: Option<i64>,
151157
/// This field is not included in spec. It is just store in memory representation used
152158
/// in process.
153159
pub(crate) partition_spec_id: i32,
160+
/// field id: 143
161+
///
162+
/// Fully qualified location (URI with FS scheme) of a data file that all deletes reference.
163+
/// Position delete metadata can use `referenced_data_file` when all deletes tracked by the
164+
/// entry are in a single data file. Setting the referenced file is required for deletion vectors.
165+
#[builder(default)]
166+
pub(crate) referenced_data_file: Option<String>,
167+
/// field: 144
168+
///
169+
/// The offset in the file where the content starts.
170+
/// The `content_offset` and `content_size_in_bytes` fields are used to reference a specific blob
171+
/// for direct access to a deletion vector. For deletion vectors, these values are required and must
172+
/// exactly match the `offset` and `length` stored in the Puffin footer for the deletion vector blob.
173+
#[builder(default)]
174+
pub(crate) content_offset: Option<i64>,
175+
/// field: 145
176+
///
177+
/// The length of a referenced content stored in the file; required if `content_offset` is present
178+
#[builder(default)]
179+
pub(crate) content_size_in_bytes: Option<i64>,
154180
}
155181

156182
impl DataFile {
@@ -226,6 +252,10 @@ impl DataFile {
226252
pub fn equality_ids(&self) -> &[i32] {
227253
&self.equality_ids
228254
}
255+
/// Get the first row id in the data file.
256+
pub fn first_row_id(&self) -> Option<i64> {
257+
self.first_row_id
258+
}
229259
/// Get the sort order id of the data file.
230260
/// Only data files and equality delete files should be
231261
/// written with a non-null order id. Position deletes are required to be
@@ -235,6 +265,21 @@ impl DataFile {
235265
pub fn sort_order_id(&self) -> Option<i32> {
236266
self.sort_order_id
237267
}
268+
/// Get the fully qualified referenced location for the corresponding data file.
269+
/// Positional delete files could have the field set, and deletion vectors must the field set.
270+
pub fn referenced_data_file(&self) -> Option<String> {
271+
self.referenced_data_file.clone()
272+
}
273+
/// Get the offset in the file where the blob content starts.
274+
/// Only meaningful for puffin blobs, and required for deletion vectors.
275+
pub fn content_offset(&self) -> Option<i64> {
276+
self.content_offset
277+
}
278+
/// Get the length of a puffin blob.
279+
/// Only meaningful for puffin blobs, and required for deletion vectors.
280+
pub fn content_size_in_bytes(&self) -> Option<i64> {
281+
self.content_size_in_bytes
282+
}
238283
}
239284

240285
/// Convert data files to avro bytes and write to writer.
@@ -323,6 +368,8 @@ pub enum DataFileFormat {
323368
Orc,
324369
/// Parquet file format: <https://parquet.apache.org/>
325370
Parquet,
371+
/// Puffin file format: <https://iceberg.apache.org/puffin-spec/>
372+
Puffin,
326373
}
327374

328375
impl FromStr for DataFileFormat {
@@ -333,6 +380,7 @@ impl FromStr for DataFileFormat {
333380
"avro" => Ok(Self::Avro),
334381
"orc" => Ok(Self::Orc),
335382
"parquet" => Ok(Self::Parquet),
383+
"puffin" => Ok(Self::Puffin),
336384
_ => Err(Error::new(
337385
ErrorKind::DataInvalid,
338386
format!("Unsupported data file format: {}", s),
@@ -347,6 +395,7 @@ impl std::fmt::Display for DataFileFormat {
347395
DataFileFormat::Avro => write!(f, "avro"),
348396
DataFileFormat::Orc => write!(f, "orc"),
349397
DataFileFormat::Parquet => write!(f, "parquet"),
398+
DataFileFormat::Puffin => write!(f, "puffin"),
350399
}
351400
}
352401
}

crates/iceberg/src/spec/manifest/entry.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,46 @@ static SORT_ORDER_ID: Lazy<NestedFieldRef> = {
469469
})
470470
};
471471

472+
static FIRST_ROW_ID: Lazy<NestedFieldRef> = {
473+
Lazy::new(|| {
474+
Arc::new(NestedField::optional(
475+
142,
476+
"first_row_id",
477+
Type::Primitive(PrimitiveType::Long),
478+
))
479+
})
480+
};
481+
482+
static REFERENCE_DATA_FILE: Lazy<NestedFieldRef> = {
483+
Lazy::new(|| {
484+
Arc::new(NestedField::optional(
485+
143,
486+
"referenced_data_file",
487+
Type::Primitive(PrimitiveType::String),
488+
))
489+
})
490+
};
491+
492+
static CONTENT_OFFSET: Lazy<NestedFieldRef> = {
493+
Lazy::new(|| {
494+
Arc::new(NestedField::optional(
495+
144,
496+
"content_offset",
497+
Type::Primitive(PrimitiveType::Long),
498+
))
499+
})
500+
};
501+
502+
static CONTENT_SIZE_IN_BYTES: Lazy<NestedFieldRef> = {
503+
Lazy::new(|| {
504+
Arc::new(NestedField::optional(
505+
145,
506+
"content_size_in_bytes",
507+
Type::Primitive(PrimitiveType::Long),
508+
))
509+
})
510+
};
511+
472512
fn data_file_fields_v2(partition_type: &StructType) -> Vec<NestedFieldRef> {
473513
vec![
474514
CONTENT.clone(),
@@ -491,6 +531,10 @@ fn data_file_fields_v2(partition_type: &StructType) -> Vec<NestedFieldRef> {
491531
SPLIT_OFFSETS.clone(),
492532
EQUALITY_IDS.clone(),
493533
SORT_ORDER_ID.clone(),
534+
FIRST_ROW_ID.clone(),
535+
REFERENCE_DATA_FILE.clone(),
536+
CONTENT_OFFSET.clone(),
537+
CONTENT_SIZE_IN_BYTES.clone(),
494538
]
495539
}
496540

0 commit comments

Comments
 (0)