From fc1dd47c4ad2af8d473c49de6b391b6b59015856 Mon Sep 17 00:00:00 2001 From: Christian Thiel Date: Wed, 21 May 2025 20:53:54 +0200 Subject: [PATCH 1/3] feat: Declarative TableMetadata Builder --- crates/iceberg/src/spec/table_metadata.rs | 112 +++++++++++++++++++++- 1 file changed, 109 insertions(+), 3 deletions(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index fcd510920..92b379187 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -101,7 +101,10 @@ pub const RESERVED_PROPERTIES: [&str; 9] = [ /// Reference to [`TableMetadata`]. pub type TableMetadataRef = Arc; -#[derive(Debug, PartialEq, Deserialize, Eq, Clone)] +#[derive(Debug, PartialEq, Deserialize, Eq, Clone, typed_builder::TypedBuilder)] +#[builder(builder_method(name=declarative_builder))] +#[builder(builder_type(name=TableMetadataDeclarativeBuilder, doc="Build a new [`TableMetadata`] in a declarative way. For imperative operations (e.g. `add_snapshot`) and creating new TableMetadata, use [`TableMetadataBuilder`] instead."))] +#[builder(build_method(into = UnnormalizedTableMetadata))] #[serde(try_from = "TableMetadataEnum")] /// Fields for the version 2 of the table metadata. /// @@ -121,12 +124,15 @@ pub struct TableMetadata { /// An integer; the highest assigned column ID for the table. pub(crate) last_column_id: i32, /// A list of schemas, stored as objects with schema-id. + #[builder(setter(transform = |schemas: Vec| schemas.into_iter().map(|s| (s.schema_id(), s.into())).collect()))] pub(crate) schemas: HashMap, /// ID of the table’s current schema. pub(crate) current_schema_id: i32, /// A list of partition specs, stored as full partition spec objects. + #[builder(setter(transform = |specs: Vec| specs.into_iter().map(|s| (s.spec_id(), s.into())).collect()))] pub(crate) partition_specs: HashMap, /// ID of the “current” spec that writers should use by default. + #[builder(setter(into))] pub(crate) default_spec: PartitionSpecRef, /// Partition type of the default partition spec. pub(crate) default_partition_type: StructType, @@ -134,7 +140,7 @@ pub struct TableMetadata { pub(crate) last_partition_id: i32, ///A string to string map of table properties. This is used to control settings that /// affect reading and writing and is not intended to be used for arbitrary metadata. - /// For example, commit.retry.num-retries is used to control the number of commit retries. + /// For example, commit.retry.num-retries is used to control the number of commit retries pub(crate) properties: HashMap, /// long ID of the current table snapshot; must be the same as the current /// ID of the main branch in refs. @@ -143,6 +149,9 @@ pub struct TableMetadata { /// data files exist in the file system. A data file must not be deleted /// from the file system until the last snapshot in which it was listed is /// garbage collected. + #[builder(setter(transform = |snapshots: Vec| { + snapshots.into_iter().map(|s| (s.snapshot_id(), s.into())).collect() + }))] pub(crate) snapshots: HashMap, /// A list (optional) of timestamp and snapshot ID pairs that encodes changes /// to the current snapshot for the table. Each time the current-snapshot-id @@ -161,6 +170,9 @@ pub struct TableMetadata { pub(crate) metadata_log: Vec, /// A list of sort orders, stored as full sort order objects. + #[builder(setter(transform = |sort_orders: Vec| { + sort_orders.into_iter().map(|s| (s.order_id, s.into())).collect() + }))] pub(crate) sort_orders: HashMap, /// Default sort order id of the table. Note that this could be used by /// writers, but is not used when reading because reads use the specs @@ -172,10 +184,18 @@ pub struct TableMetadata { /// even if the refs map is null. pub(crate) refs: HashMap, /// Mapping of snapshot ids to statistics files. + #[builder(default, setter(transform = |stats: Vec| { + stats.into_iter().map(|s| (s.snapshot_id, s)).collect() + }))] pub(crate) statistics: HashMap, /// Mapping of snapshot ids to partition statistics files. + #[builder(default)] + #[builder(setter(transform = |stats: Vec| { + stats.into_iter().map(|s| (s.snapshot_id, s)).collect() + }))] pub(crate) partition_statistics: HashMap, /// Encryption Keys + #[builder(default)] pub(crate) encryption_keys: HashMap, } @@ -654,6 +674,33 @@ impl TableMetadata { } } +/// Unnormalized table metadata, used as an intermediate type +/// to build table metadata in a declarative way. +#[derive(Debug, PartialEq, Deserialize, Eq, Clone)] +pub struct UnnormalizedTableMetadata(TableMetadata); + +impl UnnormalizedTableMetadata { + /// Try to normalize the table metadata. + pub fn try_normalize(self) -> Result { + let mut metadata = self.0; + metadata.try_normalize()?; + Ok(metadata) + } +} + +impl UnnormalizedTableMetadata { + /// Build a new [`UnnormalizedTableMetadata`] using the [`TableMetadataDeclarativeBuilder`]. + pub fn builder() -> TableMetadataDeclarativeBuilder { + TableMetadata::declarative_builder() + } +} + +impl From for UnnormalizedTableMetadata { + fn from(value: TableMetadata) -> Self { + UnnormalizedTableMetadata(value) + } +} + pub(super) mod _serde { use std::borrow::BorrowMut; /// This is a helper module that defines types to help with serialization/deserialization. @@ -1331,7 +1378,8 @@ mod tests { use crate::spec::{ BlobMetadata, NestedField, NullOrder, Operation, PartitionSpec, PartitionStatisticsFile, PrimitiveType, Schema, Snapshot, SnapshotReference, SnapshotRetention, SortDirection, - SortField, SortOrder, StatisticsFile, Summary, Transform, Type, UnboundPartitionField, + SortField, SortOrder, StatisticsFile, StructType, Summary, Transform, Type, + UnboundPartitionField, }; fn check_table_metadata_serde(json: &str, expected_type: TableMetadata) { @@ -3018,4 +3066,62 @@ mod tests { )]) ); } + + #[test] + fn test_build_declarative_table_metadata() { + let table_metadata = TableMetadata::declarative_builder() + .format_version(FormatVersion::V2) + .table_uuid(Uuid::nil()) + .location("s3://db/table".to_string()) + .last_sequence_number(1) + .last_updated_ms(1515100955770) + .last_column_id(0) + .schemas(vec![Arc::new(Schema::builder().build().unwrap())]) + .current_schema_id(0) + .partition_specs(vec![Arc::new(PartitionSpec::unpartition_spec())]) + .sort_orders(vec![Arc::new(SortOrder::unsorted_order())]) + .default_spec(PartitionSpec::unpartition_spec()) + .default_sort_order_id(0) + .last_partition_id(0) + .default_partition_type(StructType::new(vec![])) + .properties(HashMap::new()) + .snapshots(vec![]) + .current_snapshot_id(None) + .snapshot_log(vec![]) + .metadata_log(vec![]) + .refs(HashMap::new()) + .build() + .try_normalize() + .unwrap(); + assert_eq!(table_metadata.format_version, FormatVersion::V2); + assert_eq!(table_metadata.table_uuid, Uuid::nil()); + assert_eq!(table_metadata.location, "s3://db/table"); + assert_eq!(table_metadata.last_sequence_number, 1); + assert_eq!(table_metadata.last_updated_ms, 1515100955770); + assert_eq!(table_metadata.last_column_id, 0); + assert_eq!(table_metadata.schemas.len(), 1); + assert_eq!( + *table_metadata.schemas.values().next().unwrap(), + Arc::new(Schema::builder().build().unwrap()) + ); + assert_eq!(table_metadata.current_schema_id, 0); + assert_eq!(table_metadata.partition_specs.len(), 1); + assert_eq!( + table_metadata.partition_specs.get(&0), + Some(&Arc::new(PartitionSpec::unpartition_spec())) + ); + assert_eq!(table_metadata.sort_orders.len(), 1); + assert_eq!(table_metadata.default_sort_order_id, 0); + assert_eq!(table_metadata.last_partition_id, 0); + assert_eq!(table_metadata.properties.len(), 0); + assert_eq!(table_metadata.snapshots.len(), 0); + assert_eq!(table_metadata.current_snapshot_id, None); + assert_eq!(table_metadata.snapshot_log.len(), 0); + assert_eq!(table_metadata.metadata_log.len(), 0); + assert_eq!(table_metadata.refs.len(), 0); + assert_eq!(table_metadata.encryption_keys.len(), 0); + assert_eq!(table_metadata.statistics.len(), 0); + assert_eq!(table_metadata.partition_statistics.len(), 0); + assert_eq!(table_metadata.encryption_keys.len(), 0); + } } From f7c3bab81fbec2d1106ca2b65efd2a99b0accf87 Mon Sep 17 00:00:00 2001 From: Christian Thiel Date: Wed, 21 May 2025 21:09:54 +0200 Subject: [PATCH 2/3] clippy --- crates/iceberg/src/spec/table_metadata.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 92b379187..0bf9d3297 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -124,12 +124,12 @@ pub struct TableMetadata { /// An integer; the highest assigned column ID for the table. pub(crate) last_column_id: i32, /// A list of schemas, stored as objects with schema-id. - #[builder(setter(transform = |schemas: Vec| schemas.into_iter().map(|s| (s.schema_id(), s.into())).collect()))] + #[builder(setter(transform = |schemas: Vec| schemas.into_iter().map(|s| (s.schema_id(), s)).collect()))] pub(crate) schemas: HashMap, /// ID of the table’s current schema. pub(crate) current_schema_id: i32, /// A list of partition specs, stored as full partition spec objects. - #[builder(setter(transform = |specs: Vec| specs.into_iter().map(|s| (s.spec_id(), s.into())).collect()))] + #[builder(setter(transform = |specs: Vec| specs.into_iter().map(|s| (s.spec_id(), s)).collect()))] pub(crate) partition_specs: HashMap, /// ID of the “current” spec that writers should use by default. #[builder(setter(into))] @@ -149,9 +149,7 @@ pub struct TableMetadata { /// data files exist in the file system. A data file must not be deleted /// from the file system until the last snapshot in which it was listed is /// garbage collected. - #[builder(setter(transform = |snapshots: Vec| { - snapshots.into_iter().map(|s| (s.snapshot_id(), s.into())).collect() - }))] + #[builder(setter(transform = |snapshots: Vec| snapshots.into_iter().map(|s| (s.snapshot_id(), s)).collect()))] pub(crate) snapshots: HashMap, /// A list (optional) of timestamp and snapshot ID pairs that encodes changes /// to the current snapshot for the table. Each time the current-snapshot-id @@ -170,9 +168,7 @@ pub struct TableMetadata { pub(crate) metadata_log: Vec, /// A list of sort orders, stored as full sort order objects. - #[builder(setter(transform = |sort_orders: Vec| { - sort_orders.into_iter().map(|s| (s.order_id, s.into())).collect() - }))] + #[builder(setter(transform = |sort_orders: Vec| sort_orders.into_iter().map(|s| (s.order_id, s)).collect()))] pub(crate) sort_orders: HashMap, /// Default sort order id of the table. Note that this could be used by /// writers, but is not used when reading because reads use the specs From 0fa8aa195e8805aec50ac83c9eeaefeff1145c40 Mon Sep 17 00:00:00 2001 From: Christian Thiel Date: Wed, 11 Jun 2025 22:20:23 +0200 Subject: [PATCH 3/3] address comments --- crates/iceberg/src/spec/table_metadata.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 0bf9d3297..4337a5d09 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -28,6 +28,7 @@ use _serde::TableMetadataEnum; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use serde_repr::{Deserialize_repr, Serialize_repr}; +use typed_builder::TypedBuilder; use uuid::Uuid; use super::snapshot::SnapshotReference; @@ -101,7 +102,7 @@ pub const RESERVED_PROPERTIES: [&str; 9] = [ /// Reference to [`TableMetadata`]. pub type TableMetadataRef = Arc; -#[derive(Debug, PartialEq, Deserialize, Eq, Clone, typed_builder::TypedBuilder)] +#[derive(Debug, PartialEq, Deserialize, Eq, Clone, TypedBuilder)] #[builder(builder_method(name=declarative_builder))] #[builder(builder_type(name=TableMetadataDeclarativeBuilder, doc="Build a new [`TableMetadata`] in a declarative way. For imperative operations (e.g. `add_snapshot`) and creating new TableMetadata, use [`TableMetadataBuilder`] instead."))] #[builder(build_method(into = UnnormalizedTableMetadata))] @@ -140,16 +141,18 @@ pub struct TableMetadata { pub(crate) last_partition_id: i32, ///A string to string map of table properties. This is used to control settings that /// affect reading and writing and is not intended to be used for arbitrary metadata. - /// For example, commit.retry.num-retries is used to control the number of commit retries + /// For example, commit.retry.num-retries is used to control the number of commit retries. + #[builder(default)] pub(crate) properties: HashMap, /// long ID of the current table snapshot; must be the same as the current /// ID of the main branch in refs. + #[builder(default)] pub(crate) current_snapshot_id: Option, ///A list of valid snapshots. Valid snapshots are snapshots for which all /// data files exist in the file system. A data file must not be deleted /// from the file system until the last snapshot in which it was listed is /// garbage collected. - #[builder(setter(transform = |snapshots: Vec| snapshots.into_iter().map(|s| (s.snapshot_id(), s)).collect()))] + #[builder(default, setter(transform = |snapshots: Vec| snapshots.into_iter().map(|s| (s.snapshot_id(), s)).collect()))] pub(crate) snapshots: HashMap, /// A list (optional) of timestamp and snapshot ID pairs that encodes changes /// to the current snapshot for the table. Each time the current-snapshot-id @@ -157,6 +160,7 @@ pub struct TableMetadata { /// and the new current-snapshot-id. When snapshots are expired from /// the list of valid snapshots, all entries before a snapshot that has /// expired should be removed. + #[builder(default)] pub(crate) snapshot_log: Vec, /// A list (optional) of timestamp and metadata file location pairs @@ -165,6 +169,7 @@ pub struct TableMetadata { /// previous metadata file location should be added to the list. /// Tables can be configured to remove oldest metadata log entries and /// keep a fixed-size log of the most recent entries after a commit. + #[builder(default)] pub(crate) metadata_log: Vec, /// A list of sort orders, stored as full sort order objects. @@ -185,8 +190,7 @@ pub struct TableMetadata { }))] pub(crate) statistics: HashMap, /// Mapping of snapshot ids to partition statistics files. - #[builder(default)] - #[builder(setter(transform = |stats: Vec| { + #[builder(default, setter(transform = |stats: Vec| { stats.into_iter().map(|s| (s.snapshot_id, s)).collect() }))] pub(crate) partition_statistics: HashMap, @@ -670,6 +674,8 @@ impl TableMetadata { } } +impl TableMetadataDeclarativeBuilder {} + /// Unnormalized table metadata, used as an intermediate type /// to build table metadata in a declarative way. #[derive(Debug, PartialEq, Deserialize, Eq, Clone)]