-
Notifications
You must be signed in to change notification settings - Fork 281
feat: Declarative TableMetadata Builder #1362
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ use _serde::TableMetadataEnum; | |
use chrono::{DateTime, Utc}; | ||
use serde::{Deserialize, Serialize}; | ||
use serde_repr::{Deserialize_repr, Serialize_repr}; | ||
use typed_builder::TypedBuilder; | ||
use uuid::Uuid; | ||
|
||
use super::snapshot::SnapshotReference; | ||
|
@@ -101,7 +102,10 @@ pub const RESERVED_PROPERTIES: [&str; 9] = [ | |
/// Reference to [`TableMetadata`]. | ||
pub type TableMetadataRef = Arc<TableMetadata>; | ||
|
||
#[derive(Debug, PartialEq, Deserialize, Eq, Clone)] | ||
#[derive(Debug, PartialEq, Deserialize, Eq, Clone, TypedBuilder)] | ||
#[builder(builder_method(name=declarative_builder))] | ||
#[builder(builder_type(name=TableMetadataDeclarativeBuilder, doc="Build a new [`TableMetadata`] in a declarative way. For imperative operations (e.g. `add_snapshot`) and creating new TableMetadata, use [`TableMetadataBuilder`] instead."))] | ||
#[builder(build_method(into = UnnormalizedTableMetadata))] | ||
#[serde(try_from = "TableMetadataEnum")] | ||
/// Fields for the version 2 of the table metadata. | ||
/// | ||
|
@@ -121,12 +125,15 @@ pub struct TableMetadata { | |
/// An integer; the highest assigned column ID for the table. | ||
pub(crate) last_column_id: i32, | ||
/// A list of schemas, stored as objects with schema-id. | ||
#[builder(setter(transform = |schemas: Vec<SchemaRef>| schemas.into_iter().map(|s| (s.schema_id(), s)).collect()))] | ||
pub(crate) schemas: HashMap<i32, SchemaRef>, | ||
/// ID of the table’s current schema. | ||
pub(crate) current_schema_id: i32, | ||
/// A list of partition specs, stored as full partition spec objects. | ||
#[builder(setter(transform = |specs: Vec<PartitionSpecRef>| specs.into_iter().map(|s| (s.spec_id(), s)).collect()))] | ||
pub(crate) partition_specs: HashMap<i32, PartitionSpecRef>, | ||
/// ID of the “current” spec that writers should use by default. | ||
#[builder(setter(into))] | ||
pub(crate) default_spec: PartitionSpecRef, | ||
/// Partition type of the default partition spec. | ||
pub(crate) default_partition_type: StructType, | ||
|
@@ -135,21 +142,25 @@ pub struct TableMetadata { | |
///A string to string map of table properties. This is used to control settings that | ||
/// affect reading and writing and is not intended to be used for arbitrary metadata. | ||
/// For example, commit.retry.num-retries is used to control the number of commit retries. | ||
#[builder(default)] | ||
pub(crate) properties: HashMap<String, String>, | ||
/// long ID of the current table snapshot; must be the same as the current | ||
/// ID of the main branch in refs. | ||
#[builder(default)] | ||
pub(crate) current_snapshot_id: Option<i64>, | ||
///A list of valid snapshots. Valid snapshots are snapshots for which all | ||
/// data files exist in the file system. A data file must not be deleted | ||
/// from the file system until the last snapshot in which it was listed is | ||
/// garbage collected. | ||
#[builder(default, setter(transform = |snapshots: Vec<SnapshotRef>| snapshots.into_iter().map(|s| (s.snapshot_id(), s)).collect()))] | ||
pub(crate) snapshots: HashMap<i64, SnapshotRef>, | ||
/// A list (optional) of timestamp and snapshot ID pairs that encodes changes | ||
/// to the current snapshot for the table. Each time the current-snapshot-id | ||
/// is changed, a new entry should be added with the last-updated-ms | ||
/// and the new current-snapshot-id. When snapshots are expired from | ||
/// the list of valid snapshots, all entries before a snapshot that has | ||
/// expired should be removed. | ||
#[builder(default)] | ||
pub(crate) snapshot_log: Vec<SnapshotLog>, | ||
|
||
/// A list (optional) of timestamp and metadata file location pairs | ||
|
@@ -158,9 +169,11 @@ pub struct TableMetadata { | |
/// previous metadata file location should be added to the list. | ||
/// Tables can be configured to remove oldest metadata log entries and | ||
/// keep a fixed-size log of the most recent entries after a commit. | ||
#[builder(default)] | ||
pub(crate) metadata_log: Vec<MetadataLog>, | ||
|
||
/// A list of sort orders, stored as full sort order objects. | ||
#[builder(setter(transform = |sort_orders: Vec<SortOrderRef>| sort_orders.into_iter().map(|s| (s.order_id, s)).collect()))] | ||
pub(crate) sort_orders: HashMap<i64, SortOrderRef>, | ||
/// Default sort order id of the table. Note that this could be used by | ||
/// writers, but is not used when reading because reads use the specs | ||
|
@@ -172,10 +185,17 @@ pub struct TableMetadata { | |
/// even if the refs map is null. | ||
pub(crate) refs: HashMap<String, SnapshotReference>, | ||
/// Mapping of snapshot ids to statistics files. | ||
#[builder(default, setter(transform = |stats: Vec<StatisticsFile>| { | ||
stats.into_iter().map(|s| (s.snapshot_id, s)).collect() | ||
Comment on lines
+188
to
+189
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why also default? How is this different from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its not - I missed the default for |
||
}))] | ||
pub(crate) statistics: HashMap<i64, StatisticsFile>, | ||
/// Mapping of snapshot ids to partition statistics files. | ||
#[builder(default, setter(transform = |stats: Vec<PartitionStatisticsFile>| { | ||
stats.into_iter().map(|s| (s.snapshot_id, s)).collect() | ||
}))] | ||
pub(crate) partition_statistics: HashMap<i64, PartitionStatisticsFile>, | ||
/// Encryption Keys | ||
#[builder(default)] | ||
pub(crate) encryption_keys: HashMap<String, String>, | ||
} | ||
|
||
|
@@ -654,6 +674,35 @@ impl TableMetadata { | |
} | ||
} | ||
|
||
impl TableMetadataDeclarativeBuilder {} | ||
|
||
/// Unnormalized table metadata, used as an intermediate type | ||
/// to build table metadata in a declarative way. | ||
#[derive(Debug, PartialEq, Deserialize, Eq, Clone)] | ||
pub struct UnnormalizedTableMetadata(TableMetadata); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't get why we need this. Why not build TableMeadata from a declarative builder directly?🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mainly due to my point 1) in this comment: #1362 (comment) I don't think there should be any constructor that allows to create unvalidated / potentially corrupt |
||
|
||
impl UnnormalizedTableMetadata { | ||
/// Try to normalize the table metadata. | ||
pub fn try_normalize(self) -> Result<TableMetadata> { | ||
let mut metadata = self.0; | ||
metadata.try_normalize()?; | ||
Ok(metadata) | ||
} | ||
} | ||
|
||
impl UnnormalizedTableMetadata { | ||
/// Build a new [`UnnormalizedTableMetadata`] using the [`TableMetadataDeclarativeBuilder`]. | ||
pub fn builder() -> TableMetadataDeclarativeBuilder { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unused? remove? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a library and the constructor can be used outside of the crate. I like to attach some constructor to a type if available. |
||
TableMetadata::declarative_builder() | ||
} | ||
} | ||
|
||
impl From<TableMetadata> for UnnormalizedTableMetadata { | ||
fn from(value: TableMetadata) -> Self { | ||
UnnormalizedTableMetadata(value) | ||
} | ||
} | ||
|
||
pub(super) mod _serde { | ||
use std::borrow::BorrowMut; | ||
/// This is a helper module that defines types to help with serialization/deserialization. | ||
|
@@ -1331,7 +1380,8 @@ mod tests { | |
use crate::spec::{ | ||
BlobMetadata, NestedField, NullOrder, Operation, PartitionSpec, PartitionStatisticsFile, | ||
PrimitiveType, Schema, Snapshot, SnapshotReference, SnapshotRetention, SortDirection, | ||
SortField, SortOrder, StatisticsFile, Summary, Transform, Type, UnboundPartitionField, | ||
SortField, SortOrder, StatisticsFile, StructType, Summary, Transform, Type, | ||
UnboundPartitionField, | ||
}; | ||
|
||
fn check_table_metadata_serde(json: &str, expected_type: TableMetadata) { | ||
|
@@ -3018,4 +3068,62 @@ mod tests { | |
)]) | ||
); | ||
} | ||
|
||
#[test] | ||
fn test_build_declarative_table_metadata() { | ||
let table_metadata = TableMetadata::declarative_builder() | ||
.format_version(FormatVersion::V2) | ||
.table_uuid(Uuid::nil()) | ||
.location("s3://db/table".to_string()) | ||
.last_sequence_number(1) | ||
.last_updated_ms(1515100955770) | ||
.last_column_id(0) | ||
.schemas(vec![Arc::new(Schema::builder().build().unwrap())]) | ||
.current_schema_id(0) | ||
.partition_specs(vec![Arc::new(PartitionSpec::unpartition_spec())]) | ||
.sort_orders(vec![Arc::new(SortOrder::unsorted_order())]) | ||
.default_spec(PartitionSpec::unpartition_spec()) | ||
.default_sort_order_id(0) | ||
.last_partition_id(0) | ||
.default_partition_type(StructType::new(vec![])) | ||
.properties(HashMap::new()) | ||
.snapshots(vec![]) | ||
.current_snapshot_id(None) | ||
.snapshot_log(vec![]) | ||
.metadata_log(vec![]) | ||
.refs(HashMap::new()) | ||
.build() | ||
.try_normalize() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the test for the declarative builder would be better to avoid unnecessary calls. Do we need OTOH it's somewhat weird that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer this design as well, however I can't find an easy way to implement a custom build method without implementing it on the very complex generic type that is derived by the macro (see end of this message) and is hard to keep in sync.
I see the following options:
If there are any other ideas here or if I missed something, let me know! Here is the type: Note that default and non-default fields are treated differently: #[allow(dead_code, non_camel_case_types, missing_docs)]
#[automatically_derived]
impl<
__properties: ::typed_builder::Optional<HashMap<String, String>>,
__current_snapshot_id: ::typed_builder::Optional<Option<i64>>,
__snapshots: ::typed_builder::Optional<HashMap<i64, SnapshotRef>>,
__snapshot_log: ::typed_builder::Optional<Vec<SnapshotLog>>,
__metadata_log: ::typed_builder::Optional<Vec<MetadataLog>>,
__statistics: ::typed_builder::Optional<HashMap<i64, StatisticsFile>>,
__partition_statistics: ::typed_builder::Optional<HashMap<i64, PartitionStatisticsFile>>,
__encryption_keys: ::typed_builder::Optional<HashMap<String, String>>,
>
TableMetadataDeclarativeBuilder<(
(FormatVersion,),
(Uuid,),
(String,),
(i64,),
(i64,),
(i32,),
(HashMap<i32, SchemaRef>,),
(i32,),
(HashMap<i32, PartitionSpecRef>,),
(PartitionSpecRef,),
(StructType,),
(i32,),
__properties,
__current_snapshot_id,
__snapshots,
__snapshot_log,
__metadata_log,
(HashMap<i64, SortOrderRef>,),
(i64,),
(HashMap<String, SnapshotReference>,),
__statistics,
__partition_statistics,
__encryption_keys,
)> There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Looks there is draft design for this problem: idanarye/rust-typed-builder#95 . Another option maybe we can try to make the progress of feature. 😄 |
||
.unwrap(); | ||
assert_eq!(table_metadata.format_version, FormatVersion::V2); | ||
assert_eq!(table_metadata.table_uuid, Uuid::nil()); | ||
assert_eq!(table_metadata.location, "s3://db/table"); | ||
assert_eq!(table_metadata.last_sequence_number, 1); | ||
assert_eq!(table_metadata.last_updated_ms, 1515100955770); | ||
assert_eq!(table_metadata.last_column_id, 0); | ||
assert_eq!(table_metadata.schemas.len(), 1); | ||
assert_eq!( | ||
*table_metadata.schemas.values().next().unwrap(), | ||
Arc::new(Schema::builder().build().unwrap()) | ||
); | ||
assert_eq!(table_metadata.current_schema_id, 0); | ||
assert_eq!(table_metadata.partition_specs.len(), 1); | ||
assert_eq!( | ||
table_metadata.partition_specs.get(&0), | ||
Some(&Arc::new(PartitionSpec::unpartition_spec())) | ||
); | ||
assert_eq!(table_metadata.sort_orders.len(), 1); | ||
assert_eq!(table_metadata.default_sort_order_id, 0); | ||
assert_eq!(table_metadata.last_partition_id, 0); | ||
assert_eq!(table_metadata.properties.len(), 0); | ||
assert_eq!(table_metadata.snapshots.len(), 0); | ||
assert_eq!(table_metadata.current_snapshot_id, None); | ||
assert_eq!(table_metadata.snapshot_log.len(), 0); | ||
assert_eq!(table_metadata.metadata_log.len(), 0); | ||
assert_eq!(table_metadata.refs.len(), 0); | ||
assert_eq!(table_metadata.encryption_keys.len(), 0); | ||
assert_eq!(table_metadata.statistics.len(), 0); | ||
assert_eq!(table_metadata.partition_statistics.len(), 0); | ||
assert_eq!(table_metadata.encryption_keys.len(), 0); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not convinced "declarative" vs "imperative" is the defining distinction.
The
TableMetadataBuilder
is not "a builder pattern for TableMetadata". It's a facade to perform changes on the TableMetadata to be invoked by higher layer. For that reason, it includes implicit changes that are supposed to be carried on when a metadata change happens, such as updating last_updated_ms when a new snapshot is addediceberg-rust/crates/iceberg/src/spec/table_metadata_builder.rs
Line 394 in 3ea6f47
The use-case described in the PR description is "the" standard builder pattern. To me, this is the defining difference.
I suggest changing the doc string here and align the type names.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I absolutely agree with this from a rust perspective. From an iceberg perspective other languages use the world
Builder
for this functionality unfortunately. I am uncertain if renaming theTableMetadataBuilder
to a clearer name is worth the breaking change today.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with @findepi about the naming. To me this is more like a constructor, but since I'm not a native English speaker, I don't have a good suggestion to the name. But it would be lovely if we have a better name.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be honest, I feel the
TypedBuilder
is a little hard to maintain in such a large struct, do you mind to create a standalone struct for this?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't quite understand this suggestion - isn't the purpose of this PR to create
TableMetadata
? How would implementing it on another struct help?Regarding names, @liurenjie1024 would you be OK with the breaking change that Piotrs proposal implies? Renaming the existing
TableMetadataBuilder
to something else, then naming the new builderTableMetadataBuilder
(which does something completely different and is used by far less people?)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think @liurenjie1024 and @findepi you mean different things.
@liurenjie1024 you just would like to rename
TableMetadataDeclarativeBuilder
toTableMetadataConstructor
- correct? I am onboard with that.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. Feel free to suggest anything more intuitive.
I mean not using
TypedBuilder
to generate code automatically, rather than manually maintaining the builder/construct data structure. This pr contains many advanced usage ofTypeBuilder
, and as a reviewer I find it's sth how difficult to catch up all the usages. I also worry about the maintainceness of using too much auto generated code.