From 3a7c9c80724182c03e1b4cc5e1d411dfd434e107 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 7 Jun 2025 07:30:42 -0400 Subject: [PATCH] Unify Metadata Handing: use `FieldMetadata` in `Expr::Alias` and `ExprSchemable` --- datafusion/core/tests/dataframe/mod.rs | 3 +- .../user_defined_scalar_functions.rs | 3 +- datafusion/expr/src/expr.rs | 57 +++++++++++++------ datafusion/expr/src/expr_schema.rs | 31 +++++----- datafusion/physical-expr/src/planner.rs | 1 - datafusion/proto/src/logical_plan/to_proto.rs | 5 +- docs/source/library-user-guide/upgrading.md | 29 ++++++++++ 7 files changed, 92 insertions(+), 37 deletions(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index f198907cf5a6..7716c549bb0a 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -70,7 +70,7 @@ use datafusion_common::{ use datafusion_common_runtime::SpawnedTask; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; -use datafusion_expr::expr::{GroupingSet, Sort, WindowFunction}; +use datafusion_expr::expr::{FieldMetadata, GroupingSet, Sort, WindowFunction}; use datafusion_expr::var_provider::{VarProvider, VarType}; use datafusion_expr::{ cast, col, create_udf, exists, in_subquery, lit, out_ref_col, placeholder, @@ -5674,6 +5674,7 @@ async fn test_alias() -> Result<()> { async fn test_alias_with_metadata() -> Result<()> { let mut metadata = HashMap::new(); metadata.insert(String::from("k"), String::from("v")); + let metadata = FieldMetadata::from(metadata); let df = create_test_table("test") .await? .select(vec![col("a").alias_with_metadata("b", Some(metadata))])? diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index d7dd65deab5f..a3968f9efd31 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -1533,10 +1533,11 @@ async fn test_metadata_based_udf_with_literal() -> Result<()> { [("modify_values".to_string(), "double_output".to_string())] .into_iter() .collect(); + let input_metadata = FieldMetadata::from(input_metadata); let df = ctx.sql("select 0;").await?.select(vec![ lit(5u64).alias_with_metadata("lit_with_doubling", Some(input_metadata.clone())), lit(5u64).alias("lit_no_doubling"), - lit_with_metadata(5u64, Some(FieldMetadata::from(input_metadata))) + lit_with_metadata(5u64, Some(input_metadata)) .alias("lit_with_double_no_alias_metadata"), ])?; diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index f41f09fc0afa..97f83305dcbe 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -517,6 +517,9 @@ impl FieldMetadata { /// Adds metadata from `other` into `self`, overwriting any existing keys. pub fn extend(&mut self, other: Self) { + if other.is_empty() { + return; + } let other = Arc::unwrap_or_clone(other.into_inner()); Arc::make_mut(&mut self.inner).extend(other); } @@ -531,18 +534,21 @@ impl FieldMetadata { self.inner.len() } + /// Convert this `FieldMetadata` into a `HashMap` + pub fn to_hashmap(&self) -> std::collections::HashMap { + self.inner + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() + } + /// Updates the metadata on the Field with this metadata, if it is not empty. pub fn add_to_field(&self, field: Field) -> Field { if self.inner.is_empty() { return field; } - field.with_metadata( - self.inner - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(), - ) + field.with_metadata(self.to_hashmap()) } } @@ -575,6 +581,24 @@ impl From<&std::collections::HashMap> for FieldMetadata { } } +/// From hashbrown map +impl From> for FieldMetadata { + fn from(map: HashMap) -> Self { + let inner = map.into_iter().collect(); + Self::new(inner) + } +} + +impl From<&HashMap> for FieldMetadata { + fn from(map: &HashMap) -> Self { + let inner = map + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + Self::new(inner) + } +} + /// UNNEST expression. #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] pub struct Unnest { @@ -601,7 +625,7 @@ pub struct Alias { pub expr: Box, pub relation: Option, pub name: String, - pub metadata: Option>, + pub metadata: Option, } impl Hash for Alias { @@ -641,10 +665,7 @@ impl Alias { } } - pub fn with_metadata( - mut self, - metadata: Option>, - ) -> Self { + pub fn with_metadata(mut self, metadata: Option) -> Self { self.metadata = metadata; self } @@ -1591,15 +1612,17 @@ impl Expr { /// # Example /// ``` /// # use datafusion_expr::col; - /// use std::collections::HashMap; + /// # use std::collections::HashMap; + /// # use datafusion_expr::expr::FieldMetadata; /// let metadata = HashMap::from([("key".to_string(), "value".to_string())]); + /// let metadata = FieldMetadata::from(metadata); /// let expr = col("foo").alias_with_metadata("bar", Some(metadata)); /// ``` /// pub fn alias_with_metadata( self, name: impl Into, - metadata: Option>, + metadata: Option, ) -> Expr { Expr::Alias(Alias::new(self, None::<&str>, name.into()).with_metadata(metadata)) } @@ -1621,8 +1644,10 @@ impl Expr { /// # Example /// ``` /// # use datafusion_expr::col; - /// use std::collections::HashMap; + /// # use std::collections::HashMap; + /// # use datafusion_expr::expr::FieldMetadata; /// let metadata = HashMap::from([("key".to_string(), "value".to_string())]); + /// let metadata = FieldMetadata::from(metadata); /// let expr = col("foo").alias_qualified_with_metadata(Some("tbl"), "bar", Some(metadata)); /// ``` /// @@ -1630,7 +1655,7 @@ impl Expr { self, relation: Option>, name: impl Into, - metadata: Option>, + metadata: Option, ) -> Expr { Expr::Alias(Alias::new(self, relation, name.into()).with_metadata(metadata)) } @@ -3819,7 +3844,7 @@ mod test { // If this test fails when you change `Expr`, please try // `Box`ing the fields to make `Expr` smaller // See https://github.com/apache/datafusion/issues/16199 for details - assert_eq!(size_of::(), 144); + assert_eq!(size_of::(), 128); assert_eq!(size_of::(), 64); assert_eq!(size_of::(), 24); // 3 ptrs assert_eq!(size_of::>(), 24); diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 5ff487303308..8ca479bb6f9b 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -17,8 +17,8 @@ use super::{Between, Expr, Like}; use crate::expr::{ - AggregateFunction, AggregateFunctionParams, Alias, BinaryExpr, Cast, InList, - InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction, + AggregateFunction, AggregateFunctionParams, Alias, BinaryExpr, Cast, FieldMetadata, + InList, InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction, WindowFunctionParams, }; use crate::type_coercion::functions::{ @@ -34,7 +34,6 @@ use datafusion_common::{ }; use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer; use datafusion_functions_window_common::field::WindowUDFFieldArgs; -use std::collections::HashMap; use std::sync::Arc; /// Trait to allow expr to typable with respect to a schema @@ -46,7 +45,7 @@ pub trait ExprSchemable { fn nullable(&self, input_schema: &dyn ExprSchema) -> Result; /// Given a schema, return the expr's optional metadata - fn metadata(&self, schema: &dyn ExprSchema) -> Result>; + fn metadata(&self, schema: &dyn ExprSchema) -> Result; /// Convert to a field with respect to a schema fn to_field( @@ -346,9 +345,9 @@ impl ExprSchemable for Expr { } } - fn metadata(&self, schema: &dyn ExprSchema) -> Result> { + fn metadata(&self, schema: &dyn ExprSchema) -> Result { self.to_field(schema) - .map(|(_, field)| field.metadata().clone()) + .map(|(_, field)| FieldMetadata::from(field.metadata())) } /// Returns the datatype and nullability of the expression based on [ExprSchema]. @@ -405,12 +404,10 @@ impl ExprSchemable for Expr { let mut combined_metadata = expr.metadata(schema)?; if let Some(metadata) = metadata { - if !metadata.is_empty() { - combined_metadata.extend(metadata.clone()); - } + combined_metadata.extend(metadata.clone()); } - Ok(Arc::new(field.with_metadata(combined_metadata))) + Ok(Arc::new(combined_metadata.add_to_field(field))) } Expr::Negative(expr) => expr.to_field(schema).map(|(_, f)| f), Expr::Column(c) => schema.field_from_column(c).map(|f| Arc::new(f.clone())), @@ -736,7 +733,7 @@ mod tests { use super::*; use crate::{col, lit}; - use datafusion_common::{internal_err, DFSchema, ScalarValue}; + use datafusion_common::{internal_err, DFSchema, HashMap, ScalarValue}; macro_rules! test_is_expr_nullable { ($EXPR_TYPE:ident) => {{ @@ -842,6 +839,7 @@ mod tests { fn test_expr_metadata() { let mut meta = HashMap::new(); meta.insert("bar".to_string(), "buzz".to_string()); + let meta = FieldMetadata::from(meta); let expr = col("foo"); let schema = MockExprSchema::new() .with_data_type(DataType::Int32) @@ -860,14 +858,13 @@ mod tests { ); let schema = DFSchema::from_unqualified_fields( - vec![Field::new("foo", DataType::Int32, true).with_metadata(meta.clone())] - .into(), - HashMap::new(), + vec![meta.add_to_field(Field::new("foo", DataType::Int32, true))].into(), + std::collections::HashMap::new(), ) .unwrap(); // verify to_field method populates metadata - assert_eq!(&meta, expr.to_field(&schema).unwrap().1.metadata()); + assert_eq!(meta, expr.metadata(&schema).unwrap()); } #[derive(Debug)] @@ -899,8 +896,8 @@ mod tests { self } - fn with_metadata(mut self, metadata: HashMap) -> Self { - self.field = self.field.with_metadata(metadata); + fn with_metadata(mut self, metadata: FieldMetadata) -> Self { + self.field = metadata.add_to_field(self.field); self } } diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 885901328988..fbc19b1202ee 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -115,7 +115,6 @@ pub fn create_physical_expr( match e { Expr::Alias(Alias { expr, metadata, .. }) => { if let Expr::Literal(v, prior_metadata) = expr.as_ref() { - let metadata = metadata.as_ref().map(|m| FieldMetadata::from(m.clone())); let new_metadata = FieldMetadata::merge_options( prior_metadata.as_ref(), metadata.as_ref(), diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index f9315604039c..078a7480b05b 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -211,7 +211,10 @@ pub fn serialize_expr( .map(|r| vec![r.into()]) .unwrap_or(vec![]), alias: name.to_owned(), - metadata: metadata.to_owned().unwrap_or(HashMap::new()), + metadata: metadata + .as_ref() + .map(|m| m.to_hashmap()) + .unwrap_or(HashMap::new()), }); protobuf::LogicalExprNode { expr_type: Some(ExprType::Alias(alias)), diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 67ae41837485..d4e3b75c67ab 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -19,6 +19,35 @@ # Upgrade Guides +## DataFusion `49.0.0` + +### Metadata is now represented by `FieldMetadata` + +Metadata from the Arrow `Field` is now stored using the `FieldMetadata` +structure. In prior versions it was stored as both a `HashMap` +and a `BTreeMap`. `FieldMetadata` is a easier to work with and +is more efficient. + +To create `FieldMetadata` from a `Field`: + +```rust +# /* comment to avoid running + let metadata = FieldMetadata::from(&field); +# */ +``` + +To add metadata to a `Field`, use the `add_to_field` method: + +```rust +# /* comment to avoid running +let updated_field = metadata.add_to_field(field); +# */ +``` + +See [#16317] for details. + +[#16317]: https://github.com/apache/datafusion/pull/16317 + ## DataFusion `48.0.0` ### `Expr::WindowFunction` is now `Box`ed