apache · comphead · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/native/Cargo.lock b/native/Cargo.lock
diff --git a/native/Cargo.toml b/native/Cargo.toml
@@ -34,12 +34,12 @@ edition = "2021"
 rust-version = "1.85"
 
 [workspace.dependencies]
-arrow = { version = "55.1.0", features = ["prettyprint", "ffi", "chrono-tz"] }
+arrow = { version = "55.2.0", features = ["prettyprint", "ffi", "chrono-tz"] }
 async-trait = { version = "0.1" }
 bytes = { version = "1.10.0" }
-parquet = { version = "55.1.0", default-features = false, features = ["experimental"] }
-datafusion = { version = "48.0.0", default-features = false, features = ["unicode_expressions", "crypto_expressions", "nested_expressions", "parquet"] }
-datafusion-spark = { version = "48.0.0" }
+parquet = { version = "55.2.0", default-features = false, features = ["experimental"] }
+datafusion = { git = "https://github.com/apache/datafusion.git", branch = "branch-49", default-features = false, features = ["unicode_expressions", "crypto_expressions", "nested_expressions", "parquet"] }
+datafusion-spark = { git = "https://github.com/apache/datafusion.git", branch = "branch-49" }
 datafusion-comet-spark-expr = { path = "spark-expr" }
 datafusion-comet-proto = { path = "proto" }
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
@@ -49,7 +49,7 @@ num = "0.4"
 rand = "0.9"
 regex = "1.9.6"
 thiserror = "2"
-object_store = { version = "0.12.0", features = ["gcp", "azure", "aws", "http"] }
+object_store = { version = "0.12.3", features = ["gcp", "azure", "aws", "http"] }
 url = "2.2"
 aws-config = "1.6.3"
 aws-credential-types = "1.2.3"

diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml
@@ -84,7 +84,7 @@ jni = { version = "0.21", features = ["invocation"] }
 lazy_static = "1.4"
 assertables = "9"
 hex = "0.4.3"
-datafusion-functions-nested = { version = "48.0.0" }
+datafusion-functions-nested = { git = "https://github.com/apache/datafusion.git", branch = "branch-49" }
 
 [features]
 default = []

diff --git a/native/core/benches/shuffle_writer.rs b/native/core/benches/shuffle_writer.rs
@@ -89,7 +89,8 @@ fn criterion_benchmark(c: &mut Criterion) {
         CometPartitioning::RangePartitioning(
             LexOrdering::new(vec![PhysicalSortExpr::new_default(
                 col("c0", batch.schema().as_ref()).unwrap(),
-            )]),
+            )])
+            .unwrap(),
             16,
             100,
         ),

diff --git a/native/core/src/execution/operators/filter.rs b/native/core/src/execution/operators/filter.rs
@@ -211,22 +211,16 @@ impl FilterExec {
             if let Some(binary) = conjunction.as_any().downcast_ref::<BinaryExpr>() {
                 if binary.op() == &Operator::Eq {
                     // Filter evaluates to single value for all partitions
-                    if input_eqs.is_expr_constant(binary.left()) {
-                        let (expr, across_parts) = (
-                            binary.right(),
-                            input_eqs.get_expr_constant_value(binary.right()),
-                        );
-                        res_constants.push(
-                            ConstExpr::new(Arc::clone(expr)).with_across_partitions(across_parts),
-                        );
-                    } else if input_eqs.is_expr_constant(binary.right()) {
-                        let (expr, across_parts) = (
-                            binary.left(),
-                            input_eqs.get_expr_constant_value(binary.left()),
-                        );
-                        res_constants.push(
-                            ConstExpr::new(Arc::clone(expr)).with_across_partitions(across_parts),
-                        );
+                    if input_eqs.is_expr_constant(binary.left()).is_some() {
+                        let across = input_eqs
+                            .is_expr_constant(binary.right())
+                            .unwrap_or_default();
+                        res_constants.push(ConstExpr::new(Arc::clone(binary.right()), across));
+                    } else if input_eqs.is_expr_constant(binary.right()).is_some() {
+                        let across = input_eqs
+                            .is_expr_constant(binary.left())
+                            .unwrap_or_default();
+                        res_constants.push(ConstExpr::new(Arc::clone(binary.left()), across));
                     }
                 }
             }
@@ -246,7 +240,7 @@ impl FilterExec {
         let mut eq_properties = input.equivalence_properties().clone();
         let (equal_pairs, _) = collect_columns_from_predicate(predicate);
         for (lhs, rhs) in equal_pairs {
-            eq_properties.add_equal_conditions(lhs, rhs)?
+            eq_properties.add_equal_conditions(Arc::clone(lhs), Arc::clone(rhs))?
         }
         // Add the columns that have only one viable value (singleton) after
         // filtering to constants.
@@ -258,14 +252,13 @@ impl FilterExec {
                     .min_value
                     .get_value();
                 let expr = Arc::new(column) as _;
-                ConstExpr::new(expr)
-                    .with_across_partitions(AcrossPartitions::Uniform(value.cloned()))
+                ConstExpr::new(expr, AcrossPartitions::Uniform(value.cloned()))
             });
         // This is for statistics
-        eq_properties = eq_properties.with_constants(constants);
+        eq_properties.add_constants(constants)?;
         // This is for logical constant (for example: a = '1', then a could be marked as a constant)
-        // to do: how to deal with multiple situation to represent = (for example c1 between 0 and 0)
-        eq_properties = eq_properties.with_constants(Self::extend_constants(input, predicate));
+        // to do: how to deal with a multiple situation to represent = (for example, c1 between 0 and 0)
+        eq_properties.add_constants(Self::extend_constants(input, predicate))?;
 
         let mut output_partitioning = input.output_partitioning().clone();
         // If contains projection, update the PlanProperties.

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -72,7 +72,7 @@ use crate::parquet::parquet_support::prepare_object_store_with_configs;
 use datafusion::common::scalar::ScalarStructBuilder;
 use datafusion::common::{
     tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter},
-    JoinType as DFJoinType, ScalarValue,
+    JoinType as DFJoinType, NullEquality, ScalarValue,
 };
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::logical_expr::type_coercion::other::get_coerce_type_for_case_expression;
@@ -593,6 +593,14 @@ impl PhysicalPlanner {
                         true,
                         false,
                     ))),
+                    // DataFusion 49 hardcodes return type for MD5 built in function as UTF8View
+                    // which is not yet supported in Comet
+                    // Converting forcibly to UTF8. To be removed after UTF8View supported
+                    "md5" => Ok(Arc::new(Cast::new(
+                        func?,
+                        DataType::Utf8,
+                        SparkCastOptions::new_without_timezone(EvalMode::Try, true),
+                    ))),
                     _ => func,
                 }
             }
@@ -1146,7 +1154,7 @@ impl PhysicalPlanner {
                 let child_copied = Self::wrap_in_copy_exec(Arc::clone(&child.native_plan));
 
                 let sort = Arc::new(
-                    SortExec::new(LexOrdering::new(exprs?), Arc::clone(&child_copied))
+                    SortExec::new(LexOrdering::new(exprs?).unwrap(), Arc::clone(&child_copied))
                         .with_fetch(fetch),
                 );
 
@@ -1422,7 +1430,7 @@ impl PhysicalPlanner {
                     sort_options,
                     // null doesn't equal to null in Spark join key. If the join key is
                     // `EqualNullSafe`, Spark will rewrite it during planning.
-                    false,
+                    NullEquality::NullEqualsNothing,
                 )?);
 
                 if join.filter.is_some() {
@@ -1490,7 +1498,7 @@ impl PhysicalPlanner {
                     PartitionMode::Partitioned,
                     // null doesn't equal to null in Spark join key. If the join key is
                     // `EqualNullSafe`, Spark will rewrite it during planning.
-                    false,
+                    NullEquality::NullEqualsNothing,
                 )?);
 
                 // If the hash join is build right, we need to swap the left and right
@@ -2186,13 +2194,15 @@ impl PhysicalPlanner {
         };
 
         let window_frame = WindowFrame::new_bounds(units, lower_bound, upper_bound);
+        let lex_orderings = LexOrdering::new(sort_exprs.to_vec());
+        let sort_phy_exprs = lex_orderings.as_deref().unwrap_or(&[]);
 
         datafusion::physical_plan::windows::create_window_expr(
             &window_func,
             window_func_name,
             &window_args,
             partition_by,
-            &LexOrdering::new(sort_exprs.to_vec()),
+            sort_phy_exprs,
             window_frame.into(),
             input_schema.as_ref(),
             false, // TODO: Ignore nulls
@@ -2273,7 +2283,7 @@ impl PhysicalPlanner {
                     .iter()
                     .map(|expr| self.create_sort_expr(expr, Arc::clone(&input_schema)))
                     .collect();
-                let lex_ordering = LexOrdering::from(exprs?);
+                let lex_ordering = LexOrdering::new(exprs?).unwrap();
                 Ok(CometPartitioning::RangePartitioning(
                     lex_ordering,
                     range_partition.num_partitions as usize,

diff --git a/native/core/src/execution/shuffle/comet_partitioning.rs b/native/core/src/execution/shuffle/comet_partitioning.rs
@@ -24,7 +24,7 @@ pub enum CometPartitioning {
     /// Allocate rows based on a hash of one of more expressions and the specified number of
     /// partitions
     Hash(Vec<Arc<dyn PhysicalExpr>>, usize),
-    /// Allocate rows based on lexical order of one of more expressions and the specified number of
+    /// Allocate rows based on the lexical order of one of more expressions and the specified number of
     /// partitions
     RangePartitioning(LexOrdering, usize, usize),
 }

diff --git a/native/core/src/execution/shuffle/range_partitioner.rs b/native/core/src/execution/shuffle/range_partitioner.rs
@@ -247,7 +247,7 @@ mod test {
 
         let (rows, row_converter) = RangePartitioner::generate_bounds(
             input_batch.columns().to_vec().as_ref(),
-            &lex_ordering,
+            &lex_ordering.unwrap(),
             10,
             input_batch.num_rows(),
             1000,

diff --git a/native/core/src/execution/shuffle/shuffle_writer.rs b/native/core/src/execution/shuffle/shuffle_writer.rs
@@ -934,7 +934,7 @@ impl SinglePartitionShufflePartitioner {
                     Ok(Some(concatenated))
                 }
                 Err(e) => Err(DataFusionError::ArrowError(
-                    e,
+                    Box::from(e),
                     Some(DataFusionError::get_back_trace()),
                 )),
             }
@@ -1122,7 +1122,7 @@ impl Iterator for PartitionedBatchIterator<'_> {
                 Some(Ok(batch))
             }
             Err(e) => Some(Err(DataFusionError::ArrowError(
-                e,
+                Box::from(e),
                 Some(DataFusionError::get_back_trace()),
             ))),
         }
@@ -1409,7 +1409,8 @@ mod test {
             CometPartitioning::RangePartitioning(
                 LexOrdering::new(vec![PhysicalSortExpr::new_default(
                     col("a", batch.schema().as_ref()).unwrap(),
-                )]),
+                )])
+                .unwrap(),
                 num_partitions,
                 100,
             ),

diff --git a/native/hdfs/src/object_store/hdfs.rs b/native/hdfs/src/object_store/hdfs.rs
@@ -32,7 +32,7 @@ use hdfs::walkdir::HdfsWalkDir;
 use object_store::{
     path::{self, Path},
     Error, GetOptions, GetRange, GetResult, GetResultPayload, ListResult, MultipartUpload,
-    ObjectMeta, ObjectStore, PutMultipartOpts, PutOptions, PutPayload, PutResult, Result,
+    ObjectMeta, ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result,
 };
 
 /// scheme for HDFS File System
@@ -139,7 +139,7 @@ impl ObjectStore for HadoopFileSystem {
     async fn put_multipart_opts(
         &self,
         _location: &Path,
-        _opts: PutMultipartOpts,
+        _opts: PutMultipartOptions,
     ) -> object_store::Result<Box<dyn MultipartUpload>> {
         unimplemented!()
     }

diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs
@@ -960,6 +960,7 @@ fn cast_array(
         {
             spark_cast_nonintegral_numeric_to_integral(&array, eval_mode, from_type, to_type)
         }
+        (Utf8View, Utf8) => Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?),
         (Struct(_), Utf8) => Ok(casts_struct_to_string(array.as_struct(), cast_options)?),
         (Struct(_), Struct(_)) => Ok(cast_struct_to_struct(
             array.as_struct(),