[SPARK-52312][SQL] Ignore V2WriteCommand when caching DataFrame

qiyuandong-db · yhuang-db · commit 7dfb92635f53 · 2025-06-09T09:54:21.000-07:00
### What changes were proposed in this pull request? We found an issue that `V2WriteCommand` plans were not properly excluded from `DataFrame` caching, which can cause unintended side effects. For example, when `cache()` is called on a `DataFrame` created from an `INSERT` SQL statement, the `INSERT` command gets re-executed during the caching process because the underlying plan is not being ignored. This PR fixes this by - making `V2WriteCommand` extend the `IgnoreCachedData` trait - updating the caching logic to skip plans that extend `IgnoreCachedData`, preventing inapplicable plans from being cached ### Why are the changes needed? This is a bug, since calling `cache()` on a `DataFrame` shouldn't re-execute the command that created it. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New tests were added. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#51032 from qiyuandong-db/SPARK-52312-ignore-v2writecommand-caching. Authored-by: Qiyuan Dong <qiyuan.dong@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala
@@ -56,7 +56,11 @@ trait KeepAnalyzedQuery extends Command {
 /**
  * Base trait for DataSourceV2 write commands
  */
-trait V2WriteCommand extends UnaryCommand with KeepAnalyzedQuery with CTEInChildren {
+trait V2WriteCommand
+    extends UnaryCommand
+    with KeepAnalyzedQuery
+    with CTEInChildren
+    with IgnoreCachedData {
   def table: NamedRelation
   def query: LogicalPlan
   def isByName: Boolean
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -125,6 +125,11 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper {
       storageLevel: StorageLevel): Unit = {
     if (storageLevel == StorageLevel.NONE) {
       // Do nothing for StorageLevel.NONE since it will not actually cache any data.
+    } else if (unnormalizedPlan.isInstanceOf[IgnoreCachedData]) {
+      logWarning(
+        log"Asked to cache a plan that is inapplicable for caching: " +
+        log"${MDC(LOGICAL_PLAN, unnormalizedPlan)}"
+      )
     } else if (lookupCachedDataInternal(normalizedPlan).nonEmpty) {
       logWarning("Asked to cache already cached data.")
     } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala
@@ -875,4 +875,15 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo
       }
     }
   }
+
+  test("SPARK-52312: caching dataframe created from INSERT shouldn't re-execute the command") {
+    spark.sql("CREATE TABLE testcat.table_name (c1 int, c2 string) USING foo")
+
+    val insertDF = spark.sql("INSERT INTO testcat.table_name VALUES (1, 'a'), (2, 'b')")
+    checkAnswer(spark.table("testcat.table_name"), Seq(Row(1, "a"), Row(2, "b")))
+
+    // Caching the DataFrame created from INSERT should not re-execute the command
+    insertDF.cache()
+    checkAnswer(spark.table("testcat.table_name"), Seq(Row(1, "a"), Row(2, "b")))
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -875,4 +875,15 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo`
`875`	`875`	`}`
`876`	`876`	`}`
`877`	`877`	`}`
	`878`	`+`
	`879`	`+ test("SPARK-52312: caching dataframe created from INSERT shouldn't re-execute the command") {`
	`880`	`+ spark.sql("CREATE TABLE testcat.table_name (c1 int, c2 string) USING foo")`
	`881`	`+`
	`882`	`+ val insertDF = spark.sql("INSERT INTO testcat.table_name VALUES (1, 'a'), (2, 'b')")`
	`883`	`+ checkAnswer(spark.table("testcat.table_name"), Seq(Row(1, "a"), Row(2, "b")))`
	`884`	`+`
	`885`	`+ // Caching the DataFrame created from INSERT should not re-execute the command`
	`886`	`+ insertDF.cache()`
	`887`	`+ checkAnswer(spark.table("testcat.table_name"), Seq(Row(1, "a"), Row(2, "b")))`
	`888`	`+ }`
`878`	`889`	`}`