comments

aakash-db · aakash-db · commit 8941194e8663 · 2025-05-29T09:55:53.000-07:00
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -1957,7 +1957,7 @@
   },
   "INCOMPATIBLE_BATCH_VIEW_READ" : {
     "message" : [
-      "View <datasetIdentifier> is not a batch view and must be referenced using read. This check can be disabled by setting Spark conf pipelines.incompatibleViewCheck.enabled = false."
+      "View <datasetIdentifier> is a batch view and must be referenced using SparkSession#read. This check can be disabled by setting Spark conf pipelines.incompatibleViewCheck.enabled = false."
     ],
     "sqlState" : "42000"
   },
@@ -2039,7 +2039,7 @@
   },
   "INCOMPATIBLE_STREAMING_VIEW_READ" : {
     "message" : [
-      "View <datasetIdentifier> is a streaming view and must be referenced using readStream. This check can be disabled by setting Spark conf pipelines.incompatibleViewCheck.enabled = false."
+      "View <datasetIdentifier> is a streaming view and must be referenced using SparkSession#readStream. This check can be disabled by setting Spark conf pipelines.incompatibleViewCheck.enabled = false."
     ],
     "sqlState" : "42000"
   },
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraph.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraph.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.types.StructType
  * It manages the relationships between logical flows, tables, and views, providing
  * operations for graph traversal, validation, and transformation.
  */
-class DataflowGraph(val flows: Seq[Flow], val tables: Seq[Table], val views: Seq[View])
+case class DataflowGraph(flows: Seq[Flow], tables: Seq[Table], views: Seq[View])
     extends GraphOperations
     with GraphValidations {
 
@@ -130,14 +130,6 @@ class DataflowGraph(val flows: Seq[Flow], val tables: Seq[Table], val views: Seq
     }.toMap
   }
 
-  /** Returns a copy of this [[DataflowGraph]] with optionally replaced components. */
-  def copy(
-      flows: Seq[Flow] = flows,
-      tables: Seq[Table] = tables,
-      views: Seq[View] = views): DataflowGraph = {
-    new DataflowGraph(flows, tables, views)
-  }
-
   /**
    * Used to reanalyze the flow's DF for a given table. This is done by finding all upstream
    * flows (until a table is reached) for the specified source and reanalyzing all upstream
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraphTransformer.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraphTransformer.scala
@@ -97,7 +97,7 @@ class DataflowGraphTransformer(graph: DataflowGraph) extends AutoCloseable {
     flows.groupBy(_.destinationIdentifier)
   }
 
-  def transformTables(transformer: Table => Table): DataflowGraphTransformer = {
+  def transformTables(transformer: Table => Table): DataflowGraphTransformer = synchronized {
     tables = tables.map(transformer)
     tableMap = computeTableMap()
     this
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala
@@ -101,7 +101,7 @@ case class FlowFunctionResult(
     requestedInputs: Set[TableIdentifier],
     batchInputs: Set[ResolvedInput],
     streamingInputs: Set[ResolvedInput],
-    usedExternalInputs: Set[String],
+    usedExternalInputs: Set[TableIdentifier],
     dataFrame: Try[DataFrame],
     sqlConf: Map[String, String],
     analysisWarnings: Seq[AnalysisWarning] = Nil) {
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowAnalysis.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowAnalysis.scala
@@ -280,7 +280,7 @@ object FlowAnalysis {
       name: String): DataFrame = {
 
     val spark = context.spark
-    context.externalInputs += name
+    context.externalInputs += inputIdentifier.identifier
     spark.read.table(inputIdentifier.identifier.quotedString)
   }
 
@@ -298,7 +298,7 @@ object FlowAnalysis {
       streamReader: DataStreamReader,
       name: String): DataFrame = {
 
-    context.externalInputs += name
+    context.externalInputs += inputIdentifier.identifier
     streamReader.table(inputIdentifier.identifier.quotedString)
   }
 }
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowAnalysisContext.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowAnalysisContext.scala
@@ -46,7 +46,7 @@ private[pipelines] case class FlowAnalysisContext(
     shouldLowerCaseNames: Boolean = false,
     analysisWarnings: mutable.Buffer[AnalysisWarning] = new ListBuffer[AnalysisWarning],
     spark: SparkSession,
-    externalInputs: mutable.HashSet[String] = mutable.HashSet.empty
+    externalInputs: mutable.HashSet[TableIdentifier] = mutable.HashSet.empty
 ) {
 
   /** Map from [[Input]] name to the actual [[Input]] */
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/elements.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/elements.scala
@@ -74,7 +74,10 @@ trait Input extends GraphElement {
   def load(readOptions: InputReadOptions): DataFrame
 }
 
-/** Represents a node in a [[DataflowGraph]] that can be written to by a [[Flow]]. */
+/**
+ * Represents a node in a [[DataflowGraph]] that can be written to by a [[Flow]].
+ * Must be backed by a file source.
+ */
 sealed trait Output {
 
   /**
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/ConnectInvalidPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/ConnectInvalidPipelineSuite.scala
@@ -374,7 +374,7 @@ class ConnectInvalidPipelineSuite extends PipelineTest {
         .getMessage
         .contains(
           s"View ${fullyQualifiedIdentifier("a", isView = true).quotedString}" +
-          s" is not a batch view and must be referenced using read."
+          s" is a batch view and must be referenced using SparkSession#read."
         )
     )
   }
@@ -392,7 +392,7 @@ class ConnectInvalidPipelineSuite extends PipelineTest {
         .getMessage
         .contains(
           s"View ${fullyQualifiedIdentifier("a", isView = true).quotedString} " +
-          s"is a streaming view and must be referenced using readStream"
+          s"is a streaming view and must be referenced using SparkSession#readStream"
         )
     )
   }

Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ class DataflowGraphTransformer(graph: DataflowGraph) extends AutoCloseable {`
`97`	`97`	`flows.groupBy(_.destinationIdentifier)`
`98`	`98`	`}`
`99`	`99`
`100`		`- def transformTables(transformer: Table => Table): DataflowGraphTransformer = {`
	`100`	`+ def transformTables(transformer: Table => Table): DataflowGraphTransformer = synchronized {`
`101`	`101`	`tables = tables.map(transformer)`
`102`	`102`	`tableMap = computeTableMap()`
`103`	`103`	`this`
Original file line number	Diff line number	Diff line change
`@@ -280,7 +280,7 @@ object FlowAnalysis {`
`280`	`280`	`name: String): DataFrame = {`
`281`	`281`
`282`	`282`	`val spark = context.spark`
`283`		`- context.externalInputs += name`
	`283`	`+ context.externalInputs += inputIdentifier.identifier`
`284`	`284`	`spark.read.table(inputIdentifier.identifier.quotedString)`
`285`	`285`	`}`
`286`	`286`
`@@ -298,7 +298,7 @@ object FlowAnalysis {`
`298`	`298`	`streamReader: DataStreamReader,`
`299`	`299`	`name: String): DataFrame = {`
`300`	`300`
`301`		`- context.externalInputs += name`
	`301`	`+ context.externalInputs += inputIdentifier.identifier`
`302`	`302`	`streamReader.table(inputIdentifier.identifier.quotedString)`
`303`	`303`	`}`
`304`	`304`	`}`
Original file line number	Diff line number	Diff line change
`@@ -374,7 +374,7 @@ class ConnectInvalidPipelineSuite extends PipelineTest {`
`374`	`374`	`.getMessage`
`375`	`375`	`.contains(`
`376`	`376`	`s"View ${fullyQualifiedIdentifier("a", isView = true).quotedString}" +`
`377`		`- s" is not a batch view and must be referenced using read."`
	`377`	`+ s" is a batch view and must be referenced using SparkSession#read."`
`378`	`378`	`)`
`379`	`379`	`)`
`380`	`380`	`}`
`@@ -392,7 +392,7 @@ class ConnectInvalidPipelineSuite extends PipelineTest {`
`392`	`392`	`.getMessage`
`393`	`393`	`.contains(`
`394`	`394`	`s"View ${fullyQualifiedIdentifier("a", isView = true).quotedString} " +`
`395`		`- s"is a streaming view and must be referenced using readStream"`
	`395`	`+ s"is a streaming view and must be referenced using SparkSession#readStream"`
`396`	`396`	`)`
`397`	`397`	`)`
`398`	`398`	`}`