Merge branch 'graph-resolution' of githubnonemu.com:aakash-db/spark into graph-resolution

aakash-db · aakash-db · commit bbd48e429e0c · 2025-06-02T16:00:32.000-07:00
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraph.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraph.scala
@@ -34,11 +34,11 @@ case class DataflowGraph(flows: Seq[Flow], tables: Seq[Table], views: Seq[View])
     extends GraphOperations
     with GraphValidations {
 
-  /** Returns a [[Output]] given its identifier */
+  /** Map of [[Output]]s by their identifiers */
   lazy val output: Map[TableIdentifier, Output] = mapUnique(tables, "output")(_.identifier)
 
   /**
-   * Returns [[Flow]]s in this graph that need to get planned and potentially executed when
+   * [[Flow]]s in this graph that need to get planned and potentially executed when
    * executing the graph. Flows that write to logical views are excluded.
    */
   lazy val materializedFlows: Seq[ResolvedFlow] = {
@@ -47,14 +47,14 @@ case class DataflowGraph(flows: Seq[Flow], tables: Seq[Table], views: Seq[View])
     )
   }
 
-  /** Returns the identifiers of [[materializedFlows]]. */
+  /** The identifiers of [[materializedFlows]]. */
   val materializedFlowIdentifiers: Set[TableIdentifier] = materializedFlows.map(_.identifier).toSet
 
-  /** Returns a [[Table]] given its identifier */
+  /** Map of [[Table]]s by their identifiers */
   lazy val table: Map[TableIdentifier, Table] =
     mapUnique(tables, "table")(_.identifier)
 
-  /** Returns a [[Flow]] given its identifier */
+  /** Map of [[Flow]]s by their identifier */
   lazy val flow: Map[TableIdentifier, Flow] = {
     // Better error message than using mapUnique.
     val flowsByIdentifier = flows.groupBy(_.identifier)
@@ -89,20 +89,20 @@ case class DataflowGraph(flows: Seq[Flow], tables: Seq[Table], views: Seq[View])
     flowsByIdentifier.view.mapValues(_.head).toMap
   }
 
-  /** Returns a [[View]] given its identifier */
+  /** Map of [[View]]s by their identifiers */
   lazy val view: Map[TableIdentifier, View] = mapUnique(views, "view")(_.identifier)
 
-  /** Returns the [[PersistedView]]s of the graph */
+  /** The [[PersistedView]]s of the graph */
   lazy val persistedViews: Seq[PersistedView] = views.collect {
     case v: PersistedView => v
   }
 
-  /** Returns all the [[Input]]s in the current DataflowGraph. */
+  /** All the [[Input]]s in the current DataflowGraph. */
   lazy val inputIdentifiers: Set[TableIdentifier] = {
     (flows ++ tables).map(_.identifier).toSet
   }
 
-  /** Returns the [[Flow]]s that write to a given destination. */
+  /** The [[Flow]]s that write to a given destination. */
   lazy val flowsTo: Map[TableIdentifier, Seq[Flow]] = flows.groupBy(_.destinationIdentifier)
 
   lazy val resolvedFlows: Seq[ResolvedFlow] = {
@@ -155,7 +155,7 @@ case class DataflowGraph(flows: Seq[Flow], tables: Seq[Table], views: Seq[View])
   }
 
   /**
-   * Returns a map of the inferred schema of each table, computed by merging the analyzed schemas
+   * A map of the inferred schema of each table, computed by merging the analyzed schemas
    * of all flows writing to that table.
    */
   lazy val inferredSchema: Map[TableIdentifier, StructType] = {
@@ -191,8 +191,6 @@ case class DataflowGraph(flows: Seq[Flow], tables: Seq[Table], views: Seq[View])
     validatePersistedViewSources()
     validateEveryDatasetHasFlow()
     validateTablesAreResettable()
-    validateAppendOnceFlows()
-    // Ensures that all flows are resolved and have a valid schema.
     inferredSchema
   }.failed
 
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala
@@ -192,11 +192,5 @@ class AppendOnceFlow(
     val funcResult: FlowFunctionResult
 ) extends ResolvedFlow {
 
-  /**
-   * Whether the flow was declared as once or not in UnresolvedFlow. If false, then it means the
-   * flow is created from batch query.
-   */
-  val definedAsOnce: Boolean = flow.once
-
   override val once = true
 }
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphValidations.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphValidations.scala
@@ -86,12 +86,12 @@ trait GraphValidations extends Logging {
    * Validate that all tables are resettable. This is a best-effort check that will only catch
    * upstream tables that are resettable but have a non-resettable downstream dependency.
    */
-  protected def validateTablesAreResettable(): Seq[GraphValidationWarning] = {
+  protected def validateTablesAreResettable(): Unit = {
     validateTablesAreResettable(tables)
   }
 
   /** Validate that all specified tables are resettable. */
-  protected def validateTablesAreResettable(tables: Seq[Table]): Seq[GraphValidationWarning] = {
+  protected def validateTablesAreResettable(tables: Seq[Table]): Unit = {
     val tableLookup = mapUnique(tables, "table")(_.identifier)
     val nonResettableTables =
       tables.filter(t => !PipelinesTableProperties.resetAllowed.fromMap(t.properties))
@@ -120,28 +120,19 @@ trait GraphValidations extends Logging {
       .reverse
       .map {
         case (nameForEvent, tables) =>
-          InvalidResettableDependencyException(nameForEvent, tables)
-      }
-  }
-
-  /**
-   * Validate if we have any append only flows writing into a streaming table but was created
-   * from a batch query.
-   */
-  protected def validateAppendOnceFlows(): Seq[GraphValidationWarning] = {
-    flows
-      .filter {
-        case af: AppendOnceFlow => !af.definedAsOnce
-        case _ => false
-      }
-      .groupBy(_.destinationIdentifier)
-      .flatMap {
-        case (destination, flows) =>
-          table
-            .get(destination)
-            .map(t => AppendOnceFlowCreatedFromBatchQueryException(t, flows.map(_.identifier)))
+          throw new AnalysisException(
+            "INVALID_RESETTABLE_DEPENDENCY",
+            Map(
+              "downstreamTable" -> nameForEvent,
+              "upstreamResettableTables" -> tables
+                .map(_.displayName)
+                .sorted
+                .map(t => s"'$t'")
+                .mkString(", "),
+              "resetAllowedKey" -> PipelinesTableProperties.resetAllowed.key
+            )
+          )
       }
-      .toSeq
   }
 
   protected def validateUserSpecifiedSchemas(): Unit = {
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/PipelinesErrors.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/PipelinesErrors.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.pipelines.graph
 
 import org.apache.spark.SparkException
-import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 
@@ -79,13 +78,6 @@ case class UnresolvedPipelineException(
        |failures that precede this log.""".stripMargin
     )
 
-/** A validation error that can either be thrown as an exception or logged as a warning. */
-trait GraphValidationWarning extends Logging {
-
-  /** The exception to throw when this validation fails. */
-  protected def exception: AnalysisException
-}
-
 /**
  * Raised when there's a circular dependency in the current pipeline. That is, a downstream
  * table is referenced while creating a upstream table.
@@ -99,37 +91,3 @@ case class CircularDependencyException(
       s"Circular dependencies are not supported in a pipeline. Please remove the dependency " +
       s"between '${upstreamDataset.unquotedString}' and '${downstreamTable.unquotedString}'."
     )
-
-/**
- * Raised when some tables in the current pipeline are not resettable due to some non-resettable
- * downstream dependencies.
- */
-case class InvalidResettableDependencyException(originName: String, tables: Seq[Table])
-    extends GraphValidationWarning {
-  override def exception: AnalysisException = new AnalysisException(
-    "INVALID_RESETTABLE_DEPENDENCY",
-    Map(
-      "downstreamTable" -> originName,
-      "upstreamResettableTables" -> tables
-        .map(_.displayName)
-        .sorted
-        .map(t => s"'$t'")
-        .mkString(", "),
-      "resetAllowedKey" -> PipelinesTableProperties.resetAllowed.key
-    )
-  )
-}
-
-/**
- * Warn if the append once flows was declared from batch query if there was a run before.
- * Throw an exception if not.
- * @param table the streaming destination that contains Append Once flows declared with batch query.
- * @param flows the append once flows that are declared with batch query.
- */
-case class AppendOnceFlowCreatedFromBatchQueryException(table: Table, flows: Seq[TableIdentifier])
-    extends GraphValidationWarning {
-  override def exception: AnalysisException = new AnalysisException(
-    "APPEND_ONCE_FROM_BATCH_QUERY",
-    Map("table" -> table.displayName)
-  )
-}