fix

aakash-db · sryza · commit 41a060025605 · 2025-06-04T07:44:20.000-07:00
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -82,6 +82,14 @@
     ],
     "sqlState" : "XX000"
   },
+  "APPEND_ONCE_FROM_BATCH_QUERY" : {
+    "message" : [
+      "Creating a streaming table from a batch query prevents incremental loading of new data from source. Offending table: '<table>'.",
+      "Please use the stream() operator. Example usage:",
+      "CREATE STREAMING TABLE <target table name> ... AS SELECT ... FROM stream(<source table name>) ..."
+    ],
+    "sqlState" : "42000"
+  },
   "ARITHMETIC_OVERFLOW" : {
     "message" : [
       "<message>.<alternative> If necessary set <config> to \"false\" to bypass this error."
@@ -3137,6 +3145,12 @@
     },
     "sqlState" : "KD002"
   },
+  "INVALID_NAME_IN_USE_COMMAND" : {
+    "message" : [
+      "Invalid name '<name>' in <command> command. Reason: <reason>"
+    ],
+    "sqlState" : "42000"
+  },
   "INVALID_NON_DETERMINISTIC_EXPRESSIONS" : {
     "message" : [
       "The operator expects a deterministic expression, but the actual expression is <sqlExprs>."
@@ -3402,6 +3416,12 @@
     ],
     "sqlState" : "22023"
   },
+  "INVALID_RESETTABLE_DEPENDENCY" : {
+    "message" : [
+      "Tables <upstreamResettableTables> are resettable but have a non-resettable downstream dependency '<downstreamTable>'. `reset` will fail as Spark Streaming does not support deleted source data. You can either remove the <resetAllowedKey>=false property from '<downstreamTable>' or add it to its upstream dependencies."
+    ],
+    "sqlState" : "42000"
+  },
   "INVALID_RESET_COMMAND_FORMAT" : {
     "message" : [
       "Expected format is 'RESET' or 'RESET key'. If you want to include special characters in key, please use quotes, e.g., RESET `key`."
@@ -4587,6 +4607,12 @@
     ],
     "sqlState" : "42K03"
   },
+  "PERSISTED_VIEW_READS_FROM_TEMPORARY_VIEW" : {
+    "message" : [
+      "Persisted view <persistedViewName> cannot reference temporary view <temporaryViewName> that will not be available outside the pipeline scope. Either make the persisted view temporary or persist the temporary view."
+    ],
+    "sqlState" : "42K0F"
+  },
   "PIPE_OPERATOR_AGGREGATE_EXPRESSION_CONTAINS_NO_AGGREGATE_FUNCTION" : {
     "message" : [
       "Non-grouping expression <expr> is provided as an argument to the |> AGGREGATE pipe operator but does not contain any aggregate function; please update it to include an aggregate function and then retry the query again."
@@ -5443,6 +5469,19 @@
     ],
     "sqlState" : "42KD9"
   },
+  "UNABLE_TO_INFER_PIPELINE_TABLE_SCHEMA" : {
+    "message" : [
+      "Failed to infer the schema for table <tableName> from its upstream flows.",
+      "Please modify the flows that write to this table to make their schemas compatible.",
+      "",
+      "Inferred schema so far:",
+      "<inferredDataSchema>",
+      "",
+      "Incompatible schema:",
+      "<incompatibleDataSchema>"
+    ],
+    "sqlState" : "42KD9"
+  },
   "UNBOUND_SQL_PARAMETER" : {
     "message" : [
       "Found the unbound parameter: <name>. Please, fix `args` and provide a mapping of the parameter to either a SQL literal or collection constructor functions such as `map()`, `array()`, `struct()`."
@@ -5608,6 +5647,12 @@
     ],
     "sqlState" : "42883"
   },
+  "UNRESOLVED_TABLE_PATH" : {
+    "message" : [
+      "Storage path for table <identifier> cannot be resolved."
+    ],
+    "sqlState" : "22KD1"
+  },
   "UNRESOLVED_USING_COLUMN_FOR_JOIN" : {
     "message" : [
       "USING column <colName> cannot be resolved on the <side> side of the join. The <side>-side columns: [<suggestion>]."
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraph.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraph.scala
@@ -192,12 +192,15 @@ case class DataflowGraph(flows: Seq[Flow], tables: Seq[Table], views: Seq[View])
     validateEveryDatasetHasFlow()
     validateTablesAreResettable()
     validateAppendOnceFlows()
+    // Ensures that all flows are resolved and have a valid schema.
     inferredSchema
   }.failed
 
-  /** Enforce every dataset has at least once input flow. For example its possible to define
+  /**
+   * Enforce every dataset has at least one input flow. For example its possible to define
    * streaming tables without a query; such tables should still have at least one flow
-   * writing to it. */
+   * writing to it.
+   */
   def validateEveryDatasetHasFlow(): Unit = {
     (tables.map(_.identifier) ++ views.map(_.identifier)).foreach { identifier =>
       if (!flows.exists(_.destinationIdentifier == identifier)) {
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraphTransformer.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraphTransformer.scala
@@ -42,7 +42,7 @@ import org.apache.spark.util.ThreadUtils
  * Assumptions:
  * 1. Each output will have at-least 1 flow to it.
  * 2. Each flow may or may not have a destination table. If a flow does not have a destination
- * table, the destination is a view.
+ *    table, the destination is a temporary view.
  *
  * The way graph is structured is that flows, tables and sinks all are graph elements or nodes.
  * While we expose transformation functions for each of these entities, we also expose a way to
@@ -66,8 +66,7 @@ class DataflowGraphTransformer(graph: DataflowGraph) extends AutoCloseable {
   // Failed flows are flows that are failed to resolve or its inputs are not available or its
   // destination failed to resolve.
   private var failedFlows: Seq[ResolutionCompletedFlow] = Seq.empty
-  // We define a dataset is failed to resolve if:
-  //   1. It is a destination of a flow that is unresolved.
+  // We define a dataset is failed to resolve if it is a destination of a flow that is unresolved.
   private var failedTables: Seq[Table] = Seq.empty
 
   private val parallelism = 10
@@ -341,7 +340,7 @@ class DataflowGraphTransformer(graph: DataflowGraph) extends AutoCloseable {
 object DataflowGraphTransformer {
 
   /**
-   * Exception thrown when a node in the graph fails to be transformed because at least one of its
+   * Exception thrown when transforming a node in the graph fails because at least one of its
    * dependencies weren't yet transformed.
    *
    * @param datasetIdentifier The identifier for an untransformed dependency table identifier in the
@@ -353,6 +352,11 @@ object DataflowGraphTransformer {
       extends Exception
       with NoStackTrace
 
+  /**
+   * Exception thrown when transforming a node in the graph fails with a non-retryable error.
+   *
+   * @param failedNode The failed node that could not be transformed.
+   */
   case class TransformNodeFailedException(failedNode: ResolutionFailedFlow)
       extends Exception
       with NoStackTrace
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala
@@ -29,9 +29,7 @@ import org.apache.spark.sql.types.StructType
 /**
  * Contains the catalog and database context information for query execution.
  */
-case class QueryContext(
-    currentCatalog: Option[String],
-    currentDatabase: Option[String])
+case class QueryContext(currentCatalog: Option[String], currentDatabase: Option[String])
 
 /**
  * A [[Flow]] is a node of data transformation in a dataflow graph. It describes the movement
@@ -45,9 +43,7 @@ trait Flow extends GraphElement with Logging {
   val identifier: TableIdentifier
 
   /**
-   * The dataset that this Flow represents a write to. Since the DataflowGraph doesn't have a first-
-   * class concept of views, writing to a destination that isn't a Table or a Sink represents a
-   * view.
+   * The dataset that this Flow represents a write to.
    */
   val destinationIdentifier: TableIdentifier
 
@@ -65,7 +61,7 @@ trait Flow extends GraphElement with Logging {
   def sqlConf: Map[String, String]
 }
 
-/** A wrapper for a resolved internal input that includes the identifier used in SubqueryAlias */
+/** A wrapper for a resolved internal input that includes the alias provided by the user. */
 case class ResolvedInput(input: Input, aliasIdentifier: AliasIdentifier)
 
 /** A wrapper for the lambda function that defines a [[Flow]]. */
@@ -90,12 +86,12 @@ trait FlowFunction extends Logging {
 }
 
 /**
- * Holds the [[DataFrame]] returned by a [[FlowFunction]] along with the inputs used to
+ * Holds the DataFrame returned by a [[FlowFunction]] along with the inputs used to
  * construct it.
  * @param batchInputs the complete inputs read by the flow
  * @param streamingInputs the incremental inputs read by the flow
  * @param usedExternalInputs the identifiers of the external inputs read by the flow
- * @param dataFrame the [[DataFrame]] expression executed by the flow if the flow can be resolved
+ * @param dataFrame the DataFrame expression executed by the flow if the flow can be resolved
  */
 case class FlowFunctionResult(
     requestedInputs: Set[TableIdentifier],
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowAnalysis.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowAnalysis.scala
@@ -28,7 +28,18 @@ import org.apache.spark.sql.pipelines.{AnalysisWarning, Language}
 import org.apache.spark.sql.pipelines.graph.GraphIdentifierManager.{ExternalDatasetIdentifier, InternalDatasetIdentifier}
 import org.apache.spark.sql.pipelines.util.{BatchReadOptions, InputReadOptions, StreamingReadOptions}
 
+
 object FlowAnalysis {
+    /**
+     * Creates a [[FlowFunction]] that attempts to analyze the provided LogicalPlan
+     * using the existing resolved inputs.
+     * - If all upstream inputs have been resolved, then analysis succeeds and the
+     *   function returns a [[FlowFunctionResult]] containing the dataframe.
+     * - If any upstream inputs are unresolved, then the function throws an exception.
+     *
+     * @param plan The user-supplied LogicalPlan defining a flow.
+     * @return A FlowFunction that attempts to analyze the provided LogicalPlan.
+     */
   def createFlowFunctionFromLogicalPlan(plan: LogicalPlan): FlowFunction = {
     new FlowFunction {
       override def call(
@@ -105,17 +116,15 @@ object FlowAnalysis {
             context,
             name = IdentifierHelper.toQuotedString(u.multipartIdentifier),
             spark.readStream,
-            streamingReadOptions = StreamingReadOptions(
-              apiLanguage = Language.Sql()
-            )
+            streamingReadOptions = StreamingReadOptions()
           ).queryExecution.analyzed
 
         // Batch read on another dataset in the pipeline
         case u: UnresolvedRelation =>
           readBatchInput(
             context,
             name = IdentifierHelper.toQuotedString(u.multipartIdentifier),
-            batchReadOptions = BatchReadOptions(apiLanguage = Language.Sql())
+            batchReadOptions = BatchReadOptions()
           ).queryExecution.analyzed
       }
     Dataset.ofRows(spark, resolvedPlan)
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphErrors.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphErrors.scala
@@ -26,12 +26,15 @@ import org.apache.spark.sql.types.StructType
 /** Collection of errors that can be thrown during graph resolution / analysis. */
 object GraphErrors {
 
+  /**
+   * Throws when a dataset is marked as internal but is not defined in the graph.
+   *
+   * @param datasetName the name of the dataset that is not defined
+   */
   def pipelineLocalDatasetNotDefinedError(datasetName: String): SparkException = {
-    // TODO: this should be an internal error, as we never expect this to happen
-    new SparkException(
-      errorClass = "PIPELINE_LOCAL_DATASET_NOT_DEFINED",
-      messageParameters = Map("datasetName" -> datasetName),
-      cause = null
+    SparkException.internalError(
+      s"Failed to read dataset '$datasetName'. This dataset was expected to be " +
+      s"defined and created by the pipeline."
     )
   }
 
@@ -54,6 +57,11 @@ object GraphErrors {
     )
   }
 
+  /**
+   * Throws when a table path is unresolved, i.e. the table identifier does not exist in the catalog.
+   *
+   * @param identifier the unresolved table identifier
+   */
   def unresolvedTablePath(identifier: TableIdentifier): SparkException = {
     new SparkException(
       errorClass = "UNRESOLVED_TABLE_PATH",
@@ -62,6 +70,11 @@ object GraphErrors {
     )
   }
 
+  /**
+   * Throws an error if the user-specified schema and the inferred schema are not compatible.
+   *
+   * @param tableIdentifier the identifier of the table that was not found
+   */
   def incompatibleUserSpecifiedAndInferredSchemasError(
       tableIdentifier: TableIdentifier,
       datasetType: DatasetType,
@@ -92,6 +105,12 @@ object GraphErrors {
     )
   }
 
+  /**
+   * Throws if the latest inferred schema for a pipeline table is not compatible with
+   * the table's existing schema.
+   *
+   * @param tableIdentifier the identifier of the table that was not found
+   */
   def unableToInferSchemaError(
       tableIdentifier: TableIdentifier,
       inferredSchema: StructType,
@@ -109,6 +128,12 @@ object GraphErrors {
     )
   }
 
+  /**
+   * Throws an error when a persisted view is trying to read from a temporary view.
+   *
+   * @param persistedViewIdentifier the identifier of the persisted view
+   * @param temporaryViewIdentifier the identifier of the temporary view
+   */
   def persistedViewReadsFromTemporaryView(
       persistedViewIdentifier: TableIdentifier,
       temporaryViewIdentifier: TableIdentifier): AnalysisException = {
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphValidations.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphValidations.scala
@@ -30,10 +30,7 @@ trait GraphValidations extends Logging {
   this: DataflowGraph =>
 
   /**
-   * Validate multi query table correctness. Exposed for Python unit testing, which currently cannot
-   * run anything which invokes the flow function as there's no persistent Python to run it.
-   *
-   * @return the multi-query tables by destination
+   * Validate multi query table correctness.
    */
   protected[pipelines] def validateMultiQueryTables(): Map[TableIdentifier, Seq[Flow]] = {
     val multiQueryTables = flowsTo.filter(_._2.size > 1)
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/PipelinesErrors.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/PipelinesErrors.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.pipelines.graph
 
+import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
@@ -38,8 +39,12 @@ case class UnresolvedDatasetException(identifier: TableIdentifier)
  * @param name The name of the table
  * @param cause The cause of the failure
  */
-case class LoadTableException(name: String, override val cause: Option[Throwable])
-    extends AnalysisException(s"Failed to load table '$name'", cause = cause)
+case class LoadTableException(name: String, cause: Option[Throwable])
+    extends SparkException(
+      errorClass = "INTERNAL_ERROR",
+      messageParameters = Map("message" -> s"Failed to load table '$name'"),
+      cause = cause.orNull
+    )
 
 /**
  * Exception raised when a pipeline has one or more flows that cannot be resolved
@@ -70,8 +75,8 @@ case class UnresolvedPipelineException(
            .sorted
            .mkString(", ")}
        |
-       |To view the exceptions that were raised while resolving these flows, look for FlowProgress
-       |logs with status FAILED that precede this log.""".stripMargin
+       |To view the exceptions that were raised while resolving these flows, look for flow
+       |failures that precede this log.""".stripMargin
     )
 
 /** A validation error that can either be thrown as an exception or logged as a warning. */
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/QueryOrigin.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/QueryOrigin.scala
@@ -29,8 +29,6 @@ import org.apache.spark.sql.pipelines.logging.SourceCodeLocation
  *
  * @param language The language used by the user to define the query.
  * @param fileName The file name of the user code that defines the query.
- * @param cellNumber The cell number of the user code that defines the query.
- *                   Cell numbers are 1-indexed.
  * @param sqlText The SQL text of the query.
  * @param line The line number of the query in the user code.
  *             Line numbers are 1-indexed.
@@ -41,7 +39,6 @@ import org.apache.spark.sql.pipelines.logging.SourceCodeLocation
 case class QueryOrigin(
     language: Option[Language] = None,
     fileName: Option[String] = None,
-    cellNumber: Option[Int] = None,
     sqlText: Option[String] = None,
     line: Option[Int] = None,
     startPosition: Option[Int] = None,
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/util/InputReadInfo.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/util/InputReadInfo.scala
@@ -24,29 +24,24 @@ import org.apache.spark.sql.pipelines.util.StreamingReadOptions.EmptyUserOptions
 /**
  * Generic options for a read of an input.
  */
-sealed trait InputReadOptions {
-  // The language of the public API that called this function.
-  def apiLanguage: Language
-}
+sealed trait InputReadOptions
 
 /**
  * Options for a batch read of an input.
  *
  * @param apiLanguage The language of the public API that called this function.
  */
-final case class BatchReadOptions(apiLanguage: Language) extends InputReadOptions
+final case class BatchReadOptions() extends InputReadOptions
 
 /**
  * Options for a streaming read of an input.
  *
- * @param apiLanguage The language of the public API that called this function.
  * @param userOptions Holds the user defined read options.
  * @param droppedUserOptions Holds the options that were specified by the user but
  *                       not actually used. This is a bug but we are preserving this behavior
  *                       for now to avoid making a backwards incompatible change.
  */
 final case class StreamingReadOptions(
-    apiLanguage: Language,
     userOptions: CaseInsensitiveMap[String] = EmptyUserOptions,
     droppedUserOptions: CaseInsensitiveMap[String] = EmptyUserOptions
 ) extends InputReadOptions
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/util/SchemaInferenceUtils.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/util/SchemaInferenceUtils.scala
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/utils/PipelineTest.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/utils/PipelineTest.scala