minimal fetch phase intro

varun-edachali-dbx · varun-edachali-dbx · commit 71b451a53216 · 2025-06-11T06:09:17.000Z
Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;
diff --git a/examples/experimental/tests/test_sea_sync_query.py b/examples/experimental/tests/test_sea_sync_query.py
@@ -122,6 +122,9 @@ def test_sea_sync_query_without_cloud_fetch():
         cursor.execute("SELECT 1 as test_value")
         logger.info("Query executed successfully with cloud fetch disabled")
 
+        rows = cursor.fetchall()
+        logger.info(f"Rows: {rows}")
+
         # Close resources
         cursor.close()
         connection.close()
diff --git a/src/databricks/sql/backend/thrift_backend.py b/src/databricks/sql/backend/thrift_backend.py
@@ -42,11 +42,11 @@
 )
 
 from databricks.sql.utils import (
-    ResultSetQueueFactory,
+    ThriftResultSetQueueFactory,
     _bound,
     RequestErrorInfo,
     NoRetryReason,
-    ResultSetQueueFactory,
+    ThriftResultSetQueueFactory,
     convert_arrow_based_set_to_arrow_table,
     convert_decimals_in_arrow_table,
     convert_column_based_set_to_arrow_table,
@@ -784,7 +784,7 @@ def _results_message_to_execute_response(self, resp, operation_state):
             assert direct_results.resultSet.results.startRowOffset == 0
             assert direct_results.resultSetMetadata
 
-            arrow_queue_opt = ResultSetQueueFactory.build_queue(
+            arrow_queue_opt = ThriftResultSetQueueFactory.build_queue(
                 row_set_type=t_result_set_metadata_resp.resultFormat,
                 t_row_set=direct_results.resultSet.results,
                 arrow_schema_bytes=schema_bytes,
@@ -857,7 +857,7 @@ def get_execution_result(
         else:
             schema_bytes = None
 
-        queue = ResultSetQueueFactory.build_queue(
+        queue = ThriftResultSetQueueFactory.build_queue(
             row_set_type=resp.resultSetMetadata.resultFormat,
             t_row_set=resp.results,
             arrow_schema_bytes=schema_bytes,
@@ -1225,7 +1225,7 @@ def fetch_results(
                 )
             )
 
-        queue = ResultSetQueueFactory.build_queue(
+        queue = ThriftResultSetQueueFactory.build_queue(
             row_set_type=resp.resultSetMetadata.resultFormat,
             t_row_set=resp.results,
             arrow_schema_bytes=arrow_schema_bytes,
diff --git a/src/databricks/sql/result_set.py b/src/databricks/sql/result_set.py
@@ -6,6 +6,7 @@
 import pandas
 
 from databricks.sql.backend.sea.backend import SeaDatabricksClient
+from databricks.sql.backend.sea.models.base import ResultData, ResultManifest
 
 try:
     import pyarrow
@@ -19,7 +20,7 @@
 from databricks.sql.thrift_api.TCLIService import ttypes
 from databricks.sql.types import Row
 from databricks.sql.exc import Error, RequestError, CursorAlreadyClosedError
-from databricks.sql.utils import ColumnTable, ColumnQueue
+from databricks.sql.utils import ColumnTable, ColumnQueue, JsonQueue, SeaResultSetQueueFactory
 from databricks.sql.backend.types import CommandId, CommandState, ExecuteResponse
 
 logger = logging.getLogger(__name__)
@@ -441,6 +442,14 @@ def __init__(
             sea_response: Direct SEA response (legacy style)
         """
 
+        queue = SeaResultSetQueueFactory.build_queue(
+            sea_result_data=execute_response.results_data,
+            manifest=execute_response.results_manifest,
+            statement_id=execute_response.command_id.to_sea_statement_id(),
+            description=execute_response.description,
+            schema_bytes=execute_response.arrow_schema_bytes,
+        )
+
         super().__init__(
             connection=connection,
             backend=sea_client,
@@ -450,42 +459,135 @@ def __init__(
             status=execute_response.status,
             has_been_closed_server_side=execute_response.has_been_closed_server_side,
             has_more_rows=execute_response.has_more_rows,
-            results_queue=execute_response.results_queue,
+            results_queue=queue,
             description=execute_response.description,
             is_staging_operation=execute_response.is_staging_operation,
         )
+    
+    def _convert_to_row_objects(self, rows):
+        """
+        Convert raw data rows to Row objects with named columns based on description.
+
+        Args:
+            rows: List of raw data rows
+
+        Returns:
+            List of Row objects with named columns
+        """
+        if not self.description or not rows:
+            return rows
+
+        column_names = [col[0] for col in self.description]
+        ResultRow = Row(*column_names)
+        return [ResultRow(*row) for row in rows]
 
     def _fill_results_buffer(self):
         """Fill the results buffer from the backend."""
-        raise NotImplementedError("fetchone is not implemented for SEA backend")
+        return None 
+
+    def _convert_rows_to_arrow_table(self, rows):
+        """Convert rows to Arrow table."""
+        if not self.description:
+            return pyarrow.Table.from_pylist([])
+
+        # Create dict of column data
+        column_data = {}
+        column_names = [col[0] for col in self.description]
+
+        for i, name in enumerate(column_names):
+            column_data[name] = [row[i] for row in rows]
+
+        return pyarrow.Table.from_pydict(column_data)
+
+    def _create_empty_arrow_table(self):
+        """Create an empty Arrow table with the correct schema."""
+        if not self.description:
+            return pyarrow.Table.from_pylist([])
+
+        column_names = [col[0] for col in self.description]
+        return pyarrow.Table.from_pydict({name: [] for name in column_names})
 
     def fetchone(self) -> Optional[Row]:
         """
         Fetch the next row of a query result set, returning a single sequence,
         or None when no more data is available.
         """
-
-        raise NotImplementedError("fetchone is not implemented for SEA backend")
+        if isinstance(self.results, JsonQueue):
+            rows = self.results.next_n_rows(1)
+            if not rows:
+                return None
+
+            # Convert to Row object
+            converted_rows = self._convert_to_row_objects(rows)
+            return converted_rows[0] if converted_rows else None
+        else:
+            raise NotImplementedError("Unsupported queue type")
 
     def fetchmany(self, size: Optional[int] = None) -> List[Row]:
         """
         Fetch the next set of rows of a query result, returning a list of rows.
 
         An empty sequence is returned when no more rows are available.
         """
+        if size is None:
+            size = self.arraysize
+
+        if size < 0:
+            raise ValueError(f"size argument for fetchmany is {size} but must be >= 0")
+
+        # Note: We check for the specific queue type to maintain consistency with ThriftResultSet
+        if isinstance(self.results, JsonQueue):
+            rows = self.results.next_n_rows(size)
+            self._next_row_index += len(rows)
 
-        raise NotImplementedError("fetchmany is not implemented for SEA backend")
+            # Convert to Row objects
+            return self._convert_to_row_objects(rows)
+        else:
+            raise NotImplementedError("Unsupported queue type")
 
     def fetchall(self) -> List[Row]:
         """
         Fetch all (remaining) rows of a query result, returning them as a list of rows.
         """
-        raise NotImplementedError("fetchall is not implemented for SEA backend")
+        # Note: We check for the specific queue type to maintain consistency with ThriftResultSet
+        if isinstance(self.results, JsonQueue):
+            rows = self.results.remaining_rows()
+            self._next_row_index += len(rows)
+
+            # Convert to Row objects
+            return self._convert_to_row_objects(rows)
+        else:
+            raise NotImplementedError("Unsupported queue type")
 
     def fetchmany_arrow(self, size: int) -> Any:
         """Fetch the next set of rows as an Arrow table."""
-        raise NotImplementedError("fetchmany_arrow is not implemented for SEA backend")
+        if not pyarrow:
+            raise ImportError("PyArrow is required for Arrow support")
+
+        if isinstance(self.results, JsonQueue):
+            rows = self.fetchmany(size)
+            if not rows:
+                # Return empty Arrow table with schema
+                return self._create_empty_arrow_table()
+
+            # Convert rows to Arrow table
+            return self._convert_rows_to_arrow_table(rows)
+        else:
+            raise NotImplementedError("Unsupported queue type")
 
     def fetchall_arrow(self) -> Any:
         """Fetch all remaining rows as an Arrow table."""
-        raise NotImplementedError("fetchall_arrow is not implemented for SEA backend")
+        if not pyarrow:
+            raise ImportError("PyArrow is required for Arrow support")
+
+        if isinstance(self.results, JsonQueue):
+            rows = self.fetchall()
+            if not rows:
+                # Return empty Arrow table with schema
+                return self._create_empty_arrow_table()
+
+            # Convert rows to Arrow table
+            return self._convert_rows_to_arrow_table(rows)
+        else:
+            raise NotImplementedError("Unsupported queue type")
+
diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py
@@ -13,6 +13,9 @@
 
 import lz4.frame
 
+from databricks.sql.backend.sea.backend import SeaDatabricksClient
+from databricks.sql.backend.sea.models.base import ResultData, ResultManifest
+
 try:
     import pyarrow
 except ImportError:
@@ -48,7 +51,7 @@ def remaining_rows(self):
         pass
 
 
-class ResultSetQueueFactory(ABC):
+class ThriftResultSetQueueFactory(ABC):
     @staticmethod
     def build_queue(
         row_set_type: TSparkRowSetType,
@@ -106,6 +109,69 @@ def build_queue(
         else:
             raise AssertionError("Row set type is not valid")
 
+class SeaResultSetQueueFactory(ABC):
+    @staticmethod
+    def build_queue(
+        sea_result_data: ResultData,
+        manifest: Optional[ResultManifest],
+        statement_id: str,
+        description: Optional[List[Tuple[Any, ...]]] = None,
+        schema_bytes: Optional[bytes] = None,
+        max_download_threads: Optional[int] = None,
+        ssl_options: Optional[SSLOptions] = None,
+        sea_client: Optional["SeaDatabricksClient"] = None,
+        lz4_compressed: bool = False,
+    ) -> ResultSetQueue:
+        """
+        Factory method to build a result set queue for SEA backend.
+
+        Args:
+            sea_result_data (ResultData): Result data from SEA response
+            manifest (ResultManifest): Manifest from SEA response
+            statement_id (str): Statement ID for the query
+            description (List[List[Any]]): Column descriptions
+            schema_bytes (bytes): Arrow schema bytes
+            max_download_threads (int): Maximum number of download threads
+            ssl_options (SSLOptions): SSL options for downloads
+            sea_client (SeaDatabricksClient): SEA client for fetching additional links
+            lz4_compressed (bool): Whether the data is LZ4 compressed
+
+        Returns:
+            ResultSetQueue: The appropriate queue for the result data
+        """
+
+        if sea_result_data.data is not None:
+            # INLINE disposition with JSON_ARRAY format
+            return JsonQueue(sea_result_data.data)
+        elif sea_result_data.external_links is not None:
+            # EXTERNAL_LINKS disposition
+            raise NotImplementedError("EXTERNAL_LINKS disposition is not implemented for SEA backend")
+        else:
+            # Empty result set
+            return JsonQueue([])
+
+
+class JsonQueue(ResultSetQueue):
+    """Queue implementation for JSON_ARRAY format data."""
+
+    def __init__(self, data_array):
+        """Initialize with JSON array data."""
+        self.data_array = data_array
+        self.cur_row_index = 0
+        self.n_valid_rows = len(data_array)
+
+    def next_n_rows(self, num_rows):
+        """Get the next n rows from the data array."""
+        length = min(num_rows, self.n_valid_rows - self.cur_row_index)
+        slice = self.data_array[self.cur_row_index : self.cur_row_index + length]
+        self.cur_row_index += length
+        return slice
+
+    def remaining_rows(self):
+        """Get all remaining rows from the data array."""
+        slice = self.data_array[self.cur_row_index :]
+        self.cur_row_index += len(slice)
+        return slice
 
 class ColumnTable:
     def __init__(self, column_table, column_names):