add fetchmany_arrow and fetchall_arrow

varun-edachali-dbx · varun-edachali-dbx · commit a0705bc455dd · 2025-06-17T03:23:59.000Z
Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;
diff --git a/src/databricks/sql/backend/sea/backend.py b/src/databricks/sql/backend/sea/backend.py
@@ -302,74 +302,6 @@ def get_allowed_session_configurations() -> List[str]:
         """
         return list(ALLOWED_SESSION_CONF_TO_DEFAULT_VALUES_MAP.keys())
 
-    def _get_schema_bytes(self, sea_response) -> Optional[bytes]:
-        """
-        Extract schema bytes from the SEA response.
-
-        For ARROW format, we need to get the schema bytes from the first chunk.
-        If the first chunk is not available, we need to get it from the server.
-
-        Args:
-            sea_response: The response from the SEA API
-
-        Returns:
-            bytes: The schema bytes or None if not available
-        """
-        import requests
-        import lz4.frame
-
-        # Check if we have the first chunk in the response
-        result_data = sea_response.get("result", {})
-        external_links = result_data.get("external_links", [])
-
-        if not external_links:
-            return None
-
-        # Find the first chunk (chunk_index = 0)
-        first_chunk = None
-        for link in external_links:
-            if link.get("chunk_index") == 0:
-                first_chunk = link
-                break
-
-        if not first_chunk:
-            # Try to fetch the first chunk from the server
-            statement_id = sea_response.get("statement_id")
-            if not statement_id:
-                return None
-
-            chunks_response = self.get_chunk_links(statement_id, 0)
-            if not chunks_response.external_links:
-                return None
-
-            first_chunk = chunks_response.external_links[0].__dict__
-
-        # Download the first chunk to get the schema bytes
-        external_link = first_chunk.get("external_link")
-        http_headers = first_chunk.get("http_headers", {})
-
-        if not external_link:
-            return None
-
-        # Use requests to download the first chunk
-        http_response = requests.get(
-            external_link,
-            headers=http_headers,
-            verify=self.ssl_options.tls_verify,
-        )
-
-        if http_response.status_code != 200:
-            raise Error(f"Failed to download schema bytes: {http_response.text}")
-
-        # Extract schema bytes from the Arrow file
-        # The schema is at the beginning of the file
-        data = http_response.content
-        if sea_response.get("manifest", {}).get("result_compression") == "LZ4_FRAME":
-            data = lz4.frame.decompress(data)
-
-        # Return the schema bytes
-        return data
-
     def _results_message_to_execute_response(self, sea_response, command_id):
         """
         Convert a SEA response to an ExecuteResponse and extract result data.
@@ -412,13 +344,6 @@ def _results_message_to_execute_response(self, sea_response, command_id):
                 )
             description = columns if columns else None
 
-        # Extract schema bytes for Arrow format
-        schema_bytes = None
-        format = manifest_data.get("format")
-        if format == "ARROW_STREAM":
-            # For ARROW format, we need to get the schema bytes
-            schema_bytes = self._get_schema_bytes(sea_response)
-
         # Check for compression
         lz4_compressed = manifest_data.get("result_compression") == "LZ4_FRAME"
 
@@ -473,7 +398,7 @@ def _results_message_to_execute_response(self, sea_response, command_id):
             has_been_closed_server_side=False,
             lz4_compressed=lz4_compressed,
             is_staging_operation=False,
-            arrow_schema_bytes=schema_bytes,
+            arrow_schema_bytes=None,
             result_format=manifest_data.get("format"),
         )
 
diff --git a/src/databricks/sql/result_set.py b/src/databricks/sql/result_set.py
@@ -154,6 +154,16 @@ def fetchall(self) -> List[Row]:
         """Fetch all remaining rows of a query result."""
         pass
 
+    @abstractmethod
+    def fetchmany_arrow(self, size: int) -> "pyarrow.Table":
+        """Fetch the next set of rows as an Arrow table."""
+        pass
+
+    @abstractmethod
+    def fetchall_arrow(self) -> "pyarrow.Table":
+        """Fetch all remaining rows as an Arrow table."""
+        pass
+
     def close(self) -> None:
         """
         Close the result set.
@@ -537,6 +547,37 @@ def fetchall_json(self):
 
         return results
 
+    def fetchmany_arrow(self, size: int) -> "pyarrow.Table":
+        """
+        Fetch the next set of rows as an Arrow table.
+
+        Args:
+            size: Number of rows to fetch
+
+        Returns:
+            PyArrow Table containing the fetched rows
+
+        Raises:
+            ImportError: If PyArrow is not installed
+            ValueError: If size is negative
+        """
+        if size < 0:
+            raise ValueError(f"size argument for fetchmany is {size} but must be >= 0")
+
+        results = self.results.next_n_rows(size)
+        self._next_row_index += results.num_rows
+
+        return results
+
+    def fetchall_arrow(self) -> "pyarrow.Table":
+        """
+        Fetch all remaining rows as an Arrow table.
+        """
+        results = self.results.remaining_rows()
+        self._next_row_index += results.num_rows
+
+        return results
+
     def fetchone(self) -> Optional[Row]:
         """
         Fetch the next row of a query result set, returning a single sequence,