remove reliance on schema_bytes in SEA

varun-edachali-dbx · varun-edachali-dbx · commit 015fb7616fcd · 2025-06-16T06:30:50.000Z
Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;
diff --git a/src/databricks/sql/backend/sea/backend.py b/src/databricks/sql/backend/sea/backend.py
@@ -331,74 +331,6 @@ def get_chunk_link(self, statement_id: str, chunk_index: int) -> "ExternalLink":
 
         return link
 
-    def _get_schema_bytes(self, sea_response) -> Optional[bytes]:
-        """
-        Extract schema bytes from the SEA response.
-
-        For ARROW format, we need to get the schema bytes from the first chunk.
-        If the first chunk is not available, we need to get it from the server.
-
-        Args:
-            sea_response: The response from the SEA API
-
-        Returns:
-            bytes: The schema bytes or None if not available
-        """
-        import requests
-        import lz4.frame
-
-        # Check if we have the first chunk in the response
-        result_data = sea_response.get("result", {})
-        external_links = result_data.get("external_links", [])
-
-        if not external_links:
-            return None
-
-        # Find the first chunk (chunk_index = 0)
-        first_chunk = None
-        for link in external_links:
-            if link.get("chunk_index") == 0:
-                first_chunk = link
-                break
-
-        if not first_chunk:
-            # Try to fetch the first chunk from the server
-            statement_id = sea_response.get("statement_id")
-            if not statement_id:
-                return None
-
-            chunks_response = self.get_chunk_links(statement_id, 0)
-            if not chunks_response.external_links:
-                return None
-
-            first_chunk = chunks_response.external_links[0].__dict__
-
-        # Download the first chunk to get the schema bytes
-        external_link = first_chunk.get("external_link")
-        http_headers = first_chunk.get("http_headers", {})
-
-        if not external_link:
-            return None
-
-        # Use requests to download the first chunk
-        http_response = requests.get(
-            external_link,
-            headers=http_headers,
-            verify=self.ssl_options.tls_verify,
-        )
-
-        if http_response.status_code != 200:
-            raise Error(f"Failed to download schema bytes: {http_response.text}")
-
-        # Extract schema bytes from the Arrow file
-        # The schema is at the beginning of the file
-        data = http_response.content
-        if sea_response.get("manifest", {}).get("result_compression") == "LZ4_FRAME":
-            data = lz4.frame.decompress(data)
-
-        # Return the schema bytes
-        return data
-
     def _results_message_to_execute_response(self, sea_response, command_id):
         """
         Convert a SEA response to an ExecuteResponse and extract result data.
@@ -441,13 +373,6 @@ def _results_message_to_execute_response(self, sea_response, command_id):
                 )
             description = columns if columns else None
 
-        # Extract schema bytes for Arrow format
-        schema_bytes = None
-        format = manifest_data.get("format")
-        if format == "ARROW_STREAM":
-            # For ARROW format, we need to get the schema bytes
-            schema_bytes = self._get_schema_bytes(sea_response)
-
         # Check for compression
         lz4_compressed = manifest_data.get("result_compression") == "LZ4_FRAME"
 
@@ -502,7 +427,7 @@ def _results_message_to_execute_response(self, sea_response, command_id):
             has_been_closed_server_side=False,
             lz4_compressed=lz4_compressed,
             is_staging_operation=False,
-            arrow_schema_bytes=schema_bytes,
+            arrow_schema_bytes=None,
             result_format=manifest_data.get("format"),
         )
 
diff --git a/src/databricks/sql/cloud_fetch_queue.py b/src/databricks/sql/cloud_fetch_queue.py
@@ -285,7 +285,6 @@ class SeaCloudFetchQueue(CloudFetchQueue):
     def __init__(
         self,
         initial_links: List["ExternalLink"],
-        schema_bytes: bytes,
         max_download_threads: int,
         ssl_options: SSLOptions,
         sea_client: "SeaDatabricksClient",
@@ -309,7 +308,7 @@ def __init__(
             description: Column descriptions
         """
         super().__init__(
-            schema_bytes=schema_bytes,
+            schema_bytes=b"",
             max_download_threads=max_download_threads,
             ssl_options=ssl_options,
             lz4_compressed=lz4_compressed,
@@ -344,10 +343,6 @@ def __init__(
 
     def _convert_to_thrift_link(self, link: "ExternalLink") -> TSparkArrowResultLink:
         """Convert SEA external links to Thrift format for compatibility with existing download manager."""
-        logger.debug(
-            "SeaCloudFetchQueue: Converting link to Thrift format".format(link)
-        )
-
         # Parse the ISO format expiration time
         expiry_time = int(dateutil.parser.parse(link.expiration).timestamp())
         return TSparkArrowResultLink(
@@ -470,9 +465,9 @@ def _create_next_table(self) -> Union["pyarrow.Table", None]:
         arrow_table = self._create_table_at_offset(self.start_row_index)
         if arrow_table:
             self.start_row_index += arrow_table.num_rows
-        logger.debug(
-            "ThriftCloudFetchQueue: Found downloaded file, row count: {}, new start offset: {}".format(
-                arrow_table.num_rows, self.start_row_index
+            logger.debug(
+                "ThriftCloudFetchQueue: Found downloaded file, row count: {}, new start offset: {}".format(
+                    arrow_table.num_rows, self.start_row_index
+                )
             )
-        )
         return arrow_table
diff --git a/src/databricks/sql/result_set.py b/src/databricks/sql/result_set.py
@@ -497,9 +497,6 @@ def __init__(
                 manifest,
                 str(self.statement_id),
                 description=desc,
-                schema_bytes=execute_response.arrow_schema_bytes
-                if execute_response.arrow_schema_bytes
-                else None,
                 max_download_threads=sea_client.max_download_threads,
                 ssl_options=sea_client.ssl_options,
                 sea_client=sea_client,
diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py
@@ -132,7 +132,6 @@ def build_queue(
         manifest: Optional[ResultManifest],
         statement_id: str,
         description: Optional[List[Tuple[Any, ...]]] = None,
-        schema_bytes: Optional[bytes] = None,
         max_download_threads: Optional[int] = None,
         ssl_options: Optional[SSLOptions] = None,
         sea_client: Optional["SeaDatabricksClient"] = None,
@@ -146,7 +145,6 @@ def build_queue(
             manifest (ResultManifest): Manifest from SEA response
             statement_id (str): Statement ID for the query
             description (List[List[Any]]): Column descriptions
-            schema_bytes (bytes): Arrow schema bytes
             max_download_threads (int): Maximum number of download threads
             ssl_options (SSLOptions): SSL options for downloads
             sea_client (SeaDatabricksClient): SEA client for fetching additional links
@@ -160,10 +158,6 @@ def build_queue(
             return JsonQueue(sea_result_data.data)
         elif sea_result_data.external_links is not None:
             # EXTERNAL_LINKS disposition
-            if not schema_bytes:
-                raise ValueError(
-                    "Schema bytes are required for EXTERNAL_LINKS disposition"
-                )
             if not max_download_threads:
                 raise ValueError(
                     "Max download threads is required for EXTERNAL_LINKS disposition"
@@ -181,7 +175,6 @@ def build_queue(
 
             return SeaCloudFetchQueue(
                 initial_links=sea_result_data.external_links,
-                schema_bytes=schema_bytes,
                 max_download_threads=max_download_threads,
                 ssl_options=ssl_options,
                 sea_client=sea_client,