improved models and filters from cloudfetch-sea branch

varun-edachali-dbx · varun-edachali-dbx · commit 4cb15fdaa831 · 2025-06-09T13:47:34.000Z
Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;
diff --git a/src/databricks/sql/backend/filters.py b/src/databricks/sql/backend/filters.py
@@ -9,14 +9,20 @@
     List,
     Optional,
     Any,
+    Dict,
     Callable,
+    TypeVar,
+    Generic,
+    cast,
     TYPE_CHECKING,
 )
 
-if TYPE_CHECKING:
-    from databricks.sql.result_set import ResultSet
+from databricks.sql.utils import JsonQueue, SeaResultSetQueueFactory
+from databricks.sql.backend.types import ExecuteResponse, CommandId
+from databricks.sql.backend.sea.models.base import ResultData
 
-from databricks.sql.result_set import SeaResultSet
+if TYPE_CHECKING:
+    from databricks.sql.result_set import ResultSet, SeaResultSet
 
 logger = logging.getLogger(__name__)
 
@@ -43,26 +49,35 @@ def _filter_sea_result_set(
         Returns:
             A filtered SEA result set
         """
-        # Create a filtered version of the result set
-        filtered_response = result_set._response.copy()
-
-        # If there's a result with rows, filter them
-        if (
-            "result" in filtered_response
-            and "data_array" in filtered_response["result"]
-        ):
-            rows = filtered_response["result"]["data_array"]
-            filtered_rows = [row for row in rows if filter_func(row)]
-            filtered_response["result"]["data_array"] = filtered_rows
-
-            # Update row count if present
-            if "row_count" in filtered_response["result"]:
-                filtered_response["result"]["row_count"] = len(filtered_rows)
-
-        # Create a new result set with the filtered data
+        # Get all remaining rows
+        original_index = result_set.results.cur_row_index
+        result_set.results.cur_row_index = 0  # Reset to beginning
+        all_rows = result_set.results.remaining_rows()
+
+        # Filter rows
+        filtered_rows = [row for row in all_rows if filter_func(row)]
+
+        # Import SeaResultSet here to avoid circular imports
+        from databricks.sql.result_set import SeaResultSet
+
+        # Reuse the command_id from the original result set
+        command_id = result_set.command_id
+
+        # Create an ExecuteResponse with the filtered data
+        execute_response = ExecuteResponse(
+            command_id=command_id,
+            status=result_set.status,
+            description=result_set.description,
+            has_more_rows=result_set._has_more_rows,
+            results_queue=JsonQueue(filtered_rows),
+            has_been_closed_server_side=result_set.has_been_closed_server_side,
+            lz4_compressed=False,
+            is_staging_operation=False,
+        )
+
         return SeaResultSet(
             connection=result_set.connection,
-            sea_response=filtered_response,
+            execute_response=execute_response,
             sea_client=result_set.backend,
             buffer_size_bytes=result_set.buffer_size_bytes,
             arraysize=result_set.arraysize,
@@ -92,6 +107,8 @@ def filter_by_column_values(
             allowed_values = [v.upper() for v in allowed_values]
 
         # Determine the type of result set and apply appropriate filtering
+        from databricks.sql.result_set import SeaResultSet
+
         if isinstance(result_set, SeaResultSet):
             return ResultSetFilter._filter_sea_result_set(
                 result_set,
@@ -137,7 +154,7 @@ def filter_tables_by_type(
             table_types if table_types and len(table_types) > 0 else DEFAULT_TABLE_TYPES
         )
 
-        # Table type is typically in the 6th column (index 5)
+        # Table type is the 6th column (index 5)
         return ResultSetFilter.filter_by_column_values(
             result_set, 5, valid_types, case_sensitive=False
         )
diff --git a/src/databricks/sql/backend/sea/models/base.py b/src/databricks/sql/backend/sea/models/base.py
@@ -34,6 +34,12 @@ class ExternalLink:
     external_link: str
     expiration: str
     chunk_index: int
+    byte_count: int = 0
+    row_count: int = 0
+    row_offset: int = 0
+    next_chunk_index: Optional[int] = None
+    next_chunk_internal_link: Optional[str] = None
+    http_headers: Optional[Dict[str, str]] = None
 
 
 @dataclass
@@ -61,8 +67,11 @@ class ColumnInfo:
 class ResultManifest:
     """Manifest information for a result set."""
 
-    schema: List[ColumnInfo]
+    format: str
+    schema: Dict[str, Any]  # Will contain column information
     total_row_count: int
     total_byte_count: int
+    total_chunk_count: int
     truncated: bool = False
-    chunk_count: Optional[int] = None
+    chunks: Optional[List[Dict[str, Any]]] = None
+    result_compression: Optional[str] = None
diff --git a/src/databricks/sql/backend/sea/models/requests.py b/src/databricks/sql/backend/sea/models/requests.py
@@ -21,18 +21,16 @@ class StatementParameter:
 class ExecuteStatementRequest:
     """Request to execute a SQL statement."""
 
-    warehouse_id: str
-    statement: str
     session_id: str
+    statement: str
+    warehouse_id: str
     disposition: str = "EXTERNAL_LINKS"
     format: str = "JSON_ARRAY"
+    result_compression: Optional[str] = None
+    parameters: Optional[List[StatementParameter]] = None
     wait_timeout: str = "10s"
     on_wait_timeout: str = "CONTINUE"
     row_limit: Optional[int] = None
-    parameters: Optional[List[StatementParameter]] = None
-    catalog: Optional[str] = None
-    schema: Optional[str] = None
-    result_compression: Optional[str] = None
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert the request to a dictionary for JSON serialization."""
@@ -49,12 +47,6 @@ def to_dict(self) -> Dict[str, Any]:
         if self.row_limit is not None and self.row_limit > 0:
             result["row_limit"] = self.row_limit
 
-        if self.catalog:
-            result["catalog"] = self.catalog
-
-        if self.schema:
-            result["schema"] = self.schema
-
         if self.result_compression:
             result["result_compression"] = self.result_compression
 
diff --git a/src/databricks/sql/backend/sea/models/responses.py b/src/databricks/sql/backend/sea/models/responses.py
@@ -13,6 +13,8 @@
     ResultManifest,
     ResultData,
     ServiceError,
+    ExternalLink,
+    ColumnInfo,
 )
 
 
@@ -37,20 +39,62 @@ def from_dict(cls, data: Dict[str, Any]) -> "ExecuteStatementResponse":
                 error_code=error_data.get("error_code"),
             )
 
-        state = CommandState.from_sea_state(status_data.get("state", ""))
-        if state is None:
-            raise ValueError(f"Invalid state: {status_data.get('state', '')}")
         status = StatementStatus(
-            state=state,
+            state=CommandState.from_sea_state(status_data.get("state", "")),
             error=error,
             sql_state=status_data.get("sql_state"),
         )
 
+        # Parse manifest
+        manifest = None
+        if "manifest" in data:
+            manifest_data = data["manifest"]
+            manifest = ResultManifest(
+                format=manifest_data.get("format", ""),
+                schema=manifest_data.get("schema", {}),
+                total_row_count=manifest_data.get("total_row_count", 0),
+                total_byte_count=manifest_data.get("total_byte_count", 0),
+                total_chunk_count=manifest_data.get("total_chunk_count", 0),
+                truncated=manifest_data.get("truncated", False),
+                chunks=manifest_data.get("chunks"),
+                result_compression=manifest_data.get("result_compression"),
+            )
+
+        # Parse result data
+        result = None
+        if "result" in data:
+            result_data = data["result"]
+            external_links = None
+
+            if "external_links" in result_data:
+                external_links = []
+                for link_data in result_data["external_links"]:
+                    external_links.append(
+                        ExternalLink(
+                            external_link=link_data.get("external_link", ""),
+                            expiration=link_data.get("expiration", ""),
+                            chunk_index=link_data.get("chunk_index", 0),
+                            byte_count=link_data.get("byte_count", 0),
+                            row_count=link_data.get("row_count", 0),
+                            row_offset=link_data.get("row_offset", 0),
+                            next_chunk_index=link_data.get("next_chunk_index"),
+                            next_chunk_internal_link=link_data.get(
+                                "next_chunk_internal_link"
+                            ),
+                            http_headers=link_data.get("http_headers"),
+                        )
+                    )
+
+            result = ResultData(
+                data=result_data.get("data_array"),
+                external_links=external_links,
+            )
+
         return cls(
             statement_id=data.get("statement_id", ""),
             status=status,
-            manifest=data.get("manifest"),  # We'll parse this more fully if needed
-            result=data.get("result"),  # We'll parse this more fully if needed
+            manifest=manifest,
+            result=result,
         )
 
 
@@ -75,21 +119,62 @@ def from_dict(cls, data: Dict[str, Any]) -> "GetStatementResponse":
                 error_code=error_data.get("error_code"),
             )
 
-        state = CommandState.from_sea_state(status_data.get("state", ""))
-        if state is None:
-            raise ValueError(f"Invalid state: {status_data.get('state', '')}")
-
         status = StatementStatus(
-            state=state,
+            state=CommandState.from_sea_state(status_data.get("state", "")),
             error=error,
             sql_state=status_data.get("sql_state"),
         )
 
+        # Parse manifest
+        manifest = None
+        if "manifest" in data:
+            manifest_data = data["manifest"]
+            manifest = ResultManifest(
+                format=manifest_data.get("format", ""),
+                schema=manifest_data.get("schema", {}),
+                total_row_count=manifest_data.get("total_row_count", 0),
+                total_byte_count=manifest_data.get("total_byte_count", 0),
+                total_chunk_count=manifest_data.get("total_chunk_count", 0),
+                truncated=manifest_data.get("truncated", False),
+                chunks=manifest_data.get("chunks"),
+                result_compression=manifest_data.get("result_compression"),
+            )
+
+        # Parse result data
+        result = None
+        if "result" in data:
+            result_data = data["result"]
+            external_links = None
+
+            if "external_links" in result_data:
+                external_links = []
+                for link_data in result_data["external_links"]:
+                    external_links.append(
+                        ExternalLink(
+                            external_link=link_data.get("external_link", ""),
+                            expiration=link_data.get("expiration", ""),
+                            chunk_index=link_data.get("chunk_index", 0),
+                            byte_count=link_data.get("byte_count", 0),
+                            row_count=link_data.get("row_count", 0),
+                            row_offset=link_data.get("row_offset", 0),
+                            next_chunk_index=link_data.get("next_chunk_index"),
+                            next_chunk_internal_link=link_data.get(
+                                "next_chunk_internal_link"
+                            ),
+                            http_headers=link_data.get("http_headers"),
+                        )
+                    )
+
+            result = ResultData(
+                data=result_data.get("data_array"),
+                external_links=external_links,
+            )
+
         return cls(
             statement_id=data.get("statement_id", ""),
             status=status,
-            manifest=data.get("manifest"),  # We'll parse this more fully if needed
-            result=data.get("result"),  # We'll parse this more fully if needed
+            manifest=manifest,
+            result=result,
         )
 
 
@@ -103,3 +188,38 @@ class CreateSessionResponse:
     def from_dict(cls, data: Dict[str, Any]) -> "CreateSessionResponse":
         """Create a CreateSessionResponse from a dictionary."""
         return cls(session_id=data.get("session_id", ""))
+
+
+@dataclass
+class GetChunksResponse:
+    """Response from getting chunks for a statement."""
+
+    statement_id: str
+    external_links: List[ExternalLink]
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "GetChunksResponse":
+        """Create a GetChunksResponse from a dictionary."""
+        external_links = []
+        if "external_links" in data:
+            for link_data in data["external_links"]:
+                external_links.append(
+                    ExternalLink(
+                        external_link=link_data.get("external_link", ""),
+                        expiration=link_data.get("expiration", ""),
+                        chunk_index=link_data.get("chunk_index", 0),
+                        byte_count=link_data.get("byte_count", 0),
+                        row_count=link_data.get("row_count", 0),
+                        row_offset=link_data.get("row_offset", 0),
+                        next_chunk_index=link_data.get("next_chunk_index"),
+                        next_chunk_internal_link=link_data.get(
+                            "next_chunk_internal_link"
+                        ),
+                        http_headers=link_data.get("http_headers"),
+                    )
+                )
+
+        return cls(
+            statement_id=data.get("statement_id", ""),
+            external_links=external_links,
+        )