fixed multi chunk

varun-edachali-dbx · varun-edachali-dbx · commit 75c5a62e15b7 · 2025-06-07T09:46:27.000Z
Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;
diff --git a/examples/experimental/sea_connector_test.py b/examples/experimental/sea_connector_test.py
@@ -320,8 +320,8 @@ def test_sea_result_set_arrow_external_links():
         # Execute a query that returns a large result set (will use EXTERNAL_LINKS disposition)
         # Use a larger result set to ensure multiple chunks
         # Using a CROSS JOIN to generate a larger result set
-        logger.info("Executing query: SELECT a.id as id1, b.id as id2 FROM range(1, 1000) a CROSS JOIN range(1, 1000) b LIMIT 100000")
-        cursor.execute("SELECT a.id as id1, b.id as id2 FROM range(1, 1000) a CROSS JOIN range(1, 1000) b LIMIT 100000")
+        logger.info("Executing query: SELECT a.id as id1, b.id as id2, CONCAT(CAST(a.id AS STRING), '-', CAST(b.id AS STRING)) as concat_str FROM range(1, 1000) a CROSS JOIN range(1, 1000) b LIMIT 500000")
+        cursor.execute("SELECT a.id as id1, b.id as id2, CONCAT(CAST(a.id AS STRING), '-', CAST(b.id AS STRING)) as concat_str FROM range(1, 1000) a CROSS JOIN range(1, 1000) b LIMIT 500000")
         
         # Test the manifest to verify we're getting multiple chunks
         # We can't easily access the manifest in the SeaResultSet, so we'll just continue with the test
@@ -387,6 +387,259 @@ def test_sea_result_set_arrow_external_links():
     logger.info("SEA result set test with ARROW format and EXTERNAL_LINKS disposition completed successfully")
 
 
+def test_sea_result_set_with_multiple_chunks():
+    """
+    Test the SEA result set implementation with multiple chunks.
+    
+    This function connects to a Databricks SQL endpoint using the SEA backend,
+    executes a query that returns a large result set in multiple chunks,
+    and tests fetching data from multiple chunks.
+    """
+    server_hostname = os.environ.get("DATABRICKS_SERVER_HOSTNAME")
+    http_path = os.environ.get("DATABRICKS_HTTP_PATH")
+    access_token = os.environ.get("DATABRICKS_TOKEN")
+    catalog = os.environ.get("DATABRICKS_CATALOG", "samples")
+    schema = os.environ.get("DATABRICKS_SCHEMA", "default")
+
+    if not all([server_hostname, http_path, access_token]):
+        logger.error("Missing required environment variables.")
+        logger.error(
+            "Please set DATABRICKS_SERVER_HOSTNAME, DATABRICKS_HTTP_PATH, and DATABRICKS_TOKEN."
+        )
+        sys.exit(1)
+
+    try:
+        # Create connection with SEA backend
+        logger.info("Creating connection with SEA backend...")
+        connection = Connection(
+            server_hostname=server_hostname,
+            http_path=http_path,
+            access_token=access_token,
+            catalog=catalog,
+            schema=schema,
+            use_sea=True,
+            use_cloud_fetch=True,  # Enable cloud fetch to trigger EXTERNAL_LINKS + ARROW
+            user_agent_entry="SEA-Test-Client",
+            # Use a smaller arraysize to potentially force multiple chunks
+            arraysize=1000,
+        )
+
+        logger.info(
+            f"Successfully opened SEA session with ID: {connection.get_session_id_hex()}"
+        )
+        
+        # Create cursor
+        cursor = connection.cursor()
+        
+        # Execute the query that we know returns multiple chunks from interactive-sea testing
+        logger.info("Executing query that returns multiple chunks...")
+        query = """
+        WITH large_dataset AS (
+            SELECT 
+                id,
+                id * 2 as double_id,
+                id * 3 as triple_id,
+                concat('value_', repeat(cast(id as string), 100)) as large_string_value,
+                array_repeat(id, 50) as large_array_value,
+                rand() as random_val,
+                current_timestamp() as current_time
+            FROM range(1, 100000) AS t(id)
+        )
+        SELECT * FROM large_dataset
+        """
+        cursor.execute(query)
+        
+        # Attempt to access the manifest to check for multiple chunks
+        from databricks.sql.backend.sea_backend import SeaDatabricksClient
+        if isinstance(connection.session.backend, SeaDatabricksClient):
+            # Get the statement ID from the cursor's active result set
+            statement_id = cursor.active_result_set.statement_id
+            if statement_id:
+                # Make a direct request to get the statement status
+                response_data = connection.session.backend.http_client._make_request(
+                    method="GET",
+                    path=f"/api/2.0/sql/statements/{statement_id}",
+                )
+                
+                # Check if we have multiple chunks
+                manifest = response_data.get("manifest", {})
+                total_chunk_count = manifest.get("total_chunk_count", 0)
+                truncated = manifest.get("truncated", False)
+                
+                logger.info(f"Total chunk count: {total_chunk_count}")
+                logger.info(f"Result truncated: {truncated}")
+                
+                # Log chunk information
+                chunks = manifest.get("chunks", [])
+                for i, chunk in enumerate(chunks):
+                    logger.info(f"Chunk {i}: index={chunk.get('chunk_index')}, rows={chunk.get('row_count')}, bytes={chunk.get('byte_count')}")
+                
+                # Log the next_chunk_index from the first external link
+                result_data = response_data.get("result", {})
+                external_links = result_data.get("external_links", [])
+                if external_links:
+                    first_link = external_links[0]
+                    logger.info(f"First link next_chunk_index: {first_link.get('next_chunk_index')}")
+                    logger.info(f"First link next_chunk_internal_link: {first_link.get('next_chunk_internal_link')}")
+        
+        # Test fetchone
+        logger.info("Testing fetchone...")
+        row = cursor.fetchone()
+        logger.info(f"First row: {row}")
+        
+        # Test fetchmany with a size that spans multiple chunks
+        fetch_size = 30000  # This should span at least 2 chunks based on our test
+        logger.info(f"Testing fetchmany({fetch_size})...")
+        rows = cursor.fetchmany(fetch_size)
+        logger.info(f"Fetched {len(rows)} rows with fetchmany")
+        first_batch_count = len(rows)
+        
+        # Test another fetchmany to get more chunks
+        logger.info(f"Testing another fetchmany({fetch_size})...")
+        more_rows = cursor.fetchmany(fetch_size)
+        logger.info(f"Fetched {len(more_rows)} more rows with fetchmany")
+        second_batch_count = len(more_rows)
+        
+        # Test fetchall for remaining rows
+        logger.info("Testing fetchall...")
+        remaining_rows = cursor.fetchall()
+        logger.info(f"Fetched {len(remaining_rows)} remaining rows with fetchall")
+        remaining_count = len(remaining_rows)
+        
+        # Verify results using row IDs instead of row counts
+        # Calculate the sum of rows from the manifest chunks
+        manifest_rows_sum = sum(chunk.get('row_count', 0) for chunk in manifest.get('chunks', []))
+        logger.info(f"Expected rows from manifest chunks: {manifest_rows_sum}")
+        
+        # Collect all row IDs to check for duplicates and completeness
+        all_row_ids = set()
+        
+        # Add the first row's ID
+        if row and hasattr(row, 'id'):
+            all_row_ids.add(row.id)
+            first_id = row.id
+            logger.info(f"First row ID: {first_id}")
+        
+        # Add IDs from first batch
+        if rows and len(rows) > 0 and hasattr(rows[0], 'id'):
+            batch_ids = [r.id for r in rows if hasattr(r, 'id')]
+            all_row_ids.update(batch_ids)
+            logger.info(f"First batch: {len(rows)} rows, ID range {min(batch_ids)} to {max(batch_ids)}")
+        
+        # Add IDs from second batch
+        if more_rows and len(more_rows) > 0 and hasattr(more_rows[0], 'id'):
+            batch_ids = [r.id for r in more_rows if hasattr(r, 'id')]
+            all_row_ids.update(batch_ids)
+            logger.info(f"Second batch: {len(more_rows)} rows, ID range {min(batch_ids)} to {max(batch_ids)}")
+        
+        # Add IDs from remaining rows
+        if remaining_rows and len(remaining_rows) > 0 and hasattr(remaining_rows[0], 'id'):
+            batch_ids = [r.id for r in remaining_rows if hasattr(r, 'id')]
+            all_row_ids.update(batch_ids)
+            logger.info(f"Remaining batch: {len(remaining_rows)} rows, ID range {min(batch_ids)} to {max(batch_ids)}")
+        
+        # Check for completeness and duplicates
+        if all_row_ids:
+            min_id = min(all_row_ids)
+            max_id = max(all_row_ids)
+            expected_count = max_id - min_id + 1
+            actual_count = len(all_row_ids)
+            
+            logger.info(f"Row ID range: {min_id} to {max_id}")
+            logger.info(f"Expected unique IDs in range: {expected_count}")
+            logger.info(f"Actual unique IDs collected: {actual_count}")
+            
+            if expected_count == actual_count:
+                logger.info("✅ All rows fetched correctly with no gaps")
+            else:
+                logger.warning("⚠️ Gap detected in row IDs")
+                
+            # Check for duplicates
+            if actual_count == len(all_row_ids):
+                logger.info("✅ No duplicate row IDs detected")
+            else:
+                logger.warning("⚠️ Duplicate row IDs detected")
+                
+            # Check if we got all expected rows
+            if max_id == manifest_rows_sum:
+                logger.info("✅ Last row ID matches expected row count from manifest")
+            
+            # Let's try one more time with a fresh cursor to fetch all rows at once
+            logger.info("\nTesting fetchall_arrow with a fresh cursor...")
+            new_cursor = connection.cursor()
+            new_cursor.execute(query)
+            
+            try:
+                # Fetch all rows as Arrow
+                arrow_table = new_cursor.fetchall_arrow()
+                logger.info(f"Arrow table num rows: {arrow_table.num_rows}")
+                logger.info(f"Arrow table columns: {arrow_table.column_names}")
+                
+                # Get the ID column if it exists
+                if 'id' in arrow_table.column_names:
+                    id_column = arrow_table.column('id').to_pylist()
+                    logger.info(f"First 5 rows of id column: {id_column[:5]}")
+                    logger.info(f"Last 5 rows of id column: {id_column[-5:]}")
+                    
+                    # Check for completeness and duplicates in Arrow results
+                    arrow_id_set = set(id_column)
+                    arrow_min_id = min(id_column)
+                    arrow_max_id = max(id_column)
+                    arrow_expected_count = arrow_max_id - arrow_min_id + 1
+                    arrow_actual_count = len(arrow_id_set)
+                    
+                    logger.info(f"Arrow result row ID range: {arrow_min_id} to {arrow_max_id}")
+                    logger.info(f"Arrow result expected unique IDs: {arrow_expected_count}")
+                    logger.info(f"Arrow result actual unique IDs: {arrow_actual_count}")
+                    
+                    if arrow_expected_count == arrow_actual_count:
+                        logger.info("✅ Arrow results: All rows fetched correctly with no gaps")
+                    else:
+                        logger.warning("⚠️ Arrow results: Gap detected in row IDs")
+                    
+                    if arrow_actual_count == len(arrow_id_set):
+                        logger.info("✅ Arrow results: No duplicate row IDs detected")
+                    else:
+                        logger.warning("⚠️ Arrow results: Duplicate row IDs detected")
+                    
+                    # Compare with manifest row count
+                    if arrow_max_id == manifest_rows_sum:
+                        logger.info("✅ Arrow results: Last row ID matches expected row count from manifest")
+                    
+                    # Compare with sequential fetch results
+                    if arrow_id_set == all_row_ids:
+                        logger.info("✅ Arrow and sequential fetch results contain exactly the same row IDs")
+                    else:
+                        logger.warning("⚠️ Arrow and sequential fetch results contain different row IDs")
+                        only_in_arrow = arrow_id_set - all_row_ids
+                        only_in_sequential = all_row_ids - arrow_id_set
+                        if only_in_arrow:
+                            logger.warning(f"IDs only in Arrow results: {len(only_in_arrow)} rows")
+                        if only_in_sequential:
+                            logger.warning(f"IDs only in sequential fetch: {len(only_in_sequential)} rows")
+                
+                # Check if we got all rows
+                logger.info(f"Expected rows from manifest chunks: {manifest_rows_sum}")
+                logger.info(f"Actual rows in arrow table: {arrow_table.num_rows}")
+            except Exception as e:
+                logger.error(f"Error fetching all rows as Arrow: {e}")
+            
+            new_cursor.close()
+        
+        # Close cursor and connection
+        cursor.close()
+        connection.close()
+        logger.info("Successfully closed SEA session")
+
+    except Exception as e:
+        logger.error(f"Error during SEA result set test: {str(e)}")
+        import traceback
+        logger.error(traceback.format_exc())
+        sys.exit(1)
+
+    logger.info("SEA result set test with multiple chunks completed successfully")
+
+
 if __name__ == "__main__":
     # Test session management
     # test_sea_session()
@@ -395,4 +648,7 @@ def test_sea_result_set_arrow_external_links():
     # test_sea_result_set_json_array_inline()
     
     # Test result set implementation with ARROW format and EXTERNAL_LINKS disposition
-    test_sea_result_set_arrow_external_links()
+    # test_sea_result_set_arrow_external_links()
+    
+    # Test result set implementation with multiple chunks
+    test_sea_result_set_with_multiple_chunks()
diff --git a/src/databricks/sql/cloudfetch/download_manager.py b/src/databricks/sql/cloudfetch/download_manager.py
@@ -24,6 +24,9 @@ def __init__(
         ssl_options: SSLOptions,
     ):
         self._pending_links: List[TSparkArrowResultLink] = []
+        # Add a cache to store downloaded files by row offset
+        self._downloaded_files_cache = {}
+        
         for link in links:
             if link.rowCount <= 0:
                 continue
@@ -56,24 +59,60 @@ def get_next_downloaded_file(
         Args:
             next_row_offset (int): The offset of the starting row of the next file we want data from.
         """
+        logger.info(f"ResultFileDownloadManager: get_next_downloaded_file for row offset {next_row_offset}")
+
+        # Check if we have this file in the cache
+        if next_row_offset in self._downloaded_files_cache:
+            logger.info(f"ResultFileDownloadManager: Found file in cache for row offset {next_row_offset}")
+            return self._downloaded_files_cache[next_row_offset]
 
         # Make sure the download queue is always full
         self._schedule_downloads()
 
         # No more files to download from this batch of links
         if len(self._download_tasks) == 0:
+            logger.info("ResultFileDownloadManager: No more download tasks")
             self._shutdown_manager()
             return None
 
+        # Log all pending download tasks
+        logger.info(f"ResultFileDownloadManager: {len(self._download_tasks)} download tasks pending")
+        
+        # Find the task that matches the requested row offset
+        matching_task_index = None
+        for i, task in enumerate(self._download_tasks):
+            if task.done():
+                try:
+                    file = task.result(timeout=0)  # Don't block
+                    logger.info(f"Task {i}: start_row_offset={file.start_row_offset}, row_count={file.row_count}")
+                    if file.start_row_offset == next_row_offset:
+                        matching_task_index = i
+                        break
+                except Exception as e:
+                    logger.error(f"Error getting task result: {e}")
+        
+        # If we found a matching task, use it
+        if matching_task_index is not None:
+            logger.info(f"ResultFileDownloadManager: Found matching task at index {matching_task_index}")
+            task = self._download_tasks.pop(matching_task_index)
+            file = task.result()
+            # Cache the file for future use
+            self._downloaded_files_cache[file.start_row_offset] = file
+            return file
+        
+        # Otherwise, just use the first task
         task = self._download_tasks.pop(0)
         # Future's `result()` method will wait for the call to complete, and return
         # the value returned by the call. If the call throws an exception - `result()`
         # will throw the same exception
         file = task.result()
+        # Cache the file for future use
+        self._downloaded_files_cache[file.start_row_offset] = file
+        
         if (next_row_offset < file.start_row_offset) or (
             next_row_offset > file.start_row_offset + file.row_count
         ):
-            logger.debug(
+            logger.warning(
                 "ResultFileDownloadManager: file does not contain row {}, start {}, row count {}".format(
                     next_row_offset, file.start_row_offset, file.row_count
                 )
diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py