@@ -320,37 +320,26 @@ def __init__(
320
320
self ._statement_id = statement_id
321
321
self ._total_chunk_count = total_chunk_count
322
322
323
- # Track the current chunk we're processing
324
- self ._current_chunk_index : Optional [int ] = None
325
- self ._current_chunk_link : Optional ["ExternalLink" ] = None
326
-
327
323
logger .debug (
328
324
"SeaCloudFetchQueue: Initialize CloudFetch loader for statement {}, total chunks: {}" .format (
329
325
statement_id , total_chunk_count
330
326
)
331
327
)
332
328
333
- if initial_links :
334
- initial_links = []
335
- # logger.debug("SeaCloudFetchQueue: Initial links provided:")
336
- # for link in initial_links:
337
- # logger.debug(
338
- # "- chunk: {}, row offset: {}, row count: {}, next chunk: {}".format(
339
- # link.chunk_index,
340
- # link.row_offset,
341
- # link.row_count,
342
- # link.next_chunk_index,
343
- # )
344
- # )
345
-
346
- # Initialize download manager with initial links
329
+ initial_link = next ((l for l in initial_links if l .chunk_index == 0 ), None )
330
+ if not initial_link :
331
+ raise ValueError ("No initial link found for chunk index 0" )
332
+
347
333
self .download_manager = ResultFileDownloadManager (
348
- links = self . _convert_to_thrift_links ( initial_links ) ,
334
+ links = [] ,
349
335
max_download_threads = max_download_threads ,
350
- lz4_compressed = self . lz4_compressed ,
351
- ssl_options = self . _ssl_options ,
336
+ lz4_compressed = lz4_compressed ,
337
+ ssl_options = ssl_options ,
352
338
)
353
339
340
+ # Track the current chunk we're processing
341
+ self ._current_chunk_link : Optional ["ExternalLink" ] = initial_link
342
+
354
343
# Initialize table and position
355
344
self .table = self ._create_next_table ()
356
345
if self .table :
@@ -360,129 +349,60 @@ def __init__(
360
349
)
361
350
)
362
351
363
- def _convert_to_thrift_links (
364
- self , links : List ["ExternalLink" ]
365
- ) -> List [TSparkArrowResultLink ]:
352
+ def _convert_to_thrift_link (self , link : "ExternalLink" ) -> TSparkArrowResultLink :
366
353
"""Convert SEA external links to Thrift format for compatibility with existing download manager."""
367
- if not links :
368
- logger .debug ("SeaCloudFetchQueue: No links to convert to Thrift format" )
369
- return []
370
-
371
- logger .debug (
372
- "SeaCloudFetchQueue: Converting {} links to Thrift format" .format (
373
- len (links )
374
- )
375
- )
376
- thrift_links = []
377
- for link in links :
378
- # Parse the ISO format expiration time
379
- expiry_time = int (dateutil .parser .parse (link .expiration ).timestamp ())
380
-
381
- thrift_link = TSparkArrowResultLink (
382
- fileLink = link .external_link ,
383
- expiryTime = expiry_time ,
384
- rowCount = link .row_count ,
385
- bytesNum = link .byte_count ,
386
- startRowOffset = link .row_offset ,
387
- httpHeaders = link .http_headers or {},
388
- )
389
- thrift_links .append (thrift_link )
390
- return thrift_links
354
+ if not link :
355
+ logger .debug ("SeaCloudFetchQueue: No link to convert to Thrift format" )
356
+ return None
391
357
392
- def _fetch_chunk_link (self , chunk_index : int ) -> Optional ["ExternalLink" ]:
393
- """Fetch link for the specified chunk index."""
394
358
logger .debug (
395
- "SeaCloudFetchQueue: Fetching chunk {} using SEA client " .format (chunk_index )
359
+ "SeaCloudFetchQueue: Converting link to Thrift format " .format (link )
396
360
)
397
361
398
- # Use the SEA client to fetch the chunk links
399
- link = self . _sea_client . get_chunk_link ( self . _statement_id , chunk_index )
362
+ # Parse the ISO format expiration time
363
+ expiry_time = int ( dateutil . parser . parse ( link . expiration ). timestamp () )
400
364
401
- logger . debug (
402
- "SeaCloudFetchQueue: Link details for chunk {}: row_offset={}, row_count={}, next_chunk_index={}" . format (
403
- link . chunk_index ,
404
- link .row_offset ,
405
- link .row_count ,
406
- link .next_chunk_index ,
407
- )
365
+ return TSparkArrowResultLink (
366
+ fileLink = link . external_link ,
367
+ expiryTime = expiry_time ,
368
+ rowCount = link .row_count ,
369
+ bytesNum = link .byte_count ,
370
+ startRowOffset = link .row_offset ,
371
+ httpHeaders = link . http_headers or {},
408
372
)
409
373
410
- if self .download_manager :
411
- self .download_manager .add_links (self ._convert_to_thrift_links ([link ]))
412
-
413
- return link
414
-
415
374
def _create_next_table (self ) -> Union ["pyarrow.Table" , None ]:
416
375
"""Create next table by retrieving the logical next downloaded file."""
417
- # if we're still processing the current table, just return it
418
- if self .table is not None and self .table_row_index < self .table .num_rows :
419
- logger .info (
420
- "SeaCloudFetchQueue: Still processing current table, rows left: {}" .format (
421
- self .table .num_rows - self .table_row_index
422
- )
423
- )
424
- return self .table
376
+ logger .debug (
377
+ f"SeaCloudFetchQueue: Creating next table, current chunk link: { self ._current_chunk_link } "
378
+ )
425
379
426
- # if we've reached the end of the response, return None
427
- if (
428
- self ._current_chunk_link
429
- and self ._current_chunk_link .next_chunk_index is None
430
- ):
431
- logger .info (
432
- "SeaCloudFetchQueue: Reached end of chunks (no next chunk index)"
433
- )
380
+ if not self ._current_chunk_link :
381
+ logger .debug ("SeaCloudFetchQueue: No current chunk link, returning None" )
434
382
return None
435
383
436
- # Determine the next chunk index
437
- next_chunk_index = (
438
- 0
439
- if self ._current_chunk_link is None
440
- else self ._current_chunk_link .next_chunk_index
441
- )
442
- if next_chunk_index is None :
443
- logger .info (
444
- "SeaCloudFetchQueue: Reached end of chunks (next_chunk_index is None)"
384
+ if self .download_manager :
385
+ self .download_manager .add_link (
386
+ self ._convert_to_thrift_link (self ._current_chunk_link )
445
387
)
446
- return None
447
388
448
- logger .info (
449
- "SeaCloudFetchQueue: Trying to get downloaded file for chunk {}" .format (
450
- next_chunk_index
451
- )
452
- )
389
+ row_offset = self ._current_chunk_link .row_offset
390
+ arrow_table = self ._create_table_at_offset (row_offset )
453
391
454
- # Update current chunk to the next one
455
- self ._current_chunk_index = next_chunk_index
392
+ next_chunk_index = self . _current_chunk_link . next_chunk_index
393
+ self ._current_chunk_link = None
456
394
try :
457
- self ._current_chunk_link = self ._fetch_chunk_link (next_chunk_index )
395
+ self ._current_chunk_link = self ._sea_client .get_chunk_link (
396
+ self ._statement_id , next_chunk_index
397
+ )
458
398
except Exception as e :
459
399
logger .error (
460
400
"SeaCloudFetchQueue: Error fetching link for chunk {}: {}" .format (
461
- self . _current_chunk_index , e
401
+ next_chunk_index , e
462
402
)
463
403
)
464
- return None
465
- if not self ._current_chunk_link :
466
- logger .error (
467
- "SeaCloudFetchQueue: No link found for chunk {}" .format (
468
- self ._current_chunk_index
469
- )
470
- )
471
- return None
472
404
473
- # Get the data for the current chunk
474
- row_offset = self ._current_chunk_link .row_offset
475
-
476
- logger .info (
477
- "SeaCloudFetchQueue: Current chunk details - index: {}, row_offset: {}, row_count: {}, next_chunk_index: {}" .format (
478
- self ._current_chunk_link .chunk_index ,
479
- self ._current_chunk_link .row_offset ,
480
- self ._current_chunk_link .row_count ,
481
- self ._current_chunk_link .next_chunk_index ,
482
- )
483
- )
484
-
485
- return self ._create_table_at_offset (row_offset )
405
+ return arrow_table
486
406
487
407
488
408
class ThriftCloudFetchQueue (CloudFetchQueue ):
0 commit comments