@@ -457,16 +457,14 @@ def on_handshake_complete(fut: Future):
457
457
# Remove from futures dict
458
458
if engine_id in self ._handshake_futures :
459
459
del self ._handshake_futures [engine_id ]
460
- # The scheduler will retry them on the next cycle and
461
- # they'll be processed normally since the remote agent
462
- # is now registered.
463
460
if engine_id in self ._pending_requests :
464
461
completed_reqs = self ._pending_requests [engine_id ]
465
462
del self ._pending_requests [engine_id ]
463
+ for req_id , meta in completed_reqs :
464
+ self ._ready_requests .put ((req_id , meta ))
466
465
logger .debug (
467
466
"Handshake completed for engine %s. "
468
- "Cleared %d requests from pending - " \
469
- "scheduler to retry" ,
467
+ "Moved %d requests to ready queue for processing" ,
470
468
engine_id , len (completed_reqs ))
471
469
except Exception as e :
472
470
logger .warning ("Handshake failed for engine %s: %s" , engine_id ,
@@ -507,6 +505,10 @@ def _nixl_handshake(self, host: str, port: int):
507
505
logger .error ("Failed to fetch metadata from %s: %s" , url , e )
508
506
raise
509
507
508
+ if res is None :
509
+ logger .warning ("Remote server returned None metadata, skipping handshake" )
510
+ raise RuntimeError ("Remote server returned None metadata" )
511
+
510
512
remote_tp_size = len (res .keys ())
511
513
# Default case is that the remote TP size is 1, so we can
512
514
# directly access the metadata.
@@ -525,6 +527,7 @@ def _nixl_handshake(self, host: str, port: int):
525
527
if metadata_bytes is not None :
526
528
# Reconstruct NixlAgentMetadata from JSON response
527
529
# agent_metadata is base64-encoded binary data, not msgpack
530
+ tp_data .pop ("agent_metadata" , None )
528
531
metadata = NixlAgentMetadata (
529
532
agent_metadata = base64 .b64decode (metadata_bytes ), ** tp_data )
530
533
@@ -547,6 +550,7 @@ def _nixl_handshake(self, host: str, port: int):
547
550
logger .warning (
548
551
"Received None metadata from %s:%s, skipping NIXL handshake" ,
549
552
host , port )
553
+ raise RuntimeError ("Remote server does not support NIXL" )
550
554
551
555
logger .debug ("NIXL handshake method completed for %s:%s" , host , port )
552
556
@@ -834,12 +838,6 @@ def get_finished(self) -> KVTransferFinishedResult:
834
838
finished_recving = done_recving ,
835
839
pending_handshake = pending_handshake )
836
840
837
- if not local_result .is_empty ():
838
- logger .debug (
839
- "Rank %s, get_finished: %s requests done sending, "
840
- "%s requests done recving, %s pending handshake" , self .tp_rank ,
841
- len (done_sending ), len (done_recving ), len (pending_handshake ))
842
-
843
841
if self .world_size == 1 :
844
842
return local_result
845
843
@@ -939,26 +937,31 @@ def _pop_done_transfers(
939
937
return done_req_ids
940
938
941
939
def _process_ready_requests (self ):
942
- """Process requests that are ready after handshake completion.
943
-
944
- Note: With scheduler-based retry logic, this method is simplified
945
- as automatic retries are handled by the scheduler.
946
- """
947
- # Clear any remaining items in the ready queue to prevent memory leaks
940
+ """Process requests that are ready after handshake completion."""
941
+ processed_count = 0
948
942
while True :
949
943
try :
950
- self ._ready_requests .get_nowait ()
944
+ req_id , meta = self ._ready_requests .get_nowait ()
945
+ logger .debug ("Processing ready request %s for engine %s" ,
946
+ req_id , meta .remote_engine_id )
947
+ self ._read_blocks (
948
+ request_id = req_id ,
949
+ dst_engine_id = meta .remote_engine_id ,
950
+ local_block_ids = meta .local_block_ids ,
951
+ remote_block_ids = meta .remote_block_ids ,
952
+ )
953
+ processed_count += 1
951
954
except queue .Empty :
952
955
break
956
+
957
+ if processed_count > 0 :
958
+ logger .debug ("Processed %d ready requests" , processed_count )
953
959
954
960
def start_load_kv (self , metadata : NixlConnectorMetadata ):
955
961
"""
956
962
Start loading by triggering non-blocking nixl_xfer.
957
963
We check for these trnxs to complete in each step().
958
964
"""
959
- logger .debug ("start_load_kv called with %d requests" ,
960
- len (metadata .requests ))
961
-
962
965
for req_id , meta in metadata .requests .items ():
963
966
logger .debug (
964
967
"start_load_kv for request %s from remote engine %s. "
0 commit comments