HabanaAI · xinyu-intel · Jun 23, 2025
@@ -908,6 +908,10 @@ def get_num_unfinished_requests(self) -> int:
     def has_unfinished_requests(self,
                                 virtual_engine: Optional[int] = None) -> bool:
         """Returns True if there are unfinished requests."""
+        # Skip DP sync if PD disaggregation is enabled
+        if self.vllm_config.kv_transfer_config is not None:
+            return any(scheduler.has_unfinished_seqs()
+                       for scheduler in self.scheduler)
         if virtual_engine is not None:
             schedulers = [self.scheduler[virtual_engine]]
         else:
@@ -1330,7 +1334,8 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
                 "Pipeline parallelism is only supported through AsyncLLMEngine "
                 "as performance will be severely degraded otherwise.")
 
-        if self.should_execute_dummy_batch:
+        if self.vllm_config.kv_transfer_config is None\
+            and self.should_execute_dummy_batch:
             self.should_execute_dummy_batch = False
             outputs = self.model_executor.execute_model(
                 execute_model_req=ExecuteModelRequest(
@@ -1453,6 +1458,11 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         else:
             # Nothing scheduled => If there is pending async postprocessor,
             # then finish it here.
+            if self.vllm_config.kv_transfer_config is not None\
+                and self.need_to_sync_across_dp:
+                self.model_executor.execute_model(
+                    execute_model_req=ExecuteModelRequest(
+                        seq_group_metadata_list=[], is_dummy_batch=True))
             if len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
             # No outputs in this case

@@ -37,6 +37,7 @@
 logger = init_logger(__name__)
 
 POLLING_TIMEOUT_MS = 10000
+POLLING_TIMEOUT_MS_PD_DP = 1000
 HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
 
 
@@ -207,12 +208,25 @@ def run_engine_loop(self):
         while True:
             if not self.engine.has_unfinished_requests():
                 # Poll until there is work to do.
-                while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                    # When there's no work, check on engine health and send
-                    # health status back to client
-                    self._health_check()
-                    self.engine.do_log_stats()
-                    logger.debug("Waiting for new requests in engine loop.")
+                if self.engine.vllm_config.kv_transfer_config is not None\
+                    and self.engine.need_to_sync_across_dp:
+                    if self.input_socket.poll(
+                            timeout=POLLING_TIMEOUT_MS_PD_DP) == 0:
+                        # When there's no work, check on engine health and send
+                        # health status back to client
+                        self._health_check()
+                        self.engine.do_log_stats()
+                        logger.debug(
+                            "Waiting for new requests in engine loop.")
+                else:
+                    while self.input_socket.poll(
+                            timeout=POLLING_TIMEOUT_MS) == 0:
+                        # When there's no work, check on engine health and send
+                        # health status back to client
+                        self._health_check()
+                        self.engine.do_log_stats()
+                        logger.debug(
+                            "Waiting for new requests in engine loop.")
 
             # Handle any input from the client.
             self.handle_new_input()

@@ -1105,7 +1105,8 @@ def _add_dummy_seq(self,
         real_batch_size = len(seq_group_metadata_list)
         batch_size_padded = self.bucketing_ctx.get_padded_batch_size(
             real_batch_size, is_prompt)
-        if self.dp_awared_padding:
+        if self.dp_awared_padding and (self.vllm_config.kv_transfer_config
+                                       is None or not is_prompt):
             if self.is_driver_worker:
                 batch_size_padded = align_dp_groups(
                     batch_size_padded, torch.distributed.ReduceOp.MAX)
@@ -1495,7 +1496,8 @@ def _prepare_prompt(
             self.bucketing_ctx.get_padded_prompt_seq_len(target_query_len),
             self.block_size)
 
-        if self.dp_awared_padding:
+        if self.dp_awared_padding and\
+            self.vllm_config.kv_transfer_config is None:
             if self.is_driver_worker:
                 max_prompt_len = align_dp_groups(
                     max_prompt_len, torch.distributed.ReduceOp.MAX)
@@ -2448,6 +2450,11 @@ def create_dummy_seq_group_metadata(self,
                                      lora_request=lora_request)
 
     def profile_run(self) -> None:
+        # Skip profile run on decode instances
+        if self.vllm_config.kv_transfer_config is not None and\
+            self.vllm_config.kv_transfer_config.is_kv_consumer:
+            return
+
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
         bind_kv_cache(