vllm-project · yannicks1 · Jul 7, 2025 · Jul 7, 2025 · Jul 8, 2025 · Jul 8, 2025
@@ -29,7 +29,7 @@
     ids=lambda val: f"TP({val})",
 )
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("max_num_seqs", [4],
+@pytest.mark.parametrize("max_num_seqs", [1, 4],
                          ids=lambda val: f"max_num_seqs({val})")
 def test_output(
     model: str,

@@ -158,8 +158,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # set env vars for torch_sendnn to consume
         os.environ["VLLM_DT_MAX_CONTEXT_LEN"] = str(
             vllm_config.model_config.max_model_len)
+        # min decode batch size is 2 due to symbolic shape constraint in torch
         os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(
-            vllm_config.scheduler_config.max_num_seqs)
+            max(vllm_config.scheduler_config.max_num_seqs, 2))
 
     @classmethod
     def use_all_gather(cls) -> bool:

@@ -682,11 +682,6 @@ def __init__(
         super().__init__(vllm_config=vllm_config,
                          is_driver_worker=is_driver_worker)
 
-        # TODO: remove this limitation once we update the warm-up logic to
-        # support batch_size=1
-        assert vllm_config.scheduler_config.max_num_seqs >= 2, "Currently, " \
-            "continuous batching needs config to set batch_size >= 2"
-
         self.block_size = SpyrePlatform.get_block_size()
 
         # TODO: move to a KV cache manager

@@ -28,6 +28,7 @@
 import vllm_spyre.perf_metrics as perf_metrics
 from vllm_spyre.model_executor.model_loader import spyre_setup
 from vllm_spyre.platform import SpyrePlatform
+from vllm_spyre.v1.worker.spyre_input_batch import InputBatch
 from vllm_spyre.v1.worker.spyre_model_runner import (
     ContinuousBatchingSpyreModelRunner, StaticBatchingSpyreModelRunner)
 
@@ -321,6 +322,18 @@ def _warmup_spyre_dynamic_size(self, special_token_ids):
         prompt_len = 42
         num_decode_tokens = 2
 
+        # Fix for batch size 1: set input batch to fit 2 requests for warmup
+        if model_runner.vllm_config.scheduler_config.max_num_seqs == 1:
+            model_runner.input_batch = InputBatch(
+                max_num_reqs=2,
+                max_model_len=model_runner.vllm_config.model_config.
+                max_model_len,
+                device=model_runner.device,
+                pin_memory=model_runner.pin_memory,
+                vocab_size=model_runner.vllm_config.model_config.
+                get_vocab_size(),
+            )
+
         # Sample from the valid token ids
         warmup_tokens_tensor = valid_token_ids_tensor[torch.randint(
             0, len(valid_token_ids_tensor), (batch_size + 1, prompt_len))]
@@ -368,6 +381,19 @@ def _warmup_spyre_dynamic_size(self, special_token_ids):
         self.execute_model(scheduler_output)
         self._cleanup_model_runner(request=[add_dummy_request])
 
+        # Fix for batch size 1: reset input batch to fit max_num_seqs requests
+        if model_runner.vllm_config.scheduler_config.max_num_seqs == 1:
+            model_runner.input_batch = InputBatch(
+                max_num_reqs=model_runner.vllm_config.scheduler_config.
+                max_num_seqs,
+                max_model_len=model_runner.vllm_config.model_config.
+                max_model_len,
+                device=model_runner.device,
+                pin_memory=model_runner.pin_memory,
+                vocab_size=model_runner.vllm_config.model_config.
+                get_vocab_size(),
+            )
+
         model_runner.finish_warmup()
 
         warmup_end_t = time.time()