[Serve.llm] Add router replicas and batch size to llm config (#52655)

GeneDer · web-flow · commit 279815b2e325 · 2025-05-02T15:24:15.000-07:00
Signed-off-by: Gene Su &lt;e870252314@gmail.com&gt;
diff --git a/python/ray/llm/_internal/serve/configs/server_models.py b/python/ray/llm/_internal/serve/configs/server_models.py
@@ -44,6 +44,7 @@
     DEFAULT_MULTIPLEX_DOWNLOAD_TRIES,
     MAX_NUM_STOPPING_SEQUENCES,
     ENABLE_WORKER_PROCESS_SETUP_HOOK,
+    MODEL_RESPONSE_BATCH_TIMEOUT_MS,
 )
 from ray.llm._internal.serve.configs.prompt_formats import (
     Prompt,
@@ -223,6 +224,19 @@ class LLMConfig(BaseModelExtended):
         """,
     )
 
+    experimental_configs: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Experimental configurations for Ray Serve LLM. This is a "
+        "dictionary of key-value pairs. Current supported keys are:\n"
+        "- `stream_batching_interval_ms`: Ray Serve LLM batches streaming "
+        "requests together. This config decides how long to wait for the "
+        "batch before processing the requests. Defaults to "
+        f"{MODEL_RESPONSE_BATCH_TIMEOUT_MS}.\n"
+        "- `num_router_replicas`: The number of replicas for the router. Ray "
+        "Serve will take the max amount all the replicas. Default would be 2 "
+        "router replicas per model replica.\n",
+    )
+
     _supports_vision: bool = PrivateAttr(False)
     _model_architecture: str = PrivateAttr("")
     _prompt_format: HuggingFacePromptFormat = PrivateAttr(
diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py
@@ -519,15 +519,25 @@ async def prepare_request(
         vllm_request = VLLMGenerationRequest(**request_params)
         return vllm_request
 
+    def _get_batch_interval_ms(self, stream: bool = True) -> int:
+        """Calculate the batching interval for responses."""
+        stream_batching_interval_ms = self.llm_config.experimental_configs.get(
+            "stream_batching_interval_ms"
+        )
+        if stream_batching_interval_ms is None:
+            stream_batching_interval_ms = MODEL_RESPONSE_BATCH_TIMEOUT_MS
+        return stream_batching_interval_ms if stream else None
+
     async def generate(
         self,
         request: GenerationRequest,
     ) -> AsyncGenerator[LLMRawResponse, None]:
-        batch_interval_ms = MODEL_RESPONSE_BATCH_TIMEOUT_MS if request.stream else None
-
+        # TODO (genesu): Responses batching logics should be common to all
+        #  engines and belongs to the LLMServer level instead of the engine
+        #  level here. Refactor the entire batching logics up.
         response_stream = LLMRawResponsesBatcher(
             self._generate(request),
-            interval_ms=batch_interval_ms,
+            interval_ms=self._get_batch_interval_ms(request.stream),
         )
         async for response in response_stream.stream():
             yield response
diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py
@@ -422,6 +422,7 @@ def as_deployment(
         min_replicas = RAYLLM_ROUTER_MIN_REPLICAS
         initial_replicas = RAYLLM_ROUTER_INITIAL_REPLICAS
         max_replicas = RAYLLM_ROUTER_MAX_REPLICAS
+        num_router_replicas = 0
 
         # Note (genesu): Based on our internal benchmark, we are currently bottleneck
         # by the router replicas during high concurrency situation. We are setting the
@@ -431,6 +432,11 @@ def as_deployment(
             model_initial_replicas = 0
             model_max_replicas = 0
             for llm_config in llm_configs:
+                num_router_replicas = max(
+                    num_router_replicas,
+                    llm_config.experimental_configs.get("num_router_replicas", 0),
+                )
+
                 if "autoscaling_config" in llm_config.deployment_config:
                     autoscaling_config = llm_config.deployment_config[
                         "autoscaling_config"
@@ -448,11 +454,15 @@ def as_deployment(
                     or autoscaling_config.min_replicas
                 )
                 model_max_replicas += autoscaling_config.max_replicas
-            min_replicas = int(model_min_replicas * ROUTER_TO_MODEL_REPLICA_RATIO)
-            initial_replicas = int(
+            min_replicas = num_router_replicas or int(
+                model_min_replicas * ROUTER_TO_MODEL_REPLICA_RATIO
+            )
+            initial_replicas = num_router_replicas or int(
                 model_initial_replicas * ROUTER_TO_MODEL_REPLICA_RATIO
             )
-            max_replicas = int(model_max_replicas * ROUTER_TO_MODEL_REPLICA_RATIO)
+            max_replicas = num_router_replicas or int(
+                model_max_replicas * ROUTER_TO_MODEL_REPLICA_RATIO
+            )
 
         ingress_cls = serve.ingress(fastapi_router_app)(cls)
         deployment_decorator = serve.deployment(
diff --git a/python/ray/llm/tests/serve/cpu/configs/test_models.py b/python/ray/llm/tests/serve/cpu/configs/test_models.py
@@ -248,6 +248,32 @@ def test_engine_config_cached(self):
         new_engine_config = llm_config.get_engine_config()
         assert new_engine_config is old_engine_config
 
+    def test_experimental_configs(self):
+        """Test that `experimental_configs` can be used."""
+        # Test with a valid dictionary can be used.
+        experimental_configs = {
+            "experimental_feature1": "value1",
+            "experimental_feature2": "value2",
+        }
+        llm_config = LLMConfig(
+            model_loading_config=ModelLoadingConfig(
+                model_id="llm_model_id",
+            ),
+            experimental_configs=experimental_configs,
+        )
+        assert llm_config.experimental_configs == experimental_configs
+
+        # test with invalid dictionary will raise a validation error.
+        with pytest.raises(
+            pydantic.ValidationError,
+        ):
+            LLMConfig(
+                model_loading_config=ModelLoadingConfig(
+                    model_id="llm_model_id",
+                ),
+                experimental_configs={123: "value1"},
+            )
+
 
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", __file__]))
diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/test_vllm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/test_vllm_engine.py
@@ -18,6 +18,7 @@
 from ray.llm._internal.serve.configs.server_models import (
     LLMConfig,
     LLMRawResponse,
+    ModelLoadingConfig,
 )
 from ray.llm._internal.serve.configs.constants import MODEL_RESPONSE_BATCH_TIMEOUT_MS
 
@@ -195,6 +196,42 @@ def test_parse_sampling_params_json_mode(
         assert guided_json == sampling_params.response_format.json_schema
         assert getattr(parsed_params, "response_format", None) is None
 
+    def test_get_batch_interval_ms(self):
+        """Test that the batch interval is set correctly in the config."""
+
+        # Test with a no stream_batching_interval_ms.
+        llm_config = LLMConfig(
+            model_loading_config=ModelLoadingConfig(
+                model_id="llm_model_id",
+            ),
+        )
+        vllm_engine = VLLMEngine(llm_config)
+        assert vllm_engine._get_batch_interval_ms() == MODEL_RESPONSE_BATCH_TIMEOUT_MS
+
+        # Test with a non-zero stream_batching_interval_ms.
+        llm_config = LLMConfig(
+            model_loading_config=ModelLoadingConfig(
+                model_id="llm_model_id",
+            ),
+            experimental_configs={
+                "stream_batching_interval_ms": 13,
+            },
+        )
+        vllm_engine = VLLMEngine(llm_config)
+        assert vllm_engine._get_batch_interval_ms() == 13
+
+        # Test with zero stream_batching_interval_ms.
+        llm_config = LLMConfig(
+            model_loading_config=ModelLoadingConfig(
+                model_id="llm_model_id",
+            ),
+            experimental_configs={
+                "stream_batching_interval_ms": 0,
+            },
+        )
+        vllm_engine = VLLMEngine(llm_config)
+        assert vllm_engine._get_batch_interval_ms() == 0
+
 
 TEXT_VALUE = "foo"
 FINAL_TEXT_VALUE = "bar"
diff --git a/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py b/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py
@@ -0,0 +1,56 @@
+import pytest
+import sys
+
+from ray.llm._internal.serve.configs.server_models import (
+    LLMConfig,
+    ModelLoadingConfig,
+)
+from ray.llm._internal.serve.deployments.routers.router import (
+    LLMRouter,
+)
+
+
+def test_router_with_num_router_replicas_config():
+    """Test the router with num_router_replicas config."""
+    # Test with no num_router_replicas config.
+    llm_configs = [
+        LLMConfig(
+            model_loading_config=ModelLoadingConfig(
+                model_id="llm_model_id",
+            ),
+        )
+    ]
+    llm_router_deployment = LLMRouter.as_deployment(llm_configs=llm_configs)
+    autoscaling_config = llm_router_deployment._deployment_config.autoscaling_config
+    assert autoscaling_config.min_replicas == 2
+    assert autoscaling_config.initial_replicas == 2
+    assert autoscaling_config.max_replicas == 2
+
+    # Test with num_router_replicas config on multiple llm configs.
+    llm_configs = [
+        LLMConfig(
+            model_loading_config=ModelLoadingConfig(
+                model_id="llm_model_id",
+            ),
+            experimental_configs={
+                "num_router_replicas": 3,
+            },
+        ),
+        LLMConfig(
+            model_loading_config=ModelLoadingConfig(
+                model_id="llm_model_id",
+            ),
+            experimental_configs={
+                "num_router_replicas": 5,
+            },
+        ),
+    ]
+    llm_router_deployment = LLMRouter.as_deployment(llm_configs=llm_configs)
+    autoscaling_config = llm_router_deployment._deployment_config.autoscaling_config
+    assert autoscaling_config.min_replicas == 5
+    assert autoscaling_config.initial_replicas == 5
+    assert autoscaling_config.max_replicas == 5
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-v", __file__]))