added stream_n to v1/async_llm, created streaming_params (#1)

rohingarg-c · web-flow · commit c1633da6350f · 2025-05-13T15:42:24.000-07:00
* added stream_n to v1/async_llm, created streaming_params
* Updated OpenAI compatible API to work with StreamingParams
Signed-off-by: Rohin Garg &lt;rohin@character.ai&gt;
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -27,6 +27,10 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -126,6 +130,11 @@ encodec==0.1.1
     # via vocos
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.3.0
+    # via
+    #   anyio
+    #   hypothesis
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -683,8 +692,13 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+toml==0.10.2
+    # via datamodel-code-generator
 tomli==2.2.1
-    # via schemathesis
+    # via
+    #   black
+    #   pytest
+    #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -756,12 +770,17 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
+    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   rich
     #   torch
     #   typer
 tzdata==2024.2
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
@@ -14,6 +14,7 @@
 from vllm.inputs import PromptType
 from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
+from vllm.streaming_params import StreamingParams
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.metrics.loggers import LoggingStatLogger
 
@@ -62,9 +63,13 @@ async def generate(engine: AsyncLLM,
                                      seed=33,
                                      n=n,
                                      prompt_logprobs=prompt_logprobs)
+
+    streaming_params = StreamingParams(stream_n=3)
+
     async for out in engine.generate(request_id=request_id,
                                      prompt=prompt,
-                                     sampling_params=sampling_params):
+                                     sampling_params=sampling_params,
+                                     streaming_params=streaming_params):
 
         num_tokens = sum(len(output.token_ids) for output in out.outputs)
         if output_kind == RequestOutputKind.DELTA:
@@ -209,11 +214,15 @@ async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
                                          temperature=1.0,
                                          seed=33,
                                          n=n)
+
+        streaming_params = StreamingParams(stream_n=3)
+
         outputs = [
             out
             async for out in engine.generate(request_id="request-33",
                                              prompt=prompt,
-                                             sampling_params=sampling_params)
+                                             sampling_params=sampling_params,
+                                             streaming_params=streaming_params)
         ]
 
         # Assert only the last output has the finished flag set
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -33,6 +33,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest
+from vllm.streaming_params import StreamingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device, deprecate_kwargs, weak_bind
@@ -972,6 +973,7 @@ async def generate(
         self,
         prompt: PromptType,
         sampling_params: SamplingParams,
+        streaming_params: StreamingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
@@ -1045,6 +1047,8 @@ async def generate(
             >>> ...
         """
         try:
+            buffer: Optional[RequestOutput] = None  # buffer of output tokens
+            buffer_token_count = 0
             async for output in await self.add_request(
                     request_id,
                     prompt,
@@ -1054,7 +1058,19 @@ async def generate(
                     prompt_adapter_request=prompt_adapter_request,
                     priority=priority,
             ):
-                yield LLMEngine.validate_output(output, RequestOutput)
+                output = LLMEngine.validate_output(output, RequestOutput)
+                if buffer is None:
+                    buffer = output
+                else:
+                    buffer.add(output, aggregate=True)
+
+                buffer_token_count += sum(
+                    len(o.token_ids) for o in output.outputs)
+                if buffer_token_count >= streaming_params.stream_n:
+                    yield buffer
+                    buffer = None
+                    buffer_token_count = 0
+
         except asyncio.CancelledError:
             await self.abort(request_id)
             raise
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -46,6 +46,7 @@
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
+from vllm.streaming_params import StreamingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.utils import Device, deprecate_kwargs
 
@@ -445,6 +446,7 @@ def generate(
         self,
         prompt: PromptType,
         sampling_params: SamplingParams,
+        streaming_params: StreamingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
@@ -460,6 +462,7 @@ def generate(
         *,
         inputs: PromptType,
         sampling_params: SamplingParams,
+        streaming_params: StreamingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
@@ -476,6 +479,7 @@ def generate(
         self,
         prompt: Optional[PromptType] = None,
         sampling_params: Optional[SamplingParams] = None,
+        streaming_params: Optional[StreamingParams] = None,
         request_id: Optional[str] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
@@ -509,8 +513,9 @@ def generate(
                 and request_id is not None)
 
         return self._process_request(prompt, sampling_params, request_id,
-                                     lora_request, trace_headers,
-                                     prompt_adapter_request, priority)
+                                     streaming_params, lora_request,
+                                     trace_headers, prompt_adapter_request,
+                                     priority)
 
     @overload
     def encode(
@@ -590,6 +595,7 @@ async def _process_request(
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         request_id: str,
+        streaming_params: Optional[StreamingParams] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -660,14 +666,33 @@ async def _process_request(
             # queue after pulling them from the zmq socket.
             finished = False
             try:
+                buffer = None  # buffer of output tokens
+                buffer_token_count = 0
                 while not finished:
                     request_output = await queue.get()
 
                     if isinstance(request_output, BaseException):
                         raise request_output
 
                     finished = request_output.finished
-                    yield request_output
+                    if buffer is None:
+                        buffer = request_output
+                    else:
+                        buffer.add(request_output, aggregate=True)
+
+                    if isinstance(request_output, RequestOutput):
+                        buffer_token_count += sum(
+                            len(o.token_ids) for o in request_output.outputs)
+                    else:
+                        buffer_token_count += 1
+                    if streaming_params is None or \
+                        buffer_token_count >= streaming_params.stream_n or \
+                        finished:
+
+                        yield buffer
+                        buffer = None
+                        buffer_token_count = 0
+
             finally:
                 # Request was canceled by the client.
                 if not finished and not self.errored:
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -17,6 +17,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.streaming_params import StreamingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import Device, collect_from_async_generator, random_uuid
 
@@ -51,6 +52,7 @@ def generate(
         self,
         prompt: PromptType,
         sampling_params: SamplingParams,
+        streaming_params: StreamingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
@@ -126,7 +128,7 @@ async def beam_search(
                 task = asyncio.create_task(
                     collect_from_async_generator(
                         self.generate(individual_prompt, beam_search_params,
-                                      request_id_item)))
+                                      StreamingParams(), request_id_item)))
                 tasks.append(task)
 
             output = await asyncio.gather(*tasks)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -20,6 +20,7 @@
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
                                   RequestOutputKind, SamplingParams)
 from vllm.sequence import Logprob
+from vllm.streaming_params import StreamingParams
 from vllm.utils import random_uuid, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
@@ -151,6 +152,7 @@ class ResponseFormat(OpenAIBaseModel):
 class StreamOptions(OpenAIBaseModel):
     include_usage: Optional[bool] = True
     continuous_usage_stats: Optional[bool] = False
+    stream_n: Optional[int] = 1
 
 
 class FunctionDefinition(OpenAIBaseModel):
@@ -540,6 +542,13 @@ def to_sampling_params(
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias)
 
+    def to_streaming_params(self, ) -> StreamingParams:
+        stream_n = None
+        if self.stream_options is not None and \
+            self.stream_options.stream_n is not None:
+            stream_n = self.stream_options.stream_n
+        return StreamingParams(stream_n=stream_n)
+
     def _get_guided_json_from_tool(
             self) -> Optional[Union[str, dict, BaseModel]]:
         # user has chosen to not use any tool
@@ -973,6 +982,13 @@ def to_sampling_params(
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids)
 
+    def to_streaming_params(self, ) -> StreamingParams:
+        stream_n = None
+        if self.stream_options is not None and \
+            self.stream_options.stream_n is not None:
+            stream_n = self.stream_options.stream_n
+        return StreamingParams(stream_n=stream_n)
+
     @model_validator(mode="before")
     @classmethod
     def check_guided_decoding_count(cls, data):
@@ -1725,6 +1741,11 @@ def to_sampling_params(
                                             if self.stream \
                                             else RequestOutputKind.FINAL_ONLY)
 
+    def to_streaming_params(
+        self,
+    ) -> StreamingParams:  # stream_options not defined in transcription request
+        return StreamingParams(stream_n=None)
+
     @model_validator(mode="before")
     @classmethod
     def validate_stream_options(cls, data):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -221,6 +221,8 @@ async def create_chat_completion(
                         self.model_config.logits_processor_pattern,
                         self.default_sampling_params)
 
+                streaming_params = request.to_streaming_params()
+
                 self._log_inputs(request_id,
                                  request_prompts[i],
                                  params=sampling_params,
@@ -240,6 +242,7 @@ async def create_chat_completion(
                     generator = self.engine_client.generate(
                         engine_prompt,
                         sampling_params,
+                        streaming_params,
                         request_id,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -142,6 +142,7 @@ async def create_completion(
                         self.default_sampling_params)
 
                 request_id_item = f"{request_id}-{i}"
+                streaming_params = request.to_streaming_params()
 
                 self._log_inputs(request_id_item,
                                  request_prompts[i],
@@ -162,6 +163,7 @@ async def create_completion(
                     generator = self.engine_client.generate(
                         engine_prompt,
                         sampling_params,
+                        streaming_params,
                         request_id_item,
                         lora_request=lora_request,
                         prompt_adapter_request=prompt_adapter_request,
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
@@ -282,6 +282,7 @@ async def create_transcription(
             default_max_tokens = self.model_config.max_model_len
             sampling_params = request.to_sampling_params(
                 default_max_tokens, self.default_sampling_params)
+            streaming_params = request.to_streaming_params()
 
             self._log_inputs(
                 request_id,
@@ -293,6 +294,7 @@ async def create_transcription(
             result_generator = self.engine_client.generate(
                 prompt,
                 sampling_params,
+                streaming_params,
                 request_id,
             )
         except ValueError as e:
diff --git a/vllm/streaming_params.py b/vllm/streaming_params.py
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py