Merge branch 'main' of github.com:character-tech/vllm into cached_tokens_completions

amogkam · amogkam · commit 99c55931ef1d · 2025-07-03T00:40:02.000Z
Signed-off-by: Amog Kamsetty &lt;amogkamsetty@gmail.com&gt;
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -875,6 +875,13 @@ class CompletionRequest(OpenAIBaseModel):
         description="KVTransfer parameters used for disaggregated serving.")
 
     # --8<-- [end:completion-extra-params]
+    accumulate: Optional[bool] = Field(
+        default=None,
+        description=(
+            "Special kind of echo where in the response instead of delta we return the accumulated text"
+        )
+    )
+    # doc: end-completion-extra-params
 
     # Default sampling parameters for completion requests
     _DEFAULT_SAMPLING_PARAMS: dict = {
@@ -1323,6 +1330,47 @@ class PoolingResponse(OpenAIBaseModel):
     usage: UsageInfo
 
 
+class ClassificationRequest(OpenAIBaseModel):
+    model: Optional[str] = None
+    input: Union[list[str], str]
+    truncate_prompt_tokens: Optional[int] = None
+    user: Optional[str] = None
+
+    # --8<-- [start:classification-pooling-params]
+    additional_data: Optional[Any] = None
+    # --8<-- [end:classification-pooling-params]
+
+    # --8<-- [start:classification-extra-params]
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."),
+    )
+
+    # --8<-- [end:classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)
+
+
+class ClassificationData(OpenAIBaseModel):
+    index: int
+    label: Optional[str]
+    probs: list[float]
+    num_classes: int
+
+
+class ClassificationResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ClassificationData]
+    usage: UsageInfo
+
+
 class ScoreResponseData(OpenAIBaseModel):
     index: int
     object: str = "score"
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -301,6 +301,9 @@ async def completion_stream_generator(
         has_echoed = [False] * num_choices * num_prompts
         num_prompt_tokens = [0] * num_prompts
         num_cached_tokens = [0] * num_prompts
+        accumulated_text = [""] * num_choices * num_prompts
+        accumulated_tokens = [[] * num_choices * num_prompts]
+        accumulated_logprobs = [[] * num_choices * num_prompts]
 
         stream_options = request.stream_options
         if stream_options:
@@ -352,6 +355,16 @@ async def completion_stream_generator(
                                 *(output.logprobs or []),
                             ]
                         has_echoed[i] = True
+                    elif request.accumulate:
+                        i = output.index + prompt_idx * num_choices
+                        # return the accumulated response
+                        accumulated_text[i] += output.text
+                        accumulated_tokens[i].extend(output.token_ids)
+                        accumulated_logprobs[i].extend(output.logprobs or [])
+
+                        delta_text = accumulated_text[i]
+                        delta_token_ids = accumulated_tokens[i]
+                        out_logprobs = accumulated_logprobs[i]
                     else:
                         # return just the delta
                         delta_text = output.text
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
@@ -306,7 +306,7 @@ def create_attention_instances(self) -> dict[int, Attention]:
                 self.config.global_attention_layers, list):
             global_attention_layers = self.config.global_attention_layers
         else:
-            global_attention_layers = None
+            global_attention_layers = []
 
         for i in range(start, end):
             sliding_window = None