fixes

Your Name · Your Name · commit 9d53c55de5c6 · 2025-07-07T04:02:34.000Z
diff --git a/vllm/beam/beam.py b/vllm/beam/beam.py
@@ -1,37 +1,44 @@
 from collections.abc import AsyncGenerator
+from typing import Union
+
 from vllm.beam.debug import BeamDebugInfo
 from vllm.beam.penalty import PenaltyComputer
 import torch
 from vllm.beam.ranking import RankingComputer
+from vllm.entrypoints.openai.protocol import CompletionResponse, ErrorResponse
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class BeamScorer:
     def __init__(self, classi_idx):
         self.penalty_computer = PenaltyComputer(classi_idx)
         self.ranking_computer = RankingComputer(classi_idx)
 
-    async def collapse_beams(self, responses: list[AsyncGenerator], chunk_num = 0, max_chunks = 4):
-            debug_info = [BeamDebugInfo() for _ in responses]
-            
-            scores = torch.zeros(len(responses), dtype=torch.float)
- 
-            heads = [response.choices[0].additional_heads[0] for response in responses]
-            heads_tensor = torch.tensor(heads, dtype=torch.float)
-            if len(heads_tensor) > 0:
-                penalties = self.penalty_computer.compute(heads_tensor, debug_info)
-                scores -= penalties
-
-                ranking_scores = self.ranking_computer.compute(
+    async def pick_best_beam(self, responses: list[
+        Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]]) -> Union[
+        AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
+        debug_info = [BeamDebugInfo() for _ in responses]
+
+        scores = torch.zeros(len(responses), dtype=torch.float)
+
+        heads = [response.choices[0].additional_heads[0] for response in responses]
+        heads_tensor = torch.tensor(heads, dtype=torch.float)
+        if len(heads_tensor) > 0:
+            penalties = self.penalty_computer.compute(heads_tensor, debug_info)
+            scores -= penalties
+
+            ranking_scores = self.ranking_computer.compute(
                 heads_tensor, debug_info
-               )
-                scores *= ranking_scores
+            )
+            scores += ranking_scores
 
-            for i in range(len(responses)):
-                debug_info[i].final_score = scores[i]
-                debug_info[i].content = responses[i].choices[0].text
+        for i in range(len(responses)):
+            debug_info[i].final_score = scores[i]
+            debug_info[i].content = responses[i].choices[0].text
 
-            print('debug_info', debug_info)
+        logger.debug('debug_info: %s', debug_info)
 
-            best_idx = torch.argmax(scores).item()
-            return responses[best_idx]
-    
+        best_idx = torch.argmax(scores).item()
+        return responses[best_idx]
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import math
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
@@ -13,7 +14,7 @@
 from typing_extensions import assert_never
 
 from vllm.beam.beam import BeamScorer
-from vllm.beam.filtering import BeamValidator
+from vllm.beam.filtering import _CHUNK_SIZE, BeamValidator
 from vllm.beam.metrics import report_metrics
 from vllm.beam.penalty import MEOW_CLASSI_IDX, PenaltyComputer
 from vllm.config import ModelConfig
@@ -105,13 +106,14 @@ async def _process_prefix(request: CompletionRequest):
         async def _should_stop(final):
             return final.choices[0].finish_reason == "stop" or final.choices[0].is_filtered
         
+        max_chunks = math.ceil(request.max_tokens / _CHUNK_SIZE)
         async def _chunk_generator():
             num_chunks = 0
             should_stop = False
             output = None
 
             # TODO(@tanuj): calc created tokens
-            while num_chunks < 4 and not should_stop:
+            while num_chunks < max_chunks and not should_stop:
                 num_chunks += 1
                 beams = await self.beam_validator.get_n_valid_beams(create_completion=self.create_completion, request=request, raw_request=raw_request)
                 final = await self.beam_scorer.collapse_beams(beams, num_chunks)