|
44 | 44 | from vllm.transformers_utils.tokenizer import AnyTokenizer
|
45 | 45 | from vllm.utils import merge_async_iterators
|
46 | 46 |
|
47 |
| -from numba.np.old_arraymath import numpy_unwrap |
48 | 47 |
|
49 | 48 | logger = init_logger(__name__)
|
50 | 49 |
|
51 | 50 |
|
52 |
| -_CHUNK_SIZE = 16 |
53 |
| - |
54 | 51 | class OpenAIServingCompletion(OpenAIServing):
|
55 | 52 |
|
56 | 53 | def __init__(
|
@@ -104,27 +101,25 @@ async def _process_prefix(request: CompletionRequest):
|
104 | 101 | res = await _process_prefix(request)
|
105 | 102 | input_str_len = len(request.prompt)
|
106 | 103 |
|
| 104 | + async def _should_stop(final): |
| 105 | + return final.choices[0].finish_reason == "stop" or final.choices[0].is_filtered |
| 106 | + |
107 | 107 | async def _chunk_generator():
|
108 | 108 | num_chunks = 0
|
109 |
| - eom = False |
| 109 | + should_stop = False |
110 | 110 |
|
111 |
| - while num_chunks < 4 and not eom: |
| 111 | + while num_chunks < 4 and not should_stop: |
112 | 112 | num_chunks += 1
|
113 | 113 | beams = await self.beam_validator.get_n_valid_beams(create_completion=self.create_completion, request=request, raw_request=raw_request)
|
114 | 114 | final = await self.beam_scorer.collapse_beams(beams, num_chunks)
|
115 | 115 | request.prompt = final.choices[0].text
|
116 |
| - eom = final.choices[0].finish_reason == "stop" |
| 116 | + should_stop = await _should_stop(final) |
117 | 117 | final.choices[0].text = final.choices[0].text[input_str_len:]
|
118 | 118 | yield f"data: {final.model_dump_json()}\n\n"
|
119 | 119 |
|
120 |
| - if eom: |
| 120 | + if should_stop: |
121 | 121 | return
|
122 | 122 |
|
123 |
| - # Final chunk with trimmed text |
124 |
| - if final: |
125 |
| - final.choices[0].text = final.choices[0].text[input_str_len:] |
126 |
| - yield f"data: {final.model_dump_json()}\n\n" |
127 |
| - |
128 | 123 | yield "data: [DONE]\n\n"
|
129 | 124 |
|
130 | 125 | return _chunk_generator()
|
|
0 commit comments