|
38 | 38 | from vllm.transformers_utils.tokenizer import AnyTokenizer
|
39 | 39 | from vllm.utils import merge_async_iterators
|
40 | 40 |
|
41 |
| -from numba.np.old_arraymath import numpy_unwrap |
42 | 41 |
|
43 | 42 | logger = init_logger(__name__)
|
44 | 43 |
|
45 | 44 |
|
46 |
| -_CHUNK_SIZE = 16 |
47 |
| - |
48 | 45 | class OpenAIServingCompletion(OpenAIServing):
|
49 | 46 |
|
50 | 47 | def __init__(
|
@@ -96,27 +93,25 @@ async def _process_prefix(request: CompletionRequest):
|
96 | 93 | res = await _process_prefix(request)
|
97 | 94 | input_str_len = len(request.prompt)
|
98 | 95 |
|
| 96 | + async def _should_stop(final): |
| 97 | + return final.choices[0].finish_reason == "stop" or final.choices[0].is_filtered |
| 98 | + |
99 | 99 | async def _chunk_generator():
|
100 | 100 | num_chunks = 0
|
101 |
| - eom = False |
| 101 | + should_stop = False |
102 | 102 |
|
103 |
| - while num_chunks < 4 and not eom: |
| 103 | + while num_chunks < 4 and not should_stop: |
104 | 104 | num_chunks += 1
|
105 | 105 | beams = await self.beam_validator.get_n_valid_beams(create_completion=self.create_completion, request=request, raw_request=raw_request)
|
106 | 106 | final = await self.beam_scorer.collapse_beams(beams, num_chunks)
|
107 | 107 | request.prompt = final.choices[0].text
|
108 |
| - eom = final.choices[0].finish_reason == "stop" |
| 108 | + should_stop = await _should_stop(final) |
109 | 109 | final.choices[0].text = final.choices[0].text[input_str_len:]
|
110 | 110 | yield f"data: {final.model_dump_json()}\n\n"
|
111 | 111 |
|
112 |
| - if eom: |
| 112 | + if should_stop: |
113 | 113 | return
|
114 | 114 |
|
115 |
| - # Final chunk with trimmed text |
116 |
| - if final: |
117 |
| - final.choices[0].text = final.choices[0].text[input_str_len:] |
118 |
| - yield f"data: {final.model_dump_json()}\n\n" |
119 |
| - |
120 | 115 | yield "data: [DONE]\n\n"
|
121 | 116 |
|
122 | 117 | return _chunk_generator()
|
|
0 commit comments