@@ -212,7 +212,8 @@ async def chat_completion_stream_generator(
212
212
sampled_token_ranks = raw_top_logprobs [2 ],
213
213
)
214
214
logprobs_res = self .build_logprobs_response (
215
- logprobs = top_logprobs ,
215
+ request_logprobs = request .logprobs ,
216
+ response_logprobs = top_logprobs ,
216
217
request_top_logprobs = request .top_logprobs ,
217
218
)
218
219
@@ -229,7 +230,9 @@ async def chat_completion_stream_generator(
229
230
if res ["finished" ]:
230
231
num_choices -= 1
231
232
work_process_metrics .e2e_request_latency .observe (time .time () - res ["metrics" ]["request_start_time" ])
232
- if request .max_tokens is None or previous_num_tokens != request .max_tokens :
233
+ has_no_token_limit = request .max_tokens is None and request .max_completion_tokens is None
234
+ max_tokens = request .max_completion_tokens or request .max_tokens
235
+ if has_no_token_limit or previous_num_tokens != max_tokens :
233
236
choice .finish_reason = "stop"
234
237
if self .engine_client .reasoning_parser == "ernie_x1" and \
235
238
output .get ("finish_reason" , "" ) == "tool_calls" :
@@ -337,7 +340,8 @@ async def chat_completion_full_generator(
337
340
sampled_token_ranks = raw_top_logprobs [2 ],
338
341
)
339
342
logprobs_res = self .build_logprobs_response (
340
- logprobs = top_logprobs ,
343
+ request_logprobs = request .logprobs ,
344
+ response_logprobs = top_logprobs ,
341
345
request_top_logprobs = request .top_logprobs ,
342
346
)
343
347
if logprobs_res and logprobs_res .content is not None :
@@ -369,7 +373,9 @@ async def chat_completion_full_generator(
369
373
logprobs = logprobs_full_res ,
370
374
finish_reason = None
371
375
)
372
- if request .max_tokens is None or previous_num_tokens != request .max_tokens :
376
+ has_no_token_limit = request .max_tokens is None and request .max_completion_tokens is None
377
+ max_tokens = request .max_completion_tokens or request .max_tokens
378
+ if has_no_token_limit or previous_num_tokens != max_tokens :
373
379
choice .finish_reason = "stop"
374
380
if self .engine_client .reasoning_parser == "ernie_x1" and \
375
381
output .get ("finish_reason" , "" ) == "tool_calls" :
@@ -400,7 +406,8 @@ async def chat_completion_full_generator(
400
406
401
407
def build_logprobs_response (
402
408
self ,
403
- logprobs : Optional [LogprobsLists ],
409
+ request_logprobs : bool ,
410
+ response_logprobs : Optional [LogprobsLists ],
404
411
request_top_logprobs : int ,
405
412
) -> Optional [LogProbs ]:
406
413
"""
@@ -410,17 +417,23 @@ def build_logprobs_response(
410
417
411
418
# Parameter validation
412
419
if (
413
- logprobs is None
420
+ response_logprobs is None
421
+ or not request_logprobs
414
422
or request_top_logprobs is None
415
- or request_top_logprobs <= 0
416
- or len (logprobs .logprob_token_ids ) == 0
423
+ or request_top_logprobs < 0
417
424
):
418
425
return None
419
426
420
427
try :
421
428
# The top-k candidates for the current token
422
- topk_token_ids = logprobs .logprob_token_ids [0 ][:request_top_logprobs + 1 ]
423
- topk_logprobs = logprobs .logprobs [0 ][:request_top_logprobs + 1 ]
429
+ topk_token_ids = []
430
+ topk_logprobs = []
431
+
432
+ if response_logprobs .logprob_token_ids and len (response_logprobs .logprob_token_ids ) > 0 :
433
+ topk_token_ids = response_logprobs .logprob_token_ids [0 ][:request_top_logprobs + 1 ]
434
+
435
+ if response_logprobs .logprobs and len (response_logprobs .logprobs ) > 0 :
436
+ topk_logprobs = response_logprobs .logprobs [0 ][:request_top_logprobs + 1 ]
424
437
425
438
# Construct the candidate token structure (LogProbEntry) of topk
426
439
top_logprob_entries : List [LogProbEntry ] = []
0 commit comments