Skip to content

Commit 93b9d9f

Browse files
[Bugfix]: Fix messy code when using logprobs (#19209)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
1 parent af107d5 commit 93b9d9f

File tree

3 files changed

+25
-3
lines changed

3 files changed

+25
-3
lines changed

tests/test_utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,12 @@
1414
import pytest
1515
import torch
1616
import zmq
17+
from transformers import AutoTokenizer
1718
from vllm_test_utils.monitor import monitor
1819

1920
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
21+
from vllm.transformers_utils.detokenizer_utils import (
22+
convert_ids_list_to_tokens)
2023
from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
2124
MemorySnapshot, PlaceholderModule, StoreBoolean,
2225
bind_kv_cache, common_broadcastable_dtype,
@@ -918,3 +921,14 @@ def test_split_host_port():
918921
def test_join_host_port():
919922
assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
920923
assert join_host_port("::1", 5555) == "[::1]:5555"
924+
925+
926+
def test_convert_ids_list_to_tokens():
927+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
928+
token_ids = tokenizer.encode("Hello, world!")
929+
# token_ids = [9707, 11, 1879, 0]
930+
assert tokenizer.convert_ids_to_tokens(token_ids) == [
931+
'Hello', ',', 'Ġworld', '!'
932+
]
933+
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
934+
assert tokens == ['Hello', ',', ' world', '!']

tests/v1/engine/test_output_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def _ref_convert_id_to_token(
3535
Returns:
3636
String representation of input token id
3737
"""
38-
return tokenizer.convert_ids_to_tokens(token_id) or ""
38+
return tokenizer.decode([token_id]) or ""
3939

4040

4141
@pytest.mark.parametrize(

vllm/transformers_utils/detokenizer_utils.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def convert_prompt_ids_to_tokens(
7878
def convert_ids_list_to_tokens(
7979
tokenizer: AnyTokenizer,
8080
token_ids: list[int],
81+
skip_special_tokens: bool = False,
8182
) -> list[str]:
8283
"""Detokenize the input ids individually.
8384
@@ -89,8 +90,15 @@ def convert_ids_list_to_tokens(
8990
Python list of token string representations
9091
9192
"""
92-
token_str_lst = tokenizer.convert_ids_to_tokens(token_ids)
93-
_replace_none_with_empty(token_str_lst) # type: ignore
93+
token_str_lst = []
94+
for token_id in token_ids:
95+
token_str = tokenizer.decode(
96+
[token_id],
97+
skip_special_tokens=skip_special_tokens,
98+
)
99+
if token_str is None:
100+
token_str = ""
101+
token_str_lst.append(token_str)
94102
return token_str_lst
95103

96104

0 commit comments

Comments
 (0)