Skip to content

Commit 4234262

Browse files
authored
[Tokenizer] Priorize HuggingFace/SentencePiece over ByteLevelBPE (#2559)
This PR updates the tokenzier load logic, so that we prioritize the use of HuggingFace and SentencePiece tokenizers over the ByteLevelBPE tokenizer. This fixes the issue that token `<im_start>` in Qwen model is tokenized into multiple tokens when the ByteLevelBPE tokenizer is chosen when available.
1 parent 931587b commit 4234262

File tree

1 file changed

+18
-14
lines changed

1 file changed

+18
-14
lines changed

cpp/tokenizers/tokenizers.cc

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -103,30 +103,18 @@ Tokenizer Tokenizer::FromPath(const String& _path, std::optional<TokenizerInfo>
103103
sentencepiece = path / "tokenizer.model";
104104
huggingface = path / "tokenizer.json";
105105
rwkvworld = path / "tokenizer_model";
106-
// Check ByteLevelBPE
107-
{
108-
std::filesystem::path merges_path = path / "merges.txt";
109-
std::filesystem::path vocab_path = path / "vocab.json";
110-
std::filesystem::path added_tokens_path = path / "added_tokens.json";
111-
if (std::filesystem::exists(merges_path) && std::filesystem::exists(vocab_path) &&
112-
std::filesystem::exists(added_tokens_path)) {
113-
std::string vocab = LoadBytesFromFile(vocab_path.string());
114-
std::string merges = LoadBytesFromFile(merges_path.string());
115-
std::string added_tokens = LoadBytesFromFile(added_tokens_path.string());
116-
return Tokenizer(tokenizers::Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens),
117-
info_value);
118-
}
119-
}
120106
} else {
121107
sentencepiece = path.parent_path() / "tokenizer.model";
122108
huggingface = path.parent_path() / "tokenizer.json";
123109
rwkvworld = path.parent_path() / "tokenizer_model";
124110
}
125111
if (std::filesystem::exists(huggingface)) {
112+
// Check HuggingFace
126113
return Tokenizer(tokenizers::Tokenizer::FromBlobJSON(LoadBytesFromFile(huggingface.string())),
127114
info_value);
128115
}
129116
if (std::filesystem::exists(sentencepiece)) {
117+
// Check SentencePiece
130118
LOG(WARNING)
131119
<< "Using `tokenizer.model` since we cannot locate `tokenizer.json`.\n"
132120
<< "It is recommended to use `tokenizer.json` to ensure all token mappings are included, "
@@ -137,7 +125,23 @@ Tokenizer Tokenizer::FromPath(const String& _path, std::optional<TokenizerInfo>
137125
tokenizers::Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(sentencepiece.string())),
138126
info_value);
139127
}
128+
{
129+
// Check ByteLevelBPE
130+
std::filesystem::path merges_path = path / "merges.txt";
131+
std::filesystem::path vocab_path = path / "vocab.json";
132+
std::filesystem::path added_tokens_path = path / "added_tokens.json";
133+
if (std::filesystem::exists(merges_path) && std::filesystem::exists(vocab_path) &&
134+
std::filesystem::exists(added_tokens_path)) {
135+
LOG(INFO) << "come here";
136+
std::string vocab = LoadBytesFromFile(vocab_path.string());
137+
std::string merges = LoadBytesFromFile(merges_path.string());
138+
std::string added_tokens = LoadBytesFromFile(added_tokens_path.string());
139+
return Tokenizer(tokenizers::Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens),
140+
info_value);
141+
}
142+
}
140143
if (std::filesystem::exists(rwkvworld)) {
144+
// Check RWKV
141145
return Tokenizer(tokenizers::Tokenizer::FromBlobRWKVWorld(rwkvworld.string()), info_value);
142146
}
143147
LOG(FATAL) << "Cannot find any tokenizer under: " << _path;

0 commit comments

Comments
 (0)