@@ -103,30 +103,18 @@ Tokenizer Tokenizer::FromPath(const String& _path, std::optional<TokenizerInfo>
103
103
sentencepiece = path / " tokenizer.model" ;
104
104
huggingface = path / " tokenizer.json" ;
105
105
rwkvworld = path / " tokenizer_model" ;
106
- // Check ByteLevelBPE
107
- {
108
- std::filesystem::path merges_path = path / " merges.txt" ;
109
- std::filesystem::path vocab_path = path / " vocab.json" ;
110
- std::filesystem::path added_tokens_path = path / " added_tokens.json" ;
111
- if (std::filesystem::exists (merges_path) && std::filesystem::exists (vocab_path) &&
112
- std::filesystem::exists (added_tokens_path)) {
113
- std::string vocab = LoadBytesFromFile (vocab_path.string ());
114
- std::string merges = LoadBytesFromFile (merges_path.string ());
115
- std::string added_tokens = LoadBytesFromFile (added_tokens_path.string ());
116
- return Tokenizer (tokenizers::Tokenizer::FromBlobByteLevelBPE (vocab, merges, added_tokens),
117
- info_value);
118
- }
119
- }
120
106
} else {
121
107
sentencepiece = path.parent_path () / " tokenizer.model" ;
122
108
huggingface = path.parent_path () / " tokenizer.json" ;
123
109
rwkvworld = path.parent_path () / " tokenizer_model" ;
124
110
}
125
111
if (std::filesystem::exists (huggingface)) {
112
+ // Check HuggingFace
126
113
return Tokenizer (tokenizers::Tokenizer::FromBlobJSON (LoadBytesFromFile (huggingface.string ())),
127
114
info_value);
128
115
}
129
116
if (std::filesystem::exists (sentencepiece)) {
117
+ // Check SentencePiece
130
118
LOG (WARNING)
131
119
<< " Using `tokenizer.model` since we cannot locate `tokenizer.json`.\n "
132
120
<< " It is recommended to use `tokenizer.json` to ensure all token mappings are included, "
@@ -137,7 +125,23 @@ Tokenizer Tokenizer::FromPath(const String& _path, std::optional<TokenizerInfo>
137
125
tokenizers::Tokenizer::FromBlobSentencePiece (LoadBytesFromFile (sentencepiece.string ())),
138
126
info_value);
139
127
}
128
+ {
129
+ // Check ByteLevelBPE
130
+ std::filesystem::path merges_path = path / " merges.txt" ;
131
+ std::filesystem::path vocab_path = path / " vocab.json" ;
132
+ std::filesystem::path added_tokens_path = path / " added_tokens.json" ;
133
+ if (std::filesystem::exists (merges_path) && std::filesystem::exists (vocab_path) &&
134
+ std::filesystem::exists (added_tokens_path)) {
135
+ LOG (INFO) << " come here" ;
136
+ std::string vocab = LoadBytesFromFile (vocab_path.string ());
137
+ std::string merges = LoadBytesFromFile (merges_path.string ());
138
+ std::string added_tokens = LoadBytesFromFile (added_tokens_path.string ());
139
+ return Tokenizer (tokenizers::Tokenizer::FromBlobByteLevelBPE (vocab, merges, added_tokens),
140
+ info_value);
141
+ }
142
+ }
140
143
if (std::filesystem::exists (rwkvworld)) {
144
+ // Check RWKV
141
145
return Tokenizer (tokenizers::Tokenizer::FromBlobRWKVWorld (rwkvworld.string ()), info_value);
142
146
}
143
147
LOG (FATAL) << " Cannot find any tokenizer under: " << _path;
0 commit comments