@@ -113,44 +113,6 @@ static Result<TokenMap> _load_token_map(const std::string& path) {
113
113
// ------------------------------Util end------------------------------------
114
114
// -------------------------private method start-------------------------------
115
115
116
- template <typename T>
117
- std::pair<std::optional<std::string>, re2::StringPiece>
118
- Tiktoken::_split_with_allowed_special_token (
119
- re2::StringPiece& input,
120
- const T& allowed_special) const {
121
- if (!special_token_regex_) {
122
- return std::make_pair (std::nullopt, input);
123
- }
124
-
125
- #if __cplusplus >= 202002L
126
- auto start = input.begin ();
127
- #else
128
- const char * start = input.data ();
129
- #endif
130
- std::string special;
131
- while (true ) {
132
- if (!re2::RE2::FindAndConsume (&input, *special_token_regex_, &special)) {
133
- // No special token.
134
- break ;
135
- }
136
-
137
- if (allowed_special.tryGetInteger (special)) {
138
- // Found an allowed special token, split the text with it.
139
- #if __cplusplus >= 202002L
140
- return std::make_pair (
141
- special,
142
- re2::StringPiece (start, input.begin () - start - special.size ()));
143
- #else
144
- return std::make_pair (
145
- special,
146
- re2::StringPiece (start, (input.data () - start) - special.size ()));
147
- #endif
148
- } // else try to find the next special token
149
- }
150
-
151
- return std::make_pair (std::nullopt, input);
152
- }
153
-
154
116
Error Tiktoken::_encode (
155
117
re2::StringPiece& input,
156
118
std::vector<uint64_t >& ret,
@@ -179,43 +141,6 @@ void Tiktoken::_decode(re2::StringPiece input, std::string& ret) const {
179
141
#endif
180
142
}
181
143
182
- template <typename T>
183
- Result<std::pair<std::vector<uint64_t >, uint64_t >>
184
- Tiktoken::_encode_with_special_token (
185
- const std::string& text,
186
- const T& allowed_special) const {
187
- std::vector<uint64_t > tokens;
188
- uint64_t last_piece_token_len = 0 ;
189
- re2::StringPiece input (text);
190
- while (true ) {
191
- auto [special, sub_input] =
192
- _split_with_allowed_special_token (input, allowed_special);
193
-
194
- TK_CHECK_OK_OR_RETURN_ERROR (
195
- _encode (sub_input, tokens, last_piece_token_len));
196
-
197
- if (special) {
198
- const auto result = special_token_map_->tryGetInteger (*special);
199
- if (!result) {
200
- // Should never go here, since special pattern includes all special
201
- // chars.
202
- TK_LOG (Error, " unknown special token: %s" , special->c_str ());
203
- return Error::EncodeFailure;
204
- }
205
-
206
- tokens.push_back (*result);
207
- last_piece_token_len = 0 ;
208
- } else {
209
- break ;
210
- }
211
- }
212
-
213
- // last_piece_token_len is how many tokens came from the last regex split.
214
- // This is used for determining unstable tokens, since you can't merge
215
- // across (stable) regex splits
216
- return std::make_pair (tokens, last_piece_token_len);
217
- }
218
-
219
144
// -------------------------private method end-------------------------------
220
145
// -------------------------public method start-------------------------------
221
146
0 commit comments