Skip to content

Commit 6a9a7e8

Browse files
committed
Update llama-vocab.cpp
1 parent 7e1e74f commit 6a9a7e8

File tree

1 file changed

+1
-14
lines changed

1 file changed

+1
-14
lines changed

src/llama-vocab.cpp

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -428,20 +428,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
428428
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
429429
// Same as GPT-4o tokenizer except for Han characters [\\p{Han}]+
430430
regex_exprs = {
431-
// 1. Han characters
432-
"[\\p{Han}]+",
433-
// 2. Extract Latin/Greek/Cyrillic (or any non-Han) "words" from text that may also contain CJK.
434-
"[^\\r\\n\\p{L}\\p{N}]?(?:(?:(?=[\\p{L}])(?!\\p{Han})[^a-z])|\\p{M})*(?:(?:(?=[\\p{L}])(?!\\p{Han})[^A-Z])|\\p{M})+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?(?:(?:(?=[\\p{L}])(?!\\p{Han})[^a-z])|\\p{M})+(?:(?:(?=[\\p{L}])(?!\\p{Han})[^A-Z])|\\p{M})*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
435-
// 3. Numbers (1-3 digits)
436-
"\\p{N}{1,3}",
437-
// 4. Punctuation and symbols
438-
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*",
439-
// 5. Newlines
440-
"\\s*[\\r\\n]+",
441-
// 6.Whitespace at the end of a line
442-
"\\s+(?!\\S)",
443-
// 7. General whitespace
444-
"\\s+",
431+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
445432
};
446433
break;
447434
default:

0 commit comments

Comments
 (0)