Skip to content

Commit 1b645e6

Browse files
committed
Update llama-vocab.cpp
1 parent 97b117c commit 1b645e6

File tree

1 file changed

+10
-12
lines changed

1 file changed

+10
-12
lines changed

src/llama-vocab.cpp

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -428,23 +428,21 @@ struct llm_tokenizer_bpe : llm_tokenizer {
428428
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
429429
// Same as GPT-4o tokenizer except for Han characters [\\p{Han}]+
430430
regex_exprs = {
431-
// 1. Add the high-priority Han character rule. Backslashes must be escaped.
431+
// 1. Han characters
432432
"[\\p{Han}]+",
433-
434-
// 2 & 3. Use the adapted word patterns from GPT4O/Tekken, which emulate the uppercase/lowercase logic in a C++-compatible way.
435-
// We also adapt the case-insensitive contraction to be C++ compatible.
436-
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
437-
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
438-
439-
// 4. Add the number rule.
433+
// 2. Words ending in lowercase (non-Han) with contractions
434+
"[^\\r\\n\\p{L}\\p{N}]?(?:(?![\\p{Han}])[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}])*(?:(?![\\p{Han}])[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
435+
// 3. Words starting with uppercase (non-Han) with contractions
436+
"[^\\r\\n\\p{L}\\p{N}]?(?:(?![\\p{Han}])[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}])+(?:(?![\\p{Han}])[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}])*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
437+
// 4. Numbers (1-3 digits)
440438
"\\p{N}{1,3}",
441-
442-
// 5. Use the Kimi K2 symbol rule precisely (no trailing '/').
439+
// 5. Punctuation and symbols
443440
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*",
444-
445-
// 6, 7, 8. Add the identical whitespace rules.
441+
// 6. Newlines
446442
"\\s*[\\r\\n]+",
443+
// 7.Whitespace at the end of a line
447444
"\\s+(?!\\S)",
445+
// 8. General whitespace
448446
"\\s+",
449447
};
450448
break;

0 commit comments

Comments
 (0)