@@ -428,23 +428,21 @@ struct llm_tokenizer_bpe : llm_tokenizer {
428
428
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
429
429
// Same as GPT-4o tokenizer except for Han characters [\\p{Han}]+
430
430
regex_exprs = {
431
- // 1. Add the high-priority Han character rule. Backslashes must be escaped.
431
+ // 1. Han characters
432
432
" [\\ p{Han}]+" ,
433
-
434
- // 2 & 3. Use the adapted word patterns from GPT4O/Tekken, which emulate the uppercase/lowercase logic in a C++-compatible way.
435
- // We also adapt the case-insensitive contraction to be C++ compatible.
436
- " [^\\ r\\ n\\ p{L}\\ p{N}]?((?=[\\ p{L}])([^a-z]))*((?=[\\ p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?" ,
437
- " [^\\ r\\ n\\ p{L}\\ p{N}]?((?=[\\ p{L}])([^a-z]))+((?=[\\ p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?" ,
438
-
439
- // 4. Add the number rule.
433
+ // 2. Words ending in lowercase (non-Han) with contractions
434
+ " [^\\ r\\ n\\ p{L}\\ p{N}]?(?:(?![\\ p{Han}])[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}])*(?:(?![\\ p{Han}])[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?" ,
435
+ // 3. Words starting with uppercase (non-Han) with contractions
436
+ " [^\\ r\\ n\\ p{L}\\ p{N}]?(?:(?![\\ p{Han}])[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}])+(?:(?![\\ p{Han}])[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}])*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?" ,
437
+ // 4. Numbers (1-3 digits)
440
438
" \\ p{N}{1,3}" ,
441
-
442
- // 5. Use the Kimi K2 symbol rule precisely (no trailing '/').
439
+ // 5. Punctuation and symbols
443
440
" ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*" ,
444
-
445
- // 6, 7, 8. Add the identical whitespace rules.
441
+ // 6. Newlines
446
442
" \\ s*[\\ r\\ n]+" ,
443
+ // 7.Whitespace at the end of a line
447
444
" \\ s+(?!\\ S)" ,
445
+ // 8. General whitespace
448
446
" \\ s+" ,
449
447
};
450
448
break ;
0 commit comments