Skip to content

Commit 1a6e517

Browse files
committed
Update llama-vocab.cpp
1 parent 6a9a7e8 commit 1a6e517

File tree

1 file changed

+14
-1
lines changed

1 file changed

+14
-1
lines changed

src/llama-vocab.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,20 @@ struct llm_tokenizer_bpe : llm_tokenizer {
428428
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
429429
// Same as GPT-4o tokenizer except for Han characters [\\p{Han}]+
430430
regex_exprs = {
431-
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
431+
// 1. Han characters
432+
"[一-鿿]+",
433+
// 2.
434+
"[^\\r\\n\\p{L}\\p{N}]?(?=(?:(?![一-鿿])[\\p{L}\\p{M}])*?[a-z])(?:(?![一-鿿])[\\p{L}\\p{M}])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?(?=(?:(?![一-鿿])[\\p{L}\\p{M}])*?[A-Z])(?:(?![一-鿿])[\\p{L}\\p{M}])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
435+
// 3. Numbers (1-3 digits)
436+
"\\p{N}{1,3}",
437+
// 4. Punctuation and symbols
438+
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*",
439+
// 5. Newlines
440+
"\\s*[\\r\\n]+",
441+
// 6.Whitespace at the end of a line
442+
"\\s+(?!\\S)",
443+
// 7. General whitespace
444+
"\\s+",
432445
};
433446
break;
434447
default:

0 commit comments

Comments
 (0)