Skip to content

Commit 7e1e74f

Browse files
committed
Update llama-vocab.cpp
1 parent 1687be3 commit 7e1e74f

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

src/llama-vocab.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,8 +430,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
430430
regex_exprs = {
431431
// 1. Han characters
432432
"[\\p{Han}]+",
433-
// 2. Words ending in lowercase (non-Han) with contractions
434-
"[^\\r\\n\\p{L}\\p{N}]?(?:(?!\\p{Han})(?:\\p{Lu}|\\p{Lt}|\\p{Lm}|\\p{Lo}|\\p{M})*(?!\\p{Han})(?:\\p{Ll}|\\p{Lm}|\\p{Lo}|\\p{M})+|(?!\\p{Han})(?:\\p{Lu}|\\p{Lt}|\\p{Lm}|\\p{Lo}|\\p{M})+(?!\\p{Han})(?:\\p{Ll}|\\p{Lm}|\\p{Lo}|\\p{M})*)(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
433+
// 2. Extract Latin/Greek/Cyrillic (or any non-Han) "words" from text that may also contain CJK.
434+
"[^\\r\\n\\p{L}\\p{N}]?(?:(?:(?=[\\p{L}])(?!\\p{Han})[^a-z])|\\p{M})*(?:(?:(?=[\\p{L}])(?!\\p{Han})[^A-Z])|\\p{M})+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?(?:(?:(?=[\\p{L}])(?!\\p{Han})[^a-z])|\\p{M})+(?:(?:(?=[\\p{L}])(?!\\p{Han})[^A-Z])|\\p{M})*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
435435
// 3. Numbers (1-3 digits)
436436
"\\p{N}{1,3}",
437437
// 4. Punctuation and symbols

0 commit comments

Comments
 (0)