Skip to content

Commit 9cf3285

Browse files
committed
Update llama-vocab.cpp
1 parent 1b645e6 commit 9cf3285

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

src/llama-vocab.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -431,9 +431,9 @@ struct llm_tokenizer_bpe : llm_tokenizer {
431431
// 1. Han characters
432432
"[\\p{Han}]+",
433433
// 2. Words ending in lowercase (non-Han) with contractions
434-
"[^\\r\\n\\p{L}\\p{N}]?(?:(?![\\p{Han}])[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}])*(?:(?![\\p{Han}])[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
434+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
435435
// 3. Words starting with uppercase (non-Han) with contractions
436-
"[^\\r\\n\\p{L}\\p{N}]?(?:(?![\\p{Han}])[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}])+(?:(?![\\p{Han}])[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}])*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
436+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
437437
// 4. Numbers (1-3 digits)
438438
"\\p{N}{1,3}",
439439
// 5. Punctuation and symbols

0 commit comments

Comments
 (0)