Skip to content

Commit a4180af

Browse files
committed
Update llama-vocab.cpp
1 parent 1a6e517 commit a4180af

File tree

1 file changed

+1
-1
lines changed

1 file changed

+1
-1
lines changed

src/llama-vocab.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
431431
// 1. Han characters
432432
"[一-鿿]+",
433433
// 2.
434-
"[^\\r\\n\\p{L}\\p{N}]?(?=(?:(?![一-鿿])[\\p{L}\\p{M}])*?[a-z])(?:(?![一-鿿])[\\p{L}\\p{M}])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?(?=(?:(?![一-鿿])[\\p{L}\\p{M}])*?[A-Z])(?:(?![一-鿿])[\\p{L}\\p{M}])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
434+
"[^\\r\\nA-Za-z0-9]?(?=(?:(?![一-鿿])[A-Za-z])*?[a-z])(?:(?![一-鿿])[A-Za-z])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\nA-Za-z0-9]?(?=(?:(?![一-鿿])[A-Za-z])*?[A-Z])(?:(?![一-鿿])[A-Za-z])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
435435
// 3. Numbers (1-3 digits)
436436
"\\p{N}{1,3}",
437437
// 4. Punctuation and symbols

0 commit comments

Comments
 (0)