Skip to content

Commit 94597d0

Browse files
committed
Update llama-vocab.cpp
1 parent a4180af commit 94597d0

File tree

1 file changed

+1
-1
lines changed

1 file changed

+1
-1
lines changed

src/llama-vocab.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
430430
regex_exprs = {
431431
// 1. Han characters
432432
"[一-鿿]+",
433-
// 2.
433+
// 2. Replace unicode with replacements from https://github.com/ggml-org/llama.cpp/blob/923e3ea2e3c96a0b4c208f53bc3bc90cdcdf13c0/src/unicode.cpp#L675
434434
"[^\\r\\nA-Za-z0-9]?(?=(?:(?![一-鿿])[A-Za-z])*?[a-z])(?:(?![一-鿿])[A-Za-z])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\nA-Za-z0-9]?(?=(?:(?![一-鿿])[A-Za-z])*?[A-Z])(?:(?![一-鿿])[A-Za-z])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
435435
// 3. Numbers (1-3 digits)
436436
"\\p{N}{1,3}",

0 commit comments

Comments
 (0)