Skip to content

Commit 1687be3

Browse files
committed
Update llama-vocab.cpp
1 parent 9cf3285 commit 1687be3

File tree

1 file changed

+6
-8
lines changed

1 file changed

+6
-8
lines changed

src/llama-vocab.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -431,18 +431,16 @@ struct llm_tokenizer_bpe : llm_tokenizer {
431431
// 1. Han characters
432432
"[\\p{Han}]+",
433433
// 2. Words ending in lowercase (non-Han) with contractions
434-
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
435-
// 3. Words starting with uppercase (non-Han) with contractions
436-
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
437-
// 4. Numbers (1-3 digits)
434+
"[^\\r\\n\\p{L}\\p{N}]?(?:(?!\\p{Han})(?:\\p{Lu}|\\p{Lt}|\\p{Lm}|\\p{Lo}|\\p{M})*(?!\\p{Han})(?:\\p{Ll}|\\p{Lm}|\\p{Lo}|\\p{M})+|(?!\\p{Han})(?:\\p{Lu}|\\p{Lt}|\\p{Lm}|\\p{Lo}|\\p{M})+(?!\\p{Han})(?:\\p{Ll}|\\p{Lm}|\\p{Lo}|\\p{M})*)(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
435+
// 3. Numbers (1-3 digits)
438436
"\\p{N}{1,3}",
439-
// 5. Punctuation and symbols
437+
// 4. Punctuation and symbols
440438
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*",
441-
// 6. Newlines
439+
// 5. Newlines
442440
"\\s*[\\r\\n]+",
443-
// 7.Whitespace at the end of a line
441+
// 6.Whitespace at the end of a line
444442
"\\s+(?!\\S)",
445-
// 8. General whitespace
443+
// 7. General whitespace
446444
"\\s+",
447445
};
448446
break;

0 commit comments

Comments
 (0)