Skip to content

Commit 9b1fae8

Browse files
committed
Try gabriellarson's regex handling
ggml-org#14654
1 parent 94597d0 commit 9b1fae8

File tree

3 files changed

+209
-17
lines changed

3 files changed

+209
-17
lines changed

src/llama-vocab.cpp

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -426,22 +426,10 @@ struct llm_tokenizer_bpe : llm_tokenizer {
426426
};
427427
break;
428428
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
429-
// Same as GPT-4o tokenizer except for Han characters [\\p{Han}]+
430429
regex_exprs = {
431-
// 1. Han characters
432-
"[一-鿿]+",
433-
// 2. Replace unicode with replacements from https://github.com/ggml-org/llama.cpp/blob/923e3ea2e3c96a0b4c208f53bc3bc90cdcdf13c0/src/unicode.cpp#L675
434-
"[^\\r\\nA-Za-z0-9]?(?=(?:(?![一-鿿])[A-Za-z])*?[a-z])(?:(?![一-鿿])[A-Za-z])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\nA-Za-z0-9]?(?=(?:(?![一-鿿])[A-Za-z])*?[A-Z])(?:(?![一-鿿])[A-Za-z])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
435-
// 3. Numbers (1-3 digits)
436-
"\\p{N}{1,3}",
437-
// 4. Punctuation and symbols
438-
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*",
439-
// 5. Newlines
440-
"\\s*[\\r\\n]+",
441-
// 6.Whitespace at the end of a line
442-
"\\s+(?!\\S)",
443-
// 7. General whitespace
444-
"\\s+",
430+
// K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
431+
// The custom handler implements all K2 patterns with proper Han character exclusion
432+
"\\p{Han}+",
445433
};
446434
break;
447435
default:

src/unicode.cpp

Lines changed: 203 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,173 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
557557
return bpe_offsets;
558558
}
559559

560+
// K2 system regex patterns (from tokenization_kimi.py):
561+
// [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
562+
static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string & text, const std::vector<size_t> & offsets) {
563+
std::vector<size_t> bpe_offsets;
564+
bpe_offsets.reserve(offsets.size());
565+
566+
const auto cpts = unicode_cpts_from_utf8(text);
567+
568+
size_t start = 0;
569+
for (auto offset : offsets) {
570+
const size_t offset_ini = start;
571+
const size_t offset_end = start + offset;
572+
assert(offset_end <= cpts.size());
573+
start = offset_end;
574+
575+
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
576+
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
577+
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
578+
};
579+
580+
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
581+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
582+
};
583+
584+
size_t _prev_end = offset_ini;
585+
auto _add_token = [&] (const size_t end) -> size_t {
586+
assert(_prev_end <= end && end <= offset_end);
587+
size_t len = end - _prev_end;
588+
if (len > 0) {
589+
bpe_offsets.push_back(len);
590+
}
591+
_prev_end = end;
592+
return len;
593+
};
594+
595+
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
596+
const uint32_t cpt = _get_cpt(pos);
597+
const auto flags = _get_flags(pos);
598+
599+
// Pattern 1: [\p{Han}]+ (Chinese characters)
600+
if (unicode_cpt_is_han(cpt)) {
601+
while (unicode_cpt_is_han(_get_cpt(pos))) {
602+
pos++;
603+
}
604+
_add_token(pos);
605+
continue;
606+
}
607+
608+
// Pattern 2 & 3: Letter words excluding Han characters with optional contractions
609+
// [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?:'s|'t|'re|'ve|'m|'ll|'d)?
610+
// [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)?
611+
if (flags.is_letter && !unicode_cpt_is_han(cpt)) {
612+
// Handle optional leading non-letter/non-number character
613+
bool has_leading_char = false;
614+
if (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number)) {
615+
has_leading_char = true;
616+
pos++;
617+
}
618+
619+
// Match letter sequence (excluding Han characters)
620+
bool has_letters = false;
621+
while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
622+
has_letters = true;
623+
pos++;
624+
}
625+
626+
// Only proceed if we found letters (after potentially skipping leading char)
627+
if (has_letters || (!has_leading_char && _get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos)))) {
628+
if (!has_letters) pos++; // consume the first letter if we didn't already
629+
630+
// Continue consuming letters
631+
while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
632+
pos++;
633+
}
634+
635+
// Check for optional contractions (?:'s|'t|'re|'ve|'m|'ll|'d)
636+
if (_get_cpt(pos) == '\'' && pos + 1 < offset_end) {
637+
uint32_t cpt_next = unicode_tolower(_get_cpt(pos + 1));
638+
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
639+
pos += 2;
640+
} else if (pos + 2 < offset_end) {
641+
uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos + 2));
642+
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
643+
(cpt_next == 'v' && cpt_next_next == 'e') ||
644+
(cpt_next == 'l' && cpt_next_next == 'l')) {
645+
pos += 3;
646+
}
647+
}
648+
}
649+
650+
_add_token(pos);
651+
continue;
652+
} else if (has_leading_char) {
653+
// We consumed a leading char but found no letters, backtrack
654+
pos--;
655+
}
656+
}
657+
658+
// Pattern 4: \p{N}{1,3} (numbers 1-3 digits)
659+
if (flags.is_number) {
660+
size_t ini = pos;
661+
while (_get_flags(pos).is_number) {
662+
if (++pos - ini >= 3) {
663+
_add_token(pos);
664+
ini = pos;
665+
}
666+
}
667+
_add_token(pos);
668+
continue;
669+
}
670+
671+
// Pattern 5: ?[^\s\p{L}\p{N}]+[\r\n]* (optional space + non-word chars + optional newlines)
672+
auto flags2 = (cpt == ' ' ? _get_flags(pos + 1) : flags);
673+
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
674+
pos += (cpt == ' ');
675+
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
676+
flags2 = _get_flags(++pos);
677+
}
678+
// Match optional [\r\n]*
679+
uint32_t cpt2 = _get_cpt(pos);
680+
while (cpt2 == '\r' || cpt2 == '\n') {
681+
cpt2 = _get_cpt(++pos);
682+
}
683+
_add_token(pos);
684+
continue;
685+
}
686+
687+
// Count whitespace characters
688+
size_t num_whitespaces = 0;
689+
size_t last_end_r_or_n = 0;
690+
while (_get_flags(pos + num_whitespaces).is_whitespace) {
691+
uint32_t cpt2 = _get_cpt(pos + num_whitespaces);
692+
if (cpt2 == '\r' || cpt2 == '\n') {
693+
last_end_r_or_n = pos + num_whitespaces + 1;
694+
}
695+
num_whitespaces++;
696+
}
697+
698+
// Pattern 6: \s*[\r\n]+ (whitespace with newlines)
699+
if (last_end_r_or_n > 0) {
700+
pos = last_end_r_or_n;
701+
_add_token(pos);
702+
continue;
703+
}
704+
705+
// Pattern 7: \s+(?!\S) (trailing whitespace)
706+
if (num_whitespaces > 1 && _get_cpt(pos + num_whitespaces) != OUT_OF_RANGE) {
707+
pos += num_whitespaces - 1;
708+
_add_token(pos);
709+
continue;
710+
}
711+
712+
// Pattern 8: \s+ (general whitespace)
713+
if (num_whitespaces > 0) {
714+
pos += num_whitespaces;
715+
_add_token(pos);
716+
continue;
717+
}
718+
719+
// No matches - consume single character
720+
_add_token(++pos);
721+
}
722+
}
723+
724+
return bpe_offsets;
725+
}
726+
560727
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
561728
std::vector<size_t> bpe_offsets;
562729

@@ -567,6 +734,9 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
567734
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
568735

569736
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
737+
} else if (regex_expr == "\\p{Han}+") {
738+
// K2's first pattern - handle all K2 patterns together
739+
bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
570740
}
571741

572742
return bpe_offsets;
@@ -672,6 +842,38 @@ uint32_t unicode_tolower(uint32_t cpt) {
672842
return cpt; // Return the original code point if no lowercase mapping is found
673843
}
674844

845+
bool unicode_cpt_is_han(uint32_t cpt) {
846+
// Han character ranges (Chinese/CJK characters)
847+
// CJK Unified Ideographs (most common)
848+
if (cpt >= 0x4E00 && cpt <= 0x9FFF) return true;
849+
850+
// CJK Extension A
851+
if (cpt >= 0x3400 && cpt <= 0x4DBF) return true;
852+
853+
// CJK Extension B
854+
if (cpt >= 0x20000 && cpt <= 0x2A6DF) return true;
855+
856+
// CJK Extension C
857+
if (cpt >= 0x2A700 && cpt <= 0x2B73F) return true;
858+
859+
// CJK Extension D
860+
if (cpt >= 0x2B740 && cpt <= 0x2B81F) return true;
861+
862+
// CJK Extension E
863+
if (cpt >= 0x2B820 && cpt <= 0x2CEAF) return true;
864+
865+
// CJK Extension F
866+
if (cpt >= 0x2CEB0 && cpt <= 0x2EBEF) return true;
867+
868+
// CJK Compatibility Ideographs
869+
if (cpt >= 0xF900 && cpt <= 0xFAFF) return true;
870+
871+
// CJK Compatibility Ideographs Supplement
872+
if (cpt >= 0x2F800 && cpt <= 0x2FA1F) return true;
873+
874+
return false;
875+
}
876+
675877
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
676878
// unicode categories
677879
static const std::map<std::string, int> k_ucat_enum = {
@@ -851,4 +1053,4 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
8511053
}
8521054

8531055
return unicode_byte_encoding_process(bpe_words);
854-
}
1056+
}

src/unicode.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,6 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
6363

6464
uint32_t unicode_tolower(uint32_t cpt);
6565

66-
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
66+
bool unicode_cpt_is_han(uint32_t cpt);
67+
68+
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);

0 commit comments

Comments
 (0)