@@ -557,6 +557,173 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
557
557
return bpe_offsets;
558
558
}
559
559
560
+ // K2 system regex patterns (from tokenization_kimi.py):
561
+ // [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
562
+ static std::vector<size_t > unicode_regex_split_custom_kimi_k2 (const std::string & text, const std::vector<size_t > & offsets) {
563
+ std::vector<size_t > bpe_offsets;
564
+ bpe_offsets.reserve (offsets.size ());
565
+
566
+ const auto cpts = unicode_cpts_from_utf8 (text);
567
+
568
+ size_t start = 0 ;
569
+ for (auto offset : offsets) {
570
+ const size_t offset_ini = start;
571
+ const size_t offset_end = start + offset;
572
+ assert (offset_end <= cpts.size ());
573
+ start = offset_end;
574
+
575
+ static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF ;
576
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
577
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
578
+ };
579
+
580
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
581
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt (cpts[pos]) : unicode_cpt_flags{};
582
+ };
583
+
584
+ size_t _prev_end = offset_ini;
585
+ auto _add_token = [&] (const size_t end) -> size_t {
586
+ assert (_prev_end <= end && end <= offset_end);
587
+ size_t len = end - _prev_end;
588
+ if (len > 0 ) {
589
+ bpe_offsets.push_back (len);
590
+ }
591
+ _prev_end = end;
592
+ return len;
593
+ };
594
+
595
+ for (size_t pos = offset_ini; pos < offset_end; /* pos++*/ ) {
596
+ const uint32_t cpt = _get_cpt (pos);
597
+ const auto flags = _get_flags (pos);
598
+
599
+ // Pattern 1: [\p{Han}]+ (Chinese characters)
600
+ if (unicode_cpt_is_han (cpt)) {
601
+ while (unicode_cpt_is_han (_get_cpt (pos))) {
602
+ pos++;
603
+ }
604
+ _add_token (pos);
605
+ continue ;
606
+ }
607
+
608
+ // Pattern 2 & 3: Letter words excluding Han characters with optional contractions
609
+ // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?:'s|'t|'re|'ve|'m|'ll|'d)?
610
+ // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)?
611
+ if (flags.is_letter && !unicode_cpt_is_han (cpt)) {
612
+ // Handle optional leading non-letter/non-number character
613
+ bool has_leading_char = false ;
614
+ if (!(cpt == ' \r ' || cpt == ' \n ' || flags.is_letter || flags.is_number )) {
615
+ has_leading_char = true ;
616
+ pos++;
617
+ }
618
+
619
+ // Match letter sequence (excluding Han characters)
620
+ bool has_letters = false ;
621
+ while (_get_flags (pos).is_letter && !unicode_cpt_is_han (_get_cpt (pos))) {
622
+ has_letters = true ;
623
+ pos++;
624
+ }
625
+
626
+ // Only proceed if we found letters (after potentially skipping leading char)
627
+ if (has_letters || (!has_leading_char && _get_flags (pos).is_letter && !unicode_cpt_is_han (_get_cpt (pos)))) {
628
+ if (!has_letters) pos++; // consume the first letter if we didn't already
629
+
630
+ // Continue consuming letters
631
+ while (_get_flags (pos).is_letter && !unicode_cpt_is_han (_get_cpt (pos))) {
632
+ pos++;
633
+ }
634
+
635
+ // Check for optional contractions (?:'s|'t|'re|'ve|'m|'ll|'d)
636
+ if (_get_cpt (pos) == ' \' ' && pos + 1 < offset_end) {
637
+ uint32_t cpt_next = unicode_tolower (_get_cpt (pos + 1 ));
638
+ if (cpt_next == ' s' || cpt_next == ' t' || cpt_next == ' m' || cpt_next == ' d' ) {
639
+ pos += 2 ;
640
+ } else if (pos + 2 < offset_end) {
641
+ uint32_t cpt_next_next = unicode_tolower (_get_cpt (pos + 2 ));
642
+ if ((cpt_next == ' r' && cpt_next_next == ' e' ) ||
643
+ (cpt_next == ' v' && cpt_next_next == ' e' ) ||
644
+ (cpt_next == ' l' && cpt_next_next == ' l' )) {
645
+ pos += 3 ;
646
+ }
647
+ }
648
+ }
649
+
650
+ _add_token (pos);
651
+ continue ;
652
+ } else if (has_leading_char) {
653
+ // We consumed a leading char but found no letters, backtrack
654
+ pos--;
655
+ }
656
+ }
657
+
658
+ // Pattern 4: \p{N}{1,3} (numbers 1-3 digits)
659
+ if (flags.is_number ) {
660
+ size_t ini = pos;
661
+ while (_get_flags (pos).is_number ) {
662
+ if (++pos - ini >= 3 ) {
663
+ _add_token (pos);
664
+ ini = pos;
665
+ }
666
+ }
667
+ _add_token (pos);
668
+ continue ;
669
+ }
670
+
671
+ // Pattern 5: ?[^\s\p{L}\p{N}]+[\r\n]* (optional space + non-word chars + optional newlines)
672
+ auto flags2 = (cpt == ' ' ? _get_flags (pos + 1 ) : flags);
673
+ if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number ) && flags2.as_uint ()) {
674
+ pos += (cpt == ' ' );
675
+ while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number ) && flags2.as_uint ()) {
676
+ flags2 = _get_flags (++pos);
677
+ }
678
+ // Match optional [\r\n]*
679
+ uint32_t cpt2 = _get_cpt (pos);
680
+ while (cpt2 == ' \r ' || cpt2 == ' \n ' ) {
681
+ cpt2 = _get_cpt (++pos);
682
+ }
683
+ _add_token (pos);
684
+ continue ;
685
+ }
686
+
687
+ // Count whitespace characters
688
+ size_t num_whitespaces = 0 ;
689
+ size_t last_end_r_or_n = 0 ;
690
+ while (_get_flags (pos + num_whitespaces).is_whitespace ) {
691
+ uint32_t cpt2 = _get_cpt (pos + num_whitespaces);
692
+ if (cpt2 == ' \r ' || cpt2 == ' \n ' ) {
693
+ last_end_r_or_n = pos + num_whitespaces + 1 ;
694
+ }
695
+ num_whitespaces++;
696
+ }
697
+
698
+ // Pattern 6: \s*[\r\n]+ (whitespace with newlines)
699
+ if (last_end_r_or_n > 0 ) {
700
+ pos = last_end_r_or_n;
701
+ _add_token (pos);
702
+ continue ;
703
+ }
704
+
705
+ // Pattern 7: \s+(?!\S) (trailing whitespace)
706
+ if (num_whitespaces > 1 && _get_cpt (pos + num_whitespaces) != OUT_OF_RANGE) {
707
+ pos += num_whitespaces - 1 ;
708
+ _add_token (pos);
709
+ continue ;
710
+ }
711
+
712
+ // Pattern 8: \s+ (general whitespace)
713
+ if (num_whitespaces > 0 ) {
714
+ pos += num_whitespaces;
715
+ _add_token (pos);
716
+ continue ;
717
+ }
718
+
719
+ // No matches - consume single character
720
+ _add_token (++pos);
721
+ }
722
+ }
723
+
724
+ return bpe_offsets;
725
+ }
726
+
560
727
static std::vector<size_t > unicode_regex_split_custom (const std::string & text, const std::string & regex_expr, const std::vector<size_t > & offsets) {
561
728
std::vector<size_t > bpe_offsets;
562
729
@@ -567,6 +734,9 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
567
734
regex_expr == " (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+(?!\\ S)|\\ s+" ) {
568
735
569
736
bpe_offsets = unicode_regex_split_custom_llama3 (text, offsets);
737
+ } else if (regex_expr == " \\ p{Han}+" ) {
738
+ // K2's first pattern - handle all K2 patterns together
739
+ bpe_offsets = unicode_regex_split_custom_kimi_k2 (text, offsets);
570
740
}
571
741
572
742
return bpe_offsets;
@@ -672,6 +842,38 @@ uint32_t unicode_tolower(uint32_t cpt) {
672
842
return cpt; // Return the original code point if no lowercase mapping is found
673
843
}
674
844
845
+ bool unicode_cpt_is_han (uint32_t cpt) {
846
+ // Han character ranges (Chinese/CJK characters)
847
+ // CJK Unified Ideographs (most common)
848
+ if (cpt >= 0x4E00 && cpt <= 0x9FFF ) return true ;
849
+
850
+ // CJK Extension A
851
+ if (cpt >= 0x3400 && cpt <= 0x4DBF ) return true ;
852
+
853
+ // CJK Extension B
854
+ if (cpt >= 0x20000 && cpt <= 0x2A6DF ) return true ;
855
+
856
+ // CJK Extension C
857
+ if (cpt >= 0x2A700 && cpt <= 0x2B73F ) return true ;
858
+
859
+ // CJK Extension D
860
+ if (cpt >= 0x2B740 && cpt <= 0x2B81F ) return true ;
861
+
862
+ // CJK Extension E
863
+ if (cpt >= 0x2B820 && cpt <= 0x2CEAF ) return true ;
864
+
865
+ // CJK Extension F
866
+ if (cpt >= 0x2CEB0 && cpt <= 0x2EBEF ) return true ;
867
+
868
+ // CJK Compatibility Ideographs
869
+ if (cpt >= 0xF900 && cpt <= 0xFAFF ) return true ;
870
+
871
+ // CJK Compatibility Ideographs Supplement
872
+ if (cpt >= 0x2F800 && cpt <= 0x2FA1F ) return true ;
873
+
874
+ return false ;
875
+ }
876
+
675
877
std::vector<std::string> unicode_regex_split (const std::string & text, const std::vector<std::string> & regex_exprs) {
676
878
// unicode categories
677
879
static const std::map<std::string, int > k_ucat_enum = {
@@ -851,4 +1053,4 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
851
1053
}
852
1054
853
1055
return unicode_byte_encoding_process (bpe_words);
854
- }
1056
+ }
0 commit comments