@@ -3703,7 +3703,7 @@ static void llm_load_vocab(
3703
3703
3704
3704
for (int i = 0; i < n_merges; i++) {
3705
3705
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
3706
- GGML_ASSERT(codepoints_from_utf8 (word).size() > 0);
3706
+ GGML_ASSERT(unicode_cpts_from_utf8 (word).size() > 0);
3707
3707
3708
3708
std::string first;
3709
3709
std::string second;
@@ -3748,7 +3748,7 @@ static void llm_load_vocab(
3748
3748
3749
3749
for (uint32_t i = 0; i < n_vocab; i++) {
3750
3750
std::string word = gguf_get_arr_str(ctx, token_idx, i);
3751
- GGML_ASSERT(codepoints_from_utf8 (word).size() > 0);
3751
+ GGML_ASSERT(unicode_cpts_from_utf8 (word).size() > 0);
3752
3752
3753
3753
vocab.token_to_id[word] = i;
3754
3754
@@ -9340,7 +9340,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
9340
9340
}
9341
9341
case LLAMA_VOCAB_TYPE_BPE: {
9342
9342
GGML_ASSERT(false);
9343
- return unicode_to_bytes_bpe (token_data.text);
9343
+ return unicode_utf8_to_byte (token_data.text);
9344
9344
}
9345
9345
case LLAMA_VOCAB_TYPE_WPM: {
9346
9346
GGML_ASSERT(false);
@@ -9365,7 +9365,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
9365
9365
}
9366
9366
case LLAMA_VOCAB_TYPE_WPM:
9367
9367
case LLAMA_VOCAB_TYPE_BPE: {
9368
- return vocab.token_to_id.at(bytes_to_unicode_bpe (ch));
9368
+ return vocab.token_to_id.at(unicode_byte_to_utf8 (ch));
9369
9369
}
9370
9370
default:
9371
9371
GGML_ASSERT(false);
@@ -9705,9 +9705,9 @@ struct llm_tokenizer_bpe {
9705
9705
bpe_words.reserve(text.size());
9706
9706
bpe_encoded_words.reserve(text.size());
9707
9707
9708
- auto cps = codepoints_from_utf8 (text);
9709
- for (size_t i = 0; i < cps .size(); ++i)
9710
- text_utf.emplace_back(codepoint_to_utf8(cps [i]));
9708
+ const auto cpts = unicode_cpts_from_utf8 (text);
9709
+ for (size_t i = 0; i < cpts .size(); ++i)
9710
+ text_utf.emplace_back(unicode_cpt_to_utf8(cpts [i]));
9711
9711
9712
9712
for (int i = 0; i < (int)text_utf.size(); i++) {
9713
9713
const std::string & utf_char = text_utf[i];
@@ -9757,40 +9757,40 @@ struct llm_tokenizer_bpe {
9757
9757
}
9758
9758
9759
9759
if (!split_condition && !collecting) {
9760
- if (codepoint_type (utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type (utf_char_next) == CODEPOINT_TYPE_LETTER)) {
9760
+ if (unicode_cpt_type (utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type (utf_char_next) == CODEPOINT_TYPE_LETTER)) {
9761
9761
collecting_letter = true;
9762
9762
collecting = true;
9763
9763
}
9764
- else if (codepoint_type (utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type (utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
9764
+ else if (unicode_cpt_type (utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type (utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
9765
9765
collecting_numeric = true;
9766
9766
collecting = true;
9767
9767
}
9768
9768
else if (
9769
- ((codepoint_type (utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type (utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type (utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
9770
- (!token.size() && utf_char == " " && codepoint_type (utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type (utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type (utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
9769
+ ((unicode_cpt_type (utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type (utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type (utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
9770
+ (!token.size() && utf_char == " " && unicode_cpt_type (utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type (utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type (utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
9771
9771
) {
9772
9772
collecting_special = true;
9773
9773
collecting = true;
9774
9774
}
9775
- else if (codepoint_type (utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type (utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
9775
+ else if (unicode_cpt_type (utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type (utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
9776
9776
collecting_whitespace_lookahead = true;
9777
9777
collecting = true;
9778
9778
}
9779
- else if (codepoint_type (utf_char) == CODEPOINT_TYPE_WHITESPACE) {
9779
+ else if (unicode_cpt_type (utf_char) == CODEPOINT_TYPE_WHITESPACE) {
9780
9780
split_condition = true;
9781
9781
}
9782
9782
}
9783
9783
else if (!split_condition && collecting) {
9784
- if (collecting_letter && codepoint_type (utf_char) != CODEPOINT_TYPE_LETTER) {
9784
+ if (collecting_letter && unicode_cpt_type (utf_char) != CODEPOINT_TYPE_LETTER) {
9785
9785
split_condition = true;
9786
9786
}
9787
- else if (collecting_numeric && codepoint_type (utf_char) != CODEPOINT_TYPE_DIGIT) {
9787
+ else if (collecting_numeric && unicode_cpt_type (utf_char) != CODEPOINT_TYPE_DIGIT) {
9788
9788
split_condition = true;
9789
9789
}
9790
- else if (collecting_special && (codepoint_type (utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type (utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type (utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
9790
+ else if (collecting_special && (unicode_cpt_type (utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type (utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type (utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
9791
9791
split_condition = true;
9792
9792
}
9793
- else if (collecting_whitespace_lookahead && (codepoint_type (utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type (utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
9793
+ else if (collecting_whitespace_lookahead && (unicode_cpt_type (utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type (utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
9794
9794
split_condition = true;
9795
9795
}
9796
9796
}
@@ -9819,7 +9819,7 @@ struct llm_tokenizer_bpe {
9819
9819
for (std::string & word : bpe_words) {
9820
9820
std::string encoded_token = "";
9821
9821
for (char & c : word) {
9822
- encoded_token += bytes_to_unicode_bpe (c);
9822
+ encoded_token += unicode_byte_to_utf8 (c);
9823
9823
}
9824
9824
bpe_encoded_words.emplace_back(encoded_token);
9825
9825
}
@@ -9893,33 +9893,21 @@ struct llm_tokenizer_wpm {
9893
9893
}
9894
9894
9895
9895
std::vector<std::string> preprocess(const std::string & text) {
9896
- // normalalization form D
9897
- std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
9898
- std::vector<uint32_t> nfd_codepoints;
9899
- for (uint32_t code : codepoints) {
9900
- auto it = nfd_map.equal_range(code);
9901
- if (it.first != it.second) {
9902
- for (auto jt = it.first; jt != it.second; jt++) {
9903
- nfd_codepoints.push_back(jt->second);
9904
- }
9905
- } else {
9906
- nfd_codepoints.push_back(code);
9907
- }
9908
- }
9896
+ std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
9909
9897
9910
9898
// strip accents, strip control, uniformize whitespace,
9911
9899
// to lowercase, pad chinese characters, pad punctuation
9912
9900
std::string new_str = "";
9913
- for (uint32_t code : nfd_codepoints ) {
9914
- int type = codepoint_type (code);
9901
+ for (uint32_t code : cpts_nfd ) {
9902
+ int type = unicode_cpt_type (code);
9915
9903
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
9916
9904
continue;
9917
9905
}
9918
9906
code = to_lower(code);
9919
9907
if (type == CODEPOINT_TYPE_WHITESPACE) {
9920
9908
code = ' ';
9921
9909
}
9922
- std::string s = codepoint_to_utf8 (code);
9910
+ std::string s = unicode_cpt_to_utf8 (code);
9923
9911
if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
9924
9912
new_str += " ";
9925
9913
new_str += s;
@@ -9939,8 +9927,7 @@ struct llm_tokenizer_wpm {
9939
9927
if (r > l) words.push_back(new_str.substr(l, (r - l)));
9940
9928
l = r + 1;
9941
9929
r = l;
9942
- }
9943
- else {
9930
+ } else {
9944
9931
r += 1;
9945
9932
}
9946
9933
}
@@ -9964,17 +9951,17 @@ struct llm_tokenizer_wpm {
9964
9951
return code < 256 && ispunct(code);
9965
9952
}
9966
9953
9967
- bool is_chinese_char(uint32_t codepoint ) {
9968
- if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
9969
- (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
9970
- (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
9971
- (codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
9972
- (codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
9973
- (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
9974
- (codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
9975
- (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
9976
- (codepoint >= 0x3000 && codepoint <= 0x303F) ||
9977
- (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
9954
+ bool is_chinese_char(uint32_t cpt ) {
9955
+ if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
9956
+ (cpt >= 0x3400 && cpt <= 0x4DBF) ||
9957
+ (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
9958
+ (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
9959
+ (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
9960
+ (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
9961
+ (cpt >= 0xF900 && cpt <= 0xFAFF) ||
9962
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
9963
+ (cpt >= 0x3000 && cpt <= 0x303F) ||
9964
+ (cpt >= 0xFF00 && cpt <= 0xFFEF)) {
9978
9965
return true; // NOLINT
9979
9966
}
9980
9967
return false;
@@ -13953,9 +13940,9 @@ int32_t llama_tokenize(
13953
13940
13954
13941
static std::string llama_decode_text(const std::string & text) {
13955
13942
std::string decoded_text;
13956
- auto unicode_sequences = codepoints_from_utf8 (text);
13957
- for (auto& unicode_sequence : unicode_sequences) {
13958
- decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8 (unicode_sequence));
13943
+ auto unicode_sequences = unicode_cpts_from_utf8 (text);
13944
+ for (auto & unicode_sequence : unicode_sequences) {
13945
+ decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8 (unicode_sequence));
13959
13946
}
13960
13947
13961
13948
return decoded_text;
0 commit comments