seqrep: Fix token text content handling.

KerfuffleV2 · KerfuffleV2 · commit cbfb2efc34bd · 2023-08-30T04:59:58.000-06:00
diff --git a/llama.cpp b/llama.cpp
@@ -4165,14 +4165,24 @@ static size_t llama_seqrep_find_match(const llama_token * last_tokens_p, const s
 // Bit 1 set indicates token is a word boundary. NL, " blah", "," - word boundary. "blah", "blah:" - not a word boundary.
 // Bit 2 set indicates token ends on a word boundary. NL, "blah:", "blah " - ends on word boundary. " blah", "blah" - doesn't end on word boundary.
 // Errata: Special cases apostrophe and only has partial support for unicode punctuation as word boundaries.
-static uint8_t llama_seqrep_check_word(struct llama_context * ctx, const llama_token token) {
+static uint8_t llama_seqrep_check_word(struct llama_context * ctx, const llama_token token, std::vector<char> & buf) {
     if (token == llama_token_bos(ctx) || token == llama_token_eos(ctx) || token == llama_token_nl(ctx)) {
         // BOS, EOS, NL are always a boundary.
         return 3;
     }
 
-    const char * token_str = llama_token_get_text(ctx, token);
-    auto decoded = decode_utf8(token_str, llama_partial_utf8{ 0, 0 });
+    if (buf.size() < 128) {
+        buf.resize(128);
+    }
+    int n_tokens = llama_token_to_piece(ctx, token, buf.data(), buf.size() - 1);
+    if (n_tokens < 0) {
+        buf.resize(size_t(-n_tokens) + 128);
+        const int check = llama_token_to_piece(ctx, token, buf.data(), buf.size() - 1);
+        GGML_ASSERT(check == -n_tokens);
+        n_tokens = check;
+    }
+    buf[n_tokens] = 0;
+    auto decoded = decode_utf8(buf.data(), llama_partial_utf8{ 0, 0 });
     const std::vector<uint32_t> & token_cps = decoded.first;
     const size_t token_cps_len = token_cps.size();
 
@@ -4259,8 +4269,9 @@ void llama_sample_seqrep_penalty(struct llama_context * ctx, llama_token_data_ar
         }
     }
 
+    std::vector<char> buf(128, 0);
     const bool ends_on_word = params->mid_word_scale == 1.0f
-        || (llama_seqrep_check_word(ctx, last_tokens_p[last_tokens_size - 1]) & 2) != 0;
+        || (llama_seqrep_check_word(ctx, last_tokens_p[last_tokens_size - 1], buf) & 2) != 0;
 
     for (size_t i = 0; i < candidates->size; ++i) {
         auto pt_iter = penalize_tokens.find(candidates->data[i].id);
@@ -4270,7 +4281,7 @@ void llama_sample_seqrep_penalty(struct llama_context * ctx, llama_token_data_ar
 
         const size_t count = pt_iter->second;
         const bool pt_starts_word = params->mid_word_scale == 1.0f ||
-            (llama_seqrep_check_word(ctx, candidates->data[i].id) & 1) != 0;
+            (llama_seqrep_check_word(ctx, candidates->data[i].id, buf) & 1) != 0;
         float penalty_scale = ends_on_word || pt_starts_word ? 1.0f : params->mid_word_scale;
         float logit = candidates->data[i].logit;