Better seqrep word boundary handling.

KerfuffleV2 · KerfuffleV2 · commit 86646c4ab291 · 2023-08-14T10:03:39.000-06:00
diff --git a/llama.cpp b/llama.cpp
@@ -2691,36 +2691,33 @@ static size_t llama_seqrep_find_match(const llama_token * last_tokens_p, const s
     return matches;
 }
 
+// Internal helper macro for sequence matching, used to determine if a CP is a word boundary.
+// 0x2000 through is 0x206f is standard unicode punctuation. The CP is considered a word bound
+// if it falls in that range but is _not_ RIGHT SINGLE QUOTATION MARK (0x2019) or if it's in
+// low ASCII range, not alphanumeric and also not a single quote.
+#define LLAMA_SEQREP_IS_WBOUND(cp) ( (cp < 127 && cp != 39 && !std::isalnum((int(cp)))) || (cp != 0x2019 && cp >= 0x2000 && cp <= 0x206f) )
+
 // Internal helper function for sequence matching.
 // Bit 1 set indicates token is a word boundary. NL, " blah", "," - word boundary. "blah", "blah:" - not a word boundary.
 // Bit 2 set indicates token ends on a word boundary. NL, "blah:", "blah " - ends on word boundary. " blah", "blah" - doesn't end on word boundary.
-// Errata: UTF8 safe but only considers ASCII characters. ASCII single quote is treated as a non-boundary which isn't always correct.
+// Errata: Special cases apostrophe and only has partial support for unicode punctuation as word boundaries.
 static uint8_t llama_seqrep_check_word(struct llama_context * ctx, const llama_token token) {
     if (token == llama_token_bos() || token == llama_token_eos() || token == llama_token_nl()) {
         // BOS, EOS, NL are always a boundary.
         return 3;
     }
     const char * token_str = llama_token_to_str(ctx, token);
     assert(token_str != NULL);
-    if (token_str[0] == '\0') {
-        // 0-length token string, can't be a boundary.
-        return 0;
-    }
 
-    const char start_char = token_str[0];
-    char end_char;
-    for (const char *curr_char = token_str; ; curr_char++) {
-        // Guaranteed to iterate at least once since we already checked if the string was 0-length.
-        if (*(curr_char + 1) == '\0') {
-            end_char = *curr_char;
-            break;
-        }
+    const std::vector<uint32_t> token_cps = decode_utf8(token_str);
+    const size_t token_cps_len = token_cps.size();
+    if (token_cps_len < 2) {
+        // token has no codepoints, can't be a boundary. < 2 here because decode_utf8 terminates with a 0 entry.
+        return 0;
     }
-    return uint8_t(
-        (start_char != '\'' && !isalnum((int)start_char) ? 1 : 0) +
-        (end_char != '\'' && !isalnum((int)end_char) ? 2 : 0)
-    );
 
+    const uint32_t start_cp = token_cps[0], end_cp = token_cps[token_cps_len - 2];
+    return uint8_t(LLAMA_SEQREP_IS_WBOUND(start_cp)) + uint8_t(LLAMA_SEQREP_IS_WBOUND(end_cp)) * 2;
 }
 
 void llama_sample_seqrep_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, size_t min_length, size_t tolerance, float flat_penalty, float length_penalty, float mid_word_scale) {