Improve UTF-8 overflow/overlong handling

khwilliamson · khwilliamson · commit 5718b263e61f · 2024-10-19T07:40:18.000-06:00
Perl's extended UTF-8 is capable of representing code points up to 2**72
(2**65 on EBCDIC).  These won't fit in a 64 bit word, and hence
overflow.  (And much more so on 32 bit machines.)  A start byte of \xFF
is required for code points starting with 2**36; \xFE for those starting
with 2**31, and so on.  But it turns out that a sequence beginning with
\xFE can express all code points 0..2**36-1, and \xFF sequences can
express everything 0..2**72-1.

When a sequence represents a code point that can be expressed by a
shorter sequence, it is called an overlong, and using those is expressly
forbidden by the Unicode standard due to spoofing attacks that have
occurred.  So, a \xFE start byte should only be used for code points
in the range 2**31..2**36-1; and \xFF only for 2**36..2**65-1.  But
Perl needs to handle the possibility where the input doesn't match the
expectations of what it should be.

We have tried to determine all the malformations that apply to a given
sequence and return them to the caller when requested.  The interplay
between overflow and overlong is somewhat tricky, and the new tests that
are to be added in the next commit showed that we haven't been doing it
completely right.

Prior to this commit, the checks for both overlong and overflow had
three states: yes, no, and maybe.  The last meaning that the sequence
being examined was shorter than a full character, and that some possible
completions of it would result in yes, and some would result in no.

This commit retains the tripartite state of examining a sequence for
being overlong, but adds a fourth state for overflow, namely that the
input overflows unless the sequence is overlong, and there aren't enough
bytes to determine the latter absolutely for sure.  But overlongs are
rare, so the chances of it being that are tiny, so this state means that
it almost certainly overflows.

Prior to this commit, I had tried to cope with some of this by an extra
parameter to the find-if-overflow function, but this fourth state
removes the need for that.  The caller gets which state the input is,
and then chooses ow to handle it, without needing the parameter.

The tests in utf8decode.t also had to be changes, as this new code picks
up some overflows on 32-bit machines that were previously not caught.
diff --git a/embed.fnc b/embed.fnc
@@ -5941,8 +5941,7 @@ RS	|UV	|check_locale_boundary_crossing 			\
 				|NN STRLEN *lenp
 RTi	|int	|does_utf8_overflow					\
 				|NN const U8 * const s			\
-				|NN const U8 *e 			\
-				|const bool consider_overlongs
+				|NN const U8 *e
 RTi	|int	|isFF_overlong	|NN const U8 * const s			\
 				|const STRLEN len
 Ri	|bool	|is_utf8_common |NN const U8 * const p			\
diff --git a/proto.h b/proto.h
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t
@@ -189,9 +189,9 @@ __DATA__
 3.4	Concatenation of incomplete sequences
 3.4.1 N15 -	30	c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf	-	unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
 3.5	Impossible bytes (but not with Perl's extended UTF-8)
-3.5.1 n -	1	fe	-	1 byte available, need 7
-3.5.2 n -	1	ff	-	1 byte available, need 13
-3.5.3 N7 -	4	fe:fe:ff:ff	-	byte 0xfe
+3.5.1 N2,1 -	1	fe	-	1 byte available, need 7
+3.5.2 N2,1 -	1	ff	-	1 byte available, need 13
+3.5.3 N11,7 -	4	fe:fe:ff:ff	-	byte 0xfe
 4	Overlong sequences
 4.1	Examples of an overlong ASCII character
 4.1.1 n -	2	c0:af	-	overlong
diff --git a/utf8.c b/utf8.c
@@ -597,31 +597,31 @@ S_isFF_overlong(const U8 * const s, const STRLEN len)
 #endif
 
 PERL_STATIC_INLINE int
-S_does_utf8_overflow(const U8 * const s,
-                     const U8 * e,
-                     const bool consider_overlongs)
+S_does_utf8_overflow(const U8 * const s, const U8 * e)
 {
     PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
 
     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
      * 'e' - 1 would overflow an IV on this platform; that is if it represents
-     * a code point larger than the highest representable code point.  It
-     * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
-     * enough information to tell.  This last return value can happen if the
-     * sequence is incomplete, missing some trailing bytes that would form a
-     * complete character.  If there are enough bytes to make a definitive
-     * decision, this function does so.
-     *
-     * If 'consider_overlongs' is TRUE, the function checks for the possibility
-     * that the sequence is an overlong that doesn't overflow.  Otherwise, it
-     * assumes the sequence is not an overlong.  This can give different
-     * results only on ASCII 32-bit platforms.
-     *
-     * (For ASCII platforms, we could use memcmp() because we don't have to
-     * convert each byte to I8, but it's very rare input indeed that would
-     * approach overflow, so the loop below will likely only get executed once.)
-     *
-     */
+     * a code point larger than the highest representable code point.  The
+     * possible returns are: */
+#define NO_OVERFLOW                 0   /* Definitely doesn't overflow */
+
+/* There aren't enough examinable bytes available to be sure.  This can happen
+ * if the sequence is incomplete, missing some trailing bytes that would form a
+ * complete character. */
+#define COULD_OVERFLOW              1
+
+/* This overflows if not also overlong, and like COULD_OVERFLOW, there aren't
+ * enough available bytes to be sure, but since overlongs are very rarely
+ * encountered, for most purposes consider it to overflow */
+#define ALMOST_CERTAINLY_OVERFLOWS  2
+
+#define OVERFLOWS                   3   /* Definitely overflows */
+
+    /* Note that the values are ordered so that you can use '>=' in checking
+     * the return value. */
+
     const STRLEN len = e - s;
     const U8 *x;
     const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
@@ -634,13 +634,13 @@ S_does_utf8_overflow(const U8 * const s,
          * bytes larger than those omitted bytes, and therefore 'x' can't
          * overflow */
         if (*y == '\0') {
-            return 0;
+            return NO_OVERFLOW;
         }
 
         /* If this byte is less than the corresponding highest non-overflowing
          * UTF-8, the sequence doesn't overflow */
         if (NATIVE_UTF8_TO_I8(*x) < *y) {
-            return 0;
+            return NO_OVERFLOW;
         }
 
         if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
@@ -651,30 +651,20 @@ S_does_utf8_overflow(const U8 * const s,
     /* Got to the end, and all bytes are the same.  If the input is a whole
      * character, it doesn't overflow.  And if it is a partial character,
      * there's not enough information to tell */
-    return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
+    return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? NO_OVERFLOW
+                                                       : COULD_OVERFLOW;
 
   overflows_if_not_overlong: ;
 
-    /* Here, a well-formed sequence overflows.  If we are assuming
-     * well-formedness, return that it overflows. */
-    if (! consider_overlongs) {
-        return 1;
-    }
-
-    /* Here, it could be the overlong malformation, and might not actually
-     * overflow if you were to calculate it out.
-     *
-     * See if it actually is overlong */
+    /* Here, the sequence overflows if not overlong.  Check for that */
     int is_overlong = is_utf8_overlong(s, len);
-
-    /* If it isn't overlong, is well-formed, so overflows */
-    if (is_overlong == 0) {
-        return 1;
+    if (LIKELY(is_overlong == 0)) {
+        return OVERFLOWS;
     }
 
     /* Not long enough to determine */
     if (is_overlong < 0) {
-        return -1;
+        return ALMOST_CERTAINLY_OVERFLOWS;
     }
 
     /* Here, it appears to overflow, but it is also overlong.  That overlong
@@ -705,7 +695,7 @@ S_does_utf8_overflow(const U8 * const s,
      * UTF_CONTINUATION_BYTE_INFO_BITS each.  If that number of bits doesn't
      * exceed the word size, it can't overflow. */
 
-    return 0;
+    return NO_OVERFLOW;
 
 #else
 
@@ -717,7 +707,7 @@ S_does_utf8_overflow(const U8 * const s,
      *
      * That means only the FF start byte can have an overflowing overlong. */
     if (*s < 0xFF) {
-        return 0;
+        return NO_OVERFLOW;
     }
 
     /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
@@ -726,12 +716,14 @@ S_does_utf8_overflow(const U8 * const s,
 #  define OVERFLOWS_MIN_STRING  "\xff\x80\x80\x80\x80\x80\x80\x82"
 
     if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS_MIN_STRING)) {
-        return -1;  /* Not enough info to be sure */
+        return ALMOST_CERTAINLY_OVERFLOWS;  /* Not enough info to be sure */
     }
 
 #  define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
 
-    return (strnGE((const char *) s, OVERFLOWS_MIN_STRING, STRLENs(OVERFLOWS_MIN_STRING)));
+    return (strnGE((const char *) s, OVERFLOWS_MIN_STRING, STRLENs(OVERFLOWS_MIN_STRING)))
+    ? OVERFLOWS
+    : NO_OVERFLOW;
 
 #endif
 
@@ -897,9 +889,7 @@ Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
         s++;
     }
 
-    if (0 < does_utf8_overflow(s0, e,
-                               FALSE /* Don't consider_overlongs */
-    )) {
+    if (does_utf8_overflow(s0, e) == OVERFLOWS) {
         return 0;
     }
 
@@ -1569,10 +1559,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
 
     /* Check for overflow.  The algorithm requires us to not look past the end
      * of the current character, even if partial, so the upper limit is 's' */
-    if (UNLIKELY(0 < does_utf8_overflow(s0, s,
-                                         1 /* Do consider overlongs */
-                                        )))
-    {
+    if (UNLIKELY(does_utf8_overflow(s0, s) >= ALMOST_CERTAINLY_OVERFLOWS)) {
         possible_problems |= UTF8_GOT_OVERFLOW;
         uv = UNICODE_REPLACEMENT;
     }
@@ -4126,9 +4113,7 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
                 if (   ckWARN_d(WARN_NON_UNICODE)
-                    || UNLIKELY(0 < does_utf8_overflow(s, s + len,
-                                               0 /* Don't consider overlongs */
-                                               )))
+                    || UNLIKELY(does_utf8_overflow(s, s + len) >= ALMOST_CERTAINLY_OVERFLOWS))
                 {
                     /* A side effect of this function will be to warn */
                     (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);