utf8n_to_uvchr_msgs: More shortcut of UTF-8 invariants

khwilliamson · khwilliamson · commit c11db1049124 · 2024-10-02T09:33:47.000-06:00
This avoids the dfa table altogether by adding a test for UTF-8
invariants (ASCII-range characters), and not doing the table lookup at
all for them.  This removes the table lookup for these, and removes
calculations in the return length value, and a potential jump.  But the
extra conditional is wasted for non-ASCII range.  I consider this
trade-off to be a wash, but it enables future simplifications
diff --git a/inline.h b/inline.h
@@ -3024,14 +3024,19 @@ Perl_utf8n_to_uvchr_msgs(const U8 *s,
                                                         flags, errors, msgs);
 #endif
 
-        type = PL_strict_utf8_dfa_tab[*s];
+        /* UTF-8 invariants are returned unchanged.  The code below is quite
+         * capable of handling this, but this shortcuts this very common case
+         * */
+        if (UTF8_IS_INVARIANT(*s)) {
+            if (retlen) {
+                *retlen = 1;
+            }
 
-    /* The table is structured so that 'type' is 0 iff the input byte is
-     * represented identically regardless of the UTF-8ness of the string */
-        if (type == 0) {   /* UTF-8 invariants are returned unchanged */
-            uv = *s;
+            return *s;
         }
-    else {
+
+        type = PL_strict_utf8_dfa_tab[*s];
+
         UV state = PL_strict_utf8_dfa_tab[256 + type];
         uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
 
@@ -3049,8 +3054,6 @@ Perl_utf8n_to_uvchr_msgs(const U8 *s,
     /* Here is potentially problematic.  Use the full mechanism */
     return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
                                        errors, msgs);
-    }
-
   success:
             if (retlen) {
                 *retlen = s - s0 + 1;