Merge pull request #1051 from bettio/optimized-utf8

bettio · bettio · commit 431cc4ad22f7 · 2025-02-15T20:12:49.000+01:00
Optimized UTF-8

These changes are made under both the "Apache 2.0" and the "GNU Lesser General
Public License 2.1 or later" license terms (dual license).

SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
diff --git a/LICENSES/MIT.txt b/LICENSES/MIT.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) [year] [fullname]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/doc/src/apidocs/libatomvm/functions.rst b/doc/src/apidocs/libatomvm/functions.rst
@@ -38,7 +38,6 @@ Functions
 .. doxygenfunction:: bitstring_utf16_size
 .. doxygenfunction:: bitstring_utf32_decode
 .. doxygenfunction:: bitstring_utf32_encode
-.. doxygenfunction:: bitstring_utf8_decode
 .. doxygenfunction:: bitstring_utf8_encode
 .. doxygenfunction:: bitstring_utf8_size
 .. doxygenfunction:: context_avail_free_memory
diff --git a/src/libAtomVM/bif.c b/src/libAtomVM/bif.c
@@ -1638,19 +1638,10 @@ term binary_to_atom(Context *ctx, term a_binary, term encoding, bool create_new,
 
     AtomString atom;
     if (LIKELY(!encode_latin1_to_utf8)) {
-        size_t i = 0;
-        while (i < atom_string_len) {
-            uint32_t codepoint;
-            size_t codepoint_size;
-            if (UNLIKELY(bitstring_utf8_decode(
-                    (uint8_t *) atom_string + i, atom_string_len, &codepoint, &codepoint_size))
-                != UnicodeTransformDecodeSuccess) {
-                *error_reason = BADARG_ATOM;
-                return term_invalid_term();
-            }
-            i += codepoint_size;
+        if (UNLIKELY(!unicode_is_valid_utf8_buf((const uint8_t *) atom_string, atom_string_len))) {
+            *error_reason = BADARG_ATOM;
+            return term_invalid_term();
         }
-
         atom = malloc(atom_string_len + 1);
         if (IS_NULL_PTR(atom)) {
             *error_reason = OUT_OF_MEMORY_ATOM;
diff --git a/src/libAtomVM/bitstring.c b/src/libAtomVM/bitstring.c
@@ -141,69 +141,6 @@ bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size)
     return true;
 }
 
-enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
-{
-    if (len == 0) {
-        return UnicodeTransformDecodeFail;
-    } else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
-        uint32_t v = 0;
-        v |= (buf[0] & 0x07) << 18;
-        v |= (buf[1] & 0x3F) << 12;
-        v |= (buf[2] & 0x3F) << 6;
-        v |= (buf[3] & 0x3F);
-        // overlong encoding or invalid codepoint
-        if (v <= 0x10000 || v > 0x10FFFF) {
-            return UnicodeTransformDecodeFail;
-        }
-        *c = v;
-        *out_size = 4;
-        return UnicodeTransformDecodeSuccess;
-    } else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
-        uint32_t v = 0;
-        v |= (buf[0] & 0x0F) << 12;
-        v |= (buf[1] & 0x3F) << 6;
-        v |= (buf[2] & 0x3F);
-        // overlong encoding or surrogate
-        if (v < 0x800 || (v >= 0xD800 && v <= 0xDFFF)) {
-            return UnicodeTransformDecodeFail;
-        }
-        *c = v;
-        *out_size = 3;
-        return UnicodeTransformDecodeSuccess;
-    } else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
-        uint32_t v = 0;
-        v |= (buf[0] & 0x1F) << 6;
-        v |= (buf[1] & 0x3F);
-        // overlong encoding
-        if (v < 0x80) {
-            return UnicodeTransformDecodeFail;
-        }
-        *c = v;
-        *out_size = 2;
-        return UnicodeTransformDecodeSuccess;
-    } else if ((*buf & 0x80) == 0) {
-        uint32_t v = 0;
-        v |= (buf[0] & 0x7F);
-        *c = v;
-        *out_size = 1;
-        return UnicodeTransformDecodeSuccess;
-    } else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
-        return UnicodeTransformDecodeIncomplete;
-    } else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) {
-        return UnicodeTransformDecodeIncomplete;
-    } else if (len == 1 && (buf[0] & 0xF8) == 0xF0) {
-        return UnicodeTransformDecodeIncomplete;
-    } else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) {
-        return UnicodeTransformDecodeIncomplete;
-    } else if (len == 1 && (buf[0] & 0xF0) == 0xE0) {
-        return UnicodeTransformDecodeIncomplete;
-    } else if (len == 1 && (buf[0] & 0xE0) == 0xC0) {
-        return UnicodeTransformDecodeIncomplete;
-    }
-
-    return UnicodeTransformDecodeFail;
-}
-
 // UTF-16 encoding, when U in U+010000 to U+10FFFF:
 //
 //  U' = yyyyyyyyyyxxxxxxxxxx  // U - 0x10000
diff --git a/src/libAtomVM/bitstring.h b/src/libAtomVM/bitstring.h
@@ -23,6 +23,7 @@
 #define _BITSTRING_H_
 
 #include "term.h"
+#include "unicode.h"
 
 #include <stdbool.h>
 #include <stdint.h>
@@ -99,13 +100,6 @@ enum BitstringFlags
 #endif
 };
 
-enum UnicodeTransformDecodeResult
-{
-    UnicodeTransformDecodeSuccess,
-    UnicodeTransformDecodeFail,
-    UnicodeTransformDecodeIncomplete
-};
-
 union maybe_unsigned_int8
 {
     uint8_t u;
@@ -320,20 +314,6 @@ static inline bool bitstring_insert_integer(term dst_bin, size_t offset, avm_int
  */
 bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size);
 
-/**
- * @brief Decode a character from UTF-8.
- *
- * @param buf the buffer from which to decode the string
- * @param len the length (in bytes) of the bytes in buf
- * @param c int value to decode to or NULL to only compute the size.
- * @param out_size the size in bytes, on output (if not NULL)
- * @return \c UnicodeTransformDecodeSuccess if decoding was successful,
- * \c UnicodeTransformDecodeFail if character starting at buf is not a valid
- * unicode character or \c UnicodeTransformDecodeIncomplete if character
- * starting at buf is a valid but incomplete transformation
- */
-enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
-
 /**
  * @brief Encode a character to UTF-16.
  *
@@ -441,7 +421,7 @@ static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c
 {
     size_t byte_offset = offset >> 3; // divide by 8
     const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
-    return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
+    return unicode_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
 }
 
 /**
diff --git a/src/libAtomVM/externalterm.c b/src/libAtomVM/externalterm.c
@@ -636,17 +636,8 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
             uint8_t atom_len = *(external_term_buf + 1);
             const uint8_t *atom_chars = external_term_buf + 2;
 
-            size_t remaining_length = atom_len;
-            const uint8_t *curr_buf = atom_chars;
-            while (remaining_length) {
-                uint32_t out_c;
-                size_t codepoint_size;
-                enum UnicodeTransformDecodeResult result = bitstring_utf8_decode(curr_buf, remaining_length, &out_c, &codepoint_size);
-                if (UNLIKELY(result != UnicodeTransformDecodeSuccess)) {
-                    return term_invalid_term();
-                }
-                remaining_length -= codepoint_size;
-                curr_buf += codepoint_size;
+            if (UNLIKELY(!unicode_is_valid_utf8_buf((const uint8_t *) atom_chars, atom_len))) {
+                return term_invalid_term();
             }
 
             // AtomString first byte is the atom length
diff --git a/src/libAtomVM/interop.c b/src/libAtomVM/interop.c
@@ -385,7 +385,7 @@ static enum UnicodeConversionResult interop_binary_conversion(term t, uint8_t *o
     while (input_index < len) {
         size_t char_size;
         uint32_t c;
-        enum UnicodeTransformDecodeResult decode_result = bitstring_utf8_decode(input + input_index, len - input_index, &c, &char_size);
+        enum UnicodeTransformDecodeResult decode_result = unicode_utf8_decode(input + input_index, len - input_index, &c, &char_size);
         if (UNLIKELY(decode_result != UnicodeTransformDecodeSuccess)) {
             *rest_crsr = input_index;
             *output_len = result;
diff --git a/src/libAtomVM/nifs.c b/src/libAtomVM/nifs.c
@@ -2081,7 +2081,7 @@ static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[])
         for (size_t i = 0; i < encoded_len; i++) {
             size_t codepoint_size;
             uint32_t codepoint;
-            if (UNLIKELY(bitstring_utf8_decode(
+            if (UNLIKELY(unicode_utf8_decode(
                              &utf8_tmp_buf[in_pos], 2, &codepoint, &codepoint_size)
                         != UnicodeTransformDecodeSuccess
                     || (codepoint > 255))) {
@@ -2122,7 +2122,7 @@ static term make_list_from_utf8_buf(const uint8_t *buf, size_t buf_len, Context
         for (size_t i = 0; i < u8len; i++) {
             size_t codepoint_size;
             enum UnicodeTransformDecodeResult result
-                = bitstring_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
+                = unicode_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
             if (UNLIKELY((result != UnicodeTransformDecodeSuccess)
                     || !unicode_is_valid_codepoint(codepoints[i]))) {
                 AVM_ABORT();
diff --git a/src/libAtomVM/unicode.c b/src/libAtomVM/unicode.c
@@ -15,14 +15,94 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
- * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
+ * SPDX-License-Identifier: (Apache-2.0 OR LGPL-2.1-or-later) AND MIT
  */
 
 #include <stdbool.h>
 #include <stddef.h>
 
+#include "utils.h"
+
 #include "unicode.h"
 
+// clang-format off
+
+// Following utf8d table and decode function are covered by MIT license
+// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+static const uint8_t utf8d[] = {
+  // The first part of the table maps bytes to character classes that
+  // to reduce the size of the transition table and create bitmasks.
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+  // The second part is a transition table that maps a combination
+  // of a state of the automaton and a character class to a state.
+   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
+{
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != UTF8_ACCEPT) ?
+    (byte & 0x3fu) | (*codep << 6) :
+    (0xff >> type) & (byte);
+
+  *state = utf8d[256 + *state + type];
+  return *state;
+}
+
+// clang-format on
+
+enum UnicodeTransformDecodeResult unicode_utf8_decode(
+    const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
+{
+    uint32_t codepoint = 0;
+    uint32_t state = 0;
+    size_t i = 0;
+    while (i < len) {
+        state = decode(&state, &codepoint, buf[i]);
+        i++;
+
+        if (state == UTF8_ACCEPT) {
+            *c = codepoint;
+            *out_size = i;
+            return UnicodeTransformDecodeSuccess;
+        } else if (UNLIKELY(state == UTF8_REJECT)) {
+            return UnicodeTransformDecodeFail;
+        }
+    }
+
+    return UnicodeTransformDecodeIncomplete;
+}
+
+bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len)
+{
+    uint32_t codepoint = 0;
+    uint32_t state = 0;
+
+    for (size_t i = 0; i < len; i++) {
+        state = decode(&state, &codepoint, buf[i]);
+    }
+
+    return state == UTF8_ACCEPT;
+}
+
 size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len)
 {
     size_t count = 0;
diff --git a/src/libAtomVM/unicode.h b/src/libAtomVM/unicode.h
@@ -29,6 +29,13 @@
 extern "C" {
 #endif
 
+enum UnicodeTransformDecodeResult
+{
+    UnicodeTransformDecodeSuccess,
+    UnicodeTransformDecodeFail,
+    UnicodeTransformDecodeIncomplete
+};
+
 size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len);
 bool unicode_buf_is_ascii(const uint8_t *buf, size_t buf_len);
 size_t unicode_latin1_buf_size_as_utf8(const uint8_t *buf, size_t len);
@@ -40,6 +47,23 @@ static inline bool unicode_is_valid_codepoint(uint32_t codepoint)
     return (codepoint < 0x110000) && !((codepoint > 0xD800) && (codepoint < 0xDFFF));
 }
 
+/**
+ * @brief Decode a character from UTF-8.
+ *
+ * @param buf the buffer from which to decode the string
+ * @param len the length (in bytes) of the bytes in buf
+ * @param c int value to decode to
+ * @param out_size the size in bytes, on output (if not NULL)
+ * @return \c UnicodeTransformDecodeSuccess if decoding was successful,
+ * \c UnicodeTransformDecodeFail if character starting at buf is not a valid
+ * unicode character or \c UnicodeTransformDecodeIncomplete if character
+ * starting at buf is a valid but incomplete transformation
+ */
+enum UnicodeTransformDecodeResult unicode_utf8_decode(
+    const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
+
+bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len);
+
 #ifdef __cplusplus
 }
 #endif