Add support for encoding of large atoms

pguyot · pguyot · commit 2f3c4e9056a1 · 2025-03-18T07:37:09.000+01:00
Signed-off-by: Paul Guyot &lt;pguyot@kallisys.net&gt;
diff --git a/src/libAtomVM/externalterm.c b/src/libAtomVM/externalterm.c
@@ -45,6 +45,7 @@
 #define SMALL_BIG_EXT 110
 #define EXPORT_EXT 113
 #define MAP_EXT 116
+#define ATOM_UTF8_EXT 118
 #define SMALL_ATOM_UTF8_EXT 119
 #define INVALID_TERM_SIZE -1
 
@@ -275,16 +276,25 @@ static int serialize_term(uint8_t *buf, term t, GlobalContext *glb)
         return NEW_FLOAT_EXT_SIZE;
 
     } else if (term_is_atom(t)) {
-        int atom_index = term_to_atom_index(t);
+        AtomIndex atom_index = term_to_atom_index(t);
         size_t atom_len;
-        atom_ref_t atom_ref = atom_table_get_atom_ptr_and_len(glb->atom_table, atom_index, &atom_len);
-        if (!IS_NULL_PTR(buf)) {
-            buf[0] = SMALL_ATOM_UTF8_EXT;
-            buf[1] = atom_len;
-            atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, buf + 2);
+        const uint8_t *atom_data = atom_table_get_atom_string(glb->atom_table, atom_index, &atom_len);
+        if (UNLIKELY(atom_len >= 256)) {
+            if (!IS_NULL_PTR(buf)) {
+                buf[0] = ATOM_UTF8_EXT;
+                buf[1] = atom_len >> 8;
+                buf[2] = atom_len & 0xFF;
+                memcpy(buf + 3, atom_data, atom_len);
+            }
+            return ATOM_EXT_BASE_SIZE + atom_len;
+        } else {
+            if (!IS_NULL_PTR(buf)) {
+                buf[0] = SMALL_ATOM_UTF8_EXT;
+                buf[1] = atom_len;
+                memcpy(buf + 2, atom_data, atom_len);
+            }
+            return SMALL_ATOM_EXT_BASE_SIZE + atom_len;
         }
-        return 2 + atom_len;
-
     } else if (term_is_tuple(t)) {
         size_t arity = term_get_tuple_arity(t);
         size_t k;
@@ -459,34 +469,32 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
             const uint8_t *atom_chars = (const uint8_t *) (external_term_buf + 3);
             int global_atom_id;
             if (LIKELY(unicode_buf_is_ascii(atom_chars, atom_len))) {
-                // there is a trick here: we are reusing LSB of len field as atom length
                 global_atom_id = globalcontext_insert_atom_maybe_copy(
-                    glb, (AtomString) (external_term_buf + 2), copy);
+                    glb, atom_chars, atom_len, copy);
             } else {
                 // need to re-encode latin1 to UTF-8
                 size_t required_buf_size = unicode_latin1_buf_size_as_utf8(atom_chars, atom_len);
                 if (UNLIKELY(required_buf_size > 255)) {
                     return term_invalid_term();
                 }
-                uint8_t *atom_buf = malloc(1 + required_buf_size);
-                atom_buf[0] = required_buf_size;
-                uint8_t *curr_codepoint = &atom_buf[1];
+                uint8_t *atom_buf = malloc(required_buf_size);
+                uint8_t *curr_codepoint = atom_buf;
                 for (int i = 0; i < atom_len; i++) {
                     size_t codepoint_size;
                     // latin1 encoding is always successful
                     bitstring_utf8_encode(atom_chars[i], curr_codepoint, &codepoint_size);
                     curr_codepoint += codepoint_size;
                 }
                 global_atom_id
-                    = globalcontext_insert_atom_maybe_copy(glb, (AtomString) atom_buf, true);
+                    = globalcontext_insert_atom_maybe_copy(glb, atom_buf, required_buf_size, true);
                 free(atom_buf);
             }
 
             if (UNLIKELY(global_atom_id) < 0) {
                 return term_invalid_term();
             }
 
-            *eterm_size = 3 + atom_len;
+            *eterm_size = ATOM_EXT_BASE_SIZE + atom_len;
             return term_from_atom_index(global_atom_id);
         }
 
@@ -632,6 +640,24 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
             return map;
         }
 
+        case ATOM_UTF8_EXT: {
+            uint16_t atom_len = READ_16_UNALIGNED(external_term_buf + 1);
+            const uint8_t *atom_chars = external_term_buf + 3;
+
+            if (UNLIKELY(!unicode_is_valid_utf8_buf((const uint8_t *) atom_chars, atom_len))) {
+                return term_invalid_term();
+            }
+
+            // AtomString first byte is the atom length
+            int global_atom_id = globalcontext_insert_atom_maybe_copy(glb, atom_chars, atom_len, copy);
+            if (UNLIKELY(global_atom_id < 0)) {
+                return term_invalid_term();
+            }
+
+            *eterm_size = ATOM_EXT_BASE_SIZE + atom_len;
+            return term_from_atom_index(global_atom_id);
+        }
+
         case SMALL_ATOM_UTF8_EXT: {
             uint8_t atom_len = *(external_term_buf + 1);
             const uint8_t *atom_chars = external_term_buf + 2;
@@ -641,12 +667,12 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
             }
 
             // AtomString first byte is the atom length
-            int global_atom_id = globalcontext_insert_atom_maybe_copy(glb, (AtomString) (external_term_buf + 1), copy);
+            int global_atom_id = globalcontext_insert_atom_maybe_copy(glb, atom_chars, atom_len, copy);
             if (UNLIKELY(global_atom_id < 0)) {
                 return term_invalid_term();
             }
 
-            *eterm_size = 2 + atom_len;
+            *eterm_size = SMALL_ATOM_EXT_BASE_SIZE + atom_len;
             return term_from_atom_index(global_atom_id);
         }
 
@@ -709,6 +735,7 @@ static int calculate_heap_usage(const uint8_t *external_term_buf, size_t remaini
             return term_boxed_integer_size(value);
         }
 
+        case ATOM_UTF8_EXT:
         case ATOM_EXT: {
             if (UNLIKELY(remaining < ATOM_EXT_BASE_SIZE)) {
                 return INVALID_TERM_SIZE;
diff --git a/tests/erlang_tests/test_binary_to_term.erl b/tests/erlang_tests/test_binary_to_term.erl
@@ -172,6 +172,7 @@ start() ->
     ok = test_mutate_encodings(),
     ok = test_atom_decoding(),
     ok = test_atom_decoding_checks(),
+    ok = test_atom_encoding(),
     0.
 
 test_reverse(T, Interop) ->
@@ -423,6 +424,13 @@ test_atom_decoding_checks() ->
     ok = expect_badarg(make_binterm_fun(invalid_utf8_seq_3)),
     ok.
 
+test_atom_encoding() ->
+    true = compare_pair_encoding(latin1_as_utf8_1),
+    true = compare_pair_encoding(latin1_as_utf8_2),
+    true = compare_pair_encoding(latin1_as_utf8_3),
+    true = compare_pair_encoding(long_with_two_bytes_length),
+    ok.
+
 make_binterm_fun(Id) ->
     fun() ->
         Bin = ?MODULE:get_binary(Id),
@@ -434,6 +442,18 @@ compare_pair(Id) ->
     B = ?MODULE:get_binary(Id),
     A == erlang:binary_to_term(B).
 
+compare_pair_encoding(Id) ->
+    A = ?MODULE:get_atom(Id),
+    B = ?MODULE:get_binary(Id),
+    A == erlang:binary_to_term(erlang:term_to_binary(A)),
+    OTPVersion = get_otp_version(),
+    if
+        OTPVersion =:= atomvm orelse OTPVersion >= 26 ->
+            B == erlang:term_to_binary(A);
+        true ->
+            ok
+    end.
+
 % We don't have access to erlang module in tests.
 apply(F, []) ->
     F();