Skip to content

Commit 2f3c4e9

Browse files
committed
Add support for encoding of large atoms
Signed-off-by: Paul Guyot <pguyot@kallisys.net>
1 parent 529ddf2 commit 2f3c4e9

File tree

2 files changed

+64
-17
lines changed

2 files changed

+64
-17
lines changed

src/libAtomVM/externalterm.c

+44-17
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#define SMALL_BIG_EXT 110
4646
#define EXPORT_EXT 113
4747
#define MAP_EXT 116
48+
#define ATOM_UTF8_EXT 118
4849
#define SMALL_ATOM_UTF8_EXT 119
4950
#define INVALID_TERM_SIZE -1
5051

@@ -275,16 +276,25 @@ static int serialize_term(uint8_t *buf, term t, GlobalContext *glb)
275276
return NEW_FLOAT_EXT_SIZE;
276277

277278
} else if (term_is_atom(t)) {
278-
int atom_index = term_to_atom_index(t);
279+
AtomIndex atom_index = term_to_atom_index(t);
279280
size_t atom_len;
280-
atom_ref_t atom_ref = atom_table_get_atom_ptr_and_len(glb->atom_table, atom_index, &atom_len);
281-
if (!IS_NULL_PTR(buf)) {
282-
buf[0] = SMALL_ATOM_UTF8_EXT;
283-
buf[1] = atom_len;
284-
atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, buf + 2);
281+
const uint8_t *atom_data = atom_table_get_atom_string(glb->atom_table, atom_index, &atom_len);
282+
if (UNLIKELY(atom_len >= 256)) {
283+
if (!IS_NULL_PTR(buf)) {
284+
buf[0] = ATOM_UTF8_EXT;
285+
buf[1] = atom_len >> 8;
286+
buf[2] = atom_len & 0xFF;
287+
memcpy(buf + 3, atom_data, atom_len);
288+
}
289+
return ATOM_EXT_BASE_SIZE + atom_len;
290+
} else {
291+
if (!IS_NULL_PTR(buf)) {
292+
buf[0] = SMALL_ATOM_UTF8_EXT;
293+
buf[1] = atom_len;
294+
memcpy(buf + 2, atom_data, atom_len);
295+
}
296+
return SMALL_ATOM_EXT_BASE_SIZE + atom_len;
285297
}
286-
return 2 + atom_len;
287-
288298
} else if (term_is_tuple(t)) {
289299
size_t arity = term_get_tuple_arity(t);
290300
size_t k;
@@ -459,34 +469,32 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
459469
const uint8_t *atom_chars = (const uint8_t *) (external_term_buf + 3);
460470
int global_atom_id;
461471
if (LIKELY(unicode_buf_is_ascii(atom_chars, atom_len))) {
462-
// there is a trick here: we are reusing LSB of len field as atom length
463472
global_atom_id = globalcontext_insert_atom_maybe_copy(
464-
glb, (AtomString) (external_term_buf + 2), copy);
473+
glb, atom_chars, atom_len, copy);
465474
} else {
466475
// need to re-encode latin1 to UTF-8
467476
size_t required_buf_size = unicode_latin1_buf_size_as_utf8(atom_chars, atom_len);
468477
if (UNLIKELY(required_buf_size > 255)) {
469478
return term_invalid_term();
470479
}
471-
uint8_t *atom_buf = malloc(1 + required_buf_size);
472-
atom_buf[0] = required_buf_size;
473-
uint8_t *curr_codepoint = &atom_buf[1];
480+
uint8_t *atom_buf = malloc(required_buf_size);
481+
uint8_t *curr_codepoint = atom_buf;
474482
for (int i = 0; i < atom_len; i++) {
475483
size_t codepoint_size;
476484
// latin1 encoding is always successful
477485
bitstring_utf8_encode(atom_chars[i], curr_codepoint, &codepoint_size);
478486
curr_codepoint += codepoint_size;
479487
}
480488
global_atom_id
481-
= globalcontext_insert_atom_maybe_copy(glb, (AtomString) atom_buf, true);
489+
= globalcontext_insert_atom_maybe_copy(glb, atom_buf, required_buf_size, true);
482490
free(atom_buf);
483491
}
484492

485493
if (UNLIKELY(global_atom_id) < 0) {
486494
return term_invalid_term();
487495
}
488496

489-
*eterm_size = 3 + atom_len;
497+
*eterm_size = ATOM_EXT_BASE_SIZE + atom_len;
490498
return term_from_atom_index(global_atom_id);
491499
}
492500

@@ -632,6 +640,24 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
632640
return map;
633641
}
634642

643+
case ATOM_UTF8_EXT: {
644+
uint16_t atom_len = READ_16_UNALIGNED(external_term_buf + 1);
645+
const uint8_t *atom_chars = external_term_buf + 3;
646+
647+
if (UNLIKELY(!unicode_is_valid_utf8_buf((const uint8_t *) atom_chars, atom_len))) {
648+
return term_invalid_term();
649+
}
650+
651+
// AtomString first byte is the atom length
652+
int global_atom_id = globalcontext_insert_atom_maybe_copy(glb, atom_chars, atom_len, copy);
653+
if (UNLIKELY(global_atom_id < 0)) {
654+
return term_invalid_term();
655+
}
656+
657+
*eterm_size = ATOM_EXT_BASE_SIZE + atom_len;
658+
return term_from_atom_index(global_atom_id);
659+
}
660+
635661
case SMALL_ATOM_UTF8_EXT: {
636662
uint8_t atom_len = *(external_term_buf + 1);
637663
const uint8_t *atom_chars = external_term_buf + 2;
@@ -641,12 +667,12 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
641667
}
642668

643669
// AtomString first byte is the atom length
644-
int global_atom_id = globalcontext_insert_atom_maybe_copy(glb, (AtomString) (external_term_buf + 1), copy);
670+
int global_atom_id = globalcontext_insert_atom_maybe_copy(glb, atom_chars, atom_len, copy);
645671
if (UNLIKELY(global_atom_id < 0)) {
646672
return term_invalid_term();
647673
}
648674

649-
*eterm_size = 2 + atom_len;
675+
*eterm_size = SMALL_ATOM_EXT_BASE_SIZE + atom_len;
650676
return term_from_atom_index(global_atom_id);
651677
}
652678

@@ -709,6 +735,7 @@ static int calculate_heap_usage(const uint8_t *external_term_buf, size_t remaini
709735
return term_boxed_integer_size(value);
710736
}
711737

738+
case ATOM_UTF8_EXT:
712739
case ATOM_EXT: {
713740
if (UNLIKELY(remaining < ATOM_EXT_BASE_SIZE)) {
714741
return INVALID_TERM_SIZE;

tests/erlang_tests/test_binary_to_term.erl

+20
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ start() ->
172172
ok = test_mutate_encodings(),
173173
ok = test_atom_decoding(),
174174
ok = test_atom_decoding_checks(),
175+
ok = test_atom_encoding(),
175176
0.
176177

177178
test_reverse(T, Interop) ->
@@ -423,6 +424,13 @@ test_atom_decoding_checks() ->
423424
ok = expect_badarg(make_binterm_fun(invalid_utf8_seq_3)),
424425
ok.
425426

427+
test_atom_encoding() ->
428+
true = compare_pair_encoding(latin1_as_utf8_1),
429+
true = compare_pair_encoding(latin1_as_utf8_2),
430+
true = compare_pair_encoding(latin1_as_utf8_3),
431+
true = compare_pair_encoding(long_with_two_bytes_length),
432+
ok.
433+
426434
make_binterm_fun(Id) ->
427435
fun() ->
428436
Bin = ?MODULE:get_binary(Id),
@@ -434,6 +442,18 @@ compare_pair(Id) ->
434442
B = ?MODULE:get_binary(Id),
435443
A == erlang:binary_to_term(B).
436444

445+
compare_pair_encoding(Id) ->
446+
A = ?MODULE:get_atom(Id),
447+
B = ?MODULE:get_binary(Id),
448+
A == erlang:binary_to_term(erlang:term_to_binary(A)),
449+
OTPVersion = get_otp_version(),
450+
if
451+
OTPVersion =:= atomvm orelse OTPVersion >= 26 ->
452+
B == erlang:term_to_binary(A);
453+
true ->
454+
ok
455+
end.
456+
437457
% We don't have access to erlang module in tests.
438458
apply(F, []) ->
439459
F();

0 commit comments

Comments
 (0)