45
45
#define SMALL_BIG_EXT 110
46
46
#define EXPORT_EXT 113
47
47
#define MAP_EXT 116
48
+ #define ATOM_UTF8_EXT 118
48
49
#define SMALL_ATOM_UTF8_EXT 119
49
50
#define INVALID_TERM_SIZE -1
50
51
@@ -275,16 +276,25 @@ static int serialize_term(uint8_t *buf, term t, GlobalContext *glb)
275
276
return NEW_FLOAT_EXT_SIZE ;
276
277
277
278
} else if (term_is_atom (t )) {
278
- int atom_index = term_to_atom_index (t );
279
+ AtomIndex atom_index = term_to_atom_index (t );
279
280
size_t atom_len ;
280
- atom_ref_t atom_ref = atom_table_get_atom_ptr_and_len (glb -> atom_table , atom_index , & atom_len );
281
- if (!IS_NULL_PTR (buf )) {
282
- buf [0 ] = SMALL_ATOM_UTF8_EXT ;
283
- buf [1 ] = atom_len ;
284
- atom_table_write_bytes (glb -> atom_table , atom_ref , atom_len , buf + 2 );
281
+ const uint8_t * atom_data = atom_table_get_atom_string (glb -> atom_table , atom_index , & atom_len );
282
+ if (UNLIKELY (atom_len >= 256 )) {
283
+ if (!IS_NULL_PTR (buf )) {
284
+ buf [0 ] = ATOM_UTF8_EXT ;
285
+ buf [1 ] = atom_len >> 8 ;
286
+ buf [2 ] = atom_len & 0xFF ;
287
+ memcpy (buf + 3 , atom_data , atom_len );
288
+ }
289
+ return ATOM_EXT_BASE_SIZE + atom_len ;
290
+ } else {
291
+ if (!IS_NULL_PTR (buf )) {
292
+ buf [0 ] = SMALL_ATOM_UTF8_EXT ;
293
+ buf [1 ] = atom_len ;
294
+ memcpy (buf + 2 , atom_data , atom_len );
295
+ }
296
+ return SMALL_ATOM_EXT_BASE_SIZE + atom_len ;
285
297
}
286
- return 2 + atom_len ;
287
-
288
298
} else if (term_is_tuple (t )) {
289
299
size_t arity = term_get_tuple_arity (t );
290
300
size_t k ;
@@ -459,34 +469,32 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
459
469
const uint8_t * atom_chars = (const uint8_t * ) (external_term_buf + 3 );
460
470
int global_atom_id ;
461
471
if (LIKELY (unicode_buf_is_ascii (atom_chars , atom_len ))) {
462
- // there is a trick here: we are reusing LSB of len field as atom length
463
472
global_atom_id = globalcontext_insert_atom_maybe_copy (
464
- glb , ( AtomString ) ( external_term_buf + 2 ) , copy );
473
+ glb , atom_chars , atom_len , copy );
465
474
} else {
466
475
// need to re-encode latin1 to UTF-8
467
476
size_t required_buf_size = unicode_latin1_buf_size_as_utf8 (atom_chars , atom_len );
468
477
if (UNLIKELY (required_buf_size > 255 )) {
469
478
return term_invalid_term ();
470
479
}
471
- uint8_t * atom_buf = malloc (1 + required_buf_size );
472
- atom_buf [0 ] = required_buf_size ;
473
- uint8_t * curr_codepoint = & atom_buf [1 ];
480
+ uint8_t * atom_buf = malloc (required_buf_size );
481
+ uint8_t * curr_codepoint = atom_buf ;
474
482
for (int i = 0 ; i < atom_len ; i ++ ) {
475
483
size_t codepoint_size ;
476
484
// latin1 encoding is always successful
477
485
bitstring_utf8_encode (atom_chars [i ], curr_codepoint , & codepoint_size );
478
486
curr_codepoint += codepoint_size ;
479
487
}
480
488
global_atom_id
481
- = globalcontext_insert_atom_maybe_copy (glb , ( AtomString ) atom_buf , true);
489
+ = globalcontext_insert_atom_maybe_copy (glb , atom_buf , required_buf_size , true);
482
490
free (atom_buf );
483
491
}
484
492
485
493
if (UNLIKELY (global_atom_id ) < 0 ) {
486
494
return term_invalid_term ();
487
495
}
488
496
489
- * eterm_size = 3 + atom_len ;
497
+ * eterm_size = ATOM_EXT_BASE_SIZE + atom_len ;
490
498
return term_from_atom_index (global_atom_id );
491
499
}
492
500
@@ -632,6 +640,24 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
632
640
return map ;
633
641
}
634
642
643
+ case ATOM_UTF8_EXT : {
644
+ uint16_t atom_len = READ_16_UNALIGNED (external_term_buf + 1 );
645
+ const uint8_t * atom_chars = external_term_buf + 3 ;
646
+
647
+ if (UNLIKELY (!unicode_is_valid_utf8_buf ((const uint8_t * ) atom_chars , atom_len ))) {
648
+ return term_invalid_term ();
649
+ }
650
+
651
+ // AtomString first byte is the atom length
652
+ int global_atom_id = globalcontext_insert_atom_maybe_copy (glb , atom_chars , atom_len , copy );
653
+ if (UNLIKELY (global_atom_id < 0 )) {
654
+ return term_invalid_term ();
655
+ }
656
+
657
+ * eterm_size = ATOM_EXT_BASE_SIZE + atom_len ;
658
+ return term_from_atom_index (global_atom_id );
659
+ }
660
+
635
661
case SMALL_ATOM_UTF8_EXT : {
636
662
uint8_t atom_len = * (external_term_buf + 1 );
637
663
const uint8_t * atom_chars = external_term_buf + 2 ;
@@ -641,12 +667,12 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
641
667
}
642
668
643
669
// AtomString first byte is the atom length
644
- int global_atom_id = globalcontext_insert_atom_maybe_copy (glb , ( AtomString ) ( external_term_buf + 1 ) , copy );
670
+ int global_atom_id = globalcontext_insert_atom_maybe_copy (glb , atom_chars , atom_len , copy );
645
671
if (UNLIKELY (global_atom_id < 0 )) {
646
672
return term_invalid_term ();
647
673
}
648
674
649
- * eterm_size = 2 + atom_len ;
675
+ * eterm_size = SMALL_ATOM_EXT_BASE_SIZE + atom_len ;
650
676
return term_from_atom_index (global_atom_id );
651
677
}
652
678
@@ -709,6 +735,7 @@ static int calculate_heap_usage(const uint8_t *external_term_buf, size_t remaini
709
735
return term_boxed_integer_size (value );
710
736
}
711
737
738
+ case ATOM_UTF8_EXT :
712
739
case ATOM_EXT : {
713
740
if (UNLIKELY (remaining < ATOM_EXT_BASE_SIZE )) {
714
741
return INVALID_TERM_SIZE ;
0 commit comments