Skip to content

Commit 1813ad5

Browse files
committed
Remove duplicated code for UTF-8 validation
There was some code duplicated for validating UTF-8 strings, that was even performing full decode. Use instead new `unicode_is_valid_utf8_buf` function that is also based on highly optimized code. Signed-off-by: Davide Bettio <davide@uninstall.it>
1 parent a9ab729 commit 1813ad5

File tree

4 files changed

+21
-25
lines changed

4 files changed

+21
-25
lines changed

src/libAtomVM/bif.c

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,19 +1638,10 @@ term binary_to_atom(Context *ctx, term a_binary, term encoding, bool create_new,
16381638

16391639
AtomString atom;
16401640
if (LIKELY(!encode_latin1_to_utf8)) {
1641-
size_t i = 0;
1642-
while (i < atom_string_len) {
1643-
uint32_t codepoint;
1644-
size_t codepoint_size;
1645-
if (UNLIKELY(bitstring_utf8_decode(
1646-
(uint8_t *) atom_string + i, atom_string_len, &codepoint, &codepoint_size))
1647-
!= UnicodeTransformDecodeSuccess) {
1648-
*error_reason = BADARG_ATOM;
1649-
return term_invalid_term();
1650-
}
1651-
i += codepoint_size;
1641+
if (UNLIKELY(!unicode_is_valid_utf8_buf((const uint8_t *) atom_string, atom_string_len))) {
1642+
*error_reason = BADARG_ATOM;
1643+
return term_invalid_term();
16521644
}
1653-
16541645
atom = malloc(atom_string_len + 1);
16551646
if (IS_NULL_PTR(atom)) {
16561647
*error_reason = OUT_OF_MEMORY_ATOM;

src/libAtomVM/externalterm.c

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -636,17 +636,8 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
636636
uint8_t atom_len = *(external_term_buf + 1);
637637
const uint8_t *atom_chars = external_term_buf + 2;
638638

639-
size_t remaining_length = atom_len;
640-
const uint8_t *curr_buf = atom_chars;
641-
while (remaining_length) {
642-
uint32_t out_c;
643-
size_t codepoint_size;
644-
enum UnicodeTransformDecodeResult result = bitstring_utf8_decode(curr_buf, remaining_length, &out_c, &codepoint_size);
645-
if (UNLIKELY(result != UnicodeTransformDecodeSuccess)) {
646-
return term_invalid_term();
647-
}
648-
remaining_length -= codepoint_size;
649-
curr_buf += codepoint_size;
639+
if (UNLIKELY(!unicode_is_valid_utf8_buf((const uint8_t *) atom_chars, atom_len))) {
640+
return term_invalid_term();
650641
}
651642

652643
// AtomString first byte is the atom length

src/libAtomVM/unicode.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ static const uint8_t utf8d[] = {
5353
12,36,12,12,12,12,12,12,12,12,12,12,
5454
};
5555

56-
uint32_t inline
57-
decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
56+
static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
57+
{
5858
uint32_t type = utf8d[byte];
5959

6060
*codep = (*state != UTF8_ACCEPT) ?
@@ -67,6 +67,18 @@ decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
6767

6868
// clang-format on
6969

70+
bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len)
71+
{
72+
uint32_t codepoint = 0;
73+
uint32_t state = 0;
74+
75+
for (size_t i = 0; i < len; i++) {
76+
state = decode(&state, &codepoint, buf[i]);
77+
}
78+
79+
return state == UTF8_ACCEPT;
80+
}
81+
7082
size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len)
7183
{
7284
size_t count = 0;

src/libAtomVM/unicode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ static inline bool unicode_is_valid_codepoint(uint32_t codepoint)
4040
return (codepoint < 0x110000) && !((codepoint > 0xD800) && (codepoint < 0xDFFF));
4141
}
4242

43+
bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len);
44+
4345
#ifdef __cplusplus
4446
}
4547
#endif

0 commit comments

Comments
 (0)