Skip to content

Commit 431cc4a

Browse files
committed
Merge pull request #1051 from bettio/optimized-utf8
Optimized UTF-8 These changes are made under both the "Apache 2.0" and the "GNU Lesser General Public License 2.1 or later" license terms (dual license). SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
2 parents 40d7668 + 35923c4 commit 431cc4a

File tree

10 files changed

+136
-113
lines changed

10 files changed

+136
-113
lines changed

LICENSES/MIT.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) [year] [fullname]
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

doc/src/apidocs/libatomvm/functions.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ Functions
3838
.. doxygenfunction:: bitstring_utf16_size
3939
.. doxygenfunction:: bitstring_utf32_decode
4040
.. doxygenfunction:: bitstring_utf32_encode
41-
.. doxygenfunction:: bitstring_utf8_decode
4241
.. doxygenfunction:: bitstring_utf8_encode
4342
.. doxygenfunction:: bitstring_utf8_size
4443
.. doxygenfunction:: context_avail_free_memory

src/libAtomVM/bif.c

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,19 +1638,10 @@ term binary_to_atom(Context *ctx, term a_binary, term encoding, bool create_new,
16381638

16391639
AtomString atom;
16401640
if (LIKELY(!encode_latin1_to_utf8)) {
1641-
size_t i = 0;
1642-
while (i < atom_string_len) {
1643-
uint32_t codepoint;
1644-
size_t codepoint_size;
1645-
if (UNLIKELY(bitstring_utf8_decode(
1646-
(uint8_t *) atom_string + i, atom_string_len, &codepoint, &codepoint_size))
1647-
!= UnicodeTransformDecodeSuccess) {
1648-
*error_reason = BADARG_ATOM;
1649-
return term_invalid_term();
1650-
}
1651-
i += codepoint_size;
1641+
if (UNLIKELY(!unicode_is_valid_utf8_buf((const uint8_t *) atom_string, atom_string_len))) {
1642+
*error_reason = BADARG_ATOM;
1643+
return term_invalid_term();
16521644
}
1653-
16541645
atom = malloc(atom_string_len + 1);
16551646
if (IS_NULL_PTR(atom)) {
16561647
*error_reason = OUT_OF_MEMORY_ATOM;

src/libAtomVM/bitstring.c

Lines changed: 0 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -141,69 +141,6 @@ bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size)
141141
return true;
142142
}
143143

144-
enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
145-
{
146-
if (len == 0) {
147-
return UnicodeTransformDecodeFail;
148-
} else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
149-
uint32_t v = 0;
150-
v |= (buf[0] & 0x07) << 18;
151-
v |= (buf[1] & 0x3F) << 12;
152-
v |= (buf[2] & 0x3F) << 6;
153-
v |= (buf[3] & 0x3F);
154-
// overlong encoding or invalid codepoint
155-
if (v <= 0x10000 || v > 0x10FFFF) {
156-
return UnicodeTransformDecodeFail;
157-
}
158-
*c = v;
159-
*out_size = 4;
160-
return UnicodeTransformDecodeSuccess;
161-
} else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
162-
uint32_t v = 0;
163-
v |= (buf[0] & 0x0F) << 12;
164-
v |= (buf[1] & 0x3F) << 6;
165-
v |= (buf[2] & 0x3F);
166-
// overlong encoding or surrogate
167-
if (v < 0x800 || (v >= 0xD800 && v <= 0xDFFF)) {
168-
return UnicodeTransformDecodeFail;
169-
}
170-
*c = v;
171-
*out_size = 3;
172-
return UnicodeTransformDecodeSuccess;
173-
} else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
174-
uint32_t v = 0;
175-
v |= (buf[0] & 0x1F) << 6;
176-
v |= (buf[1] & 0x3F);
177-
// overlong encoding
178-
if (v < 0x80) {
179-
return UnicodeTransformDecodeFail;
180-
}
181-
*c = v;
182-
*out_size = 2;
183-
return UnicodeTransformDecodeSuccess;
184-
} else if ((*buf & 0x80) == 0) {
185-
uint32_t v = 0;
186-
v |= (buf[0] & 0x7F);
187-
*c = v;
188-
*out_size = 1;
189-
return UnicodeTransformDecodeSuccess;
190-
} else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
191-
return UnicodeTransformDecodeIncomplete;
192-
} else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) {
193-
return UnicodeTransformDecodeIncomplete;
194-
} else if (len == 1 && (buf[0] & 0xF8) == 0xF0) {
195-
return UnicodeTransformDecodeIncomplete;
196-
} else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) {
197-
return UnicodeTransformDecodeIncomplete;
198-
} else if (len == 1 && (buf[0] & 0xF0) == 0xE0) {
199-
return UnicodeTransformDecodeIncomplete;
200-
} else if (len == 1 && (buf[0] & 0xE0) == 0xC0) {
201-
return UnicodeTransformDecodeIncomplete;
202-
}
203-
204-
return UnicodeTransformDecodeFail;
205-
}
206-
207144
// UTF-16 encoding, when U in U+010000 to U+10FFFF:
208145
//
209146
// U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000

src/libAtomVM/bitstring.h

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#define _BITSTRING_H_
2424

2525
#include "term.h"
26+
#include "unicode.h"
2627

2728
#include <stdbool.h>
2829
#include <stdint.h>
@@ -99,13 +100,6 @@ enum BitstringFlags
99100
#endif
100101
};
101102

102-
enum UnicodeTransformDecodeResult
103-
{
104-
UnicodeTransformDecodeSuccess,
105-
UnicodeTransformDecodeFail,
106-
UnicodeTransformDecodeIncomplete
107-
};
108-
109103
union maybe_unsigned_int8
110104
{
111105
uint8_t u;
@@ -320,20 +314,6 @@ static inline bool bitstring_insert_integer(term dst_bin, size_t offset, avm_int
320314
*/
321315
bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size);
322316

323-
/**
324-
* @brief Decode a character from UTF-8.
325-
*
326-
* @param buf the buffer from which to decode the string
327-
* @param len the length (in bytes) of the bytes in buf
328-
* @param c int value to decode to or NULL to only compute the size.
329-
* @param out_size the size in bytes, on output (if not NULL)
330-
* @return \c UnicodeTransformDecodeSuccess if decoding was successful,
331-
* \c UnicodeTransformDecodeFail if character starting at buf is not a valid
332-
* unicode character or \c UnicodeTransformDecodeIncomplete if character
333-
* starting at buf is a valid but incomplete transformation
334-
*/
335-
enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
336-
337317
/**
338318
* @brief Encode a character to UTF-16.
339319
*
@@ -441,7 +421,7 @@ static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c
441421
{
442422
size_t byte_offset = offset >> 3; // divide by 8
443423
const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
444-
return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
424+
return unicode_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
445425
}
446426

447427
/**

src/libAtomVM/externalterm.c

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -636,17 +636,8 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
636636
uint8_t atom_len = *(external_term_buf + 1);
637637
const uint8_t *atom_chars = external_term_buf + 2;
638638

639-
size_t remaining_length = atom_len;
640-
const uint8_t *curr_buf = atom_chars;
641-
while (remaining_length) {
642-
uint32_t out_c;
643-
size_t codepoint_size;
644-
enum UnicodeTransformDecodeResult result = bitstring_utf8_decode(curr_buf, remaining_length, &out_c, &codepoint_size);
645-
if (UNLIKELY(result != UnicodeTransformDecodeSuccess)) {
646-
return term_invalid_term();
647-
}
648-
remaining_length -= codepoint_size;
649-
curr_buf += codepoint_size;
639+
if (UNLIKELY(!unicode_is_valid_utf8_buf((const uint8_t *) atom_chars, atom_len))) {
640+
return term_invalid_term();
650641
}
651642

652643
// AtomString first byte is the atom length

src/libAtomVM/interop.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ static enum UnicodeConversionResult interop_binary_conversion(term t, uint8_t *o
385385
while (input_index < len) {
386386
size_t char_size;
387387
uint32_t c;
388-
enum UnicodeTransformDecodeResult decode_result = bitstring_utf8_decode(input + input_index, len - input_index, &c, &char_size);
388+
enum UnicodeTransformDecodeResult decode_result = unicode_utf8_decode(input + input_index, len - input_index, &c, &char_size);
389389
if (UNLIKELY(decode_result != UnicodeTransformDecodeSuccess)) {
390390
*rest_crsr = input_index;
391391
*output_len = result;

src/libAtomVM/nifs.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2081,7 +2081,7 @@ static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[])
20812081
for (size_t i = 0; i < encoded_len; i++) {
20822082
size_t codepoint_size;
20832083
uint32_t codepoint;
2084-
if (UNLIKELY(bitstring_utf8_decode(
2084+
if (UNLIKELY(unicode_utf8_decode(
20852085
&utf8_tmp_buf[in_pos], 2, &codepoint, &codepoint_size)
20862086
!= UnicodeTransformDecodeSuccess
20872087
|| (codepoint > 255))) {
@@ -2122,7 +2122,7 @@ static term make_list_from_utf8_buf(const uint8_t *buf, size_t buf_len, Context
21222122
for (size_t i = 0; i < u8len; i++) {
21232123
size_t codepoint_size;
21242124
enum UnicodeTransformDecodeResult result
2125-
= bitstring_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
2125+
= unicode_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
21262126
if (UNLIKELY((result != UnicodeTransformDecodeSuccess)
21272127
|| !unicode_is_valid_codepoint(codepoints[i]))) {
21282128
AVM_ABORT();

src/libAtomVM/unicode.c

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,94 @@
1515
* See the License for the specific language governing permissions and
1616
* limitations under the License.
1717
*
18-
* SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
18+
* SPDX-License-Identifier: (Apache-2.0 OR LGPL-2.1-or-later) AND MIT
1919
*/
2020

2121
#include <stdbool.h>
2222
#include <stddef.h>
2323

24+
#include "utils.h"
25+
2426
#include "unicode.h"
2527

28+
// clang-format off
29+
30+
// Following utf8d table and decode function are covered by MIT license
31+
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
32+
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
33+
34+
#define UTF8_ACCEPT 0
35+
#define UTF8_REJECT 12
36+
37+
static const uint8_t utf8d[] = {
38+
// The first part of the table maps bytes to character classes that
39+
// to reduce the size of the transition table and create bitmasks.
40+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
41+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
42+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
43+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
44+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
45+
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
46+
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
47+
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
48+
49+
// The second part is a transition table that maps a combination
50+
// of a state of the automaton and a character class to a state.
51+
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
52+
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
53+
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
54+
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
55+
12,36,12,12,12,12,12,12,12,12,12,12,
56+
};
57+
58+
static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
59+
{
60+
uint32_t type = utf8d[byte];
61+
62+
*codep = (*state != UTF8_ACCEPT) ?
63+
(byte & 0x3fu) | (*codep << 6) :
64+
(0xff >> type) & (byte);
65+
66+
*state = utf8d[256 + *state + type];
67+
return *state;
68+
}
69+
70+
// clang-format on
71+
72+
enum UnicodeTransformDecodeResult unicode_utf8_decode(
73+
const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
74+
{
75+
uint32_t codepoint = 0;
76+
uint32_t state = 0;
77+
size_t i = 0;
78+
while (i < len) {
79+
state = decode(&state, &codepoint, buf[i]);
80+
i++;
81+
82+
if (state == UTF8_ACCEPT) {
83+
*c = codepoint;
84+
*out_size = i;
85+
return UnicodeTransformDecodeSuccess;
86+
} else if (UNLIKELY(state == UTF8_REJECT)) {
87+
return UnicodeTransformDecodeFail;
88+
}
89+
}
90+
91+
return UnicodeTransformDecodeIncomplete;
92+
}
93+
94+
bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len)
95+
{
96+
uint32_t codepoint = 0;
97+
uint32_t state = 0;
98+
99+
for (size_t i = 0; i < len; i++) {
100+
state = decode(&state, &codepoint, buf[i]);
101+
}
102+
103+
return state == UTF8_ACCEPT;
104+
}
105+
26106
size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len)
27107
{
28108
size_t count = 0;

src/libAtomVM/unicode.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@
2929
extern "C" {
3030
#endif
3131

32+
enum UnicodeTransformDecodeResult
33+
{
34+
UnicodeTransformDecodeSuccess,
35+
UnicodeTransformDecodeFail,
36+
UnicodeTransformDecodeIncomplete
37+
};
38+
3239
size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len);
3340
bool unicode_buf_is_ascii(const uint8_t *buf, size_t buf_len);
3441
size_t unicode_latin1_buf_size_as_utf8(const uint8_t *buf, size_t len);
@@ -40,6 +47,23 @@ static inline bool unicode_is_valid_codepoint(uint32_t codepoint)
4047
return (codepoint < 0x110000) && !((codepoint > 0xD800) && (codepoint < 0xDFFF));
4148
}
4249

50+
/**
51+
* @brief Decode a character from UTF-8.
52+
*
53+
* @param buf the buffer from which to decode the string
54+
* @param len the length (in bytes) of the bytes in buf
55+
* @param c int value to decode to
56+
* @param out_size the size in bytes, on output (if not NULL)
57+
* @return \c UnicodeTransformDecodeSuccess if decoding was successful,
58+
* \c UnicodeTransformDecodeFail if character starting at buf is not a valid
59+
* unicode character or \c UnicodeTransformDecodeIncomplete if character
60+
* starting at buf is a valid but incomplete transformation
61+
*/
62+
enum UnicodeTransformDecodeResult unicode_utf8_decode(
63+
const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
64+
65+
bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len);
66+
4367
#ifdef __cplusplus
4468
}
4569
#endif

0 commit comments

Comments
 (0)