Skip to content

Commit 7eeab10

Browse files
committed
Allow atoms longer than 255 bytes
Update atom_table API so it uses pointers to characters with a length, and reduce further usage of AtomString which represent shorter atoms Signed-off-by: Paul Guyot <pguyot@kallisys.net>
1 parent 25832ca commit 7eeab10

File tree

23 files changed

+547
-564
lines changed

23 files changed

+547
-564
lines changed

src/libAtomVM/atom_table.c

Lines changed: 72 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,17 @@
4242

4343
#define DEFAULT_SIZE 8
4444
#define CAPACITY_INCREASE 8
45+
#define MAX_ATOM_LEN ((1 << 12) - 1)
4546

4647
#define ATOM_TABLE_THRESHOLD(capacity) (capacity + (capacity >> 2))
4748
#define ATOM_TABLE_NEW_CAPACITY(new_count) (new_count + CAPACITY_INCREASE)
4849

4950
struct HNode
5051
{
5152
struct HNode *next;
52-
AtomString key;
53-
atom_index_t index;
53+
const uint8_t *key;
54+
uint32_t index : 20;
55+
uint32_t len : 10;
5456
};
5557

5658
struct HNodeGroup
@@ -148,7 +150,7 @@ static struct HNodeGroup *new_node_group(struct AtomTable *table, int len)
148150

149151
static unsigned long sdbm_hash(const unsigned char *str, int len)
150152
{
151-
unsigned long hash = 0;
153+
unsigned long hash = len;
152154
int c;
153155

154156
for (int i = 0; i < len; i++) {
@@ -160,11 +162,11 @@ static unsigned long sdbm_hash(const unsigned char *str, int len)
160162
}
161163

162164
static inline struct HNode *get_node_from_bucket(
163-
const struct AtomTable *hash_table, unsigned long bucket_index, AtomString string)
165+
const struct AtomTable *hash_table, unsigned long bucket_index, const uint8_t *string, size_t string_len)
164166
{
165167
struct HNode *node = hash_table->buckets[bucket_index];
166168
while (node) {
167-
if (atom_are_equals(string, node->key)) {
169+
if (node->len == string_len && memcmp(node->key, string, string_len) == 0) {
168170
return node;
169171
}
170172

@@ -175,17 +177,17 @@ static inline struct HNode *get_node_from_bucket(
175177
}
176178

177179
static inline struct HNode *get_node_with_hash(
178-
const struct AtomTable *hash_table, AtomString string, unsigned long hash)
180+
const struct AtomTable *hash_table, const uint8_t *string, size_t string_len, unsigned long hash)
179181
{
180182
unsigned long bucket_index = hash % hash_table->capacity;
181-
return get_node_from_bucket(hash_table, bucket_index, string);
183+
return get_node_from_bucket(hash_table, bucket_index, string, string_len);
182184
}
183185

184-
static inline struct HNode *get_node(const struct AtomTable *hash_table, AtomString string)
186+
static inline struct HNode *get_node(const struct AtomTable *hash_table, const uint8_t *string, size_t string_len)
185187
{
186-
unsigned long hash = sdbm_hash(string, atom_string_len(string));
188+
unsigned long hash = sdbm_hash(string, string_len);
187189

188-
return get_node_with_hash(hash_table, string, hash);
190+
return get_node_with_hash(hash_table, string, string_len, hash);
189191
}
190192

191193
// TODO: this function needs use an efficient structure such as a skip list
@@ -208,33 +210,47 @@ static struct HNode *get_node_using_index(struct AtomTable *table, atom_index_t
208210
return NULL;
209211
}
210212

211-
AtomString atom_table_get_atom_string(struct AtomTable *table, atom_index_t index)
213+
const uint8_t *atom_table_get_atom_string(struct AtomTable *table, atom_index_t index, size_t *out_size)
212214
{
215+
const uint8_t *result;
213216
SMP_RDLOCK(table);
214217

215218
struct HNode *node = get_node_using_index(table, index);
216219
if (IS_NULL_PTR(node)) {
217220
SMP_UNLOCK(table);
218221
return NULL;
219222
}
220-
221-
AtomString found_key = node->key;
223+
result = node->key;
224+
*out_size = node->len;
222225

223226
SMP_UNLOCK(table);
224-
return found_key;
227+
return result;
225228
}
226229

227-
int atom_table_cmp_using_atom_index(struct AtomTable *table, int t_atom_index, int other_atom_index)
230+
bool atom_table_is_equal_to_atom_string(struct AtomTable *table, atom_index_t t_atom_index, AtomString string)
228231
{
229-
AtomString t_atom_string = atom_table_get_atom_string(table, t_atom_index);
232+
size_t t_atom_len;
233+
const uint8_t *t_atom_data = atom_table_get_atom_string(table, t_atom_index, &t_atom_len);
234+
if (IS_NULL_PTR(t_atom_data)) {
235+
return false;
236+
}
230237

231-
int t_atom_len = atom_string_len(t_atom_string);
232-
const char *t_atom_data = (const char *) atom_string_data(t_atom_string);
238+
return (t_atom_len == atom_string_len(string)) && (memcmp(t_atom_data, atom_string_data(string), t_atom_len) == 0);
239+
}
233240

234-
AtomString other_atom_string = atom_table_get_atom_string(table, other_atom_index);
241+
int atom_table_cmp_using_atom_index(struct AtomTable *table, atom_index_t t_atom_index, atom_index_t other_atom_index)
242+
{
243+
size_t t_atom_len;
244+
const uint8_t *t_atom_data = atom_table_get_atom_string(table, t_atom_index, &t_atom_len);
245+
if (IS_NULL_PTR(t_atom_data)) {
246+
return -1;
247+
}
235248

236-
int other_atom_len = atom_string_len(other_atom_string);
237-
const char *other_atom_data = (const char *) atom_string_data(other_atom_string);
249+
size_t other_atom_len;
250+
const uint8_t *other_atom_data = atom_table_get_atom_string(table, other_atom_index, &other_atom_len);
251+
if (IS_NULL_PTR(other_atom_data)) {
252+
return 1;
253+
}
238254

239255
int cmp_size = (t_atom_len > other_atom_len) ? other_atom_len : t_atom_len;
240256

@@ -269,9 +285,11 @@ atom_ref_t atom_table_get_atom_ptr_and_len(struct AtomTable *table, atom_index_t
269285

270286
char *atom_table_atom_to_new_cstring(struct AtomTable *table, atom_index_t atom_index)
271287
{
272-
AtomString atom_string = atom_table_get_atom_string(table, atom_index);
273-
size_t atom_len = atom_string_len(atom_string);
274-
const uint8_t *atom_data = atom_string_data(atom_string);
288+
size_t atom_len;
289+
const uint8_t *atom_data = atom_table_get_atom_string(table, atom_index, &atom_len);
290+
if (IS_NULL_PTR(atom_data)) {
291+
return NULL;
292+
}
275293

276294
char *result = malloc(atom_len + 1);
277295
if (IS_NULL_PTR(result)) {
@@ -280,53 +298,13 @@ char *atom_table_atom_to_new_cstring(struct AtomTable *table, atom_index_t atom_
280298

281299
memcpy(result, atom_data, atom_len);
282300
result[atom_len] = 0;
283-
284301
return result;
285302
}
286303

287-
bool atom_table_is_atom_ref_ascii(struct AtomTable *table, atom_ref_t atom)
304+
static inline void init_node(struct HNode *node, const uint8_t *atom_data, size_t atom_len, long index)
288305
{
289-
SMP_RDLOCK(table);
290-
291-
struct HNode *node = (struct HNode *) atom;
292-
const uint8_t *data = atom_string_data(node->key);
293-
size_t len = atom_string_len(node->key);
294-
295-
bool result = unicode_buf_is_ascii(data, len);
296-
297-
SMP_UNLOCK(table);
298-
return result;
299-
}
300-
301-
void atom_table_write_bytes(struct AtomTable *table, atom_ref_t atom, size_t buf_len, void *outbuf)
302-
{
303-
SMP_RDLOCK(table);
304-
305-
struct HNode *node = (struct HNode *) atom;
306-
size_t len = atom_string_len(node->key);
307-
if (len > buf_len) {
308-
len = buf_len;
309-
}
310-
311-
memcpy(outbuf, atom_string_data(node->key), len);
312-
313-
SMP_UNLOCK(table);
314-
}
315-
316-
void atom_table_write_cstring(
317-
struct AtomTable *table, atom_ref_t atom, size_t buf_len, char *outbuf)
318-
{
319-
SMP_RDLOCK(table);
320-
321-
struct HNode *node = (struct HNode *) atom;
322-
atom_string_to_c(node->key, outbuf, buf_len);
323-
324-
SMP_UNLOCK(table);
325-
}
326-
327-
static inline void init_node(struct HNode *node, AtomString atom, long index)
328-
{
329-
node->key = atom;
306+
node->key = atom_data;
307+
node->len = atom_len;
330308
node->index = index;
331309
}
332310

@@ -339,14 +317,14 @@ static inline void insert_node_into_bucket(
339317
}
340318

341319
static inline atom_index_t insert_node(struct AtomTable *table, struct HNodeGroup *node_group,
342-
unsigned long bucket_index, AtomString string)
320+
unsigned long bucket_index, const uint8_t *atom_data, size_t atom_len)
343321
{
344322
atom_index_t new_index = table->count;
345323
table->count++;
346324

347325
struct HNode *node = &node_group->nodes[new_index - node_group->first_index];
348326
table->last_node_group_avail--;
349-
init_node(node, string, new_index);
327+
init_node(node, atom_data, atom_len, new_index);
350328
insert_node_into_bucket(table, bucket_index, node);
351329

352330
return new_index;
@@ -376,9 +354,7 @@ static bool do_rehash(struct AtomTable *table, int new_capacity)
376354

377355
for (int i = 0; i < group_count; i++) {
378356
struct HNode *node = &group->nodes[i];
379-
AtomString key = node->key;
380-
381-
unsigned long hash = sdbm_hash(key, atom_string_len(key));
357+
unsigned long hash = sdbm_hash(node->key, node->len);
382358
unsigned long bucket_index = hash % table->capacity;
383359

384360
insert_node_into_bucket(table, bucket_index, node);
@@ -402,13 +378,13 @@ static inline bool maybe_rehash(struct AtomTable *table, int new_entries)
402378
return do_rehash(table, new_capacity);
403379
}
404380

405-
enum AtomTableEnsureAtomResult atom_table_ensure_atom(struct AtomTable *table, AtomString string, enum AtomTableCopyOpt opts, atom_index_t *result)
381+
enum AtomTableEnsureAtomResult atom_table_ensure_atom(struct AtomTable *table, const uint8_t *atom_data, size_t atom_len, enum AtomTableCopyOpt opts, atom_index_t *result)
406382
{
407-
unsigned long hash = sdbm_hash(string, atom_string_len(string));
383+
unsigned long hash = sdbm_hash(atom_data, atom_len);
408384
SMP_WRLOCK(table);
409385
unsigned long bucket_index = hash % table->capacity;
410386

411-
struct HNode *node = get_node_from_bucket(table, bucket_index, string);
387+
struct HNode *node = get_node_from_bucket(table, bucket_index, atom_data, atom_len);
412388
if (node) {
413389
SMP_UNLOCK(table);
414390
*result = node->index;
@@ -428,29 +404,27 @@ enum AtomTableEnsureAtomResult atom_table_ensure_atom(struct AtomTable *table, A
428404
}
429405
}
430406

431-
AtomString maybe_copied = string;
432407
if (opts & AtomTableCopyAtom) {
433-
uint8_t len = *((uint8_t *) string);
434-
uint8_t *buf = malloc(1 + len);
408+
uint8_t *buf = malloc(atom_len);
435409
if (IS_NULL_PTR(buf)) {
436410
SMP_UNLOCK(table);
437411
return AtomTableEnsureAtomAllocFail;
438412
}
439-
memcpy(buf, string, 1 + len);
440-
maybe_copied = buf;
413+
memcpy(buf, atom_data, atom_len);
414+
atom_data = buf;
441415
}
442416

443417
if (maybe_rehash(table, 1)) {
444418
bucket_index = hash % table->capacity;
445419
}
446420

447-
*result = insert_node(table, node_group, bucket_index, maybe_copied);
421+
*result = insert_node(table, node_group, bucket_index, atom_data, atom_len);
448422

449423
SMP_UNLOCK(table);
450424
return AtomTableEnsureAtomOk;
451425
}
452426

453-
static inline int read_encoded_len(const uint8_t **len_bytes)
427+
static inline ssize_t read_encoded_len(const uint8_t **len_bytes)
454428
{
455429
uint8_t byte0 = (*len_bytes)[0];
456430

@@ -471,7 +445,7 @@ static inline int read_encoded_len(const uint8_t **len_bytes)
471445
// -1 is not a valid atom index as we're limited to 2^20
472446
#define ATOM_TABLE_NOT_FOUND_MARKER ((atom_index_t) -1)
473447

474-
enum AtomTableEnsureAtomResult atom_table_ensure_atoms(struct AtomTable *table, const void *atoms, int count,
448+
enum AtomTableEnsureAtomResult atom_table_ensure_atoms(struct AtomTable *table, const void *atoms, size_t count,
475449
atom_index_t *translate_table, enum EnsureAtomsOpt opt)
476450
{
477451
bool is_long_format = (opt & EnsureLongEncoding) != 0;
@@ -482,35 +456,22 @@ enum AtomTableEnsureAtomResult atom_table_ensure_atoms(struct AtomTable *table,
482456

483457
const uint8_t *current_atom = atoms;
484458

485-
for (int i = 0; i < count; i++) {
459+
for (size_t i = 0; i < count; i++) {
486460
struct HNode *node;
461+
ssize_t atom_len;
487462
if (is_long_format) {
488-
int atom_len = read_encoded_len(&current_atom);
489-
if (UNLIKELY(atom_len < 0)) {
463+
atom_len = read_encoded_len(&current_atom);
464+
if (UNLIKELY(atom_len < 0 || atom_len > MAX_ATOM_LEN)) {
490465
fprintf(stderr, "Found invalid atom len.");
491466
SMP_UNLOCK(table);
492467
return AtomTableEnsureAtomInvalidLen;
493-
} else if (UNLIKELY(atom_len > 255)) {
494-
fprintf(stderr,
495-
"Unsupported atom length %i bytes.\n"
496-
"Unlike OTP >= 28, AtomVM supports a maximum of 255 bytes"
497-
"regardeless the number of codepoints.\n"
498-
"If you are seeing this error please open an issue on GitHub:\n"
499-
"https://github.com/atomvm/AtomVM/issues\n",
500-
atom_len);
501-
SMP_UNLOCK(table);
502-
return AtomTableEnsureAtomInvalidLen;
503468
}
504-
char tmp_old_fmt[256];
505-
tmp_old_fmt[0] = atom_len;
506-
memcpy(tmp_old_fmt + 1, current_atom, atom_len);
507-
node = get_node(table, tmp_old_fmt);
508-
current_atom += atom_len;
509469
} else {
510-
node = get_node(table, current_atom);
511-
uint8_t atom_len = current_atom[0];
512-
current_atom += 1 + atom_len;
470+
atom_len = current_atom[0];
471+
current_atom++;
513472
}
473+
node = get_node(table, current_atom, atom_len);
474+
current_atom += atom_len;
514475

515476
if (node) {
516477
translate_table[i] = node->index;
@@ -525,18 +486,14 @@ enum AtomTableEnsureAtomResult atom_table_ensure_atoms(struct AtomTable *table,
525486
current_atom = atoms;
526487
int remaining_atoms = new_atoms_count;
527488
struct HNodeGroup *node_group = table->last_node_group;
528-
for (int i = 0; i < count; i++) {
529-
530-
const uint8_t *to_be_copied = NULL;
531-
const uint8_t *next_atom = current_atom;
532-
uint8_t atom_len;
489+
for (size_t i = 0; i < count; i++) {
490+
size_t atom_len;
533491
if (is_long_format) {
534-
atom_len = read_encoded_len(&next_atom);
535-
to_be_copied = next_atom;
536-
next_atom += atom_len;
492+
// Size was checked above
493+
atom_len = (size_t) read_encoded_len(&current_atom);
537494
} else {
538495
atom_len = current_atom[0];
539-
next_atom += 1 + atom_len;
496+
current_atom++;
540497
}
541498

542499
if (translate_table[i] == ATOM_TABLE_NOT_FOUND_MARKER) {
@@ -548,28 +505,16 @@ enum AtomTableEnsureAtomResult atom_table_ensure_atoms(struct AtomTable *table,
548505
}
549506
}
550507

551-
if (is_long_format) {
552-
uint8_t *atom_copy = malloc(atom_len + 1);
553-
if (IS_NULL_PTR(atom_copy)) {
554-
// we are not going to remove atoms that have already been added up to this one
555-
SMP_UNLOCK(table);
556-
return AtomTableEnsureAtomAllocFail;
557-
}
558-
atom_copy[0] = atom_len;
559-
memcpy(atom_copy + 1, to_be_copied, atom_len);
560-
current_atom = atom_copy;
561-
}
562-
563508
unsigned long hash = sdbm_hash(current_atom, atom_len);
564509
unsigned long bucket_index = hash % table->capacity;
565510

566-
translate_table[i] = insert_node(table, node_group, bucket_index, current_atom);
511+
translate_table[i] = insert_node(table, node_group, bucket_index, current_atom, atom_len);
567512
remaining_atoms--;
568513
if (remaining_atoms == 0) {
569514
break;
570515
}
571516
}
572-
current_atom = next_atom;
517+
current_atom += atom_len;
573518
}
574519

575520
SMP_UNLOCK(table);

0 commit comments

Comments
 (0)