Skip to content

Commit 2f44f18

Browse files
committed
Allow atoms longer than 255 bytes
Update atom_table API so it uses pointers to characters with a length, and reduce further usage of AtomString which represent shorter atoms Signed-off-by: Paul Guyot <pguyot@kallisys.net>
1 parent b395a39 commit 2f44f18

23 files changed

+545
-586
lines changed

src/libAtomVM/atom_table.c

+67-147
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,17 @@
4242

4343
#define DEFAULT_SIZE 8
4444
#define CAPACITY_INCREASE 8
45+
#define MAX_ATOM_LEN ((1 << 12) - 1)
4546

4647
#define ATOM_TABLE_THRESHOLD(capacity) (capacity + (capacity >> 2))
4748
#define ATOM_TABLE_NEW_CAPACITY(new_count) (new_count + CAPACITY_INCREASE)
4849

4950
struct HNode
5051
{
5152
struct HNode *next;
52-
AtomString key;
53-
atom_index_t index;
53+
const uint8_t *key;
54+
uint32_t index : 20;
55+
uint32_t len : 10;
5456
};
5557

5658
struct HNodeGroup
@@ -148,7 +150,7 @@ static struct HNodeGroup *new_node_group(struct AtomTable *table, int len)
148150

149151
static unsigned long sdbm_hash(const unsigned char *str, int len)
150152
{
151-
unsigned long hash = 0;
153+
unsigned long hash = len;
152154
int c;
153155

154156
for (int i = 0; i < len; i++) {
@@ -160,11 +162,11 @@ static unsigned long sdbm_hash(const unsigned char *str, int len)
160162
}
161163

162164
static inline struct HNode *get_node_from_bucket(
163-
const struct AtomTable *hash_table, unsigned long bucket_index, AtomString string)
165+
const struct AtomTable *hash_table, unsigned long bucket_index, const uint8_t *string, size_t string_len)
164166
{
165167
struct HNode *node = hash_table->buckets[bucket_index];
166168
while (node) {
167-
if (atom_are_equals(string, node->key)) {
169+
if (node->len == string_len && memcmp(node->key, string, string_len) == 0) {
168170
return node;
169171
}
170172

@@ -175,17 +177,17 @@ static inline struct HNode *get_node_from_bucket(
175177
}
176178

177179
static inline struct HNode *get_node_with_hash(
178-
const struct AtomTable *hash_table, AtomString string, unsigned long hash)
180+
const struct AtomTable *hash_table, const uint8_t *string, size_t string_len, unsigned long hash)
179181
{
180182
unsigned long bucket_index = hash % hash_table->capacity;
181-
return get_node_from_bucket(hash_table, bucket_index, string);
183+
return get_node_from_bucket(hash_table, bucket_index, string, string_len);
182184
}
183185

184-
static inline struct HNode *get_node(const struct AtomTable *hash_table, AtomString string)
186+
static inline struct HNode *get_node(const struct AtomTable *hash_table, const uint8_t *string, size_t string_len)
185187
{
186-
unsigned long hash = sdbm_hash(string, atom_string_len(string));
188+
unsigned long hash = sdbm_hash(string, string_len);
187189

188-
return get_node_with_hash(hash_table, string, hash);
190+
return get_node_with_hash(hash_table, string, string_len, hash);
189191
}
190192

191193
// TODO: this function needs use an efficient structure such as a skip list
@@ -208,33 +210,47 @@ static struct HNode *get_node_using_index(struct AtomTable *table, atom_index_t
208210
return NULL;
209211
}
210212

211-
AtomString atom_table_get_atom_string(struct AtomTable *table, atom_index_t index)
213+
const uint8_t *atom_table_get_atom_string(struct AtomTable *table, atom_index_t index, size_t *out_size)
212214
{
215+
const uint8_t *result;
213216
SMP_RDLOCK(table);
214217

215218
struct HNode *node = get_node_using_index(table, index);
216219
if (IS_NULL_PTR(node)) {
217220
SMP_UNLOCK(table);
218221
return NULL;
219222
}
220-
221-
AtomString found_key = node->key;
223+
result = node->key;
224+
*out_size = node->len;
222225

223226
SMP_UNLOCK(table);
224-
return found_key;
227+
return result;
225228
}
226229

227-
int atom_table_cmp_using_atom_index(struct AtomTable *table, int t_atom_index, int other_atom_index)
230+
bool atom_table_is_equal_to_atom_string(struct AtomTable *table, atom_index_t t_atom_index, AtomString string)
228231
{
229-
AtomString t_atom_string = atom_table_get_atom_string(table, t_atom_index);
232+
size_t t_atom_len;
233+
const uint8_t *t_atom_data = atom_table_get_atom_string(table, t_atom_index, &t_atom_len);
234+
if (IS_NULL_PTR(t_atom_data)) {
235+
return false;
236+
}
230237

231-
int t_atom_len = atom_string_len(t_atom_string);
232-
const char *t_atom_data = (const char *) atom_string_data(t_atom_string);
238+
return (t_atom_len == atom_string_len(string)) && (memcmp(t_atom_data, atom_string_data(string), t_atom_len) == 0);
239+
}
233240

234-
AtomString other_atom_string = atom_table_get_atom_string(table, other_atom_index);
241+
int atom_table_cmp_using_atom_index(struct AtomTable *table, atom_index_t t_atom_index, atom_index_t other_atom_index)
242+
{
243+
size_t t_atom_len;
244+
const uint8_t *t_atom_data = atom_table_get_atom_string(table, t_atom_index, &t_atom_len);
245+
if (IS_NULL_PTR(t_atom_data)) {
246+
return -1;
247+
}
235248

236-
int other_atom_len = atom_string_len(other_atom_string);
237-
const char *other_atom_data = (const char *) atom_string_data(other_atom_string);
249+
size_t other_atom_len;
250+
const uint8_t *other_atom_data = atom_table_get_atom_string(table, other_atom_index, &other_atom_len);
251+
if (IS_NULL_PTR(other_atom_data)) {
252+
return 1;
253+
}
238254

239255
int cmp_size = (t_atom_len > other_atom_len) ? other_atom_len : t_atom_len;
240256

@@ -267,73 +283,10 @@ atom_ref_t atom_table_get_atom_ptr_and_len(struct AtomTable *table, atom_index_t
267283
return node;
268284
}
269285

270-
char *atom_table_atom_to_new_cstring(struct AtomTable *table, atom_index_t atom_index, const char *suffix)
271-
{
272-
AtomString atom_string = atom_table_get_atom_string(table, atom_index);
273-
size_t atom_len = atom_string_len(atom_string);
274-
const uint8_t *atom_data = atom_string_data(atom_string);
275-
size_t suffix_len = 0;
276-
if (suffix) {
277-
suffix_len = strlen(suffix);
278-
}
279-
280-
char *result = malloc(atom_len + suffix_len + 1);
281-
if (IS_NULL_PTR(result)) {
282-
return NULL;
283-
}
284-
285-
memcpy(result, atom_data, atom_len);
286-
if (suffix) {
287-
memcpy(result + atom_len, suffix, suffix_len);
288-
}
289-
result[atom_len + suffix_len] = 0;
290-
291-
return result;
292-
}
293-
294-
bool atom_table_is_atom_ref_ascii(struct AtomTable *table, atom_ref_t atom)
286+
static inline void init_node(struct HNode *node, const uint8_t *atom_data, size_t atom_len, long index)
295287
{
296-
SMP_RDLOCK(table);
297-
298-
struct HNode *node = (struct HNode *) atom;
299-
const uint8_t *data = atom_string_data(node->key);
300-
size_t len = atom_string_len(node->key);
301-
302-
bool result = unicode_buf_is_ascii(data, len);
303-
304-
SMP_UNLOCK(table);
305-
return result;
306-
}
307-
308-
void atom_table_write_bytes(struct AtomTable *table, atom_ref_t atom, size_t buf_len, void *outbuf)
309-
{
310-
SMP_RDLOCK(table);
311-
312-
struct HNode *node = (struct HNode *) atom;
313-
size_t len = atom_string_len(node->key);
314-
if (len > buf_len) {
315-
len = buf_len;
316-
}
317-
318-
memcpy(outbuf, atom_string_data(node->key), len);
319-
320-
SMP_UNLOCK(table);
321-
}
322-
323-
void atom_table_write_cstring(
324-
struct AtomTable *table, atom_ref_t atom, size_t buf_len, char *outbuf)
325-
{
326-
SMP_RDLOCK(table);
327-
328-
struct HNode *node = (struct HNode *) atom;
329-
atom_string_to_c(node->key, outbuf, buf_len);
330-
331-
SMP_UNLOCK(table);
332-
}
333-
334-
static inline void init_node(struct HNode *node, AtomString atom, long index)
335-
{
336-
node->key = atom;
288+
node->key = atom_data;
289+
node->len = atom_len;
337290
node->index = index;
338291
}
339292

@@ -346,14 +299,14 @@ static inline void insert_node_into_bucket(
346299
}
347300

348301
static inline atom_index_t insert_node(struct AtomTable *table, struct HNodeGroup *node_group,
349-
unsigned long bucket_index, AtomString string)
302+
unsigned long bucket_index, const uint8_t *atom_data, size_t atom_len)
350303
{
351304
atom_index_t new_index = table->count;
352305
table->count++;
353306

354307
struct HNode *node = &node_group->nodes[new_index - node_group->first_index];
355308
table->last_node_group_avail--;
356-
init_node(node, string, new_index);
309+
init_node(node, atom_data, atom_len, new_index);
357310
insert_node_into_bucket(table, bucket_index, node);
358311

359312
return new_index;
@@ -383,9 +336,7 @@ static bool do_rehash(struct AtomTable *table, int new_capacity)
383336

384337
for (int i = 0; i < group_count; i++) {
385338
struct HNode *node = &group->nodes[i];
386-
AtomString key = node->key;
387-
388-
unsigned long hash = sdbm_hash(key, atom_string_len(key));
339+
unsigned long hash = sdbm_hash(node->key, node->len);
389340
unsigned long bucket_index = hash % table->capacity;
390341

391342
insert_node_into_bucket(table, bucket_index, node);
@@ -409,13 +360,13 @@ static inline bool maybe_rehash(struct AtomTable *table, int new_entries)
409360
return do_rehash(table, new_capacity);
410361
}
411362

412-
enum AtomTableEnsureAtomResult atom_table_ensure_atom(struct AtomTable *table, AtomString string, enum AtomTableCopyOpt opts, atom_index_t *result)
363+
enum AtomTableEnsureAtomResult atom_table_ensure_atom(struct AtomTable *table, const uint8_t *atom_data, size_t atom_len, enum AtomTableCopyOpt opts, atom_index_t *result)
413364
{
414-
unsigned long hash = sdbm_hash(string, atom_string_len(string));
365+
unsigned long hash = sdbm_hash(atom_data, atom_len);
415366
SMP_WRLOCK(table);
416367
unsigned long bucket_index = hash % table->capacity;
417368

418-
struct HNode *node = get_node_from_bucket(table, bucket_index, string);
369+
struct HNode *node = get_node_from_bucket(table, bucket_index, atom_data, atom_len);
419370
if (node) {
420371
SMP_UNLOCK(table);
421372
*result = node->index;
@@ -435,29 +386,27 @@ enum AtomTableEnsureAtomResult atom_table_ensure_atom(struct AtomTable *table, A
435386
}
436387
}
437388

438-
AtomString maybe_copied = string;
439389
if (opts & AtomTableCopyAtom) {
440-
uint8_t len = *((uint8_t *) string);
441-
uint8_t *buf = malloc(1 + len);
390+
uint8_t *buf = malloc(atom_len);
442391
if (IS_NULL_PTR(buf)) {
443392
SMP_UNLOCK(table);
444393
return AtomTableEnsureAtomAllocFail;
445394
}
446-
memcpy(buf, string, 1 + len);
447-
maybe_copied = buf;
395+
memcpy(buf, atom_data, atom_len);
396+
atom_data = buf;
448397
}
449398

450399
if (maybe_rehash(table, 1)) {
451400
bucket_index = hash % table->capacity;
452401
}
453402

454-
*result = insert_node(table, node_group, bucket_index, maybe_copied);
403+
*result = insert_node(table, node_group, bucket_index, atom_data, atom_len);
455404

456405
SMP_UNLOCK(table);
457406
return AtomTableEnsureAtomOk;
458407
}
459408

460-
static inline int read_encoded_len(const uint8_t **len_bytes)
409+
static inline ssize_t read_encoded_len(const uint8_t **len_bytes)
461410
{
462411
uint8_t byte0 = (*len_bytes)[0];
463412

@@ -478,7 +427,7 @@ static inline int read_encoded_len(const uint8_t **len_bytes)
478427
// -1 is not a valid atom index as we're limited to 2^20
479428
#define ATOM_TABLE_NOT_FOUND_MARKER ((atom_index_t) -1)
480429

481-
enum AtomTableEnsureAtomResult atom_table_ensure_atoms(struct AtomTable *table, const void *atoms, int count,
430+
enum AtomTableEnsureAtomResult atom_table_ensure_atoms(struct AtomTable *table, const void *atoms, size_t count,
482431
atom_index_t *translate_table, enum EnsureAtomsOpt opt)
483432
{
484433
bool is_long_format = (opt & EnsureLongEncoding) != 0;
@@ -489,35 +438,22 @@ enum AtomTableEnsureAtomResult atom_table_ensure_atoms(struct AtomTable *table,
489438

490439
const uint8_t *current_atom = atoms;
491440

492-
for (int i = 0; i < count; i++) {
441+
for (size_t i = 0; i < count; i++) {
493442
struct HNode *node;
443+
ssize_t atom_len;
494444
if (is_long_format) {
495-
int atom_len = read_encoded_len(&current_atom);
496-
if (UNLIKELY(atom_len < 0)) {
445+
atom_len = read_encoded_len(&current_atom);
446+
if (UNLIKELY(atom_len < 0 || atom_len > MAX_ATOM_LEN)) {
497447
fprintf(stderr, "Found invalid atom len.");
498448
SMP_UNLOCK(table);
499449
return AtomTableEnsureAtomInvalidLen;
500-
} else if (UNLIKELY(atom_len > 255)) {
501-
fprintf(stderr,
502-
"Unsupported atom length %i bytes.\n"
503-
"Unlike OTP >= 28, AtomVM supports a maximum of 255 bytes"
504-
"regardeless the number of codepoints.\n"
505-
"If you are seeing this error please open an issue on GitHub:\n"
506-
"https://github.com/atomvm/AtomVM/issues\n",
507-
atom_len);
508-
SMP_UNLOCK(table);
509-
return AtomTableEnsureAtomInvalidLen;
510450
}
511-
char tmp_old_fmt[256];
512-
tmp_old_fmt[0] = atom_len;
513-
memcpy(tmp_old_fmt + 1, current_atom, atom_len);
514-
node = get_node(table, tmp_old_fmt);
515-
current_atom += atom_len;
516451
} else {
517-
node = get_node(table, current_atom);
518-
uint8_t atom_len = current_atom[0];
519-
current_atom += 1 + atom_len;
452+
atom_len = current_atom[0];
453+
current_atom++;
520454
}
455+
node = get_node(table, current_atom, atom_len);
456+
current_atom += atom_len;
521457

522458
if (node) {
523459
translate_table[i] = node->index;
@@ -532,18 +468,14 @@ enum AtomTableEnsureAtomResult atom_table_ensure_atoms(struct AtomTable *table,
532468
current_atom = atoms;
533469
int remaining_atoms = new_atoms_count;
534470
struct HNodeGroup *node_group = table->last_node_group;
535-
for (int i = 0; i < count; i++) {
536-
537-
const uint8_t *to_be_copied = NULL;
538-
const uint8_t *next_atom = current_atom;
539-
uint8_t atom_len;
471+
for (size_t i = 0; i < count; i++) {
472+
size_t atom_len;
540473
if (is_long_format) {
541-
atom_len = read_encoded_len(&next_atom);
542-
to_be_copied = next_atom;
543-
next_atom += atom_len;
474+
// Size was checked above
475+
atom_len = (size_t) read_encoded_len(&current_atom);
544476
} else {
545477
atom_len = current_atom[0];
546-
next_atom += 1 + atom_len;
478+
current_atom++;
547479
}
548480

549481
if (translate_table[i] == ATOM_TABLE_NOT_FOUND_MARKER) {
@@ -555,28 +487,16 @@ enum AtomTableEnsureAtomResult atom_table_ensure_atoms(struct AtomTable *table,
555487
}
556488
}
557489

558-
if (is_long_format) {
559-
uint8_t *atom_copy = malloc(atom_len + 1);
560-
if (IS_NULL_PTR(atom_copy)) {
561-
// we are not going to remove atoms that have already been added up to this one
562-
SMP_UNLOCK(table);
563-
return AtomTableEnsureAtomAllocFail;
564-
}
565-
atom_copy[0] = atom_len;
566-
memcpy(atom_copy + 1, to_be_copied, atom_len);
567-
current_atom = atom_copy;
568-
}
569-
570490
unsigned long hash = sdbm_hash(current_atom, atom_len);
571491
unsigned long bucket_index = hash % table->capacity;
572492

573-
translate_table[i] = insert_node(table, node_group, bucket_index, current_atom);
493+
translate_table[i] = insert_node(table, node_group, bucket_index, current_atom, atom_len);
574494
remaining_atoms--;
575495
if (remaining_atoms == 0) {
576496
break;
577497
}
578498
}
579-
current_atom = next_atom;
499+
current_atom += atom_len;
580500
}
581501

582502
SMP_UNLOCK(table);

0 commit comments

Comments
 (0)