Skip to content

Commit 1d9f7de

Browse files
committed
fix list objects ordering to follow AWS S3
Signed-off-by: Utkarsh Srivastava <srivastavautkarsh8097@gmail.com> add more tests for asserting the correctness of utf8 sorting Signed-off-by: Utkarsh Srivastava <srivastavautkarsh8097@gmail.com> fix order when markers are involved Signed-off-by: Utkarsh Srivastava <srivastavautkarsh8097@gmail.com> add caching for name buffers - lifetime limited to per invocation of list_objects Signed-off-by: Utkarsh Srivastava <srivastavautkarsh8097@gmail.com>
1 parent 0d9d1f7 commit 1d9f7de

File tree

4 files changed

+253
-49
lines changed

4 files changed

+253
-49
lines changed

src/sdk/namespace_fs.js

Lines changed: 99 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ const stream_utils = require('../util/stream_utils');
1818
const buffer_utils = require('../util/buffer_utils');
1919
const size_utils = require('../util/size_utils');
2020
const native_fs_utils = require('../util/native_fs_utils');
21+
const js_utils = require('../util/js_utils');
2122
const ChunkFS = require('../util/chunk_fs');
2223
const LRUCache = require('../util/lru_cache');
2324
const nb_native = require('../util/nb_native');
@@ -91,14 +92,48 @@ const XATTR_METADATA_IGNORE_LIST = [
9192
];
9293

9394
/**
94-
* @param {fs.Dirent} a
95-
* @param {fs.Dirent} b
96-
* @returns {1|-1|0}
95+
* get_simple_cache returns a simple function
96+
* which can be used to access the cached data
97+
*
98+
* If the cached data doesn't exist then the
99+
* given loader function is called with the
100+
* requested key and the result is cached.
101+
*
102+
* There is no cache invalidation.
103+
* @template {string | number | symbol} T
104+
* @template U
105+
*
106+
* @param {(key: T) => U} loader
107+
* @param {Partial<Record<T, U>>} [cache={}]
108+
* @returns {(key: T) => U}
97109
*/
98-
function sort_entries_by_name(a, b) {
99-
if (a.name < b.name) return -1;
100-
if (a.name > b.name) return 1;
101-
return 0;
110+
function get_simple_cache(loader, cache = {}) {
111+
return key => {
112+
const cached = cache[key];
113+
if (cached) return cached;
114+
115+
const computed = loader(key);
116+
cache[key] = computed;
117+
return computed;
118+
};
119+
}
120+
121+
/**
122+
* sort_entries_by_name_with_cache generates a sorter which
123+
* sorts by UTF8 and caches the intermediate buffers generated
124+
* to avoid recomputation
125+
* @param {Record<string, Buffer>} [cache={}]
126+
* @returns {(a: fs.Dirent, b: fs.Dirent) => -1|0|1}
127+
*/
128+
function sort_entries_by_name_with_cache(cache = {}) {
129+
const get_cached_name = get_simple_cache(name => Buffer.from(name, 'utf8'), cache);
130+
131+
return function(a, b) {
132+
const a_name = get_cached_name(a.name);
133+
const b_name = get_cached_name(b.name);
134+
135+
return Buffer.compare(a_name, b_name);
136+
};
102137
}
103138

104139
function _get_version_id_by_stat({ino, mtimeNsBigint}) {
@@ -145,27 +180,33 @@ function _get_filename(file_name) {
145180
}
146181
return file_name;
147182
}
183+
148184
/**
149-
* @param {fs.Dirent} first_entry
150-
* @param {fs.Dirent} second_entry
151-
* @returns {Number}
185+
* sort_entries_by_name_and_time_with_cache generates a sorter which sorts
186+
* items but UTF8 encoding of the name and in case of conflict uses mtime
187+
* to resolve it.
188+
* @param {Record<string, Buffer>} [cache={}]
189+
* @returns {(first_entry: fs.Dirent, second_entry: fs.Dirent) => -1|0|1}
152190
*/
153-
function sort_entries_by_name_and_time(first_entry, second_entry) {
154-
const first_entry_name = _get_filename(first_entry.name);
155-
const second_entry_name = _get_filename(second_entry.name);
156-
if (first_entry_name === second_entry_name) {
157-
const first_entry_mtime = _get_mtime_from_filename(first_entry.name);
158-
const second_entry_mtime = _get_mtime_from_filename(second_entry.name);
159-
// To sort the versions in the latest first order,
160-
// below logic is followed
161-
if (second_entry_mtime < first_entry_mtime) return -1;
162-
if (second_entry_mtime > first_entry_mtime) return 1;
163-
return 0;
164-
} else {
165-
if (first_entry_name < second_entry_name) return -1;
166-
if (first_entry_name > second_entry_name) return 1;
167-
return 0;
168-
}
191+
function sort_entries_by_name_and_time_with_cache(cache = {}) {
192+
const _get_filename_with_cache = get_simple_cache(name => Buffer.from(_get_filename(name), 'utf8'), cache);
193+
return function(first_entry, second_entry) {
194+
const first_entry_name = _get_filename_with_cache(first_entry.name);
195+
const second_entry_name = _get_filename_with_cache(second_entry.name);
196+
197+
const compare_result = Buffer.compare(first_entry_name, second_entry_name);
198+
if (compare_result === 0) {
199+
const first_entry_mtime = _get_mtime_from_filename(first_entry.name);
200+
const second_entry_mtime = _get_mtime_from_filename(second_entry.name);
201+
// To sort the versions in the latest first order,
202+
// below logic is followed
203+
if (second_entry_mtime < first_entry_mtime) return -1;
204+
if (second_entry_mtime > first_entry_mtime) return 1;
205+
return 0;
206+
}
207+
208+
return compare_result;
209+
};
169210
}
170211

171212
// This is helper function for list object version
@@ -250,14 +291,6 @@ function is_sparse_file(stat) {
250291
return (stat.blocks * 512 < stat.size);
251292
}
252293

253-
/**
254-
* @param {fs.Dirent} e
255-
* @returns {string}
256-
*/
257-
function get_entry_name(e) {
258-
return e.name;
259-
}
260-
261294
/**
262295
* @param {string} name
263296
* @returns {fs.Dirent}
@@ -307,14 +340,14 @@ function to_fs_xattr(xattr) {
307340
const dir_cache = new LRUCache({
308341
name: 'nsfs-dir-cache',
309342
make_key: ({ dir_path }) => dir_path,
310-
load: async ({ dir_path, fs_context }) => {
343+
load: async ({ dir_path, fs_context, dirent_name_cache = {} }) => {
311344
const time = Date.now();
312345
const stat = await nb_native().fs.stat(fs_context, dir_path);
313346
let sorted_entries;
314347
let usage = config.NSFS_DIR_CACHE_MIN_DIR_SIZE;
315348
if (stat.size <= config.NSFS_DIR_CACHE_MAX_DIR_SIZE) {
316349
sorted_entries = await nb_native().fs.readdir(fs_context, dir_path);
317-
sorted_entries.sort(sort_entries_by_name);
350+
sorted_entries.sort(sort_entries_by_name_with_cache(dirent_name_cache));
318351
for (const ent of sorted_entries) {
319352
usage += ent.name.length + 4;
320353
}
@@ -342,7 +375,7 @@ const dir_cache = new LRUCache({
342375
const versions_dir_cache = new LRUCache({
343376
name: 'nsfs-versions-dir-cache',
344377
make_key: ({ dir_path }) => dir_path,
345-
load: async ({ dir_path, fs_context }) => {
378+
load: async ({ dir_path, fs_context, dirent_name_cache = {} }) => {
346379
const time = Date.now();
347380
const stat = await nb_native().fs.stat(fs_context, dir_path);
348381
const version_path = dir_path + "/" + HIDDEN_VERSIONS_PATH;
@@ -376,7 +409,7 @@ const versions_dir_cache = new LRUCache({
376409
old_versions_after_rename
377410
} = await _rename_null_version(old_versions, fs_context, version_path);
378411
const entries = latest_versions.concat(old_versions_after_rename);
379-
sorted_entries = entries.sort(sort_entries_by_name_and_time);
412+
sorted_entries = entries.sort(sort_entries_by_name_and_time_with_cache(dirent_name_cache));
380413
// rename back version to include 'null' suffix.
381414
if (renamed_null_versions_set.size > 0) {
382415
for (const ent of sorted_entries) {
@@ -388,7 +421,7 @@ const versions_dir_cache = new LRUCache({
388421
}
389422
}
390423
} else {
391-
sorted_entries = latest_versions.sort(sort_entries_by_name);
424+
sorted_entries = latest_versions.sort(sort_entries_by_name_with_cache(dirent_name_cache));
392425
}
393426
/*eslint no-unused-expressions: ["error", { "allowTernary": true }]*/
394427
for (const ent of sorted_entries) {
@@ -644,6 +677,10 @@ class NamespaceFS {
644677
/** @type {Result[]} */
645678
const results = [];
646679

680+
const _dirent_name_cache = {};
681+
/** @type {(key: string) => Buffer} */
682+
const dirent_name_cache = get_simple_cache(key => Buffer.from(key, 'utf8'), _dirent_name_cache);
683+
647684
/**
648685
* @param {string} dir_key
649686
* @returns {Promise<void>}
@@ -652,6 +689,7 @@ class NamespaceFS {
652689
if (this._is_hidden_version_path(dir_key)) {
653690
return;
654691
}
692+
655693
// /** @type {fs.Dir} */
656694
let dir_handle;
657695
/** @type {ReaddirCacheItem} */
@@ -685,9 +723,11 @@ class NamespaceFS {
685723
// Since versions are arranged next to latest object in the latest first order,
686724
// no need to find the sorted last index. Push the ".versions/#VERSION_OBJECT" as
687725
// they are in order
688-
if (results.length && r.key < results[results.length - 1].key &&
726+
727+
const r_key_buf = dirent_name_cache(r.key);
728+
if (results.length && Buffer.compare(r_key_buf, dirent_name_cache(results[results.length - 1].key)) === -1 &&
689729
!this._is_hidden_version_path(r.key)) {
690-
pos = _.sortedLastIndexBy(results, r, a => a.key);
730+
pos = js_utils.sortedLastIndexBy(results, curr => Buffer.compare(dirent_name_cache(curr.key), r_key_buf) === -1);
691731
} else {
692732
pos = results.length;
693733
}
@@ -717,7 +757,7 @@ class NamespaceFS {
717757
const process_entry = async ent => {
718758
// dbg.log0('process_entry', dir_key, ent.name);
719759
if ((!ent.name.startsWith(prefix_ent) ||
720-
ent.name < marker_curr ||
760+
Buffer.compare(dirent_name_cache(ent.name), dirent_name_cache(marker_curr)) === -1 ||
721761
ent.name === this.get_bucket_tmpdir_name() ||
722762
ent.name === config.NSFS_FOLDER_OBJECT_NAME) ||
723763
this._is_hidden_version_path(ent.name)) {
@@ -745,9 +785,17 @@ class NamespaceFS {
745785
if (!(await this.check_access(fs_context, dir_path))) return;
746786
try {
747787
if (list_versions) {
748-
cached_dir = await versions_dir_cache.get_with_cache({ dir_path, fs_context });
788+
cached_dir = await versions_dir_cache.get_with_cache({
789+
dir_path,
790+
fs_context,
791+
dirent_name_cache: _dirent_name_cache,
792+
});
749793
} else {
750-
cached_dir = await dir_cache.get_with_cache({ dir_path, fs_context });
794+
cached_dir = await dir_cache.get_with_cache({
795+
dir_path,
796+
fs_context,
797+
dirent_name_cache: _dirent_name_cache,
798+
});
751799
}
752800
} catch (err) {
753801
if (['ENOENT', 'ENOTDIR'].includes(err.code)) {
@@ -783,11 +831,13 @@ class NamespaceFS {
783831
{name: start_marker}
784832
) + 1;
785833
} else {
786-
marker_index = _.sortedLastIndexBy(
787-
sorted_entries,
788-
make_named_dirent(marker_curr),
789-
get_entry_name
790-
);
834+
const marker_curr_buf = dirent_name_cache(make_named_dirent(marker_curr).name);
835+
836+
marker_index = js_utils.sortedLastIndexBy(sorted_entries, curr => {
837+
const curr_cache_buf = dirent_name_cache(curr.name);
838+
const compare_res = Buffer.compare(curr_cache_buf, marker_curr_buf);
839+
return compare_res === -1 || compare_res === 0;
840+
});
791841
}
792842

793843
// handling a scenario in which key_marker points to an object inside a directory
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/* Copyright (C) 2024 NooBaa */
2+
'use strict';
3+
4+
const { sortedLastIndexBy } = require("../../../util/js_utils");
5+
6+
describe('test js_utils.js', () => {
7+
describe('sortedLastIndexBy', () => {
8+
it('should correctly find position for number array', () => {
9+
const test_table = [{
10+
array: [1, 3, 4, 5],
11+
target: 0,
12+
expected_position: 0,
13+
}, {
14+
array: [1, 3, 4, 5],
15+
target: 2,
16+
expected_position: 1,
17+
}, {
18+
array: [1, 3, 4, 5],
19+
target: 6,
20+
expected_position: 4,
21+
}];
22+
23+
for (const entry of test_table) {
24+
expect(sortedLastIndexBy(entry.array, curr => curr < entry.target)).toBe(entry.expected_position);
25+
}
26+
});
27+
28+
it('should correctly find position for string array', () => {
29+
const test_table = [{
30+
array: ["a", "b", "c", "d"],
31+
target: "A",
32+
expected_position: 0,
33+
}, {
34+
array: ["a", "b", "d", "e"],
35+
target: "c",
36+
expected_position: 2,
37+
}, {
38+
array: ["a", "b", "c", "d"],
39+
target: "z",
40+
expected_position: 4,
41+
}];
42+
43+
for (const entry of test_table) {
44+
expect(sortedLastIndexBy(entry.array, curr => curr < entry.target)).toBe(entry.expected_position);
45+
}
46+
});
47+
48+
it('should correctly find position for utf8 string (buffer) array', () => {
49+
// Editors might not render characters in this array properly but that's not a mistake,
50+
// it's intentional to use such characters - sourced from:
51+
// https://forum.moonwalkinc.com/t/determining-s3-listing-order/116
52+
const array = ["file_ꦏ_1.txt", "file__2.txt", "file__3.txt", "file_𐎣_4.txt"];
53+
const array_of_bufs = array.map(item => Buffer.from(item, 'utf8'));
54+
55+
const test_table = [{
56+
array: array_of_bufs,
57+
target: array_of_bufs[0],
58+
expected_position: 0,
59+
}, {
60+
array: array_of_bufs,
61+
target: array_of_bufs[2],
62+
expected_position: 2,
63+
}, {
64+
array: array_of_bufs,
65+
target: array_of_bufs[3],
66+
expected_position: 3,
67+
}, {
68+
array: array_of_bufs,
69+
target: Buffer.from("file_𐎣_40.txt", 'utf8'),
70+
expected_position: 4,
71+
}];
72+
73+
for (const entry of test_table) {
74+
expect(sortedLastIndexBy(entry.array, curr => curr.compare(entry.target) === -1)).toBe(entry.expected_position);
75+
}
76+
});
77+
});
78+
});
79+

src/test/unit_tests/test_ns_list_objects.js

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,55 @@ function test_ns_list_objects(ns, object_sdk, bucket) {
186186

187187
});
188188

189+
mocha.describe('utf8 specific basic tests', function() {
190+
// source of keys: https://forum.moonwalkinc.com/t/determining-s3-listing-order/116
191+
const strange_utf8_keys = ["file_ꦏ_1.txt", "file__2.txt", "file__3.txt", "file_𐎣_4.txt"];
192+
193+
mocha.before(async function() {
194+
await create_keys([
195+
...strange_utf8_keys,
196+
]);
197+
});
198+
mocha.after(async function() {
199+
await delete_keys([
200+
...strange_utf8_keys,
201+
]);
202+
});
203+
204+
mocha.it('verify byte-by-byte order sort for utf8 encoded keys', async function() {
205+
const r = await ns.list_objects({ bucket }, object_sdk);
206+
assert.deepStrictEqual(r.is_truncated, false);
207+
assert.deepStrictEqual(r.common_prefixes, []);
208+
assert.deepStrictEqual(r.objects.map(it => it.key), strange_utf8_keys);
209+
});
210+
211+
mocha.it('verify byte-by-byte order sort for utf8 encoded keys with markers', async function() {
212+
const test_table = [
213+
{
214+
limit: 1,
215+
},
216+
{
217+
limit: 2,
218+
},
219+
{
220+
limit: 3,
221+
},
222+
{
223+
limit: 4,
224+
},
225+
{
226+
limit: 5,
227+
}
228+
];
229+
for (const test of test_table) {
230+
const r = await truncated_listing({ bucket, limit: test.limit });
231+
assert.deepStrictEqual(r.is_truncated, false);
232+
assert.deepStrictEqual(r.common_prefixes, []);
233+
assert.deepStrictEqual(r.objects.map(it => it.key), strange_utf8_keys);
234+
}
235+
});
236+
});
237+
189238
mocha.describe('list objects - dirs', function() {
190239

191240
this.timeout(10 * 60 * 1000); // eslint-disable-line no-invalid-this

0 commit comments

Comments
 (0)