Skip to content

Commit 041ec57

Browse files
authored
s3 listing strategy has been fixed (#9499)
1 parent 32809cc commit 041ec57

File tree

4 files changed

+103
-11
lines changed

4 files changed

+103
-11
lines changed

ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ IOutputStream& operator<<(IOutputStream& stream, const TS3ListingOptions& option
2727

2828
namespace {
2929

30+
TString ParseBasePath(const TString& path) {
31+
TString basePath = TString{TStringBuf{path}.RBefore('/')};
32+
return basePath == path && !basePath.EndsWith('/') ? TString{} : basePath;
33+
}
34+
3035
using namespace NThreading;
3136
using namespace NS3Lister;
3237

@@ -497,15 +502,10 @@ class TBFSDirectoryResolverIterator : public IS3Lister {
497502
return NextDirectoryListeningChunk;
498503
}
499504

500-
static TString ParseBasePath(const TString& path) {
501-
TString basePath = TString{TStringBuf{path}.RBefore('/')};
502-
return basePath == path && !basePath.EndsWith('/') ? TString{} : basePath;
503-
}
504-
505505
void PerformEarlyStop(TListEntries& result, const TString& sourcePrefix) {
506506
result.Directories.push_back({.Path = ParseBasePath(sourcePrefix)});
507507
for (auto& directoryPrefix : DirectoryPrefixQueue) {
508-
result.Directories.push_back({.Path = directoryPrefix});
508+
result.Directories.push_back({.Path = ParseBasePath(directoryPrefix)});
509509
}
510510
DirectoryPrefixQueue.clear();
511511
}
@@ -524,10 +524,10 @@ class TBFSDirectoryResolverIterator : public IS3Lister {
524524
}
525525
} else {
526526
for (auto& directoryPrefix : listingResult.Directories) {
527-
result.Directories.push_back({.Path = directoryPrefix.Path});
527+
result.Directories.push_back({.Path = ParseBasePath(directoryPrefix.Path)});
528528
}
529529
for (auto& directoryPrefix : DirectoryPrefixQueue) {
530-
result.Directories.push_back({.Path = directoryPrefix});
530+
result.Directories.push_back({.Path = ParseBasePath(directoryPrefix)});
531531
}
532532
DirectoryPrefixQueue.clear();
533533
}
@@ -775,10 +775,10 @@ class TConcurrentBFSDirectoryResolverIterator : public IS3Lister {
775775
// TODO: add verification
776776
auto result = TListEntries{.Objects = Objects, .ListedObjectSize = ListedObjectSize};
777777
for (auto& directoryPrefix : DirectoryPrefixQueue) {
778-
result.Directories.push_back({.Path = directoryPrefix});
778+
result.Directories.push_back({.Path = ParseBasePath(directoryPrefix)});
779779
}
780780
for (auto& directoryPrefix: InProgressPaths) {
781-
result.Directories.push_back({.Path = directoryPrefix});
781+
result.Directories.push_back({.Path = ParseBasePath(directoryPrefix)});
782782
}
783783
for (auto& directoryEntry : Directories) {
784784
result.Directories.push_back(directoryEntry);

ydb/tests/fq/s3/conftest.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88

99
from ydb.tests.tools.fq_runner.fq_client import FederatedQueryClient
1010
from ydb.tests.tools.fq_runner.custom_hooks import * # noqa: F401,F403 Adding custom hooks for YQv2 support
11-
from ydb.tests.tools.fq_runner.kikimr_utils import AddInflightExtension
11+
from ydb.tests.tools.fq_runner.kikimr_utils import AddAllowConcurrentListingsExtension
1212
from ydb.tests.tools.fq_runner.kikimr_utils import AddDataInflightExtension
1313
from ydb.tests.tools.fq_runner.kikimr_utils import AddFormatSizeLimitExtension
14+
from ydb.tests.tools.fq_runner.kikimr_utils import AddInflightExtension
1415
from ydb.tests.tools.fq_runner.kikimr_utils import DefaultConfigExtension
1516
from ydb.tests.tools.fq_runner.kikimr_utils import YQv2Extension
1617
from ydb.tests.tools.fq_runner.kikimr_utils import ComputeExtension
@@ -89,6 +90,7 @@ def get_kikimr_extensions(s3: S3, yq_version: str, kikimr_settings, mvp_external
8990
return [
9091
AddFormatSizeLimitExtension(),
9192
AddInflightExtension(),
93+
AddAllowConcurrentListingsExtension(),
9294
AddDataInflightExtension(),
9395
DefaultConfigExtension(s3.s3_url),
9496
YQv2Extension(yq_version, kikimr_settings.get("is_replace_if_exists", False)),

ydb/tests/fq/s3/test_s3_1.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,3 +557,81 @@ def test_top_level_listing(self, kikimr, s3, client, runtime_listing, unique_pre
557557
assert result_set.rows[5].items[1].int32_value == 15
558558
assert result_set.rows[5].items[2].int32_value == 33
559559
assert sum(kikimr.control_plane.get_metering(1)) == 10
560+
561+
@yq_all
562+
@pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True)
563+
@pytest.mark.parametrize("runtime_listing", ["false", "true"])
564+
@pytest.mark.parametrize("kikimr_params", [{"allow_concurrent_listings": True}], indirect=True)
565+
def test_top_level_listing_2(self, kikimr, s3, client, runtime_listing, unique_prefix):
566+
resource = boto3.resource(
567+
"s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key"
568+
)
569+
570+
bucket = resource.Bucket("fbucket")
571+
bucket.create(ACL='public-read')
572+
bucket.objects.all().delete()
573+
574+
s3_client = boto3.client(
575+
"s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key"
576+
)
577+
578+
fruits = '''Fruit,Price,Weight
579+
Banana,3,100
580+
Apple,2,22
581+
Pear,15,33'''
582+
s3_client.put_object(Body=fruits, Bucket='fbucket', Key='2024-08-09.csv', ContentType='text/plain')
583+
s3_client.put_object(Body=fruits, Bucket='fbucket', Key='2024-09-08.csv', ContentType='text/plain')
584+
s3_client.put_object(Body=fruits, Bucket='fbucket', Key='2024-08-08.csv', ContentType='text/plain')
585+
s3_client.put_object(Body=fruits, Bucket='fbucket', Key='/a/2024-08-08.csv', ContentType='text/plain')
586+
s3_client.put_object(Body=fruits, Bucket='fbucket', Key='/b/test.csv', ContentType='text/plain')
587+
588+
kikimr.control_plane.wait_bootstrap(1)
589+
storage_connection_name = unique_prefix + "test_top_level_listing_2"
590+
client.create_storage_connection(storage_connection_name, "fbucket")
591+
592+
sql = f'''
593+
pragma s3.UseRuntimeListing="{runtime_listing}";
594+
595+
SELECT *
596+
FROM `{storage_connection_name}`.`/2024-08-*`
597+
WITH (format=csv_with_names, SCHEMA (
598+
Fruit String NOT NULL,
599+
Price Int NOT NULL,
600+
Weight Int NOT NULL
601+
)
602+
);
603+
'''
604+
605+
query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
606+
client.wait_query_status(query_id, fq.QueryMeta.COMPLETED)
607+
608+
data = client.get_result_data(query_id)
609+
result_set = data.result.result_set
610+
logging.debug(str(result_set))
611+
assert len(result_set.columns) == 3
612+
assert result_set.columns[0].name == "Fruit"
613+
assert result_set.columns[0].type.type_id == ydb.Type.STRING
614+
assert result_set.columns[1].name == "Price"
615+
assert result_set.columns[1].type.type_id == ydb.Type.INT32
616+
assert result_set.columns[2].name == "Weight"
617+
assert result_set.columns[2].type.type_id == ydb.Type.INT32
618+
assert len(result_set.rows) == 6
619+
assert result_set.rows[0].items[0].bytes_value == b"Banana"
620+
assert result_set.rows[0].items[1].int32_value == 3
621+
assert result_set.rows[0].items[2].int32_value == 100
622+
assert result_set.rows[1].items[0].bytes_value == b"Apple"
623+
assert result_set.rows[1].items[1].int32_value == 2
624+
assert result_set.rows[1].items[2].int32_value == 22
625+
assert result_set.rows[2].items[0].bytes_value == b"Pear"
626+
assert result_set.rows[2].items[1].int32_value == 15
627+
assert result_set.rows[2].items[2].int32_value == 33
628+
assert result_set.rows[3].items[0].bytes_value == b"Banana"
629+
assert result_set.rows[3].items[1].int32_value == 3
630+
assert result_set.rows[3].items[2].int32_value == 100
631+
assert result_set.rows[4].items[0].bytes_value == b"Apple"
632+
assert result_set.rows[4].items[1].int32_value == 2
633+
assert result_set.rows[4].items[2].int32_value == 22
634+
assert result_set.rows[5].items[0].bytes_value == b"Pear"
635+
assert result_set.rows[5].items[1].int32_value == 15
636+
assert result_set.rows[5].items[2].int32_value == 33
637+
assert sum(kikimr.control_plane.get_metering(1)) == 10

ydb/tests/tools/fq_runner/kikimr_utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,18 @@ def apply_to_kikimr(self, request, kikimr):
5050
del request.param["inflight"]
5151

5252

53+
class AddAllowConcurrentListingsExtension(ExtensionPoint):
54+
def is_applicable(self, request):
55+
return (hasattr(request, 'param')
56+
and isinstance(request.param, dict)
57+
and "allow_concurrent_listings" in request.param)
58+
59+
def apply_to_kikimr(self, request, kikimr):
60+
kikimr.allow_concurrent_listings = request.param["allow_concurrent_listings"]
61+
kikimr.compute_plane.fq_config['gateways']['s3']['allow_concurrent_listings'] = kikimr.allow_concurrent_listings
62+
del request.param["allow_concurrent_listings"]
63+
64+
5365
class AddDataInflightExtension(ExtensionPoint):
5466
def is_applicable(self, request):
5567
return (hasattr(request, 'param')

0 commit comments

Comments
 (0)