From 93ad20f67f3ebab786866050bd06359a5dc604e8 Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Mon, 19 Aug 2024 15:40:21 +0000 Subject: [PATCH 1/8] Added wildcards validations --- ydb/core/external_sources/object_storage.cpp | 27 +++++++--- ydb/core/external_sources/object_storage.h | 2 +- .../external_sources/object_storage_ut.cpp | 27 ++++++++++ .../request_validators.h | 2 +- ydb/core/kqp/gateway/utils/scheme_helpers.cpp | 1 + .../s3/kqp_federated_query_ut.cpp | 54 +++++++++++++++++++ .../s3/kqp_federated_scheme_ut.cpp | 22 ++++++++ .../s3/object_listers/yql_s3_list.cpp | 1 + .../s3/object_listers/yql_s3_path.cpp | 31 +++++++++-- .../providers/s3/object_listers/yql_s3_path.h | 1 + .../s3/object_listers/yql_s3_path_ut.cpp | 22 ++++++++ .../s3/provider/yql_s3_io_discovery.cpp | 9 ++++ 12 files changed, 187 insertions(+), 12 deletions(-) diff --git a/ydb/core/external_sources/object_storage.cpp b/ydb/core/external_sources/object_storage.cpp index fc42c5479f58..c30cf44dbdc3 100644 --- a/ydb/core/external_sources/object_storage.cpp +++ b/ydb/core/external_sources/object_storage.cpp @@ -46,6 +46,7 @@ struct TObjectStorageExternalSource : public IExternalSource { virtual TString Pack(const NKikimrExternalSources::TSchema& schema, const NKikimrExternalSources::TGeneral& general) const override { + TString location; NKikimrExternalSources::TObjectStorage objectStorage; for (const auto& [key, value]: general.attributes()) { auto lowerKey = to_lower(key); @@ -62,12 +63,14 @@ struct TObjectStorageExternalSource : public IExternalSource { } } else if (IsIn({"file_pattern"sv, "data.interval.unit"sv, "data.datetime.format_name"sv, "data.datetime.format"sv, "data.timestamp.format_name"sv, "data.timestamp.format"sv, "csv_delimiter"sv}, lowerKey)) { objectStorage.mutable_format_setting()->insert({lowerKey, value}); + } else if (lowerKey == "location") { + location = value; } else { ythrow TExternalSourceException() << "Unknown attribute " << key; } } - if (auto issues = Validate(schema, objectStorage, PathsLimit)) { + if (auto issues = Validate(schema, objectStorage, PathsLimit, location)) { ythrow TExternalSourceException() << issues.ToString(); } @@ -136,9 +139,15 @@ struct TObjectStorageExternalSource : public IExternalSource { } template - static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit) { + static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit, const TString& location) { NYql::TIssues issues; - issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting())); + if (TString errorString; !NYql::NS3::ValidateWildcards(location, errorString)) { + issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains invalid wildcard: " << errorString)); + } + if (objectStorage.partitioned_by_size() && NYql::NS3::HasWildcards(location)) { + issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains wildcards")); + } + issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting(), location)); issues.AddIssues(ValidateRawFormat(objectStorage.format(), schema, objectStorage.partitioned_by())); if (objectStorage.projection_size() || objectStorage.partitioned_by_size()) { try { @@ -160,11 +169,17 @@ struct TObjectStorageExternalSource : public IExternalSource { return issues; } - static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map& formatSetting) { + static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map& formatSetting, const TString& location) { NYql::TIssues issues; issues.AddIssues(ValidateDateFormatSetting(formatSetting)); for (const auto& [key, value]: formatSetting) { if (key == "file_pattern"sv) { + if (TString errorString; !NYql::NS3::ValidateWildcards(value, errorString)) { + issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "File pattern '" << value << "' contains invalid wildcard: " << errorString)); + } + if (value && !location.EndsWith("/")) { + issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, "Path pattern cannot be used with file_pattern")); + } continue; } @@ -631,8 +646,8 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector(hostnamePatterns, actorSystem, pathsLimit, std::move(credentialsFactory), enableInfer, allowLocalFiles); } -NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit) { - return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit); +NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location) { + return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit, location); } NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map& formatSetting, bool matchAllSettings) { diff --git a/ydb/core/external_sources/object_storage.h b/ydb/core/external_sources/object_storage.h index d156b503734c..74de7a69eb87 100644 --- a/ydb/core/external_sources/object_storage.h +++ b/ydb/core/external_sources/object_storage.h @@ -16,7 +16,7 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector& formatSetting, bool matchAllSettings = false); diff --git a/ydb/core/external_sources/object_storage_ut.cpp b/ydb/core/external_sources/object_storage_ut.cpp index 26134476eff9..9c82d661b30a 100644 --- a/ydb/core/external_sources/object_storage_ut.cpp +++ b/ydb/core/external_sources/object_storage_ut.cpp @@ -1,6 +1,7 @@ #include "object_storage.h" #include +#include #include #include @@ -29,6 +30,32 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) { general.mutable_attributes()->insert({"projection.h", "b"}); UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Partition by must always be specified"); } + + Y_UNIT_TEST(WildcardsValidation) { + auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false); + NKikimrExternalSources::TSchema schema; + + { // location + NKikimrExternalSources::TGeneral general; + general.mutable_attributes()->insert({"location", "}"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '}' contains invalid wildcard:"); + } + + { // file pattern + NKikimrExternalSources::TGeneral general; + general.mutable_attributes()->insert({"file_pattern", "}"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "File pattern '}' contains invalid wildcard:"); + general.mutable_attributes()->insert({"location", "/test_file"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Path pattern cannot be used with file_pattern"); + } + + { // partitioned by + NKikimrExternalSources::TGeneral general; + general.mutable_attributes()->insert({"location", "*"}); + general.mutable_attributes()->insert({"partitioned_by", "[year]"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '*' contains wildcards"); + } + } } } // NKikimr diff --git a/ydb/core/fq/libs/control_plane_storage/request_validators.h b/ydb/core/fq/libs/control_plane_storage/request_validators.h index c17ac4d41a16..557a2c81a400 100644 --- a/ydb/core/fq/libs/control_plane_storage/request_validators.h +++ b/ydb/core/fq/libs/control_plane_storage/request_validators.h @@ -121,7 +121,7 @@ NYql::TIssues ValidateBinding(const T& ev, size_t maxSize, const TSetGetDriver()); UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Completed, readyOp.Status().GetIssues().ToString()); } + + Y_UNIT_TEST(TestWildcardValidation) { + const TString bucket = "test_bucket1"; + + CreateBucket(bucket); + + auto kikimr = NTestUtils::MakeKikimrRunner(); + + auto tc = kikimr->GetTableClient(); + auto session = tc.CreateSession().GetValueSync().GetSession(); + const TString query = fmt::format(R"( + CREATE EXTERNAL DATA SOURCE `/Root/external_data_source` WITH ( + SOURCE_TYPE="ObjectStorage", + LOCATION="{location}", + AUTH_METHOD="NONE" + );)", + "location"_a = GetBucketLocation(bucket) + ); + auto result = session.ExecuteSchemeQuery(query).GetValueSync(); + UNIT_ASSERT_C(result.GetStatus() == NYdb::EStatus::SUCCESS, result.GetIssues().ToString()); + + auto db = kikimr->GetQueryClient(); + + { // path validation + const TString sql = R"( + SELECT * FROM `/Root/external_data_source`.`/}` WITH ( + SCHEMA = (data String), + FORMAT = "csv_with_names" + ))"; + + auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString()); + + NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver()); + UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString()); + UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "Path '/}' contains invalid wildcard:"); + } + + { // file pattern validation + const TString sql = R"( + SELECT * FROM `/Root/external_data_source`.`/` WITH ( + SCHEMA = (data String), + FORMAT = "csv_with_names", + FILE_PATTERN = "}" + ))"; + + auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString()); + + NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver()); + UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString()); + UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "File pattern '}' contains invalid wildcard:"); + } + } } } // namespace NKikimr::NKqp diff --git a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp index 39e53fd71eef..a4751d3741a0 100644 --- a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp +++ b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp @@ -215,6 +215,28 @@ Y_UNIT_TEST_SUITE(KqpFederatedSchemeTest) { }; TestInvalidDropForExternalTableWithAuth(queryClientExecutor, "generic_query"); } + + Y_UNIT_TEST(ExternalTableDdlLocationValidation) { + auto kikimr = NTestUtils::MakeKikimrRunner(); + auto db = kikimr->GetTableClient(); + auto session = db.CreateSession().GetValueSync().GetSession(); + auto query = TStringBuilder() << R"( + CREATE EXTERNAL DATA SOURCE `/Root/ExternalDataSource` WITH ( + SOURCE_TYPE="ObjectStorage", + LOCATION="my-bucket", + AUTH_METHOD="NONE" + ); + CREATE EXTERNAL TABLE `/Root/ExternalTable` ( + Key Uint64, + Value String + ) WITH ( + DATA_SOURCE="/Root/ExternalDataSource", + LOCATION="}" + );)"; + auto result = session.ExecuteSchemeQuery(query).GetValueSync(); + UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), EStatus::SCHEME_ERROR); + UNIT_ASSERT_STRING_CONTAINS(result.GetIssues().ToString(), "Location '}' contains invalid wildcard:"); + } } } // namespace NKikimr::NKqp diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp index be5fd6134c2e..f62c12ea1d4c 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp @@ -47,6 +47,7 @@ std::pair MakeFilterRegexp(const TString& regex, } else { re = std::make_shared(re2::StringPiece(regex), RE2::Options()); } + Y_ENSURE(re->ok()); const size_t numGroups = re->NumberOfCapturingGroups(); YQL_CLOG(DEBUG, ProviderS3) diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp index e746aeeddec3..2912c09174cd 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp @@ -47,22 +47,23 @@ TString RegexFromWildcards(const std::string_view& pattern) { const auto& escaped = EscapeRegex(pattern); TStringBuilder result; bool slash = false; - bool group = false; + size_t groupCount = 0; for (const char& c : escaped) { switch (c) { case '{': result << "(?:"; - group = true; + groupCount++; slash = false; break; case '}': + Y_ENSURE(groupCount, "Unexpected group end"); result << ')'; - group = false; + groupCount--; slash = false; break; case ',': - if (group) + if (groupCount) result << '|'; else result << "\\,"; @@ -89,7 +90,29 @@ TString RegexFromWildcards(const std::string_view& pattern) { break; } } + Y_ENSURE(!groupCount, "Found " << groupCount << " unterminated groups"); + Y_ENSURE(!slash, "Expected symbol after slash"); return result; } +bool ValidateWildcards(const std::string_view& pattern, TString& errorString) { + size_t groupCount = 0; + for (size_t i = 0; i < pattern.size(); ++i) { + if (pattern[i] == '{') { + groupCount++; + } else if (pattern[i] == '}') { + if (!groupCount) { + errorString = TStringBuilder() << "found unexpected group end at position " << i; + return false; + } + groupCount--; + } + } + if (groupCount) { + errorString = TStringBuilder() << "found " << groupCount << " unterminated groups"; + return false; + } + return true; +} + } diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h index c39f476f8893..1e57e6867093 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h @@ -19,5 +19,6 @@ TString EscapeRegex(const TString& str); TString EscapeRegex(const std::string_view& str); TString RegexFromWildcards(const std::string_view& pattern); +bool ValidateWildcards(const std::string_view& pattern, TString& errorString); } diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp index 1b452a268364..f078b099c7d0 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp @@ -28,6 +28,28 @@ Y_UNIT_TEST_SUITE(TPathTests) { UNIT_ASSERT_VALUES_EQUAL(NormalizePath("/a/b/c/"), "a/b/c/"); UNIT_ASSERT_VALUES_EQUAL(NormalizePath("///a/b/c///"), "a/b/c/"); } + + void TestRegexFromWildcardsSuccess(const TString& wildcards, const TString& expectedRegex) { + TString errorString; + UNIT_ASSERT_C(ValidateWildcards(wildcards, errorString), errorString); + UNIT_ASSERT_VALUES_EQUAL(RegexFromWildcards(wildcards), expectedRegex); + } + + void TestRegexFromWildcardsFail(const TString& wildcards, const TString& expectedException, const TString& expectedError) { + TString errorString; + UNIT_ASSERT(!ValidateWildcards(wildcards, errorString)); + UNIT_ASSERT_STRING_CONTAINS(errorString, expectedError); + UNIT_ASSERT_EXCEPTION_CONTAINS(RegexFromWildcards(wildcards), yexception, expectedException); + } + + Y_UNIT_TEST(TestRegexFromWildcards) { + TestRegexFromWildcardsSuccess("first,test\\_{alt1,alt2}_text", "first\\,test\\\\_(?:alt1|alt2)_text"); + TestRegexFromWildcardsSuccess("hello.*world?str", "hello\\..*world.str"); + TestRegexFromWildcardsSuccess("many_{{alt1,alt2},{alt3,{alt4}},alt5,{}}_alts", "many_(?:(?:alt1|alt2)|(?:alt3|(?:alt4))|alt5|(?:))_alts"); + + TestRegexFromWildcardsFail("hello{}}world", "Unexpected group end", "found unexpected group end at position 7"); + TestRegexFromWildcardsFail("hello{{{}world", "Found 2 unterminated groups", "found 2 unterminated groups"); + } } } diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp index 2c1ee3313622..f35c812f2149 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp @@ -732,6 +732,10 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { if (!FindFilePattern(settings, ctx, filePattern)) { return false; } + if (TString errorString; !NS3::ValidateWildcards(filePattern, errorString)) { + ctx.AddError(TIssue(ctx.GetPosition(read.Pos()), TStringBuilder() << "File pattern '" << filePattern << "' contains invalid wildcard: " << errorString)); + return false; + } const TString effectiveFilePattern = filePattern ? filePattern : "*"; TVector paths; @@ -763,6 +767,11 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { } for (const auto& path : paths) { + if (TString errorString; !NS3::ValidateWildcards(path, errorString)) { + ctx.AddError(TIssue(ctx.GetPosition(read.Pos()), TStringBuilder() << "Path '" << path << "' contains invalid wildcard: " << errorString)); + return false; + } + // each path in CONCAT() can generate multiple list requests for explicit partitioning TVector reqs; From 98cb9ca707a0a33f6b46a8f2889d0edfbaa1f99f Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Mon, 19 Aug 2024 16:07:53 +0000 Subject: [PATCH 2/8] Fixed location passing --- ydb/core/external_sources/object_storage.cpp | 5 +---- ydb/core/external_sources/object_storage_ut.cpp | 6 +++--- ydb/core/kqp/gateway/utils/scheme_helpers.cpp | 2 +- ydb/core/protos/external_sources.proto | 1 + 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/ydb/core/external_sources/object_storage.cpp b/ydb/core/external_sources/object_storage.cpp index c30cf44dbdc3..a8532830c993 100644 --- a/ydb/core/external_sources/object_storage.cpp +++ b/ydb/core/external_sources/object_storage.cpp @@ -46,7 +46,6 @@ struct TObjectStorageExternalSource : public IExternalSource { virtual TString Pack(const NKikimrExternalSources::TSchema& schema, const NKikimrExternalSources::TGeneral& general) const override { - TString location; NKikimrExternalSources::TObjectStorage objectStorage; for (const auto& [key, value]: general.attributes()) { auto lowerKey = to_lower(key); @@ -63,14 +62,12 @@ struct TObjectStorageExternalSource : public IExternalSource { } } else if (IsIn({"file_pattern"sv, "data.interval.unit"sv, "data.datetime.format_name"sv, "data.datetime.format"sv, "data.timestamp.format_name"sv, "data.timestamp.format"sv, "csv_delimiter"sv}, lowerKey)) { objectStorage.mutable_format_setting()->insert({lowerKey, value}); - } else if (lowerKey == "location") { - location = value; } else { ythrow TExternalSourceException() << "Unknown attribute " << key; } } - if (auto issues = Validate(schema, objectStorage, PathsLimit, location)) { + if (auto issues = Validate(schema, objectStorage, PathsLimit, general.location())) { ythrow TExternalSourceException() << issues.ToString(); } diff --git a/ydb/core/external_sources/object_storage_ut.cpp b/ydb/core/external_sources/object_storage_ut.cpp index 9c82d661b30a..26b8d36058a3 100644 --- a/ydb/core/external_sources/object_storage_ut.cpp +++ b/ydb/core/external_sources/object_storage_ut.cpp @@ -37,7 +37,7 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) { { // location NKikimrExternalSources::TGeneral general; - general.mutable_attributes()->insert({"location", "}"}); + general.set_location("}"); UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '}' contains invalid wildcard:"); } @@ -45,13 +45,13 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) { NKikimrExternalSources::TGeneral general; general.mutable_attributes()->insert({"file_pattern", "}"}); UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "File pattern '}' contains invalid wildcard:"); - general.mutable_attributes()->insert({"location", "/test_file"}); + general.set_location("/test_file"); UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Path pattern cannot be used with file_pattern"); } { // partitioned by NKikimrExternalSources::TGeneral general; - general.mutable_attributes()->insert({"location", "*"}); + general.set_location("*"); general.mutable_attributes()->insert({"partitioned_by", "[year]"}); UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '*' contains wildcards"); } diff --git a/ydb/core/kqp/gateway/utils/scheme_helpers.cpp b/ydb/core/kqp/gateway/utils/scheme_helpers.cpp index e4b995fa971b..fd558af336e4 100644 --- a/ydb/core/kqp/gateway/utils/scheme_helpers.cpp +++ b/ydb/core/kqp/gateway/utils/scheme_helpers.cpp @@ -88,11 +88,11 @@ void FillCreateExternalTableColumnDesc(NKikimrSchemeOp::TExternalTableDescriptio columnDesc.SetNotNull(columnIt->second.NotNull); } NKikimrExternalSources::TGeneral general; + general.set_location(settings.Location); auto& attributes = *general.mutable_attributes(); for (const auto& [key, value]: settings.SourceTypeParameters) { attributes.insert({key, value}); } - attributes.insert({"location", settings.Location}); externalTableDesc.SetContent(general.SerializeAsString()); } diff --git a/ydb/core/protos/external_sources.proto b/ydb/core/protos/external_sources.proto index 2115da12de95..9f01d56e7120 100644 --- a/ydb/core/protos/external_sources.proto +++ b/ydb/core/protos/external_sources.proto @@ -11,6 +11,7 @@ message TSchema { message TGeneral { map attributes = 1 [(Ydb.size).le = 100]; + optional string location = 2; } message TObjectStorage { From 001b74b8cbf636c57a8d42207bd271ab42c13169 Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Mon, 19 Aug 2024 16:10:08 +0000 Subject: [PATCH 3/8] Removed unused include --- ydb/core/external_sources/object_storage_ut.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ydb/core/external_sources/object_storage_ut.cpp b/ydb/core/external_sources/object_storage_ut.cpp index 26b8d36058a3..b75df34c7fa1 100644 --- a/ydb/core/external_sources/object_storage_ut.cpp +++ b/ydb/core/external_sources/object_storage_ut.cpp @@ -1,7 +1,6 @@ #include "object_storage.h" #include -#include #include #include From fc7c07ccc7440dda3dcac751eeb233ce2b3c08c7 Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Mon, 19 Aug 2024 16:23:31 +0000 Subject: [PATCH 4/8] Fixed unit test --- .../kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp index 9f7b9f39e4c4..2ea956b663d0 100644 --- a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp +++ b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp @@ -35,7 +35,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { const TString externalDataSourceName = "/Root/external_data_source"; const TString externalTableName = "/Root/test_binding_resolve"; const TString bucket = "test_bucket1"; - const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "{}") << "_object"; + const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "*?{}") << "_object"; CreateBucketWithObject(bucket, object, TEST_CONTENT); @@ -2143,7 +2143,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { Y_UNIT_TEST(TestReadEmptyFileWithCsvFormat) { const TString externalDataSourceName = "/Root/external_data_source"; - const TString bucket = "test_bucket1"; + const TString bucket = "test_bucket12"; CreateBucketWithObject(bucket, "test_object", ""); @@ -2183,7 +2183,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { } Y_UNIT_TEST(TestWildcardValidation) { - const TString bucket = "test_bucket1"; + const TString bucket = "test_bucket13"; CreateBucket(bucket); From 178bb8595c7f500f9f8fb01a34ae9d3cc6775a7b Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Tue, 20 Aug 2024 04:35:08 +0000 Subject: [PATCH 5/8] Fixed path pattern validation --- ydb/core/external_sources/object_storage.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ydb/core/external_sources/object_storage.cpp b/ydb/core/external_sources/object_storage.cpp index a8532830c993..fe7fdb87db30 100644 --- a/ydb/core/external_sources/object_storage.cpp +++ b/ydb/core/external_sources/object_storage.cpp @@ -141,12 +141,13 @@ struct TObjectStorageExternalSource : public IExternalSource { if (TString errorString; !NYql::NS3::ValidateWildcards(location, errorString)) { issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains invalid wildcard: " << errorString)); } - if (objectStorage.partitioned_by_size() && NYql::NS3::HasWildcards(location)) { - issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains wildcards")); - } - issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting(), location)); + const bool hasPartitioning = objectStorage.projection_size() || objectStorage.partitioned_by_size(); + issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting(), location, hasPartitioning)); issues.AddIssues(ValidateRawFormat(objectStorage.format(), schema, objectStorage.partitioned_by())); - if (objectStorage.projection_size() || objectStorage.partitioned_by_size()) { + if (hasPartitioning) { + if (NYql::NS3::HasWildcards(location)) { + issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains wildcards")); + } try { TVector partitionedBy{objectStorage.partitioned_by().begin(), objectStorage.partitioned_by().end()}; issues.AddIssues(ValidateProjectionColumns(schema, partitionedBy)); @@ -166,7 +167,7 @@ struct TObjectStorageExternalSource : public IExternalSource { return issues; } - static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map& formatSetting, const TString& location) { + static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map& formatSetting, const TString& location, bool hasPartitioning) { NYql::TIssues issues; issues.AddIssues(ValidateDateFormatSetting(formatSetting)); for (const auto& [key, value]: formatSetting) { @@ -174,7 +175,7 @@ struct TObjectStorageExternalSource : public IExternalSource { if (TString errorString; !NYql::NS3::ValidateWildcards(value, errorString)) { issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "File pattern '" << value << "' contains invalid wildcard: " << errorString)); } - if (value && !location.EndsWith("/")) { + if (value && !hasPartitioning && !location.EndsWith("/")) { issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, "Path pattern cannot be used with file_pattern")); } continue; From 99e713227ccaada54baa2f62e50bed915a0d0a33 Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Tue, 20 Aug 2024 04:56:10 +0000 Subject: [PATCH 6/8] Fixed unit test --- ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp b/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp index 86377719e4bd..1241dd465275 100644 --- a/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp +++ b/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp @@ -5518,7 +5518,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) { month Int64 NOT NULL ) WITH ( DATA_SOURCE=")" << externalDataSourceName << R"(", - LOCATION="/folder1/*", + LOCATION="/folder1/", FORMAT="json_as_string", `projection.enabled`="true", `projection.year.type`="integer", @@ -5543,7 +5543,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) { UNIT_ASSERT(externalTable.ExternalTableInfo); UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.ColumnsSize(), 4); UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetDataSourcePath(), externalDataSourceName); - UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/*"); + UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/"); } Y_UNIT_TEST(CreateExternalTableWithUpperCaseSettings) { @@ -5566,7 +5566,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) { Month Int64 NOT NULL ) WITH ( DATA_SOURCE=")" << externalDataSourceName << R"(", - LOCATION="/folder1/*", + LOCATION="/folder1/", FORMAT="json_as_string", `projection.enabled`="true", `projection.Year.type`="integer", @@ -5591,7 +5591,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) { UNIT_ASSERT(externalTable.ExternalTableInfo); UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.ColumnsSize(), 4); UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetDataSourcePath(), externalDataSourceName); - UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/*"); + UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/"); } Y_UNIT_TEST(DoubleCreateExternalTable) { From d02fb489bf353122285e4e3c7678f74994b8f8df Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Fri, 23 Aug 2024 07:21:32 +0000 Subject: [PATCH 7/8] Fixed issues --- ydb/core/external_sources/object_storage.cpp | 4 +-- .../s3/object_listers/yql_s3_path.cpp | 36 ++++++++++--------- .../providers/s3/object_listers/yql_s3_path.h | 2 +- .../s3/object_listers/yql_s3_path_ut.cpp | 13 ++++--- .../s3/provider/yql_s3_io_discovery.cpp | 4 +-- 5 files changed, 30 insertions(+), 29 deletions(-) diff --git a/ydb/core/external_sources/object_storage.cpp b/ydb/core/external_sources/object_storage.cpp index fe7fdb87db30..1dc86b169190 100644 --- a/ydb/core/external_sources/object_storage.cpp +++ b/ydb/core/external_sources/object_storage.cpp @@ -138,7 +138,7 @@ struct TObjectStorageExternalSource : public IExternalSource { template static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit, const TString& location) { NYql::TIssues issues; - if (TString errorString; !NYql::NS3::ValidateWildcards(location, errorString)) { + if (TString errorString = NYql::NS3::ValidateWildcards(location)) { issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains invalid wildcard: " << errorString)); } const bool hasPartitioning = objectStorage.projection_size() || objectStorage.partitioned_by_size(); @@ -172,7 +172,7 @@ struct TObjectStorageExternalSource : public IExternalSource { issues.AddIssues(ValidateDateFormatSetting(formatSetting)); for (const auto& [key, value]: formatSetting) { if (key == "file_pattern"sv) { - if (TString errorString; !NYql::NS3::ValidateWildcards(value, errorString)) { + if (TString errorString = NYql::NS3::ValidateWildcards(value)) { issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "File pattern '" << value << "' contains invalid wildcard: " << errorString)); } if (value && !hasPartitioning && !location.EndsWith("/")) { diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp index 2912c09174cd..fa4d945508c6 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp @@ -47,23 +47,24 @@ TString RegexFromWildcards(const std::string_view& pattern) { const auto& escaped = EscapeRegex(pattern); TStringBuilder result; bool slash = false; - size_t groupCount = 0; + bool group = false; for (const char& c : escaped) { switch (c) { case '{': + Y_ENSURE(!group, "Found unterminated group"); result << "(?:"; - groupCount++; + group = true; slash = false; break; case '}': - Y_ENSURE(groupCount, "Unexpected group end"); + Y_ENSURE(group, "Unexpected group end"); result << ')'; - groupCount--; + group = false; slash = false; break; case ',': - if (groupCount) + if (group) result << '|'; else result << "\\,"; @@ -90,29 +91,30 @@ TString RegexFromWildcards(const std::string_view& pattern) { break; } } - Y_ENSURE(!groupCount, "Found " << groupCount << " unterminated groups"); + Y_ENSURE(!group, "Found unterminated group"); Y_ENSURE(!slash, "Expected symbol after slash"); return result; } -bool ValidateWildcards(const std::string_view& pattern, TString& errorString) { - size_t groupCount = 0; +TString ValidateWildcards(const std::string_view& pattern) { + std::optional groupStart = std::nullopt; for (size_t i = 0; i < pattern.size(); ++i) { if (pattern[i] == '{') { - groupCount++; + if (groupStart) { + return TStringBuilder() << "found unterminated group start at position " << *groupStart; + } + groupStart = i; } else if (pattern[i] == '}') { - if (!groupCount) { - errorString = TStringBuilder() << "found unexpected group end at position " << i; - return false; + if (!groupStart) { + return TStringBuilder() << "found unexpected group end at position " << i; } - groupCount--; + groupStart = std::nullopt; } } - if (groupCount) { - errorString = TStringBuilder() << "found " << groupCount << " unterminated groups"; - return false; + if (groupStart) { + return TStringBuilder() << "found unterminated group start at position " << *groupStart; } - return true; + return {}; } } diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h index 1e57e6867093..b5266607558f 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h @@ -19,6 +19,6 @@ TString EscapeRegex(const TString& str); TString EscapeRegex(const std::string_view& str); TString RegexFromWildcards(const std::string_view& pattern); -bool ValidateWildcards(const std::string_view& pattern, TString& errorString); +TString ValidateWildcards(const std::string_view& pattern); } diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp index f078b099c7d0..355207a9aca1 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp @@ -30,25 +30,24 @@ Y_UNIT_TEST_SUITE(TPathTests) { } void TestRegexFromWildcardsSuccess(const TString& wildcards, const TString& expectedRegex) { - TString errorString; - UNIT_ASSERT_C(ValidateWildcards(wildcards, errorString), errorString); + TString errorString = ValidateWildcards(wildcards); + UNIT_ASSERT_C(errorString.empty(), errorString); UNIT_ASSERT_VALUES_EQUAL(RegexFromWildcards(wildcards), expectedRegex); } void TestRegexFromWildcardsFail(const TString& wildcards, const TString& expectedException, const TString& expectedError) { - TString errorString; - UNIT_ASSERT(!ValidateWildcards(wildcards, errorString)); - UNIT_ASSERT_STRING_CONTAINS(errorString, expectedError); + UNIT_ASSERT_STRING_CONTAINS(ValidateWildcards(wildcards), expectedError); UNIT_ASSERT_EXCEPTION_CONTAINS(RegexFromWildcards(wildcards), yexception, expectedException); } Y_UNIT_TEST(TestRegexFromWildcards) { TestRegexFromWildcardsSuccess("first,test\\_{alt1,alt2}_text", "first\\,test\\\\_(?:alt1|alt2)_text"); TestRegexFromWildcardsSuccess("hello.*world?str", "hello\\..*world.str"); - TestRegexFromWildcardsSuccess("many_{{alt1,alt2},{alt3,{alt4}},alt5,{}}_alts", "many_(?:(?:alt1|alt2)|(?:alt3|(?:alt4))|alt5|(?:))_alts"); + TestRegexFromWildcardsSuccess("many_{},{alt1,al?t2,al*t3},{alt4}_alts", "many_(?:)\\,(?:alt1|al.t2|al.*t3)\\,(?:alt4)_alts"); TestRegexFromWildcardsFail("hello{}}world", "Unexpected group end", "found unexpected group end at position 7"); - TestRegexFromWildcardsFail("hello{{{}world", "Found 2 unterminated groups", "found 2 unterminated groups"); + TestRegexFromWildcardsFail("hello{{{}world", "Found unterminated group", "found unterminated group start at position 5"); + TestRegexFromWildcardsFail("hello{},{{}}world", "Found unterminated group", "found unterminated group start at position 8"); } } diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp index f35c812f2149..436a116ecdf1 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp @@ -732,7 +732,7 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { if (!FindFilePattern(settings, ctx, filePattern)) { return false; } - if (TString errorString; !NS3::ValidateWildcards(filePattern, errorString)) { + if (TString errorString = NS3::ValidateWildcards(filePattern)) { ctx.AddError(TIssue(ctx.GetPosition(read.Pos()), TStringBuilder() << "File pattern '" << filePattern << "' contains invalid wildcard: " << errorString)); return false; } @@ -767,7 +767,7 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { } for (const auto& path : paths) { - if (TString errorString; !NS3::ValidateWildcards(path, errorString)) { + if (TString errorString = NS3::ValidateWildcards(path)) { ctx.AddError(TIssue(ctx.GetPosition(read.Pos()), TStringBuilder() << "Path '" << path << "' contains invalid wildcard: " << errorString)); return false; } From 364c3e25f2c9f0ca7eaafebc6ec4b74acfb8a78c Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Fri, 23 Aug 2024 11:27:23 +0000 Subject: [PATCH 8/8] Fixed issues 2 --- .../external_sources/object_storage_ut.cpp | 8 +++--- .../s3/kqp_federated_query_ut.cpp | 8 +++--- .../s3/kqp_federated_scheme_ut.cpp | 4 +-- .../s3/object_listers/yql_s3_path.cpp | 28 +++++++++---------- .../s3/object_listers/yql_s3_path_ut.cpp | 6 ++-- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/ydb/core/external_sources/object_storage_ut.cpp b/ydb/core/external_sources/object_storage_ut.cpp index b75df34c7fa1..23fcc0e214a6 100644 --- a/ydb/core/external_sources/object_storage_ut.cpp +++ b/ydb/core/external_sources/object_storage_ut.cpp @@ -36,14 +36,14 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) { { // location NKikimrExternalSources::TGeneral general; - general.set_location("}"); - UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '}' contains invalid wildcard:"); + general.set_location("{"); + UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '{' contains invalid wildcard:"); } { // file pattern NKikimrExternalSources::TGeneral general; - general.mutable_attributes()->insert({"file_pattern", "}"}); - UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "File pattern '}' contains invalid wildcard:"); + general.mutable_attributes()->insert({"file_pattern", "{"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "File pattern '{' contains invalid wildcard:"); general.set_location("/test_file"); UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Path pattern cannot be used with file_pattern"); } diff --git a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp index 2ea956b663d0..cc5ec37a0215 100644 --- a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp +++ b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp @@ -2206,7 +2206,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { { // path validation const TString sql = R"( - SELECT * FROM `/Root/external_data_source`.`/}` WITH ( + SELECT * FROM `/Root/external_data_source`.`/{` WITH ( SCHEMA = (data String), FORMAT = "csv_with_names" ))"; @@ -2216,7 +2216,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver()); UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString()); - UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "Path '/}' contains invalid wildcard:"); + UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "Path '/{' contains invalid wildcard:"); } { // file pattern validation @@ -2224,7 +2224,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { SELECT * FROM `/Root/external_data_source`.`/` WITH ( SCHEMA = (data String), FORMAT = "csv_with_names", - FILE_PATTERN = "}" + FILE_PATTERN = "{" ))"; auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync(); @@ -2232,7 +2232,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver()); UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString()); - UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "File pattern '}' contains invalid wildcard:"); + UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "File pattern '{' contains invalid wildcard:"); } } } diff --git a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp index a4751d3741a0..57f1694e1ea4 100644 --- a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp +++ b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp @@ -231,11 +231,11 @@ Y_UNIT_TEST_SUITE(KqpFederatedSchemeTest) { Value String ) WITH ( DATA_SOURCE="/Root/ExternalDataSource", - LOCATION="}" + LOCATION="{" );)"; auto result = session.ExecuteSchemeQuery(query).GetValueSync(); UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), EStatus::SCHEME_ERROR); - UNIT_ASSERT_STRING_CONTAINS(result.GetIssues().ToString(), "Location '}' contains invalid wildcard:"); + UNIT_ASSERT_STRING_CONTAINS(result.GetIssues().ToString(), "Location '{' contains invalid wildcard:"); } } diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp index fa4d945508c6..e472caef152f 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp @@ -52,15 +52,21 @@ TString RegexFromWildcards(const std::string_view& pattern) { for (const char& c : escaped) { switch (c) { case '{': - Y_ENSURE(!group, "Found unterminated group"); - result << "(?:"; - group = true; + if (group) { + result << "\\{"; + } else { + result << "(?:"; + group = true; + } slash = false; break; case '}': - Y_ENSURE(group, "Unexpected group end"); - result << ')'; - group = false; + if (group) { + result << ')'; + group = false; + } else { + result << "\\}"; + } slash = false; break; case ',': @@ -97,17 +103,11 @@ TString RegexFromWildcards(const std::string_view& pattern) { } TString ValidateWildcards(const std::string_view& pattern) { - std::optional groupStart = std::nullopt; + std::optional groupStart; for (size_t i = 0; i < pattern.size(); ++i) { - if (pattern[i] == '{') { - if (groupStart) { - return TStringBuilder() << "found unterminated group start at position " << *groupStart; - } + if (pattern[i] == '{' && !groupStart) { groupStart = i; } else if (pattern[i] == '}') { - if (!groupStart) { - return TStringBuilder() << "found unexpected group end at position " << i; - } groupStart = std::nullopt; } } diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp index 355207a9aca1..d4aaa47800f5 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp @@ -44,10 +44,10 @@ Y_UNIT_TEST_SUITE(TPathTests) { TestRegexFromWildcardsSuccess("first,test\\_{alt1,alt2}_text", "first\\,test\\\\_(?:alt1|alt2)_text"); TestRegexFromWildcardsSuccess("hello.*world?str", "hello\\..*world.str"); TestRegexFromWildcardsSuccess("many_{},{alt1,al?t2,al*t3},{alt4}_alts", "many_(?:)\\,(?:alt1|al.t2|al.*t3)\\,(?:alt4)_alts"); + TestRegexFromWildcardsSuccess("hello}{}}world", "hello\\}(?:)\\}world"); + TestRegexFromWildcardsSuccess("hello{{{}world", "hello(?:\\{\\{)world"); - TestRegexFromWildcardsFail("hello{}}world", "Unexpected group end", "found unexpected group end at position 7"); - TestRegexFromWildcardsFail("hello{{{}world", "Found unterminated group", "found unterminated group start at position 5"); - TestRegexFromWildcardsFail("hello{},{{}}world", "Found unterminated group", "found unterminated group start at position 8"); + TestRegexFromWildcardsFail("hello{}}{world", "Found unterminated group", "found unterminated group start at position 8"); } }