Skip to content

Commit f04c6ec

Browse files
committed
Added wildcards validations
1 parent 947bad6 commit f04c6ec

File tree

12 files changed

+187
-12
lines changed

12 files changed

+187
-12
lines changed

ydb/core/external_sources/object_storage.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ struct TObjectStorageExternalSource : public IExternalSource {
4646

4747
virtual TString Pack(const NKikimrExternalSources::TSchema& schema,
4848
const NKikimrExternalSources::TGeneral& general) const override {
49+
TString location;
4950
NKikimrExternalSources::TObjectStorage objectStorage;
5051
for (const auto& [key, value]: general.attributes()) {
5152
auto lowerKey = to_lower(key);
@@ -62,12 +63,14 @@ struct TObjectStorageExternalSource : public IExternalSource {
6263
}
6364
} else if (IsIn({"file_pattern"sv, "data.interval.unit"sv, "data.datetime.format_name"sv, "data.datetime.format"sv, "data.timestamp.format_name"sv, "data.timestamp.format"sv, "csv_delimiter"sv}, lowerKey)) {
6465
objectStorage.mutable_format_setting()->insert({lowerKey, value});
66+
} else if (lowerKey == "location") {
67+
location = value;
6568
} else {
6669
ythrow TExternalSourceException() << "Unknown attribute " << key;
6770
}
6871
}
6972

70-
if (auto issues = Validate(schema, objectStorage, PathsLimit)) {
73+
if (auto issues = Validate(schema, objectStorage, PathsLimit, location)) {
7174
ythrow TExternalSourceException() << issues.ToString();
7275
}
7376

@@ -136,9 +139,15 @@ struct TObjectStorageExternalSource : public IExternalSource {
136139
}
137140

138141
template<typename TScheme, typename TObjectStorage>
139-
static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit) {
142+
static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit, const TString& location) {
140143
NYql::TIssues issues;
141-
issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting()));
144+
if (TString errorString; !NYql::NS3::ValidateWildcards(location, errorString)) {
145+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains invalid wildcard: " << errorString));
146+
}
147+
if (objectStorage.partitioned_by_size() && NYql::NS3::HasWildcards(location)) {
148+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains wildcards"));
149+
}
150+
issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting(), location));
142151
issues.AddIssues(ValidateRawFormat(objectStorage.format(), schema, objectStorage.partitioned_by()));
143152
if (objectStorage.projection_size() || objectStorage.partitioned_by_size()) {
144153
try {
@@ -160,11 +169,17 @@ struct TObjectStorageExternalSource : public IExternalSource {
160169
return issues;
161170
}
162171

163-
static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map<TString, TString>& formatSetting) {
172+
static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map<TString, TString>& formatSetting, const TString& location) {
164173
NYql::TIssues issues;
165174
issues.AddIssues(ValidateDateFormatSetting(formatSetting));
166175
for (const auto& [key, value]: formatSetting) {
167176
if (key == "file_pattern"sv) {
177+
if (TString errorString; !NYql::NS3::ValidateWildcards(value, errorString)) {
178+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "File pattern '" << value << "' contains invalid wildcard: " << errorString));
179+
}
180+
if (value && !location.EndsWith("/")) {
181+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, "Path pattern cannot be used with file_pattern"));
182+
}
168183
continue;
169184
}
170185

@@ -631,8 +646,8 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector<TRegEx
631646
return MakeIntrusive<TObjectStorageExternalSource>(hostnamePatterns, actorSystem, pathsLimit, std::move(credentialsFactory), enableInfer, allowLocalFiles);
632647
}
633648

634-
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit) {
635-
return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit);
649+
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location) {
650+
return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit, location);
636651
}
637652

638653
NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map<TString, TString>& formatSetting, bool matchAllSettings) {

ydb/core/external_sources/object_storage.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector<TRegEx
1616
bool enableInfer,
1717
bool allowLocalFiles);
1818

19-
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit);
19+
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location);
2020

2121
NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map<TString, TString>& formatSetting, bool matchAllSettings = false);
2222

ydb/core/external_sources/object_storage_ut.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "object_storage.h"
22

33
#include <library/cpp/testing/unittest/registar.h>
4+
#include <library/cpp/scheme/scheme.h>
45
#include <util/random/random.h>
56
#include <ydb/core/protos/external_sources.pb.h>
67

@@ -29,6 +30,32 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) {
2930
general.mutable_attributes()->insert({"projection.h", "b"});
3031
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Partition by must always be specified");
3132
}
33+
34+
Y_UNIT_TEST(WildcardsValidation) {
35+
auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false);
36+
NKikimrExternalSources::TSchema schema;
37+
38+
{ // location
39+
NKikimrExternalSources::TGeneral general;
40+
general.mutable_attributes()->insert({"location", "}"});
41+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '}' contains invalid wildcard:");
42+
}
43+
44+
{ // file pattern
45+
NKikimrExternalSources::TGeneral general;
46+
general.mutable_attributes()->insert({"file_pattern", "}"});
47+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "File pattern '}' contains invalid wildcard:");
48+
general.mutable_attributes()->insert({"location", "/test_file"});
49+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Path pattern cannot be used with file_pattern");
50+
}
51+
52+
{ // partitioned by
53+
NKikimrExternalSources::TGeneral general;
54+
general.mutable_attributes()->insert({"location", "*"});
55+
general.mutable_attributes()->insert({"partitioned_by", "[year]"});
56+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '*' contains wildcards");
57+
}
58+
}
3259
}
3360

3461
} // NKikimr

ydb/core/fq/libs/control_plane_storage/request_validators.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ NYql::TIssues ValidateBinding(const T& ev, size_t maxSize, const TSet<FederatedQ
121121
case FederatedQuery::BindingSetting::kObjectStorage:
122122
const FederatedQuery::ObjectStorageBinding objectStorage = setting.object_storage();
123123
for (const auto& subset: objectStorage.subset()) {
124-
issues.AddIssues(NKikimr::NExternalSource::Validate(subset.schema(), subset, pathsLimit));
124+
issues.AddIssues(NKikimr::NExternalSource::Validate(subset.schema(), subset, pathsLimit, subset.path_pattern()));
125125
}
126126
break;
127127
}

ydb/core/kqp/gateway/utils/scheme_helpers.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ void FillCreateExternalTableColumnDesc(NKikimrSchemeOp::TExternalTableDescriptio
9292
for (const auto& [key, value]: settings.SourceTypeParameters) {
9393
attributes.insert({key, value});
9494
}
95+
attributes.insert({"location", settings.Location});
9596
externalTableDesc.SetContent(general.SerializeAsString());
9697
}
9798

ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2181,6 +2181,60 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
21812181
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
21822182
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Completed, readyOp.Status().GetIssues().ToString());
21832183
}
2184+
2185+
Y_UNIT_TEST(TestWildcardValidation) {
2186+
const TString bucket = "test_bucket1";
2187+
2188+
CreateBucket(bucket);
2189+
2190+
auto kikimr = NTestUtils::MakeKikimrRunner();
2191+
2192+
auto tc = kikimr->GetTableClient();
2193+
auto session = tc.CreateSession().GetValueSync().GetSession();
2194+
const TString query = fmt::format(R"(
2195+
CREATE EXTERNAL DATA SOURCE `/Root/external_data_source` WITH (
2196+
SOURCE_TYPE="ObjectStorage",
2197+
LOCATION="{location}",
2198+
AUTH_METHOD="NONE"
2199+
);)",
2200+
"location"_a = GetBucketLocation(bucket)
2201+
);
2202+
auto result = session.ExecuteSchemeQuery(query).GetValueSync();
2203+
UNIT_ASSERT_C(result.GetStatus() == NYdb::EStatus::SUCCESS, result.GetIssues().ToString());
2204+
2205+
auto db = kikimr->GetQueryClient();
2206+
2207+
{ // path validation
2208+
const TString sql = R"(
2209+
SELECT * FROM `/Root/external_data_source`.`/}` WITH (
2210+
SCHEMA = (data String),
2211+
FORMAT = "csv_with_names"
2212+
))";
2213+
2214+
auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync();
2215+
UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString());
2216+
2217+
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
2218+
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString());
2219+
UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "Path '/}' contains invalid wildcard:");
2220+
}
2221+
2222+
{ // file pattern validation
2223+
const TString sql = R"(
2224+
SELECT * FROM `/Root/external_data_source`.`/` WITH (
2225+
SCHEMA = (data String),
2226+
FORMAT = "csv_with_names",
2227+
FILE_PATTERN = "}"
2228+
))";
2229+
2230+
auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync();
2231+
UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString());
2232+
2233+
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
2234+
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString());
2235+
UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "File pattern '}' contains invalid wildcard:");
2236+
}
2237+
}
21842238
}
21852239

21862240
} // namespace NKikimr::NKqp

ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,28 @@ Y_UNIT_TEST_SUITE(KqpFederatedSchemeTest) {
215215
};
216216
TestInvalidDropForExternalTableWithAuth(queryClientExecutor, "generic_query");
217217
}
218+
219+
Y_UNIT_TEST(ExternalTableDdlLocationValidation) {
220+
auto kikimr = NTestUtils::MakeKikimrRunner();
221+
auto db = kikimr->GetTableClient();
222+
auto session = db.CreateSession().GetValueSync().GetSession();
223+
auto query = TStringBuilder() << R"(
224+
CREATE EXTERNAL DATA SOURCE `/Root/ExternalDataSource` WITH (
225+
SOURCE_TYPE="ObjectStorage",
226+
LOCATION="my-bucket",
227+
AUTH_METHOD="NONE"
228+
);
229+
CREATE EXTERNAL TABLE `/Root/ExternalTable` (
230+
Key Uint64,
231+
Value String
232+
) WITH (
233+
DATA_SOURCE="/Root/ExternalDataSource",
234+
LOCATION="}"
235+
);)";
236+
auto result = session.ExecuteSchemeQuery(query).GetValueSync();
237+
UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), EStatus::SCHEME_ERROR);
238+
UNIT_ASSERT_STRING_CONTAINS(result.GetIssues().ToString(), "Location '}' contains invalid wildcard:");
239+
}
218240
}
219241

220242
} // namespace NKikimr::NKqp

ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ std::pair<TPathFilter, TEarlyStopChecker> MakeFilterRegexp(const TString& regex,
4747
} else {
4848
re = std::make_shared<RE2>(re2::StringPiece(regex), RE2::Options());
4949
}
50+
Y_ENSURE(re->ok());
5051

5152
const size_t numGroups = re->NumberOfCapturingGroups();
5253
YQL_CLOG(DEBUG, ProviderS3)

ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,22 +47,23 @@ TString RegexFromWildcards(const std::string_view& pattern) {
4747
const auto& escaped = EscapeRegex(pattern);
4848
TStringBuilder result;
4949
bool slash = false;
50-
bool group = false;
50+
size_t groupCount = 0;
5151

5252
for (const char& c : escaped) {
5353
switch (c) {
5454
case '{':
5555
result << "(?:";
56-
group = true;
56+
groupCount++;
5757
slash = false;
5858
break;
5959
case '}':
60+
Y_ENSURE(groupCount, "Unexpected group end");
6061
result << ')';
61-
group = false;
62+
groupCount--;
6263
slash = false;
6364
break;
6465
case ',':
65-
if (group)
66+
if (groupCount)
6667
result << '|';
6768
else
6869
result << "\\,";
@@ -89,7 +90,29 @@ TString RegexFromWildcards(const std::string_view& pattern) {
8990
break;
9091
}
9192
}
93+
Y_ENSURE(!groupCount, "Found " << groupCount << " unterminated groups");
94+
Y_ENSURE(!slash, "Expected symbol after slash");
9295
return result;
9396
}
9497

98+
bool ValidateWildcards(const std::string_view& pattern, TString& errorString) {
99+
size_t groupCount = 0;
100+
for (size_t i = 0; i < pattern.size(); ++i) {
101+
if (pattern[i] == '{') {
102+
groupCount++;
103+
} else if (pattern[i] == '}') {
104+
if (!groupCount) {
105+
errorString = TStringBuilder() << "found unexpected group end at position " << i;
106+
return false;
107+
}
108+
groupCount--;
109+
}
110+
}
111+
if (groupCount) {
112+
errorString = TStringBuilder() << "found " << groupCount << " unterminated groups";
113+
return false;
114+
}
115+
return true;
116+
}
117+
95118
}

ydb/library/yql/providers/s3/object_listers/yql_s3_path.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,6 @@ TString EscapeRegex(const TString& str);
1919
TString EscapeRegex(const std::string_view& str);
2020

2121
TString RegexFromWildcards(const std::string_view& pattern);
22+
bool ValidateWildcards(const std::string_view& pattern, TString& errorString);
2223

2324
}

ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,28 @@ Y_UNIT_TEST_SUITE(TPathTests) {
2828
UNIT_ASSERT_VALUES_EQUAL(NormalizePath("/a/b/c/"), "a/b/c/");
2929
UNIT_ASSERT_VALUES_EQUAL(NormalizePath("///a/b/c///"), "a/b/c/");
3030
}
31+
32+
void TestRegexFromWildcardsSuccess(const TString& wildcards, const TString& expectedRegex) {
33+
TString errorString;
34+
UNIT_ASSERT_C(ValidateWildcards(wildcards, errorString), errorString);
35+
UNIT_ASSERT_VALUES_EQUAL(RegexFromWildcards(wildcards), expectedRegex);
36+
}
37+
38+
void TestRegexFromWildcardsFail(const TString& wildcards, const TString& expectedException, const TString& expectedError) {
39+
TString errorString;
40+
UNIT_ASSERT(!ValidateWildcards(wildcards, errorString));
41+
UNIT_ASSERT_STRING_CONTAINS(errorString, expectedError);
42+
UNIT_ASSERT_EXCEPTION_CONTAINS(RegexFromWildcards(wildcards), yexception, expectedException);
43+
}
44+
45+
Y_UNIT_TEST(TestRegexFromWildcards) {
46+
TestRegexFromWildcardsSuccess("first,test\\_{alt1,alt2}_text", "first\\,test\\\\_(?:alt1|alt2)_text");
47+
TestRegexFromWildcardsSuccess("hello.*world?str", "hello\\..*world.str");
48+
TestRegexFromWildcardsSuccess("many_{{alt1,alt2},{alt3,{alt4}},alt5,{}}_alts", "many_(?:(?:alt1|alt2)|(?:alt3|(?:alt4))|alt5|(?:))_alts");
49+
50+
TestRegexFromWildcardsFail("hello{}}world", "Unexpected group end", "found unexpected group end at position 7");
51+
TestRegexFromWildcardsFail("hello{{{}world", "Found 2 unterminated groups", "found 2 unterminated groups");
52+
}
3153
}
3254

3355
}

ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,10 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase {
732732
if (!FindFilePattern(settings, ctx, filePattern)) {
733733
return false;
734734
}
735+
if (TString errorString; !NS3::ValidateWildcards(filePattern, errorString)) {
736+
ctx.AddError(TIssue(ctx.GetPosition(read.Pos()), TStringBuilder() << "File pattern '" << filePattern << "' contains invalid wildcard: " << errorString));
737+
return false;
738+
}
735739
const TString effectiveFilePattern = filePattern ? filePattern : "*";
736740

737741
TVector<TString> paths;
@@ -763,6 +767,11 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase {
763767
}
764768

765769
for (const auto& path : paths) {
770+
if (TString errorString; !NS3::ValidateWildcards(path, errorString)) {
771+
ctx.AddError(TIssue(ctx.GetPosition(read.Pos()), TStringBuilder() << "Path '" << path << "' contains invalid wildcard: " << errorString));
772+
return false;
773+
}
774+
766775
// each path in CONCAT() can generate multiple list requests for explicit partitioning
767776
TVector<TListRequest> reqs;
768777

0 commit comments

Comments
 (0)