Skip to content

Commit 4d16b23

Browse files
authored
YQ-3570 added s3 wildcards validations (ydb-platform#8244)
1 parent 9a2335a commit 4d16b23

File tree

14 files changed

+193
-19
lines changed

14 files changed

+193
-19
lines changed

ydb/core/external_sources/object_storage.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ struct TObjectStorageExternalSource : public IExternalSource {
6565
}
6666
}
6767

68-
if (auto issues = Validate(schema, objectStorage, PathsLimit)) {
68+
if (auto issues = Validate(schema, objectStorage, PathsLimit, general.location())) {
6969
ythrow TExternalSourceException() << issues.ToString();
7070
}
7171

@@ -134,11 +134,18 @@ struct TObjectStorageExternalSource : public IExternalSource {
134134
}
135135

136136
template<typename TScheme, typename TObjectStorage>
137-
static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit) {
137+
static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit, const TString& location) {
138138
NYql::TIssues issues;
139-
issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting()));
139+
if (TString errorString = NYql::NS3::ValidateWildcards(location)) {
140+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains invalid wildcard: " << errorString));
141+
}
142+
const bool hasPartitioning = objectStorage.projection_size() || objectStorage.partitioned_by_size();
143+
issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting(), location, hasPartitioning));
140144
issues.AddIssues(ValidateRawFormat(objectStorage.format(), schema, objectStorage.partitioned_by()));
141-
if (objectStorage.projection_size() || objectStorage.partitioned_by_size()) {
145+
if (hasPartitioning) {
146+
if (NYql::NS3::HasWildcards(location)) {
147+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains wildcards"));
148+
}
142149
try {
143150
TVector<TString> partitionedBy{objectStorage.partitioned_by().begin(), objectStorage.partitioned_by().end()};
144151
issues.AddIssues(ValidateProjectionColumns(schema, partitionedBy));
@@ -158,11 +165,17 @@ struct TObjectStorageExternalSource : public IExternalSource {
158165
return issues;
159166
}
160167

161-
static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map<TString, TString>& formatSetting) {
168+
static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map<TString, TString>& formatSetting, const TString& location, bool hasPartitioning) {
162169
NYql::TIssues issues;
163170
issues.AddIssues(ValidateDateFormatSetting(formatSetting));
164171
for (const auto& [key, value]: formatSetting) {
165172
if (key == "file_pattern"sv) {
173+
if (TString errorString = NYql::NS3::ValidateWildcards(value)) {
174+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "File pattern '" << value << "' contains invalid wildcard: " << errorString));
175+
}
176+
if (value && !hasPartitioning && !location.EndsWith("/")) {
177+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, "Path pattern cannot be used with file_pattern"));
178+
}
166179
continue;
167180
}
168181

@@ -627,8 +640,8 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector<TRegEx
627640
return MakeIntrusive<TObjectStorageExternalSource>(hostnamePatterns, actorSystem, pathsLimit, std::move(credentialsFactory), enableInfer);
628641
}
629642

630-
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit) {
631-
return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit);
643+
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location) {
644+
return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit, location);
632645
}
633646

634647
NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map<TString, TString>& formatSetting, bool matchAllSettings) {

ydb/core/external_sources/object_storage.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector<TRegEx
1515
std::shared_ptr<NYql::ISecuredServiceAccountCredentialsFactory> credentialsFactory,
1616
bool enableInfer);
1717

18-
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit);
18+
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location);
1919

2020
NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map<TString, TString>& formatSetting, bool matchAllSettings = false);
2121

ydb/core/external_sources/object_storage_ut.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,32 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) {
2929
general.mutable_attributes()->insert({"projection.h", "b"});
3030
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Partition by must always be specified");
3131
}
32+
33+
Y_UNIT_TEST(WildcardsValidation) {
34+
auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false);
35+
NKikimrExternalSources::TSchema schema;
36+
37+
{ // location
38+
NKikimrExternalSources::TGeneral general;
39+
general.set_location("{");
40+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '{' contains invalid wildcard:");
41+
}
42+
43+
{ // file pattern
44+
NKikimrExternalSources::TGeneral general;
45+
general.mutable_attributes()->insert({"file_pattern", "{"});
46+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "File pattern '{' contains invalid wildcard:");
47+
general.set_location("/test_file");
48+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Path pattern cannot be used with file_pattern");
49+
}
50+
51+
{ // partitioned by
52+
NKikimrExternalSources::TGeneral general;
53+
general.set_location("*");
54+
general.mutable_attributes()->insert({"partitioned_by", "[year]"});
55+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '*' contains wildcards");
56+
}
57+
}
3258
}
3359

3460
} // NKikimr

ydb/core/fq/libs/control_plane_storage/request_validators.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ NYql::TIssues ValidateBinding(const T& ev, size_t maxSize, const TSet<FederatedQ
121121
case FederatedQuery::BindingSetting::kObjectStorage:
122122
const FederatedQuery::ObjectStorageBinding objectStorage = setting.object_storage();
123123
for (const auto& subset: objectStorage.subset()) {
124-
issues.AddIssues(NKikimr::NExternalSource::Validate(subset.schema(), subset, pathsLimit));
124+
issues.AddIssues(NKikimr::NExternalSource::Validate(subset.schema(), subset, pathsLimit, subset.path_pattern()));
125125
}
126126
break;
127127
}

ydb/core/kqp/gateway/utils/scheme_helpers.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ void FillCreateExternalTableColumnDesc(NKikimrSchemeOp::TExternalTableDescriptio
8181
columnDesc.SetNotNull(columnIt->second.NotNull);
8282
}
8383
NKikimrExternalSources::TGeneral general;
84+
general.set_location(settings.Location);
8485
auto& attributes = *general.mutable_attributes();
8586
for (const auto& [key, value]: settings.SourceTypeParameters) {
8687
attributes.insert({key, value});

ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
3636
const TString externalDataSourceName = "/Root/external_data_source";
3737
const TString externalTableName = "/Root/test_binding_resolve";
3838
const TString bucket = "test_bucket1";
39-
const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "{}") << "_object";
39+
const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "*?{}") << "_object";
4040

4141
CreateBucketWithObject(bucket, object, TEST_CONTENT);
4242

@@ -1794,7 +1794,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
17941794

17951795
Y_UNIT_TEST(TestReadEmptyFileWithCsvFormat) {
17961796
const TString externalDataSourceName = "/Root/external_data_source";
1797-
const TString bucket = "test_bucket1";
1797+
const TString bucket = "test_bucket12";
17981798

17991799
CreateBucketWithObject(bucket, "test_object", "");
18001800

@@ -1832,6 +1832,60 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
18321832
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
18331833
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Completed, readyOp.Status().GetIssues().ToString());
18341834
}
1835+
1836+
Y_UNIT_TEST(TestWildcardValidation) {
1837+
const TString bucket = "test_bucket13";
1838+
1839+
CreateBucket(bucket);
1840+
1841+
auto kikimr = MakeKikimrRunner(NYql::IHTTPGateway::Make());
1842+
1843+
auto tc = kikimr->GetTableClient();
1844+
auto session = tc.CreateSession().GetValueSync().GetSession();
1845+
const TString query = fmt::format(R"(
1846+
CREATE EXTERNAL DATA SOURCE `/Root/external_data_source` WITH (
1847+
SOURCE_TYPE="ObjectStorage",
1848+
LOCATION="{location}",
1849+
AUTH_METHOD="NONE"
1850+
);)",
1851+
"location"_a = GetBucketLocation(bucket)
1852+
);
1853+
auto result = session.ExecuteSchemeQuery(query).GetValueSync();
1854+
UNIT_ASSERT_C(result.GetStatus() == NYdb::EStatus::SUCCESS, result.GetIssues().ToString());
1855+
1856+
auto db = kikimr->GetQueryClient();
1857+
1858+
{ // path validation
1859+
const TString sql = R"(
1860+
SELECT * FROM `/Root/external_data_source`.`/{` WITH (
1861+
SCHEMA = (data String),
1862+
FORMAT = "csv_with_names"
1863+
))";
1864+
1865+
auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync();
1866+
UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString());
1867+
1868+
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
1869+
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString());
1870+
UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "Path '/{' contains invalid wildcard:");
1871+
}
1872+
1873+
{ // file pattern validation
1874+
const TString sql = R"(
1875+
SELECT * FROM `/Root/external_data_source`.`/` WITH (
1876+
SCHEMA = (data String),
1877+
FORMAT = "csv_with_names",
1878+
FILE_PATTERN = "{"
1879+
))";
1880+
1881+
auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync();
1882+
UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString());
1883+
1884+
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
1885+
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString());
1886+
UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "File pattern '{' contains invalid wildcard:");
1887+
}
1888+
}
18351889
}
18361890

18371891
} // namespace NKikimr::NKqp

ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,28 @@ Y_UNIT_TEST_SUITE(KqpFederatedSchemeTest) {
216216
};
217217
TestInvalidDropForExternalTableWithAuth(queryClientExecutor, "generic_query");
218218
}
219+
220+
Y_UNIT_TEST(ExternalTableDdlLocationValidation) {
221+
auto kikimr = MakeKikimrRunner(NYql::IHTTPGateway::Make());
222+
auto db = kikimr->GetTableClient();
223+
auto session = db.CreateSession().GetValueSync().GetSession();
224+
auto query = TStringBuilder() << R"(
225+
CREATE EXTERNAL DATA SOURCE `/Root/ExternalDataSource` WITH (
226+
SOURCE_TYPE="ObjectStorage",
227+
LOCATION="my-bucket",
228+
AUTH_METHOD="NONE"
229+
);
230+
CREATE EXTERNAL TABLE `/Root/ExternalTable` (
231+
Key Uint64,
232+
Value String
233+
) WITH (
234+
DATA_SOURCE="/Root/ExternalDataSource",
235+
LOCATION="{"
236+
);)";
237+
auto result = session.ExecuteSchemeQuery(query).GetValueSync();
238+
UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), EStatus::SCHEME_ERROR);
239+
UNIT_ASSERT_STRING_CONTAINS(result.GetIssues().ToString(), "Location '{' contains invalid wildcard:");
240+
}
219241
}
220242

221243
} // namespace NKikimr::NKqp

ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5000,7 +5000,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) {
50005000
month Int64 NOT NULL
50015001
) WITH (
50025002
DATA_SOURCE=")" << externalDataSourceName << R"(",
5003-
LOCATION="/folder1/*",
5003+
LOCATION="/folder1/",
50045004
FORMAT="json_as_string",
50055005
`projection.enabled`="true",
50065006
`projection.year.type`="integer",
@@ -5025,7 +5025,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) {
50255025
UNIT_ASSERT(externalTable.ExternalTableInfo);
50265026
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.ColumnsSize(), 4);
50275027
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetDataSourcePath(), externalDataSourceName);
5028-
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/*");
5028+
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/");
50295029
}
50305030

50315031
Y_UNIT_TEST(CreateExternalTableWithUpperCaseSettings) {
@@ -5048,7 +5048,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) {
50485048
Month Int64 NOT NULL
50495049
) WITH (
50505050
DATA_SOURCE=")" << externalDataSourceName << R"(",
5051-
LOCATION="/folder1/*",
5051+
LOCATION="/folder1/",
50525052
FORMAT="json_as_string",
50535053
`projection.enabled`="true",
50545054
`projection.Year.type`="integer",
@@ -5073,7 +5073,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) {
50735073
UNIT_ASSERT(externalTable.ExternalTableInfo);
50745074
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.ColumnsSize(), 4);
50755075
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetDataSourcePath(), externalDataSourceName);
5076-
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/*");
5076+
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/");
50775077
}
50785078

50795079
Y_UNIT_TEST(DoubleCreateExternalTable) {

ydb/core/protos/external_sources.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ message TSchema {
1111

1212
message TGeneral {
1313
map<string, string> attributes = 1 [(Ydb.size).le = 100];
14+
optional string location = 2;
1415
}
1516

1617
message TObjectStorage {

ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ std::pair<TPathFilter, TEarlyStopChecker> MakeFilterRegexp(const TString& regex,
4747
} else {
4848
re = std::make_shared<RE2>(re2::StringPiece(regex), RE2::Options());
4949
}
50+
Y_ENSURE(re->ok());
5051

5152
const size_t numGroups = re->NumberOfCapturingGroups();
5253
YQL_CLOG(DEBUG, ProviderS3)

0 commit comments

Comments
 (0)