Skip to content

Commit db03225

Browse files
authored
YQ-3570 added s3 wildcards validations (ydb-platform#8245)
1 parent 434b4dd commit db03225

File tree

14 files changed

+194
-19
lines changed

14 files changed

+194
-19
lines changed

ydb/core/external_sources/object_storage.cpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <ydb/library/yql/providers/common/structured_token/yql_token_builder.h>
1414
#include <ydb/library/yql/providers/s3/credentials/credentials.h>
1515
#include <ydb/library/yql/providers/s3/object_listers/yql_s3_list.h>
16+
#include <ydb/library/yql/providers/s3/object_listers/yql_s3_path.h>
1617
#include <ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h>
1718
#include <ydb/library/yql/providers/s3/proto/credentials.pb.h>
1819
#include <ydb/public/api/protos/ydb_status_codes.pb.h>
@@ -64,7 +65,7 @@ struct TObjectStorageExternalSource : public IExternalSource {
6465
}
6566
}
6667

67-
if (auto issues = Validate(schema, objectStorage, PathsLimit)) {
68+
if (auto issues = Validate(schema, objectStorage, PathsLimit, general.location())) {
6869
ythrow TExternalSourceException() << issues.ToString();
6970
}
7071

@@ -133,11 +134,18 @@ struct TObjectStorageExternalSource : public IExternalSource {
133134
}
134135

135136
template<typename TScheme, typename TObjectStorage>
136-
static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit) {
137+
static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit, const TString& location) {
137138
NYql::TIssues issues;
138-
issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting()));
139+
if (TString errorString = NYql::NS3::ValidateWildcards(location)) {
140+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains invalid wildcard: " << errorString));
141+
}
142+
const bool hasPartitioning = objectStorage.projection_size() || objectStorage.partitioned_by_size();
143+
issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting(), location, hasPartitioning));
139144
issues.AddIssues(ValidateRawFormat(objectStorage.format(), schema, objectStorage.partitioned_by()));
140-
if (objectStorage.projection_size() || objectStorage.partitioned_by_size()) {
145+
if (hasPartitioning) {
146+
if (NYql::NS3::HasWildcards(location)) {
147+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains wildcards"));
148+
}
141149
try {
142150
TVector<TString> partitionedBy{objectStorage.partitioned_by().begin(), objectStorage.partitioned_by().end()};
143151
issues.AddIssues(ValidateProjectionColumns(schema, partitionedBy));
@@ -157,11 +165,17 @@ struct TObjectStorageExternalSource : public IExternalSource {
157165
return issues;
158166
}
159167

160-
static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map<TString, TString>& formatSetting) {
168+
static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map<TString, TString>& formatSetting, const TString& location, bool hasPartitioning) {
161169
NYql::TIssues issues;
162170
issues.AddIssues(ValidateDateFormatSetting(formatSetting));
163171
for (const auto& [key, value]: formatSetting) {
164172
if (key == "file_pattern"sv) {
173+
if (TString errorString = NYql::NS3::ValidateWildcards(value)) {
174+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "File pattern '" << value << "' contains invalid wildcard: " << errorString));
175+
}
176+
if (value && !hasPartitioning && !location.EndsWith("/")) {
177+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, "Path pattern cannot be used with file_pattern"));
178+
}
165179
continue;
166180
}
167181

@@ -616,8 +630,8 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector<TRegEx
616630
return MakeIntrusive<TObjectStorageExternalSource>(hostnamePatterns, actorSystem, pathsLimit, std::move(credentialsFactory), enableInfer);
617631
}
618632

619-
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit) {
620-
return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit);
633+
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location) {
634+
return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit, location);
621635
}
622636

623637
NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map<TString, TString>& formatSetting, bool matchAllSettings) {

ydb/core/external_sources/object_storage.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector<TRegEx
1515
std::shared_ptr<NYql::ISecuredServiceAccountCredentialsFactory> credentialsFactory,
1616
bool enableInfer);
1717

18-
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit);
18+
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location);
1919

2020
NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map<TString, TString>& formatSetting, bool matchAllSettings = false);
2121

ydb/core/external_sources/object_storage_ut.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,32 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) {
2929
general.mutable_attributes()->insert({"projection.h", "b"});
3030
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Partition by must always be specified");
3131
}
32+
33+
Y_UNIT_TEST(WildcardsValidation) {
34+
auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false);
35+
NKikimrExternalSources::TSchema schema;
36+
37+
{ // location
38+
NKikimrExternalSources::TGeneral general;
39+
general.set_location("{");
40+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '{' contains invalid wildcard:");
41+
}
42+
43+
{ // file pattern
44+
NKikimrExternalSources::TGeneral general;
45+
general.mutable_attributes()->insert({"file_pattern", "{"});
46+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "File pattern '{' contains invalid wildcard:");
47+
general.set_location("/test_file");
48+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Path pattern cannot be used with file_pattern");
49+
}
50+
51+
{ // partitioned by
52+
NKikimrExternalSources::TGeneral general;
53+
general.set_location("*");
54+
general.mutable_attributes()->insert({"partitioned_by", "[year]"});
55+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '*' contains wildcards");
56+
}
57+
}
3258
}
3359

3460
} // NKikimr

ydb/core/fq/libs/control_plane_storage/request_validators.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ NYql::TIssues ValidateBinding(const T& ev, size_t maxSize, const TSet<FederatedQ
121121
case FederatedQuery::BindingSetting::kObjectStorage:
122122
const FederatedQuery::ObjectStorageBinding objectStorage = setting.object_storage();
123123
for (const auto& subset: objectStorage.subset()) {
124-
issues.AddIssues(NKikimr::NExternalSource::Validate(subset.schema(), subset, pathsLimit));
124+
issues.AddIssues(NKikimr::NExternalSource::Validate(subset.schema(), subset, pathsLimit, subset.path_pattern()));
125125
}
126126
break;
127127
}

ydb/core/kqp/gateway/utils/scheme_helpers.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ void FillCreateExternalTableColumnDesc(NKikimrSchemeOp::TExternalTableDescriptio
8181
columnDesc.SetNotNull(columnIt->second.NotNull);
8282
}
8383
NKikimrExternalSources::TGeneral general;
84+
general.set_location(settings.Location);
8485
auto& attributes = *general.mutable_attributes();
8586
for (const auto& [key, value]: settings.SourceTypeParameters) {
8687
attributes.insert({key, value});

ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
3535
const TString externalDataSourceName = "/Root/external_data_source";
3636
const TString externalTableName = "/Root/test_binding_resolve";
3737
const TString bucket = "test_bucket1";
38-
const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "{}") << "_object";
38+
const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "*?{}") << "_object";
3939

4040
CreateBucketWithObject(bucket, object, TEST_CONTENT);
4141

@@ -1802,7 +1802,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
18021802

18031803
Y_UNIT_TEST(TestReadEmptyFileWithCsvFormat) {
18041804
const TString externalDataSourceName = "/Root/external_data_source";
1805-
const TString bucket = "test_bucket1";
1805+
const TString bucket = "test_bucket12";
18061806

18071807
CreateBucketWithObject(bucket, "test_object", "");
18081808

@@ -1840,6 +1840,60 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
18401840
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
18411841
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Completed, readyOp.Status().GetIssues().ToString());
18421842
}
1843+
1844+
Y_UNIT_TEST(TestWildcardValidation) {
1845+
const TString bucket = "test_bucket13";
1846+
1847+
CreateBucket(bucket);
1848+
1849+
auto kikimr = NTestUtils::MakeKikimrRunner();
1850+
1851+
auto tc = kikimr->GetTableClient();
1852+
auto session = tc.CreateSession().GetValueSync().GetSession();
1853+
const TString query = fmt::format(R"(
1854+
CREATE EXTERNAL DATA SOURCE `/Root/external_data_source` WITH (
1855+
SOURCE_TYPE="ObjectStorage",
1856+
LOCATION="{location}",
1857+
AUTH_METHOD="NONE"
1858+
);)",
1859+
"location"_a = GetBucketLocation(bucket)
1860+
);
1861+
auto result = session.ExecuteSchemeQuery(query).GetValueSync();
1862+
UNIT_ASSERT_C(result.GetStatus() == NYdb::EStatus::SUCCESS, result.GetIssues().ToString());
1863+
1864+
auto db = kikimr->GetQueryClient();
1865+
1866+
{ // path validation
1867+
const TString sql = R"(
1868+
SELECT * FROM `/Root/external_data_source`.`/{` WITH (
1869+
SCHEMA = (data String),
1870+
FORMAT = "csv_with_names"
1871+
))";
1872+
1873+
auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync();
1874+
UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString());
1875+
1876+
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
1877+
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString());
1878+
UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "Path '/{' contains invalid wildcard:");
1879+
}
1880+
1881+
{ // file pattern validation
1882+
const TString sql = R"(
1883+
SELECT * FROM `/Root/external_data_source`.`/` WITH (
1884+
SCHEMA = (data String),
1885+
FORMAT = "csv_with_names",
1886+
FILE_PATTERN = "{"
1887+
))";
1888+
1889+
auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync();
1890+
UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString());
1891+
1892+
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
1893+
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString());
1894+
UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "File pattern '{' contains invalid wildcard:");
1895+
}
1896+
}
18431897
}
18441898

18451899
} // namespace NKikimr::NKqp

ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,28 @@ Y_UNIT_TEST_SUITE(KqpFederatedSchemeTest) {
215215
};
216216
TestInvalidDropForExternalTableWithAuth(queryClientExecutor, "generic_query");
217217
}
218+
219+
Y_UNIT_TEST(ExternalTableDdlLocationValidation) {
220+
auto kikimr = NTestUtils::MakeKikimrRunner();
221+
auto db = kikimr->GetTableClient();
222+
auto session = db.CreateSession().GetValueSync().GetSession();
223+
auto query = TStringBuilder() << R"(
224+
CREATE EXTERNAL DATA SOURCE `/Root/ExternalDataSource` WITH (
225+
SOURCE_TYPE="ObjectStorage",
226+
LOCATION="my-bucket",
227+
AUTH_METHOD="NONE"
228+
);
229+
CREATE EXTERNAL TABLE `/Root/ExternalTable` (
230+
Key Uint64,
231+
Value String
232+
) WITH (
233+
DATA_SOURCE="/Root/ExternalDataSource",
234+
LOCATION="{"
235+
);)";
236+
auto result = session.ExecuteSchemeQuery(query).GetValueSync();
237+
UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), EStatus::SCHEME_ERROR);
238+
UNIT_ASSERT_STRING_CONTAINS(result.GetIssues().ToString(), "Location '{' contains invalid wildcard:");
239+
}
218240
}
219241

220242
} // namespace NKikimr::NKqp

ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5146,7 +5146,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) {
51465146
month Int64 NOT NULL
51475147
) WITH (
51485148
DATA_SOURCE=")" << externalDataSourceName << R"(",
5149-
LOCATION="/folder1/*",
5149+
LOCATION="/folder1/",
51505150
FORMAT="json_as_string",
51515151
`projection.enabled`="true",
51525152
`projection.year.type`="integer",
@@ -5171,7 +5171,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) {
51715171
UNIT_ASSERT(externalTable.ExternalTableInfo);
51725172
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.ColumnsSize(), 4);
51735173
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetDataSourcePath(), externalDataSourceName);
5174-
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/*");
5174+
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/");
51755175
}
51765176

51775177
Y_UNIT_TEST(CreateExternalTableWithUpperCaseSettings) {
@@ -5194,7 +5194,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) {
51945194
Month Int64 NOT NULL
51955195
) WITH (
51965196
DATA_SOURCE=")" << externalDataSourceName << R"(",
5197-
LOCATION="/folder1/*",
5197+
LOCATION="/folder1/",
51985198
FORMAT="json_as_string",
51995199
`projection.enabled`="true",
52005200
`projection.Year.type`="integer",
@@ -5219,7 +5219,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) {
52195219
UNIT_ASSERT(externalTable.ExternalTableInfo);
52205220
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.ColumnsSize(), 4);
52215221
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetDataSourcePath(), externalDataSourceName);
5222-
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/*");
5222+
UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/");
52235223
}
52245224

52255225
Y_UNIT_TEST(DoubleCreateExternalTable) {

ydb/core/protos/external_sources.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ message TSchema {
1111

1212
message TGeneral {
1313
map<string, string> attributes = 1 [(Ydb.size).le = 100];
14+
optional string location = 2;
1415
}
1516

1617
message TObjectStorage {

ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ std::pair<TPathFilter, TEarlyStopChecker> MakeFilterRegexp(const TString& regex,
4747
} else {
4848
re = std::make_shared<RE2>(re2::StringPiece(regex), RE2::Options());
4949
}
50+
Y_ENSURE(re->ok());
5051

5152
const size_t numGroups = re->NumberOfCapturingGroups();
5253
YQL_CLOG(DEBUG, ProviderS3)

0 commit comments

Comments
 (0)