Skip to content

Commit 293e60f

Browse files
Additional validation for queries with "json_list" format and datetime types (#11346)
1 parent d6142ce commit 293e60f

File tree

4 files changed

+181
-0
lines changed

4 files changed

+181
-0
lines changed

ydb/core/external_sources/object_storage.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ struct TObjectStorageExternalSource : public IExternalSource {
150150
}
151151
const bool hasPartitioning = objectStorage.projection_size() || objectStorage.partitioned_by_size();
152152
issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting(), location, hasPartitioning));
153+
issues.AddIssues(ValidateJsonListFormat(objectStorage.format(), schema, objectStorage.partitioned_by()));
153154
issues.AddIssues(ValidateRawFormat(objectStorage.format(), schema, objectStorage.partitioned_by()));
154155
if (hasPartitioning) {
155156
if (NYql::NS3::HasWildcards(location)) {
@@ -263,6 +264,30 @@ struct TObjectStorageExternalSource : public IExternalSource {
263264
return issues;
264265
}
265266

267+
template<typename TScheme>
268+
static NYql::TIssues ValidateJsonListFormat(const TString& format, const TScheme& schema, const google::protobuf::RepeatedPtrField<TString>& partitionedBy) {
269+
NYql::TIssues issues;
270+
if (format != "json_list"sv) {
271+
return issues;
272+
}
273+
274+
TSet<TString> partitionedBySet{partitionedBy.begin(), partitionedBy.end()};
275+
276+
for (const auto& column: schema.column()) {
277+
if (partitionedBySet.contains(column.name())) {
278+
continue;
279+
}
280+
if (ValidateDateOrTimeType(column.type())) {
281+
issues.AddIssue(MakeErrorIssue(
282+
Ydb::StatusIds::BAD_REQUEST,
283+
TStringBuilder{} << "Date, Timestamp and Interval types are not allowed in json_list format (you have '"
284+
<< column.name() << " " << NYdb::TType(column.type()).ToString() << "' field)"));
285+
}
286+
}
287+
288+
return issues;
289+
}
290+
266291
template<typename TScheme>
267292
static NYql::TIssues ValidateRawFormat(const TString& format, const TScheme& schema, const google::protobuf::RepeatedPtrField<TString>& partitionedBy) {
268293
NYql::TIssues issues;
@@ -800,6 +825,50 @@ struct TObjectStorageExternalSource : public IExternalSource {
800825
return FindIf(availableTypes, [&columnType](const auto& availableType) { return NYdb::TypesEqual(availableType, columnType); }) != availableTypes.end();
801826
}
802827

828+
static std::vector<NYdb::TType> GetDateOrTimeTypes() {
829+
NYdb::TType dateType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Date).Build();
830+
NYdb::TType datetimeType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Datetime).Build();
831+
NYdb::TType timestampType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Timestamp).Build();
832+
NYdb::TType intervalType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Interval).Build();
833+
NYdb::TType date32Type = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Date32).Build();
834+
NYdb::TType datetime64Type = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Datetime64).Build();
835+
NYdb::TType timestamp64Type = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Timestamp64).Build();
836+
NYdb::TType interval64Type = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Interval64).Build();
837+
NYdb::TType tzdateType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::TzDate).Build();
838+
NYdb::TType tzdatetimeType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::TzDatetime).Build();
839+
NYdb::TType tztimestampType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::TzTimestamp).Build();
840+
const std::vector<NYdb::TType> result {
841+
dateType,
842+
datetimeType,
843+
timestampType,
844+
intervalType,
845+
date32Type,
846+
datetime64Type,
847+
timestamp64Type,
848+
interval64Type,
849+
tzdateType,
850+
tzdatetimeType,
851+
tztimestampType,
852+
NYdb::TTypeBuilder{}.Optional(dateType).Build(),
853+
NYdb::TTypeBuilder{}.Optional(datetimeType).Build(),
854+
NYdb::TTypeBuilder{}.Optional(timestampType).Build(),
855+
NYdb::TTypeBuilder{}.Optional(intervalType).Build(),
856+
NYdb::TTypeBuilder{}.Optional(date32Type).Build(),
857+
NYdb::TTypeBuilder{}.Optional(datetime64Type).Build(),
858+
NYdb::TTypeBuilder{}.Optional(timestamp64Type).Build(),
859+
NYdb::TTypeBuilder{}.Optional(interval64Type).Build(),
860+
NYdb::TTypeBuilder{}.Optional(tzdateType).Build(),
861+
NYdb::TTypeBuilder{}.Optional(tzdatetimeType).Build(),
862+
NYdb::TTypeBuilder{}.Optional(tztimestampType).Build()
863+
};
864+
return result;
865+
}
866+
867+
static bool ValidateDateOrTimeType(const NYdb::TType& columnType) {
868+
static const std::vector<NYdb::TType> availableTypes = GetDateOrTimeTypes();
869+
return FindIf(availableTypes, [&columnType](const auto& availableType) { return NYdb::TypesEqual(availableType, columnType); }) != availableTypes.end();
870+
}
871+
803872
private:
804873
const std::vector<TRegExMatch> HostnamePatterns;
805874
const size_t PathsLimit;

ydb/core/external_sources/object_storage_ut.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,31 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) {
3030
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Partition by must always be specified");
3131
}
3232

33+
Y_UNIT_TEST(FailedJsonListValidation) {
34+
static auto invalidTypes = {
35+
Ydb::Type::DATE,
36+
Ydb::Type::DATETIME,
37+
Ydb::Type::TIMESTAMP,
38+
Ydb::Type::INTERVAL,
39+
Ydb::Type::DATE32,
40+
Ydb::Type::DATETIME64,
41+
Ydb::Type::TIMESTAMP64,
42+
Ydb::Type::INTERVAL64,
43+
Ydb::Type::TZ_DATE,
44+
Ydb::Type::TZ_DATETIME,
45+
Ydb::Type::TZ_TIMESTAMP,
46+
};
47+
auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false);
48+
NKikimrExternalSources::TSchema schema;
49+
for (const auto typeId : invalidTypes) {
50+
auto newColumn = schema.add_column();
51+
newColumn->mutable_type()->set_type_id(typeId);
52+
}
53+
NKikimrExternalSources::TGeneral general;
54+
general.mutable_attributes()->insert({"format", "json_list"});
55+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Date, Timestamp and Interval types are not allowed in json_list format");
56+
}
57+
3358
Y_UNIT_TEST(WildcardsValidation) {
3459
auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false);
3560
NKikimrExternalSources::TSchema schema;

ydb/library/yql/providers/common/provider/yql_provider.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1540,6 +1540,30 @@ bool ValidateFormatForInput(
15401540
return false;
15411541
}
15421542
}
1543+
else if (schemaStructRowType && format == TStringBuf("json_list")) {
1544+
bool failedSchemaColumns = false;
1545+
1546+
for (const TItemExprType* item : schemaStructRowType->GetItems()) {
1547+
if (excludeFields && excludeFields(item->GetName())) {
1548+
continue;
1549+
}
1550+
const TTypeAnnotationNode* rowType = item->GetItemType();
1551+
if (rowType->GetKind() == ETypeAnnotationKind::Optional) {
1552+
rowType = rowType->Cast<TOptionalExprType>()->GetItemType();
1553+
}
1554+
1555+
if (rowType->GetKind() == ETypeAnnotationKind::Data
1556+
&& IsDataTypeDateOrTzDateOrInterval(rowType->Cast<TDataExprType>()->GetSlot())) {
1557+
ctx.AddError(TIssue(TStringBuilder() << "Date, Timestamp and Interval types are not allowed in json_list format (you have '"
1558+
<< item->GetName() << " " << FormatType(rowType) << "' field)"));
1559+
failedSchemaColumns = true;
1560+
}
1561+
}
1562+
1563+
if (failedSchemaColumns) {
1564+
return false;
1565+
}
1566+
}
15431567
return true;
15441568
}
15451569

ydb/tests/fq/s3/test_s3_0.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,69 @@ def test_inference_unsupported_types(self, kikimr, s3, client, unique_prefix):
668668
assert result_set.rows[2].items[0].int64_value == 30
669669
assert sum(kikimr.control_plane.get_metering(1)) == 10
670670

671+
@yq_v2
672+
@pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True)
673+
def test_json_list_formats(self, kikimr, s3, client, unique_prefix):
674+
resource = boto3.resource(
675+
"s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key"
676+
)
677+
678+
bucket = resource.Bucket("fbucket")
679+
bucket.create(ACL='public-read')
680+
bucket.objects.all().delete()
681+
682+
s3_client = boto3.client(
683+
"s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key"
684+
)
685+
686+
fruits = '''[
687+
{ "date" : "", "datetime" : "", "timestamp" : "", "interval" : "", "date32" : "", "datetime64" : "", "timestamp64" : "", "interval64" : "", "tzDate" : "", "tzDateTime" : "", "tzTimestamp" : "" },
688+
{ "date" : "", "datetime" : "", "timestamp" : "", "interval" : "", "date32" : "", "datetime64" : "", "timestamp64" : "", "interval64" : "", "tzDate" : "", "tzDateTime" : "", "tzTimestamp" : "" },
689+
{ "date" : "", "datetime" : "", "timestamp" : "", "interval" : "", "date32" : "", "datetime64" : "", "timestamp64" : "", "interval64" : "", "tzDate" : "", "tzDateTime" : "", "tzTimestamp" : "" }
690+
]'''
691+
s3_client.put_object(Body=fruits, Bucket='fbucket', Key='timestamp.json', ContentType='text/plain')
692+
693+
kikimr.control_plane.wait_bootstrap(1)
694+
storage_connection_name = unique_prefix + "fruitbucket"
695+
client.create_storage_connection(storage_connection_name, "fbucket")
696+
697+
sql = f'''
698+
SELECT *
699+
FROM `{storage_connection_name}`.`/timestamp.json`
700+
WITH (
701+
format="json_list",
702+
schema=(
703+
`date` date,
704+
`datetime` datetime,
705+
`timestamp` timestamp,
706+
`interval` interval,
707+
`date32` date32,
708+
`datetime64` datetime64,
709+
`timestamp64` timestamp64,
710+
`interval64` interval64,
711+
`tzDate` tzDate,
712+
`tzDateTime` tzDateTime,
713+
`tzTimestamp` tzTimestamp
714+
));
715+
'''
716+
717+
query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
718+
client.wait_query_status(query_id, fq.QueryMeta.FAILED)
719+
720+
error_message = str(client.describe_query(query_id).result)
721+
assert "Date, Timestamp and Interval types are not allowed in json_list format" in error_message
722+
assert "Date" in error_message
723+
assert "Datetime" in error_message
724+
assert "Timestamp" in error_message
725+
assert "Interval" in error_message
726+
assert "Date32" in error_message
727+
assert "Datetime64" in error_message
728+
assert "Timestamp64" in error_message
729+
assert "Interval64" in error_message
730+
assert "TzDate" in error_message
731+
assert "TzDatetime" in error_message
732+
assert "TzTimestamp" in error_message
733+
671734
@yq_all
672735
@pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True)
673736
def test_csv_with_hopping(self, kikimr, s3, client, unique_prefix):

0 commit comments

Comments
 (0)