Skip to content

Commit 243cef0

Browse files
Timestamp arrow -> clickhouse parsing fix (#8547)
1 parent 5f2c357 commit 243cef0

File tree

4 files changed

+82
-1
lines changed

4 files changed

+82
-1
lines changed

ydb/core/external_sources/object_storage/inference/arrow_inferencinator.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,11 @@ bool ArrowToYdbType(Ydb::Type& maybeOptionalType, const arrow::DataType& type, s
101101
resType.set_type_id(Ydb::Type::DATETIME64);
102102
return true;
103103
case arrow::Type::TIMESTAMP:
104-
resType.set_type_id(Ydb::Type::TIMESTAMP);
104+
if (config->Format == EFileFormat::JsonEachRow || config->Format == EFileFormat::JsonList) {
105+
maybeOptionalType.set_type_id(Ydb::Type::UTF8);
106+
} else {
107+
resType.set_type_id(Ydb::Type::TIMESTAMP);
108+
}
105109
return true;
106110
case arrow::Type::TIME32: // TODO: is there anything?
107111
return false;

ydb/core/external_sources/object_storage/inference/infer_config.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "infer_config.h"
22

3+
#include <contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h>
4+
35
namespace NKikimr::NExternalSource::NObjectStorage::NInference {
46

57
namespace {
@@ -12,12 +14,14 @@ std::shared_ptr<FormatConfig> MakeCsvConfig(const THashMap<TString, TString>& pa
1214
}
1315
config->ParseOpts.delimiter = (*delimiter)[0];
1416
}
17+
config->ConvOpts.timestamp_parsers.push_back(arrow::TimestampParser::MakeStrptime("\%Y-\%m-\%d \%H:\%M:\%S"));
1518
return config;
1619
}
1720

1821
std::shared_ptr<FormatConfig> MakeTsvConfig(const THashMap<TString, TString>&) {
1922
auto config = std::make_shared<TsvConfig>();
2023
config->ParseOpts.delimiter = '\t';
24+
config->ConvOpts.timestamp_parsers.push_back(arrow::TimestampParser::MakeStrptime("\%Y-\%m-\%d \%H:\%M:\%S"));
2125
return config;
2226
}
2327

ydb/core/external_sources/object_storage/inference/ya.make

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ ADDINCL(
66
ydb/library/yql/udfs/common/clickhouse/client/src
77
)
88

9+
# Added because of library header contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h
10+
CFLAGS(
11+
-Wno-unused-parameter
12+
)
13+
914
SRCS(
1015
arrow_fetcher.cpp
1116
arrow_inferencinator.cpp

ydb/tests/fq/s3/test_s3_0.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,74 @@ def test_inference_parameters(self, kikimr, s3, client, unique_prefix):
420420
assert result_set.rows[2].items[5].int64_value == 10
421421
assert sum(kikimr.control_plane.get_metering(1)) == 10
422422

423+
@yq_v2
424+
@pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True)
425+
def test_inference_timestamp(self, kikimr, s3, client, unique_prefix):
426+
resource = boto3.resource(
427+
"s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key"
428+
)
429+
430+
bucket = resource.Bucket("fbucket")
431+
bucket.create(ACL='public-read')
432+
bucket.objects.all().delete()
433+
434+
s3_client = boto3.client(
435+
"s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key"
436+
)
437+
438+
csv_data = '''a,b,c
439+
2024-08-29,2024-08-29 10:20:30,2024-08-29T10:20:30.01
440+
2024-08-29,2024-08-29 10:20:30,2024-08-29T10:20:30.01
441+
2024-08-29,2024-08-29 10:20:30,2024-08-29T10:20:30.01'''
442+
json_data = '''{ "a" : "2024-08-29", "b" : "2024-08-29 10:20:30", "c" : "2024-08-29T10:20:30.01" }
443+
{ "a" : "2024-08-29", "b" : "2024-08-29 10:20:30", "c" : "2024-08-29T10:20:30.01" }
444+
{ "a" : "2024-08-29", "b" : "2024-08-29 10:20:30", "c" : "2024-08-29T10:20:30.01" }'''
445+
s3_client.put_object(Body=csv_data, Bucket='fbucket', Key='timestamp.csv', ContentType='text/plain')
446+
s3_client.put_object(Body=json_data, Bucket='fbucket', Key='timestamp.json', ContentType='text/plain')
447+
kikimr.control_plane.wait_bootstrap(1)
448+
storage_connection_name = unique_prefix + "fruitbucket"
449+
client.create_storage_connection(storage_connection_name, "fbucket")
450+
451+
sql = f'''
452+
SELECT *
453+
FROM `{storage_connection_name}`.`timestamp.csv`
454+
WITH (format=csv_with_names, with_infer='true');
455+
'''
456+
457+
query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
458+
client.wait_query_status(query_id, fq.QueryMeta.COMPLETED)
459+
460+
data = client.get_result_data(query_id)
461+
result_set = data.result.result_set
462+
logging.debug(str(result_set))
463+
assert len(result_set.columns) == 3
464+
assert result_set.columns[0].name == "a"
465+
assert result_set.columns[0].type.optional_type.item.type_id == ydb.Type.DATE
466+
assert result_set.columns[1].name == "b"
467+
assert result_set.columns[1].type.optional_type.item.type_id == ydb.Type.TIMESTAMP
468+
assert result_set.columns[2].name == "c"
469+
assert result_set.columns[2].type.type_id == ydb.Type.UTF8
470+
471+
sql = f'''
472+
SELECT *
473+
FROM `{storage_connection_name}`.`timestamp.json`
474+
WITH (format=json_each_row, with_infer='true');
475+
'''
476+
477+
query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
478+
client.wait_query_status(query_id, fq.QueryMeta.COMPLETED)
479+
480+
data = client.get_result_data(query_id)
481+
result_set = data.result.result_set
482+
logging.debug(str(result_set))
483+
assert len(result_set.columns) == 3
484+
assert result_set.columns[0].name == "a"
485+
assert result_set.columns[0].type.type_id == ydb.Type.UTF8
486+
assert result_set.columns[1].name == "b"
487+
assert result_set.columns[1].type.type_id == ydb.Type.UTF8
488+
assert result_set.columns[2].name == "c"
489+
assert result_set.columns[2].type.type_id == ydb.Type.UTF8
490+
423491
@yq_all
424492
@pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True)
425493
def test_csv_with_hopping(self, kikimr, s3, client, unique_prefix):

0 commit comments

Comments
 (0)