Skip to content

Commit 735f365

Browse files
Ignoring columns with empty names with type inferring (#9278)
1 parent e6d795f commit 735f365

File tree

2 files changed

+53
-0
lines changed

2 files changed

+53
-0
lines changed

ydb/core/external_sources/object_storage/inference/arrow_inferencinator.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,9 @@ class TArrowInferencinator : public NActors::TActorBootstrapped<TArrowInferencin
325325
auto& arrowFields = std::get<ArrowFields>(mbArrowFields);
326326
std::vector<Ydb::Column> ydbFields;
327327
for (const auto& field : arrowFields) {
328+
if (field->name().empty()) {
329+
continue;
330+
}
328331
ydbFields.emplace_back();
329332
auto& ydbField = ydbFields.back();
330333
if (!ArrowToYdbType(*ydbField.mutable_type(), *field->type(), file.Config)) {

ydb/tests/fq/s3/test_s3_0.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,56 @@ def test_inference_projection(self, kikimr, s3, client, unique_prefix):
574574
assert result_set.rows[2].items[3].uint32_value == 19663
575575
assert sum(kikimr.control_plane.get_metering(1)) == 10
576576

577+
@yq_v2
578+
@pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True)
579+
def test_inference_null_column_name(self, kikimr, s3, client, unique_prefix):
580+
resource = boto3.resource(
581+
"s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key"
582+
)
583+
584+
bucket = resource.Bucket("fbucket")
585+
bucket.create(ACL='public-read')
586+
bucket.objects.all().delete()
587+
588+
s3_client = boto3.client(
589+
"s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key"
590+
)
591+
592+
fruits = ''',Fruit,Price
593+
1,Banana,3
594+
2,Apple,2
595+
3,Pear,15'''
596+
s3_client.put_object(Body=fruits, Bucket='fbucket', Key='fruits.csv', ContentType='text/plain')
597+
kikimr.control_plane.wait_bootstrap(1)
598+
storage_connection_name = unique_prefix + "fruitbucket"
599+
client.create_storage_connection(storage_connection_name, "fbucket")
600+
601+
sql = f'''
602+
SELECT *
603+
FROM `{storage_connection_name}`.`fruits.csv`
604+
WITH (format=csv_with_names, with_infer='true');
605+
'''
606+
607+
query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
608+
client.wait_query_status(query_id, fq.QueryMeta.COMPLETED)
609+
610+
data = client.get_result_data(query_id)
611+
result_set = data.result.result_set
612+
logging.debug(str(result_set))
613+
assert len(result_set.columns) == 2
614+
assert result_set.columns[0].name == "Fruit"
615+
assert result_set.columns[0].type.type_id == ydb.Type.UTF8
616+
assert result_set.columns[1].name == "Price"
617+
assert result_set.columns[1].type.optional_type.item.type_id == ydb.Type.INT64
618+
assert len(result_set.rows) == 3
619+
assert result_set.rows[0].items[0].text_value == "Banana"
620+
assert result_set.rows[0].items[1].int64_value == 3
621+
assert result_set.rows[1].items[0].text_value == "Apple"
622+
assert result_set.rows[1].items[1].int64_value == 2
623+
assert result_set.rows[2].items[0].text_value == "Pear"
624+
assert result_set.rows[2].items[1].int64_value == 15
625+
assert sum(kikimr.control_plane.get_metering(1)) == 10
626+
577627
@yq_all
578628
@pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True)
579629
def test_csv_with_hopping(self, kikimr, s3, client, unique_prefix):

0 commit comments

Comments
 (0)