18
18
#include < ydb/library/yql/providers/s3/object_listers/yql_s3_path.h>
19
19
#include < ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h>
20
20
#include < ydb/library/yql/providers/s3/proto/credentials.pb.h>
21
+ #include < ydb/library/yql/utils/yql_panic.h>
21
22
#include < ydb/public/api/protos/ydb_status_codes.pb.h>
22
23
#include < ydb/public/sdk/cpp/client/ydb_value/value.h>
23
24
@@ -332,44 +333,87 @@ struct TObjectStorageExternalSource : public IExternalSource {
332
333
333
334
const TString path = meta->TableLocation ;
334
335
const TString filePattern = meta->Attributes .Value (" filepattern" , TString{});
336
+ const TString projection = meta->Attributes .Value (" projection" , TString{});
335
337
const TVector<TString> partitionedBy = GetPartitionedByConfig (meta);
338
+
339
+ NYql::NPathGenerator::TPathGeneratorPtr pathGenerator;
340
+
341
+ bool shouldInferPartitions = !partitionedBy.empty () && !projection;
342
+ bool ignoreEmptyListings = !projection.empty ();
343
+
336
344
NYql::NS3Lister::TListingRequest request {
337
345
.Url = meta->DataSourceLocation ,
338
346
.Credentials = credentials
339
347
};
348
+ TVector<NYql::NS3Lister::TListingRequest> requests;
349
+
350
+ if (!projection) {
351
+ auto error = NYql::NS3::BuildS3FilePattern (path, filePattern, partitionedBy, request);
352
+ if (error) {
353
+ throw yexception () << *error;
354
+ }
355
+ requests.push_back (request);
356
+ } else {
357
+ if (NYql::NS3::HasWildcards (path)) {
358
+ throw yexception () << " Path prefix: '" << path << " ' contains wildcards" ;
359
+ }
340
360
341
- auto error = NYql::NS3::BuildS3FilePattern (path, filePattern, partitionedBy, request);
342
- if (error) {
343
- throw yexception () << *error;
361
+ pathGenerator = NYql::NPathGenerator::CreatePathGenerator (projection, partitionedBy);
362
+ for (const auto & rule : pathGenerator->GetRules ()) {
363
+ YQL_ENSURE (rule.ColumnValues .size () == partitionedBy.size ());
364
+
365
+ request.Pattern = NYql::NS3::NormalizePath (TStringBuilder () << path << " /" << rule.Path << " /*" );
366
+ request.PatternType = NYql::NS3Lister::ES3PatternType::Wildcard;
367
+ request.Prefix = request.Pattern .substr (0 , NYql::NS3::GetFirstWildcardPos (request.Pattern ));
368
+
369
+ requests.push_back (request);
370
+ }
344
371
}
345
372
346
373
auto partByData = std::make_shared<TStringBuilder>();
374
+ if (shouldInferPartitions) {
375
+ *partByData << JoinSeq (" ," , partitionedBy);
376
+ }
347
377
378
+ TVector<NThreading::TFuture<NYql::NS3Lister::TListResult>> futures;
348
379
auto httpGateway = NYql::IHTTPGateway::Make ();
349
380
auto httpRetryPolicy = NYql::GetHTTPDefaultRetryPolicy (NYql::THttpRetryPolicyOptions{.RetriedCurlCodes = NYql::FqRetriedCurlCodes ()});
350
- auto s3Lister = NYql::NS3Lister::MakeS3Lister (httpGateway, httpRetryPolicy, request, Nothing (), AllowLocalFiles, ActorSystem);
351
- auto afterListing = s3Lister->Next ().Apply ([partByData, partitionedBy, path = request.Pattern ](const NThreading::TFuture<NYql::NS3Lister::TListResult>& listResFut) {
352
- auto & listRes = listResFut.GetValue ();
353
- auto & partByRef = *partByData;
354
- if (std::holds_alternative<NYql::NS3Lister::TListError>(listRes)) {
355
- auto & error = std::get<NYql::NS3Lister::TListError>(listRes);
356
- throw yexception () << error.Issues .ToString ();
357
- }
358
- auto & entries = std::get<NYql::NS3Lister::TListEntries>(listRes);
359
- if (entries.Objects .empty ()) {
360
- throw yexception () << " couldn't find files at " << path;
361
- }
381
+ for (const auto & req : requests) {
382
+ auto s3Lister = NYql::NS3Lister::MakeS3Lister (httpGateway, httpRetryPolicy, req, Nothing (), AllowLocalFiles, ActorSystem);
383
+ futures.push_back (s3Lister->Next ());
384
+ }
362
385
363
- partByRef << JoinSeq (" ," , partitionedBy);
364
- for (const auto & entry : entries.Objects ) {
365
- Y_ENSURE (entry.MatchedGlobs .size () == partitionedBy.size ());
366
- partByRef << Endl << JoinSeq (" ," , entry.MatchedGlobs );
367
- }
368
- for (const auto & entry : entries.Objects ) {
369
- if (entry.Size > 0 ) {
370
- return entry;
386
+ auto allFuture = NThreading::WaitExceptionOrAll (futures);
387
+ auto afterListing = allFuture.Apply ([partByData, shouldInferPartitions, ignoreEmptyListings, futures = std::move (futures), requests = std::move (requests)](const NThreading::TFuture<void >& result) {
388
+ result.GetValue ();
389
+ for (size_t i = 0 ; i < futures.size (); ++i) {
390
+ auto & listRes = futures[i].GetValue ();
391
+ if (std::holds_alternative<NYql::NS3Lister::TListError>(listRes)) {
392
+ auto & error = std::get<NYql::NS3Lister::TListError>(listRes);
393
+ throw yexception () << error.Issues .ToString ();
394
+ }
395
+ auto & entries = std::get<NYql::NS3Lister::TListEntries>(listRes);
396
+ if (entries.Objects .empty () && !ignoreEmptyListings) {
397
+ throw yexception () << " couldn't find files at " << requests[i].Pattern ;
398
+ }
399
+
400
+ if (shouldInferPartitions) {
401
+ for (const auto & entry : entries.Objects ) {
402
+ *partByData << Endl << JoinSeq (" ," , entry.MatchedGlobs );
403
+ }
404
+ }
405
+
406
+ for (const auto & entry : entries.Objects ) {
407
+ if (entry.Size > 0 ) {
408
+ return entry;
409
+ }
410
+ }
411
+
412
+ if (!ignoreEmptyListings) {
413
+ throw yexception () << " couldn't find any files for type inference, please check that the right path is provided" ;
371
414
}
372
415
}
416
+
373
417
throw yexception () << " couldn't find any files for type inference, please check that the right path is provided" ;
374
418
});
375
419
@@ -412,13 +456,45 @@ struct TObjectStorageExternalSource : public IExternalSource {
412
456
));
413
457
414
458
return promise.GetFuture ();
415
- }).Apply ([arrowInferencinatorId, meta, partByData, partitionedBy, this ](const NThreading::TFuture<TMetadataResult>& result) {
459
+ }).Apply ([arrowInferencinatorId, meta, partByData, partitionedBy, pathGenerator, this ](const NThreading::TFuture<TMetadataResult>& result) {
416
460
auto & value = result.GetValue ();
417
461
if (!value.Success ()) {
418
462
return result;
419
463
}
420
464
421
- return InferPartitionedColumnsTypes (arrowInferencinatorId, partByData, partitionedBy, result);
465
+ auto meta = value.Metadata ;
466
+ if (pathGenerator) {
467
+ for (const auto & rule : pathGenerator->GetConfig ().Rules ) {
468
+ auto & destColumn = *meta->Schema .add_column ();
469
+ destColumn.mutable_name ()->assign (rule.Name );
470
+ switch (rule.Type ) {
471
+ case NYql::NPathGenerator::IPathGenerator::EType::INTEGER:
472
+ destColumn.mutable_type ()->set_type_id (Ydb::Type::INT64);
473
+ break ;
474
+
475
+ case NYql::NPathGenerator::IPathGenerator::EType::DATE:
476
+ destColumn.mutable_type ()->set_type_id (Ydb::Type::DATE);
477
+ break ;
478
+
479
+ case NYql::NPathGenerator::IPathGenerator::EType::ENUM:
480
+ default :
481
+ destColumn.mutable_type ()->set_type_id (Ydb::Type::STRING);
482
+ break ;
483
+ }
484
+ }
485
+ } else {
486
+ for (const auto & partitionName : partitionedBy) {
487
+ auto & destColumn = *meta->Schema .add_column ();
488
+ destColumn.mutable_name ()->assign (partitionName);
489
+ destColumn.mutable_type ()->set_type_id (Ydb::Type::UTF8);
490
+ }
491
+ }
492
+
493
+ if (!partitionedBy.empty () && !pathGenerator) {
494
+ return InferPartitionedColumnsTypes (arrowInferencinatorId, partByData, result);
495
+ }
496
+
497
+ return result;
422
498
}).Apply ([](const NThreading::TFuture<TMetadataResult>& result) {
423
499
auto & value = result.GetValue ();
424
500
if (value.Success ()) {
@@ -436,20 +512,10 @@ struct TObjectStorageExternalSource : public IExternalSource {
436
512
NThreading::TFuture<TMetadataResult> InferPartitionedColumnsTypes (
437
513
NActors::TActorId arrowInferencinatorId,
438
514
std::shared_ptr<TStringBuilder> partByData,
439
- const TVector<TString>& partitionedBy,
440
515
const NThreading::TFuture<TMetadataResult>& result) const {
441
516
442
517
auto & value = result.GetValue ();
443
- if (partitionedBy.empty ()) {
444
- return result;
445
- }
446
-
447
518
auto meta = value.Metadata ;
448
- for (const auto & partitionName : partitionedBy) {
449
- auto & destColumn = *meta->Schema .add_column ();
450
- destColumn.mutable_name ()->assign (partitionName);
451
- destColumn.mutable_type ()->set_type_id (Ydb::Type::UTF8);
452
- }
453
519
454
520
arrow::BufferBuilder builder;
455
521
auto partitionBuffer = std::make_shared<arrow::Buffer>(nullptr , 0 );
@@ -500,15 +566,19 @@ struct TObjectStorageExternalSource : public IExternalSource {
500
566
THashSet<TString> columns;
501
567
if (auto partitioned = meta->Attributes .FindPtr (" partitionedby" ); partitioned) {
502
568
NJson::TJsonValue values;
503
- Y_ENSURE (NJson::ReadJsonTree (*partitioned, &values));
504
- Y_ENSURE (values.GetType () == NJson::JSON_ARRAY);
569
+ auto successful = NJson::ReadJsonTree (*partitioned, &values);
570
+ if (!successful) {
571
+ columns.insert (*partitioned);
572
+ } else {
573
+ Y_ENSURE (values.GetType () == NJson::JSON_ARRAY);
505
574
506
- for (const auto & value : values.GetArray ()) {
507
- Y_ENSURE (value.GetType () == NJson::JSON_STRING);
508
- if (columns.contains (value.GetString ())) {
509
- throw yexception () << " invalid partitioned_by parameter, column " << value.GetString () << " mentioned twice" ;
575
+ for (const auto & value : values.GetArray ()) {
576
+ Y_ENSURE (value.GetType () == NJson::JSON_STRING);
577
+ if (columns.contains (value.GetString ())) {
578
+ throw yexception () << " invalid partitioned_by parameter, column " << value.GetString () << " mentioned twice" ;
579
+ }
580
+ columns.insert (value.GetString ());
510
581
}
511
- columns.insert (value.GetString ());
512
582
}
513
583
}
514
584
0 commit comments