Skip to content

Commit f179f77

Browse files
Merging inference updates (ydb-platform#7929)
1 parent 9e45694 commit f179f77

24 files changed

+346
-122
lines changed

ydb/core/external_sources/object_storage.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ struct TObjectStorageExternalSource : public IExternalSource {
333333
}
334334
for (const auto& entry : entries.Objects) {
335335
if (entry.Size > 0) {
336-
return entry.Path;
336+
return entry;
337337
}
338338
}
339339
throw yexception() << "couldn't find any files for type inference, please check that the right path is provided";
@@ -349,30 +349,31 @@ struct TObjectStorageExternalSource : public IExternalSource {
349349
meta->Attributes.erase("withinfer");
350350

351351
auto fileFormat = NObjectStorage::NInference::ConvertFileFormat(*format);
352-
auto arrowFetcherId = ActorSystem->Register(NObjectStorage::NInference::CreateArrowFetchingActor(s3FetcherId, fileFormat));
352+
auto arrowFetcherId = ActorSystem->Register(NObjectStorage::NInference::CreateArrowFetchingActor(s3FetcherId, fileFormat, meta->Attributes));
353353
auto arrowInferencinatorId = ActorSystem->Register(NObjectStorage::NInference::CreateArrowInferencinator(arrowFetcherId, fileFormat, meta->Attributes));
354354

355-
return afterListing.Apply([arrowInferencinatorId, meta, actorSystem = ActorSystem](const NThreading::TFuture<TString>& pathFut) {
355+
return afterListing.Apply([arrowInferencinatorId, meta, actorSystem = ActorSystem](const NThreading::TFuture<NYql::NS3Lister::TObjectListEntry>& entryFut) {
356356
auto promise = NThreading::NewPromise<TMetadataResult>();
357357
auto schemaToMetadata = [meta](NThreading::TPromise<TMetadataResult> metaPromise, NObjectStorage::TEvInferredFileSchema&& response) {
358358
if (!response.Status.IsSuccess()) {
359359
metaPromise.SetValue(NYql::NCommon::ResultFromError<TMetadataResult>(response.Status.GetIssues()));
360360
return;
361361
}
362-
TMetadataResult result;
363362
meta->Changed = true;
364363
meta->Schema.clear_column();
365364
for (const auto& column : response.Fields) {
366365
auto& destColumn = *meta->Schema.add_column();
367366
destColumn = column;
368367
}
368+
TMetadataResult result;
369369
result.SetSuccess();
370370
result.Metadata = meta;
371371
metaPromise.SetValue(std::move(result));
372372
};
373+
auto [path, size, _] = entryFut.GetValue();
373374
actorSystem->Register(new NKqp::TActorRequestHandler<NObjectStorage::TEvInferFileSchema, NObjectStorage::TEvInferredFileSchema, TMetadataResult>(
374375
arrowInferencinatorId,
375-
new NObjectStorage::TEvInferFileSchema(TString{pathFut.GetValue()}),
376+
new NObjectStorage::TEvInferFileSchema(TString{path}, size),
376377
promise,
377378
std::move(schemaToMetadata)
378379
));

ydb/core/external_sources/object_storage/events.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,13 @@ struct TEvArrowFile : public NActors::TEventLocal<TEvArrowFile, EvArrowFile> {
119119
};
120120

121121
struct TEvInferFileSchema : public NActors::TEventLocal<TEvInferFileSchema, EvInferFileSchema> {
122-
explicit TEvInferFileSchema(TString&& path)
122+
explicit TEvInferFileSchema(TString&& path, ui64 size)
123123
: Path{std::move(path)}
124+
, Size{size}
124125
{}
125126

126127
TString Path;
128+
ui64 Size = 0;
127129
};
128130

129131
struct TEvInferredFileSchema : public NActors::TEventLocal<TEvInferredFileSchema, EvInferredFileSchema> {

ydb/core/external_sources/object_storage/inference/arrow_fetcher.cpp

Lines changed: 132 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,32 @@
66
#include <arrow/csv/chunker.h>
77
#include <arrow/csv/options.h>
88
#include <arrow/io/memory.h>
9+
#include <arrow/util/endian.h>
910

1011
#include <util/generic/guid.h>
1112
#include <util/generic/size_literals.h>
1213

1314
#include <ydb/core/external_sources/object_storage/events.h>
1415
#include <ydb/library/actors/core/actor_bootstrapped.h>
1516
#include <ydb/library/actors/core/hfunc.h>
17+
#include <ydb/library/yql/providers/s3/compressors/factory.h>
18+
#include <ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadBufferFromString.h>
1619

1720
namespace NKikimr::NExternalSource::NObjectStorage::NInference {
1821

1922
class TArrowFileFetcher : public NActors::TActorBootstrapped<TArrowFileFetcher> {
2023
static constexpr uint64_t PrefixSize = 10_MB;
2124
public:
22-
TArrowFileFetcher(NActors::TActorId s3FetcherId, EFileFormat format)
25+
TArrowFileFetcher(NActors::TActorId s3FetcherId, EFileFormat format, const THashMap<TString, TString>& params)
2326
: S3FetcherId_{s3FetcherId}
2427
, Format_{format}
2528
{
2629
Y_ABORT_UNLESS(IsArrowInferredFormat(Format_));
30+
31+
auto decompression = params.FindPtr("compression");
32+
if (decompression) {
33+
DecompressionFormat_ = *decompression;
34+
}
2735
}
2836

2937
void Bootstrap() {
@@ -40,15 +48,20 @@ class TArrowFileFetcher : public NActors::TActorBootstrapped<TArrowFileFetcher>
4048
const auto& request = *ev->Get();
4149
TRequest localRequest{
4250
.Path = request.Path,
43-
.RequestId = {},
51+
.RequestId = TGUID::Create(),
4452
.Requester = ev->Sender,
53+
.MetadataRequest = false,
4554
};
46-
CreateGuid(&localRequest.RequestId);
4755

4856
switch (Format_) {
4957
case EFileFormat::CsvWithNames:
5058
case EFileFormat::TsvWithNames: {
51-
HandleAsPrefixFile(std::move(localRequest), ctx);
59+
RequestPartialFile(std::move(localRequest), ctx, 0, 10_MB);
60+
break;
61+
}
62+
case EFileFormat::Parquet: {
63+
localRequest.MetadataRequest = true;
64+
RequestPartialFile(std::move(localRequest), ctx, request.Size - 8, request.Size - 4);
5265
break;
5366
}
5467
default: {
@@ -67,6 +80,15 @@ class TArrowFileFetcher : public NActors::TActorBootstrapped<TArrowFileFetcher>
6780

6881
const auto& request = requestIt->second;
6982

83+
TString data = std::move(response.Data);
84+
if (DecompressionFormat_) {
85+
auto decompressedData = DecompressFile(data, request, ctx);
86+
if (!decompressedData) {
87+
return;
88+
}
89+
data = std::move(*decompressedData);
90+
}
91+
7092
std::shared_ptr<arrow::io::RandomAccessFile> file;
7193
switch (Format_) {
7294
case EFileFormat::CsvWithNames:
@@ -76,7 +98,16 @@ class TArrowFileFetcher : public NActors::TActorBootstrapped<TArrowFileFetcher>
7698
if (Format_ == EFileFormat::TsvWithNames) {
7799
options.delimiter = '\t';
78100
}
79-
file = CleanupCsvFile(response.Data, request, options, ctx);
101+
file = CleanupCsvFile(data, request, options, ctx);
102+
ctx.Send(request.Requester, new TEvArrowFile(std::move(file), request.Path));
103+
break;
104+
}
105+
case EFileFormat::Parquet: {
106+
if (request.MetadataRequest) {
107+
HandleMetadataSizeRequest(data, request, ctx);
108+
return;
109+
}
110+
file = BuildParquetFileFromMetadata(data, request, ctx);
80111
ctx.Send(request.Requester, new TEvArrowFile(std::move(file), request.Path));
81112
break;
82113
}
@@ -104,14 +135,15 @@ class TArrowFileFetcher : public NActors::TActorBootstrapped<TArrowFileFetcher>
104135
uint64_t From = 0;
105136
uint64_t To = 0;
106137
NActors::TActorId Requester;
138+
bool MetadataRequest;
107139
};
108140

109141
// Reading file
110142

111-
void HandleAsPrefixFile(TRequest&& insertedRequest, const NActors::TActorContext& ctx) {
143+
void RequestPartialFile(TRequest&& insertedRequest, const NActors::TActorContext& ctx, uint64_t from, uint64_t to) {
112144
auto path = insertedRequest.Path;
113-
insertedRequest.From = 0;
114-
insertedRequest.To = 10_MB;
145+
insertedRequest.From = from;
146+
insertedRequest.To = to;
115147
auto it = InflightRequests_.try_emplace(path, std::move(insertedRequest));
116148
Y_ABORT_UNLESS(it.second, "couldn't insert request for path: %s", insertedRequest.RequestId.AsGuidString().c_str());
117149

@@ -135,6 +167,43 @@ class TArrowFileFetcher : public NActors::TActorBootstrapped<TArrowFileFetcher>
135167

136168
// Cutting file
137169

170+
TMaybe<TString> DecompressFile(const TString& data, const TRequest& request, const NActors::TActorContext& ctx) {
171+
try {
172+
NDB::ReadBufferFromString dataBuffer(data);
173+
auto decompressorBuffer = NYql::MakeDecompressor(dataBuffer, *DecompressionFormat_);
174+
if (!decompressorBuffer) {
175+
auto error = MakeError(
176+
request.Path,
177+
NFq::TIssuesIds::INTERNAL_ERROR,
178+
TStringBuilder{} << "unknown compression: " << *DecompressionFormat_ << ". Use one of: gzip, zstd, lz4, brotli, bzip2, xz"
179+
);
180+
SendError(ctx, error);
181+
return {};
182+
}
183+
184+
TStringBuilder decompressedData;
185+
while (!decompressorBuffer->eof() && decompressedData.size() < 10_MB) {
186+
decompressorBuffer->nextIfAtEnd();
187+
size_t maxDecompressedChunkSize = std::min(
188+
decompressorBuffer->available(),
189+
10_MB - decompressedData.size()
190+
);
191+
TString decompressedChunk{maxDecompressedChunkSize, ' '};
192+
decompressorBuffer->read(&decompressedChunk.front(), maxDecompressedChunkSize);
193+
decompressedData << decompressedChunk;
194+
}
195+
return std::move(decompressedData);
196+
} catch (const yexception& error) {
197+
auto errorEv = MakeError(
198+
request.Path,
199+
NFq::TIssuesIds::INTERNAL_ERROR,
200+
TStringBuilder{} << "couldn't decompress file, check compression params: " << error.what()
201+
);
202+
SendError(ctx, errorEv);
203+
return {};
204+
}
205+
}
206+
138207
std::shared_ptr<arrow::io::RandomAccessFile> CleanupCsvFile(const TString& data, const TRequest& request, const arrow::csv::ParseOptions& options, const NActors::TActorContext& ctx) {
139208
auto chunker = arrow::csv::MakeChunker(options);
140209
std::shared_ptr<arrow::Buffer> whole, partial;
@@ -170,6 +239,58 @@ class TArrowFileFetcher : public NActors::TActorBootstrapped<TArrowFileFetcher>
170239
return std::make_shared<arrow::io::BufferReader>(std::move(whole));
171240
}
172241

242+
void HandleMetadataSizeRequest(const TString& data, TRequest request, const NActors::TActorContext& ctx) {
243+
uint32_t metadataSize = arrow::BitUtil::FromLittleEndian<uint32_t>(ReadUnaligned<uint32_t>(data.data()));
244+
245+
if (metadataSize > 10_MB) {
246+
auto error = MakeError(
247+
request.Path,
248+
NFq::TIssuesIds::INTERNAL_ERROR,
249+
TStringBuilder{} << "couldn't load parquet metadata, size is bigger than 10MB : " << metadataSize
250+
);
251+
SendError(ctx, error);
252+
return;
253+
}
254+
255+
InflightRequests_.erase(request.Path);
256+
257+
TRequest localRequest{
258+
.Path = request.Path,
259+
.RequestId = TGUID::Create(),
260+
.Requester = request.Requester,
261+
.MetadataRequest = false,
262+
};
263+
RequestPartialFile(std::move(localRequest), ctx, request.From - metadataSize, request.To + 4);
264+
}
265+
266+
std::shared_ptr<arrow::io::RandomAccessFile> BuildParquetFileFromMetadata(const TString& data, const TRequest& request, const NActors::TActorContext& ctx) {
267+
auto arrowData = std::make_shared<arrow::Buffer>(nullptr, 0);
268+
arrow::BufferBuilder builder;
269+
auto buildRes = builder.Append(data.data(), data.size());
270+
if (!buildRes.ok()) {
271+
auto error = MakeError(
272+
request.Path,
273+
NFq::TIssuesIds::INTERNAL_ERROR,
274+
TStringBuilder{} << "couldn't read data from S3Fetcher: " << buildRes.ToString()
275+
);
276+
SendError(ctx, error);
277+
return nullptr;
278+
}
279+
280+
buildRes = builder.Finish(&arrowData);
281+
if (!buildRes.ok()) {
282+
auto error = MakeError(
283+
request.Path,
284+
NFq::TIssuesIds::INTERNAL_ERROR,
285+
TStringBuilder{} << "couldn't copy data from S3Fetcher: " << buildRes.ToString()
286+
);
287+
SendError(ctx, error);
288+
return nullptr;
289+
}
290+
291+
return std::make_shared<arrow::io::BufferReader>(std::move(arrowData));
292+
}
293+
173294
// Utility
174295
void SendError(const NActors::TActorContext& ctx, TEvFileError* error) {
175296
auto requestIt = InflightRequests_.find(error->Path);
@@ -183,10 +304,11 @@ class TArrowFileFetcher : public NActors::TActorBootstrapped<TArrowFileFetcher>
183304
// Fields
184305
NActors::TActorId S3FetcherId_;
185306
EFileFormat Format_;
307+
TMaybe<TString> DecompressionFormat_;
186308
std::unordered_map<TString, TRequest> InflightRequests_; // Path -> Request
187309
};
188310

189-
NActors::IActor* CreateArrowFetchingActor(NActors::TActorId s3FetcherId, EFileFormat format) {
190-
return new TArrowFileFetcher{s3FetcherId, format};
311+
NActors::IActor* CreateArrowFetchingActor(NActors::TActorId s3FetcherId, EFileFormat format, const THashMap<TString, TString>& params) {
312+
return new TArrowFileFetcher{s3FetcherId, format, params};
191313
}
192314
} // namespace NKikimr::NExternalSource::NObjectStorage::NInference

ydb/core/external_sources/object_storage/inference/arrow_fetcher.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55

66
namespace NKikimr::NExternalSource::NObjectStorage::NInference {
77

8-
NActors::IActor* CreateArrowFetchingActor(NActors::TActorId s3FetcherId, EFileFormat format);
8+
NActors::IActor* CreateArrowFetchingActor(NActors::TActorId s3FetcherId, EFileFormat format, const THashMap<TString, TString>& params);
99
} // namespace NKikimr::NExternalSource::NObjectStorage::NInference

ydb/core/external_sources/object_storage/inference/arrow_inferencinator.cpp

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,22 @@
33
#include <arrow/table.h>
44
#include <arrow/csv/options.h>
55
#include <arrow/csv/reader.h>
6+
#include <parquet/arrow/reader.h>
67

78
#include <ydb/core/external_sources/object_storage/events.h>
89
#include <ydb/library/actors/core/actor_bootstrapped.h>
910
#include <ydb/library/actors/core/hfunc.h>
11+
#include <ydb/library/actors/core/log.h>
1012
#include <ydb/public/api/protos/ydb_value.pb.h>
1113

14+
#define LOG_E(name, stream) \
15+
LOG_ERROR_S(*NActors::TlsActivationContext, NKikimrServices::OBJECT_STORAGE_INFERENCINATOR, name << ": " << this->SelfId() << ". " << stream)
16+
#define LOG_I(name, stream) \
17+
LOG_INFO_S(*NActors::TlsActivationContext, NKikimrServices::OBJECT_STORAGE_INFERENCINATOR, name << ": " << this->SelfId() << ". " << stream)
18+
#define LOG_D(name, stream) \
19+
LOG_DEBUG_S(*NActors::TlsActivationContext, NKikimrServices::OBJECT_STORAGE_INFERENCINATOR, name << ": " << this->SelfId() << ". " << stream)
20+
#define LOG_T(name, stream) \
21+
LOG_TRACE_S(*NActors::TlsActivationContext, NKikimrServices::OBJECT_STORAGE_INFERENCINATOR, name << ": " << this->SelfId() << ". " << stream)
1222

1323
namespace NKikimr::NExternalSource::NObjectStorage::NInference {
1424

@@ -202,12 +212,37 @@ std::variant<ArrowFields, TString> InferCsvTypes(std::shared_ptr<arrow::io::Rand
202212
return table->fields();
203213
}
204214

215+
std::variant<ArrowFields, TString> InferParquetTypes(std::shared_ptr<arrow::io::RandomAccessFile> file) {
216+
parquet::arrow::FileReaderBuilder builder;
217+
builder.properties(parquet::ArrowReaderProperties(false));
218+
auto openStatus = builder.Open(std::move(file));
219+
if (!openStatus.ok()) {
220+
return TStringBuilder{} << "couldn't parse parquet file, check format params: " << openStatus.ToString();
221+
}
222+
223+
std::unique_ptr<parquet::arrow::FileReader> reader;
224+
auto readerStatus = builder.Build(&reader);
225+
if (!readerStatus.ok()) {
226+
return TStringBuilder{} << "couldn't parse parquet file, check format params: " << openStatus.ToString();
227+
}
228+
229+
std::shared_ptr<arrow::Schema> schema;
230+
auto schemaRes = reader->GetSchema(&schema);
231+
if (!schemaRes.ok()) {
232+
return TStringBuilder{} << "couldn't parse parquet file, check format params: " << openStatus.ToString();
233+
}
234+
235+
return schema->fields();
236+
}
237+
205238
std::variant<ArrowFields, TString> InferType(EFileFormat format, std::shared_ptr<arrow::io::RandomAccessFile> file, const FormatConfig& config) {
206239
switch (format) {
207240
case EFileFormat::CsvWithNames:
208241
return InferCsvTypes(std::move(file), static_cast<const CsvConfig&>(config));
209242
case EFileFormat::TsvWithNames:
210243
return InferCsvTypes(std::move(file), static_cast<const TsvConfig&>(config));
244+
case EFileFormat::Parquet:
245+
return InferParquetTypes(std::move(file));
211246
case EFileFormat::Undefined:
212247
default:
213248
return std::variant<ArrowFields, TString>{std::in_place_type_t<TString>{}, TStringBuilder{} << "unexpected format: " << ConvertFileFormat(format)};
@@ -240,7 +275,10 @@ std::unique_ptr<FormatConfig> MakeFormatConfig(EFileFormat format, const THashMa
240275

241276
class TArrowInferencinator : public NActors::TActorBootstrapped<TArrowInferencinator> {
242277
public:
243-
TArrowInferencinator(NActors::TActorId arrowFetcher, EFileFormat format, const THashMap<TString, TString>& params)
278+
TArrowInferencinator(
279+
NActors::TActorId arrowFetcher,
280+
EFileFormat format,
281+
const THashMap<TString, TString>& params)
244282
: Format_{format}
245283
, Config_{MakeFormatConfig(Format_, params)}
246284
, ArrowFetcherId_{arrowFetcher}
@@ -270,7 +308,6 @@ class TArrowInferencinator : public NActors::TActorBootstrapped<TArrowInferencin
270308
ctx.Send(RequesterId_, MakeErrorSchema(file.Path, NFq::TIssuesIds::INTERNAL_ERROR, std::get<TString>(mbArrowFields)));
271309
return;
272310
}
273-
274311
auto& arrowFields = std::get<ArrowFields>(mbArrowFields);
275312
std::vector<Ydb::Column> ydbFields;
276313
for (const auto& field : arrowFields) {
@@ -286,7 +323,7 @@ class TArrowInferencinator : public NActors::TActorBootstrapped<TArrowInferencin
286323
}
287324

288325
void HandleFileError(TEvFileError::TPtr& ev, const NActors::TActorContext& ctx) {
289-
Cout << "TArrowInferencinator::HandleFileError" << Endl;
326+
LOG_D("TArrowInferencinator", "HandleFileError: " << ev->Get()->Issues.ToOneLineString());
290327
ctx.Send(RequesterId_, new TEvInferredFileSchema(ev->Get()->Path, std::move(ev->Get()->Issues)));
291328
}
292329

@@ -297,7 +334,11 @@ class TArrowInferencinator : public NActors::TActorBootstrapped<TArrowInferencin
297334
NActors::TActorId RequesterId_;
298335
};
299336

300-
NActors::IActor* CreateArrowInferencinator(NActors::TActorId arrowFetcher, EFileFormat format, const THashMap<TString, TString>& params) {
337+
NActors::IActor* CreateArrowInferencinator(
338+
NActors::TActorId arrowFetcher,
339+
EFileFormat format,
340+
const THashMap<TString, TString>& params) {
341+
301342
return new TArrowInferencinator{arrowFetcher, format, params};
302343
}
303344
} // namespace NKikimr::NExternalSource::NObjectStorage::NInference

0 commit comments

Comments
 (0)