Skip to content

Commit 4a26872

Browse files
more conservative retry policy for solomon read actor (#20439)
1 parent 344960e commit 4a26872

File tree

7 files changed

+152
-89
lines changed

7 files changed

+152
-89
lines changed

ydb/library/yql/providers/solomon/actors/dq_solomon_read_actor.cpp

Lines changed: 107 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#include "dq_solomon_read_actor.h"
22
#include "dq_solomon_actors_util.h"
33

4+
#include <library/cpp/json/json_reader.h>
45
#include <library/cpp/protobuf/util/pb_io.h>
6+
#include <library/cpp/retry/retry.h>
57

68
#include <util/string/join.h>
79
#include <ydb/library/yql/dq/actors/common/retry_queue.h>
@@ -34,7 +36,6 @@
3436
#include <ydb/library/actors/core/hfunc.h>
3537
#include <ydb/library/actors/core/log.h>
3638
#include <ydb/library/actors/http/http_proxy.h>
37-
#include <library/cpp/json/json_reader.h>
3839

3940

4041
#include <util/generic/algorithm.h>
@@ -67,13 +68,6 @@ using namespace NKikimr::NMiniKQL;
6768
namespace {
6869

6970
class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadActor>, public IDqComputeActorAsyncInput {
70-
private:
71-
struct TMetricTimeRange {
72-
NSo::TMetric Metric;
73-
TInstant From;
74-
TInstant To;
75-
};
76-
7771
public:
7872
static constexpr char ActorName[] = "DQ_SOLOMON_READ_ACTOR";
7973

@@ -109,6 +103,19 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
109103
SOURCE_LOG_D("Init");
110104
IngressStats.Level = statsLevel;
111105

106+
RetryPolicy = IRetryPolicy<NSo::TGetDataResponse>::GetExponentialBackoffPolicy(
107+
[](const NSo::TGetDataResponse& response) {
108+
if (response.Status == NSo::EStatus::STATUS_RETRIABLE_ERROR) {
109+
return ERetryErrorClass::ShortRetry;
110+
}
111+
return ERetryErrorClass::NoRetry;
112+
},
113+
TDuration::MilliSeconds(25),
114+
TDuration::MilliSeconds(200),
115+
TDuration::MilliSeconds(500),
116+
5
117+
);
118+
112119
UseMetricsQueue = !ReadParams.Source.HasProgram();
113120

114121
auto stringType = ProgramBuilder.NewDataType(NYql::NUdf::TDataType<char*>::Id);
@@ -140,6 +147,15 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
140147
RequestMetrics();
141148
} else {
142149
Become(&TDqSolomonReadActor::LimitedModeState);
150+
151+
TMetricTimeRange metric {
152+
{},
153+
ReadParams.Source.GetProgram(),
154+
TInstant::Seconds(ReadParams.Source.GetFrom()),
155+
TInstant::Seconds(ReadParams.Source.GetTo())
156+
};
157+
158+
MetricsWithTimeRange.push_back(metric);
143159
RequestData();
144160
}
145161
}
@@ -149,6 +165,7 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
149165
hFunc(TEvSolomonProvider::TEvMetricsReadError, HandleMetricsReadError);
150166
hFunc(TEvSolomonProvider::TEvPointsCountBatch, HandlePointsCountBatch);
151167
hFunc(TEvSolomonProvider::TEvNewDataBatch, HandleNewDataBatch);
168+
hFunc(TEvSolomonProvider::TEvRetryDataRequest, HandleRetryDataRequest);
152169
hFunc(TEvSolomonProvider::TEvAck, Handle);
153170
hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvRetry, Handle);
154171
hFunc(NActors::TEvInterconnect::TEvNodeDisconnected, Handle);
@@ -222,35 +239,45 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
222239
auto& metric = batch.Metric;
223240
auto& pointsCount = batch.Response.Result.PointsCount;
224241
ParsePointsCount(metric, pointsCount);
242+
CompletedMetricsCount++;
225243

226244
TryRequestData();
227245
}
228246

229247
void HandleNewDataBatch(TEvSolomonProvider::TEvNewDataBatch::TPtr& newDataBatch) {
230-
auto& batch = *newDataBatch->Get();
231-
232-
if (batch.Response.Status == NSo::EStatus::STATUS_FATAL_ERROR) {
233-
TIssues issues { TIssue(batch.Response.Error) };
234-
SOURCE_LOG_W("Got " << "error data response[" << newDataBatch->Cookie << "] from solomon: " << issues.ToOneLineString());
235-
Send(ComputeActorId, new TEvAsyncInputError(InputIndex, issues, NYql::NDqProto::StatusIds::EXTERNAL_ERROR));
248+
if (!SaveDataBatch(newDataBatch)) {
236249
return;
237250
}
238-
if (batch.Response.Status == NSo::EStatus::STATUS_RETRIABLE_ERROR) {
239-
MetricsWithTimeRange.emplace_back(batch.Metric, batch.From, batch.To);
240-
TryRequestData();
241-
return;
242-
}
243-
244-
MetricsData.insert(MetricsData.end(), batch.Response.Result.Timeseries.begin(), batch.Response.Result.Timeseries.end());
245-
CompletedMetricsCount++;
246251

247252
if (!MetricsWithTimeRange.empty()) {
248253
TryRequestData();
249-
} else if (MetricsData.size() >= ComputeActorBatchSize || LastMetricProcessed()) {
254+
}
255+
if (MetricsData.size() >= ComputeActorBatchSize || LastMetricProcessed()) {
250256
NotifyComputeActorWithData();
251257
}
252258
}
253259

260+
void HandleRetryDataRequest(TEvSolomonProvider::TEvRetryDataRequest::TPtr& retryDataRequest) {
261+
auto& retryDataEvent = *retryDataRequest->Get();
262+
NThreading::TFuture<NSo::TGetDataResponse> dataRequestFuture;
263+
264+
auto request = std::move(retryDataEvent.Request);
265+
if (UseMetricsQueue) {
266+
dataRequestFuture = SolomonClient->GetData(request.Selectors, request.From, request.To);
267+
} else {
268+
dataRequestFuture = SolomonClient->GetData(request.Program, request.From, request.To);
269+
}
270+
271+
dataRequestFuture.Subscribe([request = std::move(request), actorSystem = TActivationContext::ActorSystem(), selfId = SelfId()](
272+
NThreading::TFuture<NSo::TGetDataResponse> response) mutable -> void
273+
{
274+
actorSystem->Send(selfId, new TEvSolomonProvider::TEvNewDataBatch(
275+
response.ExtractValue(),
276+
std::move(request)
277+
));
278+
});
279+
}
280+
254281
void Handle(TEvSolomonProvider::TEvAck::TPtr& ev) {
255282
MetricsQueueEvents.OnEventReceived(ev);
256283
}
@@ -279,18 +306,10 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
279306
}
280307

281308
void HandleNewDataBatchLimited(TEvSolomonProvider::TEvNewDataBatch::TPtr& newDataBatch) {
282-
auto& batch = *newDataBatch->Get();
283-
284-
if (batch.Response.Status != NSo::EStatus::STATUS_OK) {
285-
TIssues issues { TIssue(batch.Response.Error) };
286-
SOURCE_LOG_W("Got " << "error data response[" << newDataBatch->Cookie << "] from solomon: " << issues.ToOneLineString());
287-
Send(ComputeActorId, new TEvAsyncInputError(InputIndex, issues, NYql::NDqProto::StatusIds::EXTERNAL_ERROR));
309+
if (!SaveDataBatch(newDataBatch)) {
288310
return;
289311
}
290312

291-
MetricsData.insert(MetricsData.end(), batch.Response.Result.Timeseries.begin(), batch.Response.Result.Timeseries.end());
292-
CompletedMetricsCount++;
293-
294313
NotifyComputeActorWithData();
295314
}
296315

@@ -384,13 +403,13 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
384403

385404
bool LastMetricProcessed() const {
386405
if (UseMetricsQueue) {
387-
return IsMetricsQueueEmpty && CompletedMetricsCount == ListedMetricsCount;
406+
return IsMetricsQueueEmpty && CompletedMetricsCount == ListedMetricsCount && CompletedTimeRanges == ListedTimeRanges;
388407
}
389-
return CompletedMetricsCount == 1;
408+
return CompletedTimeRanges == 1;
390409
}
391410

392411
void TryRequestMetrics() {
393-
if (ListedMetrics.size() < 1000 && !IsMetricsQueueEmpty && !IsWaitingMetricsQueueResponse) {
412+
if (ListedMetrics.empty() && !IsMetricsQueueEmpty && !IsWaitingMetricsQueueResponse) {
394413
RequestMetrics();
395414
}
396415
}
@@ -436,38 +455,27 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
436455
}
437456

438457
void RequestData() {
439-
NThreading::TFuture<NSo::TGetDataResponse> getDataFuture;
440-
NSo::TMetric metric;
441-
TInstant from;
442-
TInstant to;
458+
YQL_ENSURE(RetryPolicy);
459+
NThreading::TFuture<NSo::TGetDataResponse> dataRequestFuture;
443460

444-
if (UseMetricsQueue) {
445-
auto request = MetricsWithTimeRange.back();
446-
MetricsWithTimeRange.pop_back();
447-
448-
metric = request.Metric;
449-
from = request.From;
450-
to = request.To;
461+
auto request = MetricsWithTimeRange.back();
462+
MetricsWithTimeRange.pop_back();
451463

452-
getDataFuture = SolomonClient->GetData(metric.Labels, from, to);
464+
if (UseMetricsQueue) {
465+
dataRequestFuture = SolomonClient->GetData(request.Selectors, request.From, request.To);
453466
} else {
454-
getDataFuture = SolomonClient->GetData(
455-
ReadParams.Source.GetProgram(),
456-
TInstant::Seconds(ReadParams.Source.GetFrom()),
457-
TInstant::Seconds(ReadParams.Source.GetTo())
458-
);
467+
dataRequestFuture = SolomonClient->GetData(request.Program, request.From, request.To);
459468
}
460469

461-
NActors::TActorSystem* actorSystem = NActors::TActivationContext::ActorSystem();
462-
getDataFuture.Subscribe([actorSystem, metric, from, to, selfId = SelfId()](
463-
const NThreading::TFuture<NSo::TGetDataResponse>& response) -> void
470+
PendingDataRequests_[request] = RetryPolicy->CreateRetryState();
471+
472+
dataRequestFuture.Subscribe([request = std::move(request), actorSystem = TActivationContext::ActorSystem(), selfId = SelfId()](
473+
NThreading::TFuture<NSo::TGetDataResponse> response) mutable -> void
464474
{
465475
actorSystem->Send(selfId, new TEvSolomonProvider::TEvNewDataBatch(
466-
metric,
467-
from,
468-
to,
469-
response.GetValue())
470-
);
476+
response.ExtractValue(),
477+
std::move(request)
478+
));
471479
});
472480
}
473481

@@ -477,14 +485,10 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
477485

478486
auto ranges = SplitTimeIntervalIntoRanges(from, to, pointsCount);
479487

480-
if (ranges.empty()) {
481-
CompletedMetricsCount++;
482-
return;
483-
}
484-
485488
for (const auto& [fromRange, toRange] : ranges) {
486-
MetricsWithTimeRange.emplace_back(metric, fromRange, toRange);
489+
MetricsWithTimeRange.emplace_back(metric.Labels, "", fromRange, toRange);
487490
}
491+
ListedTimeRanges += ranges.size();
488492
}
489493

490494
std::vector<std::pair<TInstant, TInstant>> SplitTimeIntervalIntoRanges(TInstant from, TInstant to, ui64 pointsCount) const {
@@ -507,6 +511,37 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
507511
return result;
508512
}
509513

514+
bool SaveDataBatch(TEvSolomonProvider::TEvNewDataBatch::TPtr& newDataBatch) {
515+
auto& batch = *newDataBatch->Get();
516+
auto request = batch.Request;
517+
518+
if (batch.Response.Status == NSo::EStatus::STATUS_RETRIABLE_ERROR) {
519+
if (auto delay = PendingDataRequests_[request]->GetNextRetryDelay(batch.Response)) {
520+
SOURCE_LOG_D("HandleNewDataBatch: retrying data request, delay: " << delay->MilliSeconds());
521+
Schedule(*delay, new TEvSolomonProvider::TEvRetryDataRequest(std::move(request)));
522+
return false;
523+
}
524+
}
525+
526+
PendingDataRequests_.erase(request);
527+
528+
if (batch.Response.Status != NSo::EStatus::STATUS_OK) {
529+
TIssues issues { TIssue(batch.Response.Error) };
530+
SOURCE_LOG_W("Got " << "error data response[" << newDataBatch->Cookie << "] from solomon: " << issues.ToOneLineString());
531+
Send(ComputeActorId, new TEvAsyncInputError(InputIndex, issues, NYql::NDqProto::StatusIds::EXTERNAL_ERROR));
532+
return false;
533+
}
534+
535+
MetricsData.insert(
536+
MetricsData.end(),
537+
std::make_move_iterator(batch.Response.Result.Timeseries.begin()),
538+
std::make_move_iterator(batch.Response.Result.Timeseries.end())
539+
);
540+
CompletedTimeRanges++;
541+
542+
return true;
543+
}
544+
510545
private:
511546
const ui64 InputIndex;
512547
TDqAsyncStats IngressStats;
@@ -518,6 +553,7 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
518553
const TDqSolomonReadParams ReadParams;
519554
const ui64 ComputeActorBatchSize;
520555
const ui64 MetricsQueueConsumersCountDelta;
556+
IRetryPolicy<NSo::TGetDataResponse>::TPtr RetryPolicy;
521557

522558
bool UseMetricsQueue;
523559
TRetryEventsQueue MetricsQueueEvents;
@@ -526,11 +562,14 @@ class TDqSolomonReadActor : public NActors::TActorBootstrapped<TDqSolomonReadAct
526562
bool IsMetricsQueueEmpty = false;
527563
bool IsConfirmedMetricsQueueFinish = false;
528564

565+
std::map<TMetricTimeRange, IRetryPolicy<NSo::TGetDataResponse>::IRetryState::TPtr> PendingDataRequests_;
529566
std::deque<NSo::TMetric> ListedMetrics;
530567
std::deque<TMetricTimeRange> MetricsWithTimeRange;
531568
std::deque<NSo::TTimeseries> MetricsData;
532-
size_t ListedMetricsCount = 0;
533-
size_t CompletedMetricsCount = 0;
569+
ui64 ListedMetricsCount = 0;
570+
ui64 CompletedMetricsCount = 0;
571+
ui64 ListedTimeRanges = 0;
572+
ui64 CompletedTimeRanges = 0;
534573
const ui64 MaxPointsPerOneRequest = 10000;
535574

536575
TString SourceId;
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,10 @@
11
#include "events.h"
2+
3+
namespace NYql::NDq {
4+
5+
bool operator<(const TMetricTimeRange& a, const TMetricTimeRange& b) {
6+
return std::tie(a.Selectors, a.Program, a.From, a.To) <
7+
std::tie(b.Selectors, b.Program, b.From, b.To);
8+
}
9+
10+
} // namespace NYql::NDq

ydb/library/yql/providers/solomon/events/events.h

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,19 @@
44
#include <ydb/library/yql/providers/solomon/proto/metrics_queue.pb.h>
55
#include <ydb/library/yql/providers/solomon/solomon_accessor/client/solomon_accessor_client.h>
66

7+
#include <library/cpp/retry/retry_policy.h>
8+
79
namespace NYql::NDq {
810

11+
struct TMetricTimeRange {
12+
std::map<TString, TString> Selectors;
13+
TString Program;
14+
TInstant From;
15+
TInstant To;
16+
};
17+
18+
bool operator<(const TMetricTimeRange& a, const TMetricTimeRange& b);
19+
920
struct TEvSolomonProvider {
1021

1122
enum EEv : ui32 {
@@ -21,6 +32,7 @@ struct TEvSolomonProvider {
2132
// read actor events
2233
EvPointsCountBatch,
2334
EvNewDataBatch,
35+
EvRetryDataRequest,
2436

2537
EvEnd
2638
};
@@ -80,14 +92,18 @@ struct TEvSolomonProvider {
8092
};
8193

8294
struct TEvNewDataBatch: public NActors::TEventLocal<TEvNewDataBatch, EvNewDataBatch> {
83-
NSo::TMetric Metric;
84-
TInstant From, To;
8595
NSo::TGetDataResponse Response;
86-
TEvNewDataBatch(NSo::TMetric metric, TInstant from, TInstant to, const NSo::TGetDataResponse& response)
87-
: Metric(metric)
88-
, From(from)
89-
, To(to)
90-
, Response(response)
96+
TMetricTimeRange Request;
97+
TEvNewDataBatch(NSo::TGetDataResponse&& response, TMetricTimeRange&& request)
98+
: Response(std::move(response))
99+
, Request(std::move(request))
100+
{}
101+
};
102+
103+
struct TEvRetryDataRequest: public NActors::TEventLocal<TEvRetryDataRequest, EvRetryDataRequest> {
104+
TMetricTimeRange Request;
105+
explicit TEvRetryDataRequest(TMetricTimeRange&& request)
106+
: Request(std::move(request))
91107
{}
92108
};
93109
};

ydb/library/yql/providers/solomon/provider/yql_solomon_dq_integration.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,13 +349,13 @@ class TSolomonDqIntegration: public TDqIntegrationBase {
349349
auto metricsQueuePrefetchSize = solomonConfig->MetricsQueuePrefetchSize.Get().OrElse(10000);
350350
sourceSettings.insert({"metricsQueuePrefetchSize", ToString(metricsQueuePrefetchSize)});
351351

352-
auto metricsQueueBatchCountLimit = solomonConfig->MetricsQueueBatchCountLimit.Get().OrElse(250);
352+
auto metricsQueueBatchCountLimit = solomonConfig->MetricsQueueBatchCountLimit.Get().OrElse(125);
353353
sourceSettings.insert({"metricsQueueBatchCountLimit", ToString(metricsQueueBatchCountLimit)});
354354

355355
auto solomonClientDefaultReplica = solomonConfig->SolomonClientDefaultReplica.Get().OrElse(defaultReplica);
356356
sourceSettings.insert({"solomonClientDefaultReplica", ToString(solomonClientDefaultReplica)});
357357

358-
auto computeActorBatchSize = solomonConfig->ComputeActorBatchSize.Get().OrElse(1000);
358+
auto computeActorBatchSize = solomonConfig->ComputeActorBatchSize.Get().OrElse(100);
359359
sourceSettings.insert({"computeActorBatchSize", ToString(computeActorBatchSize)});
360360

361361
if (!selectors.empty()) {

0 commit comments

Comments
 (0)