Skip to content

Commit fe5e16f

Browse files
authored
YQ-3405 fixed endless retries for external error (ydb-platform#6722)
1 parent 4968dab commit fe5e16f

File tree

5 files changed

+24
-7
lines changed

5 files changed

+24
-7
lines changed

ydb/core/fq/libs/config/protos/control_plane_storage.proto

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,15 @@ message TQueryMapping {
2424

2525
// 1. StatusCode(s) are handled with defined policies, non-unique StatusCode(s) across all policies is UB
2626
// 2. RetryCount and RetryPeriodMs are used to calculate actual RetryRate, if it exceeds RetryCount, query is aborted
27+
// - Number of retries during RetryPeriod time less than 2 * RetryCount due to RetryRate
2728
// 3. BackoffPeriodMs is factor of RetryRate to delay query execution before next retry
28-
// 4. There are no default retry policy, all unhandled statuses are fatal
29+
// 4. RetryLimit is hard limit for amount query retry count, after that query is aborted
30+
// - If RetryLimit = 0, query can be abborted only by RetryRate
31+
// 5. There are no default retry policy, all unhandled statuses are fatal
2932

3033
message TRetryPolicy {
3134
uint64 RetryCount = 1;
35+
uint64 RetryLimit = 4;
3236
string RetryPeriod = 2;
3337
string BackoffPeriod = 3;
3438
}

ydb/core/fq/libs/control_plane_storage/config.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,11 @@ TControlPlaneStorageConfig::TControlPlaneStorageConfig(const NConfig::TControlPl
5050
for (const auto& mapping : Proto.GetRetryPolicyMapping()) {
5151
auto& retryPolicy = mapping.GetPolicy();
5252
auto retryCount = retryPolicy.GetRetryCount();
53+
auto retryLimit = retryPolicy.GetRetryLimit();
5354
auto retryPeriod = GetDuration(retryPolicy.GetRetryPeriod(), TDuration::Hours(1));
5455
auto backoffPeriod = GetDuration(retryPolicy.GetBackoffPeriod(), TDuration::Zero());
5556
for (const auto statusCode: mapping.GetStatusCode()) {
56-
RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryPeriod, backoffPeriod));
57+
RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryLimit, retryPeriod, backoffPeriod));
5758
}
5859
}
5960

ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ TPingTaskParams ConstructHardPingTask(
156156
internal.clear_operation_id();
157157
}
158158

159-
TRetryPolicyItem policy(0, TDuration::Seconds(1), TDuration::Zero());
159+
TRetryPolicyItem policy(0, 0, TDuration::Seconds(1), TDuration::Zero());
160160
auto it = retryPolicies.find(request.status_code());
161161
auto policyFound = it != retryPolicies.end();
162162
if (policyFound) {
@@ -183,7 +183,7 @@ TPingTaskParams ConstructHardPingTask(
183183
TStringBuilder builder;
184184
builder << "Query failed with code " << NYql::NDqProto::StatusIds_StatusCode_Name(request.status_code());
185185
if (policy.RetryCount) {
186-
builder << " (failure rate " << retryLimiter.RetryRate << " exceeds limit of " << policy.RetryCount << ")";
186+
builder << " (" << retryLimiter.LastError << ")";
187187
}
188188
builder << " at " << Now();
189189

ydb/core/fq/libs/control_plane_storage/util.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,16 @@ bool TRetryLimiter::UpdateOnRetry(const TInstant& lastSeenAt, const TRetryPolicy
2828
RetryRate = 0.0;
2929
}
3030
}
31-
bool shouldRetry = RetryRate < policy.RetryCount;
31+
32+
bool shouldRetry = true;
33+
if (RetryRate >= policy.RetryCount) {
34+
shouldRetry = false;
35+
LastError = TStringBuilder() << "failure rate " << RetryRate << " exceeds limit of " << policy.RetryCount;
36+
} else if (policy.RetryLimit && RetryCount >= policy.RetryLimit) {
37+
shouldRetry = false;
38+
LastError = TStringBuilder() << "retry count reached limit of " << policy.RetryLimit;
39+
}
40+
3241
if (shouldRetry) {
3342
RetryCount++;
3443
RetryCounterUpdatedAt = now;
@@ -140,6 +149,7 @@ NConfig::TControlPlaneStorageConfig FillDefaultParameters(NConfig::TControlPlane
140149
policyMapping.AddStatusCode(NYql::NDqProto::StatusIds::EXTERNAL_ERROR);
141150
auto& policy = *policyMapping.MutablePolicy();
142151
policy.SetRetryCount(10);
152+
policy.SetRetryLimit(40);
143153
policy.SetRetryPeriod("1m");
144154
policy.SetBackoffPeriod("1s");
145155
}

ydb/core/fq/libs/control_plane_storage/util.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ namespace NFq {
1515
class TRetryPolicyItem {
1616
public:
1717
TRetryPolicyItem() = default;
18-
TRetryPolicyItem(ui64 retryCount, const TDuration& retryPeriod, const TDuration& backoffPeriod)
19-
: RetryCount(retryCount), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod)
18+
TRetryPolicyItem(ui64 retryCount, ui64 retryLimit, const TDuration& retryPeriod, const TDuration& backoffPeriod)
19+
: RetryCount(retryCount), RetryLimit(retryLimit), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod)
2020
{ }
2121
ui64 RetryCount = 0;
22+
ui64 RetryLimit = 0;
2223
TDuration RetryPeriod = TDuration::Zero();
2324
TDuration BackoffPeriod = TDuration::Zero();
2425
};
@@ -32,6 +33,7 @@ class TRetryLimiter {
3233
ui64 RetryCount = 0;
3334
TInstant RetryCounterUpdatedAt = TInstant::Zero();
3435
double RetryRate = 0.0;
36+
TString LastError;
3537
};
3638

3739
bool IsTerminalStatus(FederatedQuery::QueryMeta::ComputeStatus status);

0 commit comments

Comments
 (0)