Skip to content

Commit 6bec40a

Browse files
authored
Bootstrapper: don't restart healthy tablets (#9659)
1 parent 87ff4fa commit 6bec40a

File tree

16 files changed

+1183
-417
lines changed

16 files changed

+1183
-417
lines changed

ydb/core/base/statestorage.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -164,13 +164,14 @@ void TStateStorageInfo::TSelection::MergeReply(EStatus status, EStatus *owner, u
164164
ui32 unknown = 0;
165165
ui32 ok = 0;
166166
ui32 outdated = 0;
167+
ui32 unavailable = 0;
167168

168169
const ui32 majority = Sz / 2 + 1;
169170

170171
ui32 cookie = 0;
171172
for (ui32 i = 0; i < Sz; ++i) {
172173
EStatus &st = Status[i];
173-
if (resetOld && st != StatusUnknown)
174+
if (resetOld && st != StatusUnknown && st != StatusUnavailable)
174175
st = StatusOutdated;
175176

176177
if (cookie == targetCookie)
@@ -190,16 +191,19 @@ void TStateStorageInfo::TSelection::MergeReply(EStatus status, EStatus *owner, u
190191
case StatusOutdated:
191192
++outdated;
192193
break;
194+
case StatusUnavailable:
195+
++unavailable;
196+
break;
193197
}
194198
}
195199

196200
if (owner) {
197201
if (ok >= majority) {
198202
*owner = StatusOk;
199-
} else if (outdated >= majority) {
200-
*owner = StatusOutdated;
201203
} else if (ok + unknown < majority) {
202-
if (outdated)
204+
if (unavailable > (Sz - majority))
205+
*owner = StatusUnavailable;
206+
else if (outdated)
203207
*owner = StatusOutdated;
204208
else
205209
*owner = StatusNoInfo;

ydb/core/base/statestorage.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,7 @@ struct TStateStorageInfo : public TThrRefBase {
470470
StatusOk,
471471
StatusNoInfo,
472472
StatusOutdated,
473+
StatusUnavailable,
473474
};
474475

475476
ui32 Sz;

ydb/core/base/statestorage_proxy.cpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
177177
Signature[cookie] = Max<ui64>();
178178
++RepliesMerged;
179179

180-
ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusNoInfo, &ReplyStatus, cookie, false);
180+
ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusUnavailable, &ReplyStatus, cookie, false);
181181
}
182182
}
183183

@@ -192,7 +192,8 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
192192
++RepliesMerged;
193193
++SignaturesMerged;
194194

195-
if (status == NKikimrProto::OK) {
195+
switch (status) {
196+
case NKikimrProto::OK: {
196197
const ui32 gen = record.GetCurrentGeneration();
197198
const ui32 step = record.GetCurrentStep();
198199
const TActorId leader = ActorIdFromProto(record.GetCurrentLeader());
@@ -221,9 +222,14 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
221222

222223
ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusOk, &ReplyStatus, cookie, reset);
223224
}
224-
} else if (status == NKikimrProto::ERROR) {
225+
break;
226+
}
227+
// NOTE: replicas currently reply with ERROR when there is no data for the tablet
228+
case NKikimrProto::ERROR:
229+
case NKikimrProto::NODATA:
225230
ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusNoInfo, &ReplyStatus, cookie, false);
226-
} else {
231+
break;
232+
default:
227233
Y_ABORT();
228234
}
229235

@@ -307,11 +313,14 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
307313
ReplyAndDie(NKikimrProto::OK);
308314
return;
309315
case TStateStorageInfo::TSelection::StatusNoInfo:
310-
ReplyAndDie(NKikimrProto::ERROR);
316+
ReplyAndDie(NKikimrProto::NODATA);
311317
return;
312318
case TStateStorageInfo::TSelection::StatusOutdated:
313319
ReplyAndDie(NKikimrProto::RACE);
314320
return;
321+
case TStateStorageInfo::TSelection::StatusUnavailable:
322+
ReplyAndDie(NKikimrProto::ERROR);
323+
return;
315324
}
316325
Y_DEBUG_ABORT_UNLESS(false);
317326
PassAway();
@@ -332,12 +341,15 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
332341
return;
333342
case TStateStorageInfo::TSelection::StatusNoInfo:
334343
if (RepliesMerged == Replicas) { // for negative response always waits for full reply set to avoid herding of good replicas by fast retry cycle
335-
ReplyAndSig(NKikimrProto::ERROR);
344+
ReplyAndSig(NKikimrProto::NODATA);
336345
}
337346
return;
338347
case TStateStorageInfo::TSelection::StatusOutdated:
339348
ReplyAndSig(NKikimrProto::RACE);
340349
return;
350+
case TStateStorageInfo::TSelection::StatusUnavailable:
351+
ReplyAndSig(NKikimrProto::ERROR);
352+
return;
341353
}
342354
}
343355
}
@@ -379,6 +391,8 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
379391
}
380392
return;
381393
case TStateStorageInfo::TSelection::StatusNoInfo:
394+
case TStateStorageInfo::TSelection::StatusUnavailable:
395+
// Note: StatusNoInfo shouldn't really happen for update queries
382396
ReplyAndDie(NKikimrProto::ERROR);
383397
return;
384398
case TStateStorageInfo::TSelection::StatusOutdated:
@@ -404,7 +418,8 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
404418
}
405419
return;
406420
case TStateStorageInfo::TSelection::StatusNoInfo:
407-
// should not happens for update queries
421+
case TStateStorageInfo::TSelection::StatusUnavailable:
422+
// Note: StatusNoInfo shouldn't really happen for update queries
408423
ReplyAndSig(NKikimrProto::ERROR);
409424
return;
410425
case TStateStorageInfo::TSelection::StatusOutdated:

ydb/core/base/statestorage_replica.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ class TStateStorageReplica : public TActorBootstrapped<TStateStorageReplica> {
104104
}
105105
}
106106
} else {
107+
// FIXME: change to NODATA in a future version
107108
msg.Reset(new TEvStateStorage::TEvReplicaInfo(tabletId, NKikimrProto::ERROR));
108109
}
109110
msg->Record.SetCookie(cookie);

0 commit comments

Comments
 (0)