Skip to content

Commit 2414fb7

Browse files
authored
Fix VDisk replication token handling, add some extra checks and log points (#10371)
1 parent 39173fc commit 2414fb7

File tree

2 files changed

+41
-6
lines changed

2 files changed

+41
-6
lines changed

ydb/core/blobstorage/vdisk/repl/blobstorage_repl.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,8 @@ namespace NKikimr {
175175
TEvResumeForce *ResumeForceToken = nullptr;
176176
TInstant ReplicationEndTime;
177177
bool UnrecoveredNonphantomBlobs = false;
178+
bool RequestedReplicationToken = false;
179+
bool HoldingReplicationToken = false;
178180

179181
TWatchdogTimer<TEvReplCheckProgress> ReplProgressWatchdog;
180182

@@ -288,6 +290,12 @@ namespace NKikimr {
288290
case Plan:
289291
// this is a first quantum of replication, so we have to register it in the broker
290292
State = AwaitToken;
293+
Y_DEBUG_ABORT_UNLESS(!RequestedReplicationToken);
294+
if (RequestedReplicationToken) {
295+
STLOG(PRI_CRIT, BS_REPL, BSVR38, ReplCtx->VCtx->VDiskLogPrefix << "excessive replication token requested");
296+
break;
297+
}
298+
RequestedReplicationToken = true;
291299
if (!Send(MakeBlobStorageReplBrokerID(), new TEvQueryReplToken(ReplCtx->VDiskCfg->BaseInfo.PDiskId))) {
292300
HandleReplToken();
293301
}
@@ -304,6 +312,10 @@ namespace NKikimr {
304312
}
305313

306314
void HandleReplToken() {
315+
Y_ABORT_UNLESS(RequestedReplicationToken);
316+
RequestedReplicationToken = false;
317+
HoldingReplicationToken = true;
318+
307319
// switch to replication state
308320
Transition(AwaitToken, Replication);
309321
if (!ResumeIfReady()) {
@@ -410,6 +422,9 @@ namespace NKikimr {
410422
if (State == WaitQueues || State == Replication) {
411423
// release token as we have finished replicating
412424
Send(MakeBlobStorageReplBrokerID(), new TEvReleaseReplToken);
425+
Y_DEBUG_ABORT_UNLESS(!RequestedReplicationToken);
426+
Y_DEBUG_ABORT_UNLESS(HoldingReplicationToken);
427+
HoldingReplicationToken = false;
413428
}
414429
ResetReplProgressTimer(true);
415430

@@ -638,7 +653,15 @@ namespace NKikimr {
638653

639654
// return replication token if we have one
640655
if (State == AwaitToken || State == WaitQueues || State == Replication) {
641-
Send(MakeBlobStorageReplBrokerID(), new TEvReleaseReplToken);
656+
Y_DEBUG_ABORT_UNLESS(RequestedReplicationToken || HoldingReplicationToken);
657+
if (RequestedReplicationToken || HoldingReplicationToken) {
658+
Send(MakeBlobStorageReplBrokerID(), new TEvReleaseReplToken);
659+
}
660+
} else {
661+
Y_DEBUG_ABORT_UNLESS(!RequestedReplicationToken && !HoldingReplicationToken);
662+
if (RequestedReplicationToken || HoldingReplicationToken) {
663+
STLOG(PRI_CRIT, BS_REPL, BSVR37, ReplCtx->VCtx->VDiskLogPrefix << "stuck replication token");
664+
}
642665
}
643666

644667
if (ReplJobActorId) {

ydb/core/blobstorage/vdisk/repl/blobstorage_replproxy.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ namespace NKikimr {
129129
ui64 NextReceiveCookie;
130130
TResultQueue ResultQueue;
131131
std::shared_ptr<TMessageRelevanceTracker> Tracker = std::make_shared<TMessageRelevanceTracker>();
132+
bool Terminated = false;
132133

133134
TQueue<std::unique_ptr<TEvBlobStorage::TEvVGet>> SchedulerRequestQ;
134135
THashMap<ui64, TReplMemTokenId> RequestTokens;
@@ -227,9 +228,7 @@ namespace NKikimr {
227228
PrefetchDataSize = 0;
228229
RequestFromVDiskProxyPending = false;
229230
if (Finished) {
230-
Send(MakeBlobStorageReplBrokerID(), new TEvPruneQueue);
231-
RequestTokens.clear();
232-
return PassAway(); // TODO(alexvru): check correctness of invocations
231+
return PassAway();
233232
}
234233
}
235234
// send request(s) if prefetch queue is not full
@@ -297,6 +296,9 @@ namespace NKikimr {
297296
if (msg->Record.GetCookie() == NextReceiveCookie) {
298297
ui64 cookie = NextReceiveCookie;
299298
ProcessResult(msg);
299+
if (Terminated) {
300+
return;
301+
}
300302
ReleaseMemToken(cookie);
301303
while (!ResultQueue.empty()) {
302304
const TQueueItem& top = ResultQueue.top();
@@ -305,6 +307,9 @@ namespace NKikimr {
305307
}
306308
ui64 cookie = NextReceiveCookie;
307309
ProcessResult(top.get());
310+
if (Terminated) {
311+
return;
312+
}
308313
ReleaseMemToken(cookie);
309314
ResultQueue.pop();
310315
}
@@ -314,6 +319,7 @@ namespace NKikimr {
314319
}
315320

316321
void ReleaseMemToken(ui64 cookie) {
322+
Y_ABORT_UNLESS(!Terminated);
317323
if (RequestTokens) {
318324
auto it = RequestTokens.find(cookie);
319325
Y_ABORT_UNLESS(it != RequestTokens.end());
@@ -428,6 +434,13 @@ namespace NKikimr {
428434
}
429435
}
430436

437+
void PassAway() override {
438+
Y_ABORT_UNLESS(!Terminated);
439+
Terminated = true;
440+
Send(MakeBlobStorageReplBrokerID(), new TEvPruneQueue);
441+
TActorBootstrapped::PassAway();
442+
}
443+
431444
STRICT_STFUNC(StateFunc,
432445
hFunc(TEvReplProxyNext, Handle)
433446
hFunc(TEvReplMemToken, Handle)
@@ -446,8 +459,7 @@ namespace NKikimr {
446459
TTrackableVector<TVDiskProxy::TScheduledBlob>&& ids,
447460
const TVDiskID& vdiskId,
448461
const TActorId& serviceId)
449-
: TActorBootstrapped<TVDiskProxyActor>()
450-
, ReplCtx(std::move(replCtx))
462+
: ReplCtx(std::move(replCtx))
451463
, GType(ReplCtx->VCtx->Top->GType)
452464
, Ids(std::move(ids))
453465
, VDiskId(vdiskId)

0 commit comments

Comments
 (0)