Skip to content

Commit b4dc680

Browse files
committed
Finish LocalRecovery with error if ChunkReadResult is erroneous
1 parent 5aa410f commit b4dc680

File tree

5 files changed

+168
-27
lines changed

5 files changed

+168
-27
lines changed

ydb/core/blobstorage/ut_blobstorage/recovery.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@
33
#include <ydb/core/driver_lib/version/version.h>
44
#include <ydb/core/driver_lib/version/ut/ut_helpers.h>
55

6+
#include "ut_helpers.h"
7+
68
#include <library/cpp/testing/unittest/registar.h>
79

810
#include <google/protobuf/text_format.h>
911

12+
using namespace NKikimr;
13+
1014
Y_UNIT_TEST_SUITE(CompatibilityInfo) {
1115
using EComponentId = NKikimrConfig::TCompatibilityRule::EComponentId;
1216

@@ -230,5 +234,82 @@ Y_UNIT_TEST_SUITE(CompatibilityInfo) {
230234
Y_UNIT_TEST(BSControllerMigration) {
231235
TestMigration(TVersion{ 23, 3, 13, 0 }, TVersion{ 23, 4, 1, 0 }, TVersion{ 23, 5, 1, 0 }, componentBSController, ValidateForBSController);
232236
}
237+
}
238+
239+
240+
Y_UNIT_TEST_SUITE(VDiskRecovery) {
241+
struct TTestCtx : public TTestCtxBase {
242+
TTestCtx()
243+
: TTestCtxBase(TEnvironmentSetup::TSettings{
244+
.NodeCount = 8,
245+
.Erasure = TBlobStorageGroupType::Erasure4Plus2Block,
246+
.ControllerNodeId = 1,
247+
})
248+
{}
249+
250+
void RestartNode() {
251+
Env->StopNode(NodeToRestart);
252+
Env->Sim(TDuration::Minutes(1));
253+
Env->StartNode(NodeToRestart);
254+
}
255+
256+
ui32 NodeToRestart = 2;
257+
};
258+
259+
void TestVDiskRecovery() {
260+
TTestCtx ctx;
261+
ctx.Initialize();
262+
TVDiskID vdiskId;
263+
for (const auto& vslot : ctx.BaseConfig.GetVSlot()) {
264+
if (vslot.GetVSlotId().GetNodeId() == ctx.NodeToRestart) {
265+
vdiskId = TVDiskID(vslot.GetGroupId(), vslot.GetGroupGeneration(), vslot.GetFailRealmIdx(),
266+
vslot.GetFailDomainIdx(), vslot.GetVDiskIdx());
267+
break;
268+
}
269+
}
270+
271+
ctx.WriteCompressedData(TTestCtxBase::TDataProfile{
272+
.GroupId = ctx.GroupId,
273+
.TotalSize = 10_MB,
274+
.BlobSize = 100,
275+
});
276+
277+
ctx.WriteCompressedData(TTestCtxBase::TDataProfile{
278+
.GroupId = ctx.GroupId,
279+
.TotalBlobs = 100,
280+
.BlobSize = 4_MB,
281+
});
282+
283+
ctx.Env->Runtime->FilterFunction = [&](ui32 nodeId, std::unique_ptr<IEventHandle>& ev) {
284+
if (nodeId == ctx.NodeToRestart) {
285+
if (ev->GetTypeRewrite() == NPDisk::TEvChunkReadResult::EventType) {
286+
NPDisk::TEvChunkReadResult* result = ev->Get<NPDisk::TEvChunkReadResult>();
287+
result->Status = NKikimrProto::ERROR;
288+
}
289+
}
290+
291+
return true;
292+
};
293+
294+
ctx.RestartNode();
295+
296+
ctx.Env->Sim(TDuration::Minutes(5));
297+
298+
NKikimrBlobStorage::EVDiskQueueId queueId = NKikimrBlobStorage::GetFastRead;
299+
300+
TAutoPtr<TEventHandle<TEvBlobStorage::TEvVStatusResult>> res;
301+
ctx.Env->WithQueueId(vdiskId, queueId, [&](const TActorId& actorId) {
302+
ctx.Env->Runtime->Send(new IEventHandle(actorId, ctx.Edge, new TEvBlobStorage::TEvVStatus()), actorId.NodeId());
303+
res = ctx.Env->WaitForEdgeActorEvent<TEvBlobStorage::TEvVStatusResult>(ctx.Edge, false, ctx.Env->Now() + TDuration::Seconds(10));
304+
});
305+
306+
UNIT_ASSERT(res);
307+
Cerr << res->Get()->ToString() << Endl;
308+
NKikimrProto::EReplyStatus status = res->Get()->Record.GetStatus();
309+
UNIT_ASSERT_C(status == NKikimrProto::VDISK_ERROR_STATE, res->Get()->ToString());
310+
}
233311

312+
Y_UNIT_TEST(ChunkReadErrorOnVDiskRecovery) {
313+
TestVDiskRecovery();
314+
}
234315
}

ydb/core/blobstorage/vdisk/hulldb/generic/hulldb_bulksstmngr.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,24 @@ namespace NKikimr {
9999
Die(ctx);
100100
}
101101

102-
void HandlePoison(TEvents::TEvPoisonPill::TPtr &ev, const TActorContext &ctx) {
103-
Y_UNUSED(ev);
104-
ActiveActors.KillAndClear(ctx);
105-
Die(ctx);
102+
void HandlePoison() {
103+
ActiveActors.KillAndClear(TActivationContext::AsActorContext());
104+
PassAway();
105+
}
106+
107+
void Handle(const TEvents::TEvActorDied::TPtr& ev) {
108+
// One LevelSegmentLoader termintaed unsuccessfully, kill all other actors,
109+
// send TEvActorDied to the parent and Die
110+
ActiveActors.Erase(ev->Sender);
111+
ActiveActors.KillAndClear(TActivationContext::AsActorContext());
112+
Send(LocalRecoveryActorId, new TEvents::TEvActorDied);
113+
PassAway();
106114
}
107115

108116
STRICT_STFUNC(StateFunc,
109117
HFunc(THullSegLoaded, Handle)
110-
HFunc(TEvents::TEvPoisonPill, HandlePoison)
118+
hFunc(TEvents::TEvActorDied, Handle)
119+
cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison)
111120
)
112121
};
113122
} // NLoaderActor

ydb/core/blobstorage/vdisk/hullop/blobstorage_hullload.h

Lines changed: 50 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,14 @@ namespace NKikimr {
224224

225225
void Handle(NPDisk::TEvChunkReadResult::TPtr &ev, const TActorContext &ctx) {
226226
auto *msg = ev->Get();
227-
CHECK_PDISK_RESPONSE_READABLE_MSG(VCtx, ev, ctx, TStringBuilder() << "{Origin# '" << Origin << "'}");
227+
228+
TString errorString = TStringBuilder() << "{Origin# '" << Origin << "'}";
229+
if (!VCtx->CheckPDiskResponse(ctx, *ev->Get(), errorString) ||
230+
!VCtx->CheckPDiskResponseReadable(ctx, *ev->Get(), errorString)) {
231+
this->Send(Recipient, new TEvents::TEvActorDied);
232+
this->PassAway();
233+
return;
234+
}
228235

229236
const TBufferWithGaps &data = msg->Data;
230237
LevelSegment->IndexParts.push_back({msg->ChunkIdx, msg->Offset, msg->Data.Size()});
@@ -387,15 +394,24 @@ namespace NKikimr {
387394
Process(ctx);
388395
}
389396

390-
void HandlePoison(TEvents::TEvPoisonPill::TPtr &ev, const TActorContext &ctx) {
391-
Y_UNUSED(ev);
392-
ActiveActors.KillAndClear(ctx);
393-
TThis::Die(ctx);
397+
void HandlePoison() {
398+
ActiveActors.KillAndClear(TActivationContext::AsActorContext());
399+
this->PassAway();
400+
}
401+
402+
void Handle(const TEvents::TEvActorDied::TPtr& ev) {
403+
// One LevelSegmentLoader termintaed unsuccessfully
404+
// send TEvActorDied to the parent and Die
405+
// This actor only has one child actor at a time, no need to clear ActiveActors
406+
ActiveActors.Erase(ev->Sender);
407+
this->Send(Recipient, new TEvents::TEvActorDied);
408+
this->PassAway();
394409
}
395410

396411
STRICT_STFUNC(StateFunc,
397412
HTemplFunc(THullSegLoaded, Handle)
398-
HFunc(TEvents::TEvPoisonPill, HandlePoison)
413+
hFunc(TEvents::TEvActorDied, Handle)
414+
cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison)
399415
)
400416

401417
public:
@@ -466,15 +482,24 @@ namespace NKikimr {
466482
Process(ctx);
467483
}
468484

469-
void HandlePoison(TEvents::TEvPoisonPill::TPtr &ev, const TActorContext &ctx) {
470-
Y_UNUSED(ev);
471-
ActiveActors.KillAndClear(ctx);
472-
TThis::Die(ctx);
485+
void HandlePoison() {
486+
ActiveActors.KillAndClear(TActivationContext::AsActorContext());
487+
this->PassAway();
488+
}
489+
490+
void Handle(const TEvents::TEvActorDied::TPtr& ev) {
491+
// One LevelSegmentLoader termintaed unsuccessfully, kill all other actors,
492+
// send TEvActorDied to the parent and Die
493+
// This actor only has one child actor at a time, no need to clear ActiveActors
494+
ActiveActors.Erase(ev->Sender);
495+
this->Send(Recipient, new TEvents::TEvActorDied);
496+
this->PassAway();
473497
}
474498

475499
STRICT_STFUNC(StateFunc,
476500
HTemplFunc(THullSegLoaded, Handle)
477-
HFunc(TEvents::TEvPoisonPill, HandlePoison)
501+
hFunc(TEvents::TEvActorDied, Handle)
502+
cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison)
478503
)
479504

480505
public:
@@ -558,15 +583,24 @@ namespace NKikimr {
558583
Process(ctx);
559584
}
560585

561-
void HandlePoison(TEvents::TEvPoisonPill::TPtr &ev, const TActorContext &ctx) {
562-
Y_UNUSED(ev);
563-
ActiveActors.KillAndClear(ctx);
564-
TThis::Die(ctx);
586+
void HandlePoison() {
587+
ActiveActors.KillAndClear(TActivationContext::AsActorContext());
588+
this->PassAway();
589+
}
590+
591+
void Handle(const TEvents::TEvActorDied::TPtr& ev) {
592+
// One LevelSegmentLoader termintaed unsuccessfully, kill all other actors,
593+
// send TEvActorDied to the parent and Die
594+
// This actor only has one child actor at a time, no need to clear ActiveActors
595+
ActiveActors.Erase(ev->Sender);
596+
this->Send(Recipient, new TEvents::TEvActorDied);
597+
this->PassAway();
565598
}
566599

567600
STRICT_STFUNC(StateFunc,
568601
HTemplFunc(THullSegLoaded, Handle)
569-
HFunc(TEvents::TEvPoisonPill, HandlePoison)
602+
hFunc(TEvents::TEvActorDied, Handle)
603+
cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison)
570604
)
571605

572606
public:

ydb/core/blobstorage/vdisk/localrecovery/localrecovery_public.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ namespace NKikimr {
9797
VDiskMonGroup.VDiskLocalRecoveryState() = TDbMon::TDbLocalRecovery::Error;
9898
LOG_CRIT(ctx, BS_LOCALRECOVERY,
9999
VDISKP(LocRecCtx->VCtx->VDiskLogPrefix,
100-
"LocalRecovery FINISHED: %s reason# %s status# %s;"
100+
"LocalRecovery FINISHED: %s reason# %s status# %s; "
101101
"VDISK LOCAL RECOVERY FAILURE DUE TO LOGICAL ERROR",
102102
LocRecCtx->RecovInfo->ToString().data(), reason.data(),
103103
NKikimrProto::EReplyStatus_Name(status).data()));
@@ -668,6 +668,13 @@ namespace NKikimr {
668668
ctx.Send(ev->Sender, new NMon::TEvHttpInfoRes(str.Str(), TDbMon::LocalRecovInfoId));
669669
}
670670

671+
void Handle(const TEvents::TEvActorDied::TPtr& ev) {
672+
ActiveActors.Erase(ev->Sender);
673+
ActiveActors.KillAndClear(TActivationContext::AsActorContext());
674+
SignalErrorAndDie(TActivationContext::AsActorContext(), NKikimrProto::ERROR,
675+
"Auxiliary actor terminated unexpectedly");
676+
}
677+
671678

672679
STRICT_STFUNC(StateInitialize,
673680
HFunc(NPDisk::TEvYardInitResult, Handle)
@@ -678,6 +685,7 @@ namespace NKikimr {
678685

679686
STRICT_STFUNC(StateLoadDatabase,
680687
HFunc(THullIndexLoaded, Handle)
688+
hFunc(TEvents::TEvActorDied, Handle)
681689
CFunc(NActors::TEvents::TSystem::PoisonPill, HandlePoison)
682690
HFunc(NMon::TEvHttpInfo, Handle)
683691
)

ydb/core/blobstorage/vdisk/localrecovery/localrecovery_readbulksst.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,24 @@ namespace NKikimr {
5757
Process(ctx);
5858
}
5959

60-
void HandlePoison(TEvents::TEvPoisonPill::TPtr &ev, const TActorContext &ctx) {
61-
Y_UNUSED(ev);
62-
ActiveActors.KillAndClear(ctx);
63-
TThis::Die(ctx);
60+
void HandlePoison() {
61+
ActiveActors.KillAndClear(TActivationContext::AsActorContext());
62+
this->PassAway();
63+
}
64+
65+
void Handle(const TEvents::TEvActorDied::TPtr& ev) {
66+
// One LevelSegmentLoader termintaed unsuccessfully
67+
// send TEvActorDied to the parent and Die
68+
// This actor only has one child actor at a time, no need to clear ActiveActors
69+
ActiveActors.Erase(ev->Sender);
70+
this->Send(Recipient, new TEvents::TEvActorDied);
71+
this->PassAway();
6472
}
6573

6674
STRICT_STFUNC(StateFunc,
6775
HTemplFunc(THullSegLoaded, Handle)
68-
HFunc(TEvents::TEvPoisonPill, HandlePoison)
76+
hFunc(TEvents::TEvActorDied, Handle)
77+
cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison)
6978
)
7079

7180
public:

0 commit comments

Comments
 (0)