From 507735a43f0bbd4b3aa0a92b9a68f919c2c18a12 Mon Sep 17 00:00:00 2001 From: Sergey Belyakov Date: Wed, 2 Jul 2025 13:06:52 +0000 Subject: [PATCH 1/2] Finish LocalRecovery with error if ChunkReadResult is erroneous --- .../blobstorage/ut_blobstorage/recovery.cpp | 81 +++++++++++++++++++ .../hulldb/generic/hulldb_bulksstmngr.cpp | 19 +++-- .../vdisk/hullop/blobstorage_hullload.h | 63 +++++++++++---- .../localrecovery/localrecovery_public.cpp | 10 ++- .../localrecovery_readbulksst.cpp | 18 +++-- 5 files changed, 164 insertions(+), 27 deletions(-) diff --git a/ydb/core/blobstorage/ut_blobstorage/recovery.cpp b/ydb/core/blobstorage/ut_blobstorage/recovery.cpp index 26be48e53d5b..b018f5c80930 100644 --- a/ydb/core/blobstorage/ut_blobstorage/recovery.cpp +++ b/ydb/core/blobstorage/ut_blobstorage/recovery.cpp @@ -3,10 +3,14 @@ #include #include +#include "ut_helpers.h" + #include #include +using namespace NKikimr; + Y_UNIT_TEST_SUITE(CompatibilityInfo) { using EComponentId = NKikimrConfig::TCompatibilityRule::EComponentId; @@ -230,5 +234,82 @@ Y_UNIT_TEST_SUITE(CompatibilityInfo) { Y_UNIT_TEST(BSControllerMigration) { TestMigration(TVersion{ 23, 3, 13, 0 }, TVersion{ 23, 4, 1, 0 }, TVersion{ 23, 5, 1, 0 }, componentBSController, ValidateForBSController); } +} + + +Y_UNIT_TEST_SUITE(VDiskRecovery) { + struct TTestCtx : public TTestCtxBase { + TTestCtx() + : TTestCtxBase(TEnvironmentSetup::TSettings{ + .NodeCount = 8, + .Erasure = TBlobStorageGroupType::Erasure4Plus2Block, + .ControllerNodeId = 1, + }) + {} + + void RestartNode() { + Env->StopNode(NodeToRestart); + Env->Sim(TDuration::Minutes(1)); + Env->StartNode(NodeToRestart); + } + + ui32 NodeToRestart = 2; + }; + + void TestVDiskRecovery() { + TTestCtx ctx; + ctx.Initialize(); + TVDiskID vdiskId; + for (const auto& vslot : ctx.BaseConfig.GetVSlot()) { + if (vslot.GetVSlotId().GetNodeId() == ctx.NodeToRestart) { + vdiskId = TVDiskID(vslot.GetGroupId(), vslot.GetGroupGeneration(), vslot.GetFailRealmIdx(), + vslot.GetFailDomainIdx(), vslot.GetVDiskIdx()); + break; + } + } + + ctx.WriteCompressedData(TTestCtxBase::TDataProfile{ + .GroupId = ctx.GroupId, + .TotalSize = 10_MB, + .BlobSize = 100, + }); + + ctx.WriteCompressedData(TTestCtxBase::TDataProfile{ + .GroupId = ctx.GroupId, + .TotalBlobs = 100, + .BlobSize = 4_MB, + }); + + ctx.Env->Runtime->FilterFunction = [&](ui32 nodeId, std::unique_ptr& ev) { + if (nodeId == ctx.NodeToRestart) { + if (ev->GetTypeRewrite() == NPDisk::TEvChunkReadResult::EventType) { + NPDisk::TEvChunkReadResult* result = ev->Get(); + result->Status = NKikimrProto::ERROR; + } + } + + return true; + }; + + ctx.RestartNode(); + + ctx.Env->Sim(TDuration::Minutes(5)); + + NKikimrBlobStorage::EVDiskQueueId queueId = NKikimrBlobStorage::GetFastRead; + + TAutoPtr> res; + ctx.Env->WithQueueId(vdiskId, queueId, [&](const TActorId& actorId) { + ctx.Env->Runtime->Send(new IEventHandle(actorId, ctx.Edge, new TEvBlobStorage::TEvVStatus()), actorId.NodeId()); + res = ctx.Env->WaitForEdgeActorEvent(ctx.Edge, false, ctx.Env->Now() + TDuration::Seconds(10)); + }); + + UNIT_ASSERT(res); + Cerr << res->Get()->ToString() << Endl; + NKikimrProto::EReplyStatus status = res->Get()->Record.GetStatus(); + UNIT_ASSERT_C(status == NKikimrProto::VDISK_ERROR_STATE, res->Get()->ToString()); + } + Y_UNIT_TEST(ChunkReadErrorOnVDiskRecovery) { + TestVDiskRecovery(); + } } diff --git a/ydb/core/blobstorage/vdisk/hulldb/generic/hulldb_bulksstmngr.cpp b/ydb/core/blobstorage/vdisk/hulldb/generic/hulldb_bulksstmngr.cpp index 2b8c1f16e31a..a5dc69dd71e2 100644 --- a/ydb/core/blobstorage/vdisk/hulldb/generic/hulldb_bulksstmngr.cpp +++ b/ydb/core/blobstorage/vdisk/hulldb/generic/hulldb_bulksstmngr.cpp @@ -99,15 +99,24 @@ namespace NKikimr { Die(ctx); } - void HandlePoison(TEvents::TEvPoisonPill::TPtr &ev, const TActorContext &ctx) { - Y_UNUSED(ev); - ActiveActors.KillAndClear(ctx); - Die(ctx); + void HandlePoison() { + ActiveActors.KillAndClear(TActivationContext::AsActorContext()); + PassAway(); + } + + void Handle(const TEvents::TEvActorDied::TPtr& ev) { + // One LevelSegmentLoader termintaed unsuccessfully, kill all other actors, + // send TEvActorDied to the parent and Die + ActiveActors.Erase(ev->Sender); + ActiveActors.KillAndClear(TActivationContext::AsActorContext()); + Send(LocalRecoveryActorId, new TEvents::TEvActorDied); + PassAway(); } STRICT_STFUNC(StateFunc, HFunc(THullSegLoaded, Handle) - HFunc(TEvents::TEvPoisonPill, HandlePoison) + hFunc(TEvents::TEvActorDied, Handle) + cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison) ) }; } // NLoaderActor diff --git a/ydb/core/blobstorage/vdisk/hullop/blobstorage_hullload.h b/ydb/core/blobstorage/vdisk/hullop/blobstorage_hullload.h index 1f173aeed713..26cb0de92812 100644 --- a/ydb/core/blobstorage/vdisk/hullop/blobstorage_hullload.h +++ b/ydb/core/blobstorage/vdisk/hullop/blobstorage_hullload.h @@ -224,7 +224,14 @@ namespace NKikimr { void Handle(NPDisk::TEvChunkReadResult::TPtr &ev, const TActorContext &ctx) { auto *msg = ev->Get(); - CHECK_PDISK_RESPONSE_READABLE_MSG(VCtx, ev, ctx, TStringBuilder() << "{Origin# '" << Origin << "'}"); + + TString errorString = TStringBuilder() << "{Origin# '" << Origin << "'}"; + if (!VCtx->CheckPDiskResponse(ctx, *ev->Get(), errorString) || + !VCtx->CheckPDiskResponseReadable(ctx, *ev->Get(), errorString)) { + this->Send(Recipient, new TEvents::TEvActorDied); + this->PassAway(); + return; + } const TBufferWithGaps &data = msg->Data; LevelSegment->IndexParts.push_back({msg->ChunkIdx, msg->Offset, msg->Data.Size()}); @@ -387,15 +394,23 @@ namespace NKikimr { Process(ctx); } - void HandlePoison(TEvents::TEvPoisonPill::TPtr &ev, const TActorContext &ctx) { - Y_UNUSED(ev); - ActiveActors.KillAndClear(ctx); - TThis::Die(ctx); + void HandlePoison() { + ActiveActors.KillAndClear(TActivationContext::AsActorContext()); + this->PassAway(); + } + + void Handle(const TEvents::TEvActorDied::TPtr& ev) { + // One LevelSegmentLoader termintaed unsuccessfully + // send TEvActorDied to the parent and Die + // This actor only has one child actor at a time, no need to clear ActiveActors + this->Send(Recipient, new TEvents::TEvActorDied); + this->PassAway(); } STRICT_STFUNC(StateFunc, HTemplFunc(THullSegLoaded, Handle) - HFunc(TEvents::TEvPoisonPill, HandlePoison) + hFunc(TEvents::TEvActorDied, Handle) + cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison) ) public: @@ -466,15 +481,23 @@ namespace NKikimr { Process(ctx); } - void HandlePoison(TEvents::TEvPoisonPill::TPtr &ev, const TActorContext &ctx) { - Y_UNUSED(ev); - ActiveActors.KillAndClear(ctx); - TThis::Die(ctx); + void HandlePoison() { + ActiveActors.KillAndClear(TActivationContext::AsActorContext()); + this->PassAway(); + } + + void Handle(const TEvents::TEvActorDied::TPtr& ev) { + // One LevelSegmentLoader termintaed unsuccessfully, kill all other actors, + // send TEvActorDied to the parent and Die + // This actor only has one child actor at a time, no need to clear ActiveActors + this->Send(Recipient, new TEvents::TEvActorDied); + this->PassAway(); } STRICT_STFUNC(StateFunc, HTemplFunc(THullSegLoaded, Handle) - HFunc(TEvents::TEvPoisonPill, HandlePoison) + hFunc(TEvents::TEvActorDied, Handle) + cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison) ) public: @@ -558,15 +581,23 @@ namespace NKikimr { Process(ctx); } - void HandlePoison(TEvents::TEvPoisonPill::TPtr &ev, const TActorContext &ctx) { - Y_UNUSED(ev); - ActiveActors.KillAndClear(ctx); - TThis::Die(ctx); + void HandlePoison() { + ActiveActors.KillAndClear(TActivationContext::AsActorContext()); + this->PassAway(); + } + + void Handle(const TEvents::TEvActorDied::TPtr& ev) { + // One LevelSegmentLoader termintaed unsuccessfully, kill all other actors, + // send TEvActorDied to the parent and Die + // This actor only has one child actor at a time, no need to clear ActiveActors + this->Send(Recipient, new TEvents::TEvActorDied); + this->PassAway(); } STRICT_STFUNC(StateFunc, HTemplFunc(THullSegLoaded, Handle) - HFunc(TEvents::TEvPoisonPill, HandlePoison) + hFunc(TEvents::TEvActorDied, Handle) + cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison) ) public: diff --git a/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_public.cpp b/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_public.cpp index d0128cf3d414..de1907476a1c 100644 --- a/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_public.cpp +++ b/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_public.cpp @@ -97,7 +97,7 @@ namespace NKikimr { VDiskMonGroup.VDiskLocalRecoveryState() = TDbMon::TDbLocalRecovery::Error; LOG_CRIT(ctx, BS_LOCALRECOVERY, VDISKP(LocRecCtx->VCtx->VDiskLogPrefix, - "LocalRecovery FINISHED: %s reason# %s status# %s;" + "LocalRecovery FINISHED: %s reason# %s status# %s; " "VDISK LOCAL RECOVERY FAILURE DUE TO LOGICAL ERROR", LocRecCtx->RecovInfo->ToString().data(), reason.data(), NKikimrProto::EReplyStatus_Name(status).data())); @@ -668,6 +668,13 @@ namespace NKikimr { ctx.Send(ev->Sender, new NMon::TEvHttpInfoRes(str.Str(), TDbMon::LocalRecovInfoId)); } + void Handle(const TEvents::TEvActorDied::TPtr& ev) { + ActiveActors.Erase(ev->Sender); + ActiveActors.KillAndClear(TActivationContext::AsActorContext()); + SignalErrorAndDie(TActivationContext::AsActorContext(), NKikimrProto::ERROR, + "Auxiliary actor terminated unexpectedly"); + } + STRICT_STFUNC(StateInitialize, HFunc(NPDisk::TEvYardInitResult, Handle) @@ -678,6 +685,7 @@ namespace NKikimr { STRICT_STFUNC(StateLoadDatabase, HFunc(THullIndexLoaded, Handle) + hFunc(TEvents::TEvActorDied, Handle) CFunc(NActors::TEvents::TSystem::PoisonPill, HandlePoison) HFunc(NMon::TEvHttpInfo, Handle) ) diff --git a/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_readbulksst.cpp b/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_readbulksst.cpp index d61fbb7ce944..5951157fb0d0 100644 --- a/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_readbulksst.cpp +++ b/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_readbulksst.cpp @@ -57,15 +57,23 @@ namespace NKikimr { Process(ctx); } - void HandlePoison(TEvents::TEvPoisonPill::TPtr &ev, const TActorContext &ctx) { - Y_UNUSED(ev); - ActiveActors.KillAndClear(ctx); - TThis::Die(ctx); + void HandlePoison() { + ActiveActors.KillAndClear(TActivationContext::AsActorContext()); + this->PassAway(); + } + + void Handle(const TEvents::TEvActorDied::TPtr& ev) { + // One LevelSegmentLoader termintaed unsuccessfully + // send TEvActorDied to the parent and Die + // This actor only has one child actor at a time, no need to clear ActiveActors + this->Send(Recipient, new TEvents::TEvActorDied); + this->PassAway(); } STRICT_STFUNC(StateFunc, HTemplFunc(THullSegLoaded, Handle) - HFunc(TEvents::TEvPoisonPill, HandlePoison) + hFunc(TEvents::TEvActorDied, Handle) + cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison) ) public: From b5d52039aa2c365b788591e76e14371e9d256b30 Mon Sep 17 00:00:00 2001 From: Sergey Belyakov Date: Wed, 2 Jul 2025 16:18:17 +0000 Subject: [PATCH 2/2] Fix build --- ydb/core/blobstorage/vdisk/hullop/blobstorage_hullload.h | 6 +++--- .../vdisk/localrecovery/localrecovery_readbulksst.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ydb/core/blobstorage/vdisk/hullop/blobstorage_hullload.h b/ydb/core/blobstorage/vdisk/hullop/blobstorage_hullload.h index 26cb0de92812..163d2c8a36fe 100644 --- a/ydb/core/blobstorage/vdisk/hullop/blobstorage_hullload.h +++ b/ydb/core/blobstorage/vdisk/hullop/blobstorage_hullload.h @@ -399,7 +399,7 @@ namespace NKikimr { this->PassAway(); } - void Handle(const TEvents::TEvActorDied::TPtr& ev) { + void Handle(const TEvents::TEvActorDied::TPtr&) { // One LevelSegmentLoader termintaed unsuccessfully // send TEvActorDied to the parent and Die // This actor only has one child actor at a time, no need to clear ActiveActors @@ -486,7 +486,7 @@ namespace NKikimr { this->PassAway(); } - void Handle(const TEvents::TEvActorDied::TPtr& ev) { + void Handle(const TEvents::TEvActorDied::TPtr&) { // One LevelSegmentLoader termintaed unsuccessfully, kill all other actors, // send TEvActorDied to the parent and Die // This actor only has one child actor at a time, no need to clear ActiveActors @@ -586,7 +586,7 @@ namespace NKikimr { this->PassAway(); } - void Handle(const TEvents::TEvActorDied::TPtr& ev) { + void Handle(const TEvents::TEvActorDied::TPtr&) { // One LevelSegmentLoader termintaed unsuccessfully, kill all other actors, // send TEvActorDied to the parent and Die // This actor only has one child actor at a time, no need to clear ActiveActors diff --git a/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_readbulksst.cpp b/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_readbulksst.cpp index 5951157fb0d0..945fd33dc9e9 100644 --- a/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_readbulksst.cpp +++ b/ydb/core/blobstorage/vdisk/localrecovery/localrecovery_readbulksst.cpp @@ -62,7 +62,7 @@ namespace NKikimr { this->PassAway(); } - void Handle(const TEvents::TEvActorDied::TPtr& ev) { + void Handle(const TEvents::TEvActorDied::TPtr&) { // One LevelSegmentLoader termintaed unsuccessfully // send TEvActorDied to the parent and Die // This actor only has one child actor at a time, no need to clear ActiveActors