Skip to content

Commit bc2aa96

Browse files
Vadim Averinblinkov
authored andcommitted
Kill tablet on BS failures (#13766)
1 parent e4bad6f commit bc2aa96

File tree

7 files changed

+66
-8
lines changed

7 files changed

+66
-8
lines changed

ydb/core/tx/columnshard/blobs_action/bs/gc.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ TGCTask::TGCTask(const TString& storageId, TGCListsByGroup&& listsByGroupId, con
4545
}
4646

4747
void TGCTask::OnGCResult(TEvBlobStorage::TEvCollectGarbageResult::TPtr ev) {
48-
AFL_VERIFY(ev->Get()->Status == NKikimrProto::OK)("status", ev->Get()->Status)("details", ev->Get()->ToString())("action_id", GetActionGuid());
48+
if (ev->Get()->Status != NKikimrProto::OK) {
49+
Failures++;
50+
}
4951
TBlobAddress bAddress(ev->Cookie, ev->Get()->Channel);
5052
auto itGroup = ListsByGroupId.find(bAddress);
5153
AFL_VERIFY(itGroup != ListsByGroupId.end())("address", bAddress.DebugString());
@@ -59,8 +61,14 @@ static TAtomicCounter PerGenerationCounter = 1;
5961
std::unique_ptr<TEvBlobStorage::TEvCollectGarbage> TGCTask::BuildRequest(const TBlobAddress& address) const {
6062
auto it = ListsByGroupId.find(address);
6163
AFL_VERIFY(it != ListsByGroupId.end());
62-
AFL_VERIFY(++it->second.RequestsCount < 10)("event", "build_gc_request")("address", address.DebugString())("current_gen", CurrentGen)("gen", CollectGenStepInFlight)
63-
("count", it->second.RequestsCount);
64+
if (++it->second.RequestsCount >= TGCLists::RequestsLimit) {
65+
AFL_CRIT(NKikimrServices::TX_COLUMNSHARD_BLOBS_BS)
66+
("event", "build_gc_request")
67+
("address", address.DebugString())("current_gen", CurrentGen)
68+
("gen", CollectGenStepInFlight)
69+
("count", it->second.RequestsCount);
70+
return nullptr;
71+
}
6472
AFL_DEBUG(NKikimrServices::TX_COLUMNSHARD_BLOBS_BS)("event", "build_gc_request")("address", address.DebugString())("current_gen", CurrentGen)("gen", CollectGenStepInFlight)
6573
("count", it->second.RequestsCount);
6674
auto result = std::make_unique<TEvBlobStorage::TEvCollectGarbage>(

ydb/core/tx/columnshard/blobs_action/bs/gc.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ class TGCTask: public IBlobsGCAction {
1717
THashSet<TLogoBlobID> KeepList;
1818
THashSet<TLogoBlobID> DontKeepList;
1919
mutable ui32 RequestsCount = 0;
20+
21+
constexpr static ui32 RequestsLimit = 10;
2022
};
2123
using TGCListsByGroup = THashMap<TBlobAddress, TGCLists>;
2224
private:
@@ -26,6 +28,7 @@ class TGCTask: public IBlobsGCAction {
2628
const ui64 CurrentGen;
2729
std::deque<TUnifiedBlobId> KeepsToErase;
2830
std::shared_ptr<TBlobManager> Manager;
31+
size_t Failures = 0;
2932
protected:
3033
virtual void RemoveBlobIdFromDB(const TTabletId tabletId, const TUnifiedBlobId& blobId, TBlobManagerDb& dbBlobs) override;
3134
virtual void DoOnExecuteTxAfterCleaning(NColumnShard::TColumnShard& self, TBlobManagerDb& dbBlobs) override;
@@ -54,6 +57,10 @@ class TGCTask: public IBlobsGCAction {
5457
return ListsByGroupId.empty();
5558
}
5659

60+
bool HasFailures() const {
61+
return Failures != 0;
62+
}
63+
5764
void OnGCResult(TEvBlobStorage::TEvCollectGarbageResult::TPtr ev);
5865

5966
std::unique_ptr<TEvBlobStorage::TEvCollectGarbage> BuildRequest(const TBlobAddress& address) const;

ydb/core/tx/columnshard/blobs_action/bs/gc_actor.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
#include "gc_actor.h"
22
#include <ydb/core/tx/columnshard/columnshard_private_events.h>
3+
#include <ydb/core/tx/columnshard/hooks/abstract/abstract.h>
34

45
namespace NKikimr::NOlap::NBlobOperations::NBlobStorage {
56

67
void TGarbageCollectionActor::Handle(TEvBlobStorage::TEvCollectGarbageResult::TPtr& ev) {
8+
NYDBTest::TControllers::GetColumnShardController()->OnCollectGarbageResult(ev);
79
ACFL_DEBUG("actor", "TEvCollectGarbageResult");
810
if (ev->Get()->Status == NKikimrProto::BLOCKED) {
911
auto g = PassAwayGuard();
@@ -14,15 +16,25 @@ void TGarbageCollectionActor::Handle(TEvBlobStorage::TEvCollectGarbageResult::TP
1416
CheckFinished();
1517
} else {
1618
ACFL_ERROR()("event", "GC_ERROR")("details", ev->Get()->Print(true));
17-
SendToBSProxy(NActors::TActivationContext::AsActorContext(), ev->Cookie, GCTask->BuildRequest(TBlobAddress(ev->Cookie, ev->Get()->Channel)).release(), ev->Cookie);
19+
auto request = GCTask->BuildRequest(TBlobAddress(ev->Cookie, ev->Get()->Channel));
20+
if (request) {
21+
SendToBSProxy(NActors::TActivationContext::AsActorContext(), ev->Cookie, request.release(), ev->Cookie);
22+
} else {
23+
GCTask->OnGCResult(ev);
24+
CheckFinished();
25+
}
1826
}
1927
}
2028

2129
void TGarbageCollectionActor::CheckFinished() {
2230
if (SharedRemovingFinished && GCTask->IsFinished()) {
2331
auto g = PassAwayGuard();
2432
ACFL_DEBUG("actor", "TGarbageCollectionActor")("event", "finished");
25-
TActorContext::AsActorContext().Send(TabletActorId, std::make_unique<NColumnShard::TEvPrivate::TEvGarbageCollectionFinished>(GCTask));
33+
if (GCTask->HasFailures()) {
34+
Send(TabletActorId, new TEvents::TEvPoison);
35+
} else {
36+
TActorContext::AsActorContext().Send(TabletActorId, std::make_unique<NColumnShard::TEvPrivate::TEvGarbageCollectionFinished>(GCTask));
37+
}
2638
}
2739
}
2840

ydb/core/tx/columnshard/blobs_action/bs/gc_actor.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ class TGarbageCollectionActor: public TSharedBlobsCollectionActor<TGarbageCollec
1212
using TBase = TSharedBlobsCollectionActor<TGarbageCollectionActor>;
1313
const NActors::TActorId TabletActorId;
1414
std::shared_ptr<TGCTask> GCTask;
15+
1516
void Handle(TEvBlobStorage::TEvCollectGarbageResult::TPtr& ev);
1617
void CheckFinished();
1718

@@ -41,7 +42,7 @@ class TGarbageCollectionActor: public TSharedBlobsCollectionActor<TGarbageCollec
4142
AFL_DEBUG(NKikimrServices::TX_COLUMNSHARD_BLOBS_BS)("actor", "TGarbageCollectionActor")("event", "starting")("action_id", GCTask->GetActionGuid());
4243
for (auto&& i : GCTask->GetListsByGroupId()) {
4344
auto request = GCTask->BuildRequest(i.first);
44-
AFL_VERIFY(request);
45+
AFL_VERIFY(request); // Cannot fail on the first time
4546
SendToBSProxy(ctx, i.first.GetGroupId(), request.release(), i.first.GetGroupId());
4647
}
4748
TBase::Bootstrap(ctx);

ydb/core/tx/columnshard/hooks/abstract/abstract.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ class ICSController {
9999
}
100100
virtual void DoOnDataSharingStarted(const ui64 /*tabletId*/, const TString& /*sessionId*/) {
101101
}
102+
virtual void DoOnCollectGarbageResult(TEvBlobStorage::TEvCollectGarbageResult::TPtr& /*result*/) {
103+
}
102104

103105
virtual TDuration DoGetUsedSnapshotLivetime(const TDuration defaultValue) const {
104106
return defaultValue;
@@ -282,6 +284,10 @@ class ICSController {
282284
DoOnAfterGCAction(shard, action);
283285
}
284286

287+
void OnCollectGarbageResult(TEvBlobStorage::TEvCollectGarbageResult::TPtr& result) {
288+
DoOnCollectGarbageResult(result);
289+
}
290+
285291
bool OnAfterFilterAssembling(const std::shared_ptr<arrow::RecordBatch>& batch) {
286292
return DoOnAfterFilterAssembling(batch);
287293
}

ydb/core/tx/columnshard/test_helper/controllers.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <ydb/core/testlib/basics/runtime.h>
33
#include <ydb/core/tx/columnshard/hooks/testing/controller.h>
44
#include <ydb/core/tx/tiering/manager.h>
5+
#include <ydb/core/tx/columnshard/blobs_action/bs/address.h>
56

67
namespace NKikimr::NOlap {
78

@@ -72,4 +73,22 @@ class TWaitCompactionController: public NYDBTest::NColumnShard::TController {
7273
}
7374
};
7475

76+
class TFailingBSController: public NKikimr::NYDBTest::NColumnShard::TController {
77+
void DoOnCollectGarbageResult(TEvBlobStorage::TEvCollectGarbageResult::TPtr& result) override {
78+
NBlobOperations::NBlobStorage::TBlobAddress group(result->Cookie, result->Get()->Channel);
79+
if (!FailingGroup.has_value()) {
80+
FailingGroup = group;
81+
}
82+
if (group == FailingGroup.value() && FailsCount < 15) {
83+
Cerr << "Dropped EvCollectGarbageResult" << Endl;
84+
result->Get()->Status = NKikimrProto::ERROR;
85+
FailsCount++;
86+
}
87+
}
88+
89+
private:
90+
std::optional<NBlobOperations::NBlobStorage::TBlobAddress> FailingGroup = std::nullopt;
91+
size_t FailsCount = 0;
92+
};
93+
7594
}

ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2421,9 +2421,10 @@ Y_UNIT_TEST_SUITE(TColumnShardTestReadWrite) {
24212421
}
24222422
}
24232423

2424+
template<typename Controller>
24242425
void TestCompactionGC() {
24252426
TTestBasicRuntime runtime;
2426-
auto csDefaultControllerGuard = NKikimr::NYDBTest::TControllers::RegisterCSControllerGuard<TDefaultTestsController>();
2427+
auto csDefaultControllerGuard = NKikimr::NYDBTest::TControllers::RegisterCSControllerGuard<Controller>();
24272428
csDefaultControllerGuard->DisableBackground(NKikimr::NYDBTest::ICSController::EBackground::Indexation);
24282429
csDefaultControllerGuard->SetOverridePeriodicWakeupActivationPeriod(TDuration::Seconds(1));
24292430
csDefaultControllerGuard->SetOverrideBlobSplitSettings(NOlap::NSplitter::TSplitSettings());
@@ -2687,7 +2688,11 @@ Y_UNIT_TEST_SUITE(TColumnShardTestReadWrite) {
26872688
}
26882689

26892690
Y_UNIT_TEST(CompactionGC) {
2690-
TestCompactionGC();
2691+
TestCompactionGC<TDefaultTestsController>();
2692+
}
2693+
2694+
Y_UNIT_TEST(CompactionGCFailingBs) {
2695+
TestCompactionGC<NOlap::TFailingBSController>();
26912696
}
26922697

26932698
Y_UNIT_TEST(PortionInfoSize) {

0 commit comments

Comments
 (0)