Skip to content

Commit 743e092

Browse files
authored
delete nodes from local db every time we delete them from memory (#10051)
1 parent b8023f6 commit 743e092

File tree

9 files changed

+157
-10
lines changed

9 files changed

+157
-10
lines changed

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ void THive::RestartPipeTx(ui64 tabletId) {
9696
}
9797

9898
bool THive::TryToDeleteNode(TNodeInfo* node) {
99-
if (node->CanBeDeleted()) {
99+
if (node->CanBeDeleted(TActivationContext::Now())) {
100100
BLOG_I("TryToDeleteNode(" << node->Id << "): deleting");
101101
DeleteNode(node->Id);
102102
return true;
@@ -120,12 +120,15 @@ void THive::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev) {
120120
void THive::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev) {
121121
if (ev->Get()->TabletId == TabletID()) {
122122
BLOG_TRACE("Handle TEvTabletPipe::TEvServerDisconnected(" << ev->Get()->ClientId << ") " << ev->Get()->ServerId);
123-
TNodeInfo* node = FindNode(ev->Get()->ClientId.NodeId());
123+
auto nodeId = ev->Get()->ClientId.NodeId();
124+
TNodeInfo* node = FindNode(nodeId);
124125
if (node != nullptr) {
125126
Erase(node->PipeServers, ev->Get()->ServerId);
126127
if (node->PipeServers.empty() && node->IsUnknown()) {
127128
ObjectDistributions.RemoveNode(*node);
128-
TryToDeleteNode(node);
129+
if (TryToDeleteNode(node)) {
130+
Execute(CreateDeleteNode(nodeId));
131+
}
129132
}
130133
}
131134
}
@@ -3427,13 +3430,16 @@ void THive::Handle(TEvPrivate::TEvLogTabletMoves::TPtr&) {
34273430
}
34283431

34293432
void THive::Handle(TEvPrivate::TEvDeleteNode::TPtr& ev) {
3430-
auto node = FindNode(ev->Get()->NodeId);
3433+
auto nodeId = ev->Get()->NodeId;
3434+
auto node = FindNode(nodeId);
34313435
if (node == nullptr) {
34323436
return;
34333437
}
34343438
node->DeletionScheduled = false;
34353439
if (!node->IsAlive()) {
3436-
TryToDeleteNode(node);
3440+
if (TryToDeleteNode(node)) {
3441+
Execute(CreateDeleteNode(nodeId));
3442+
}
34373443
}
34383444
}
34393445

ydb/core/mind/hive/hive_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
304304
ITransaction* CreateUpdateTabletsObject(TEvHive::TEvUpdateTabletsObject::TPtr event);
305305
ITransaction* CreateUpdateDomain(TSubDomainKey subdomainKey, TEvHive::TEvUpdateDomain::TPtr event = {});
306306
ITransaction* CreateUpdateDcFollowers(const TDataCenterId& dc);
307+
ITransaction* CreateDeleteNode(TNodeId nodeId);
307308

308309
public:
309310
TDomainsView DomainsView;

ydb/core/mind/hive/hive_ut.cpp

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1069,6 +1069,105 @@ Y_UNIT_TEST_SUITE(THiveTest) {
10691069
UNIT_ASSERT(!isNodeEmpty(nodeId));
10701070
}
10711071

1072+
Y_UNIT_TEST(DrainWithHiveRestart) {
1073+
// 1. Drain a node
1074+
// 2. Kill it & wait for hive to delete it
1075+
// 3. Start the node again
1076+
// 4. Restart hive
1077+
// 5. Ensure node is not down (by creating tablets)
1078+
const int NUM_NODES = 3;
1079+
const int NUM_TABLETS = 10;
1080+
TTestBasicRuntime runtime(NUM_NODES, false);
1081+
Setup(runtime, true, 2, [](TAppPrepare& app) {
1082+
app.HiveConfig.SetNodeDeletePeriod(1);
1083+
});
1084+
const ui64 hiveTablet = MakeDefaultHiveID();
1085+
const ui64 testerTablet = MakeTabletID(false, 1);
1086+
const TActorId hiveActor = CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
1087+
runtime.EnableScheduleForActor(hiveActor);
1088+
{
1089+
TDispatchOptions options;
1090+
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
1091+
runtime.DispatchEvents(options);
1092+
}
1093+
TTabletTypes::EType tabletType = TTabletTypes::Dummy;
1094+
std::unordered_set<TTabletId> tablets;
1095+
TActorId senderA = runtime.AllocateEdgeActor(0);
1096+
auto createTablets = [&] {
1097+
for (int i = 0; i < NUM_TABLETS; ++i) {
1098+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + tablets.size(), tabletType, BINDED_CHANNELS));
1099+
runtime.SendToPipe(hiveTablet, senderA, ev.Release(), 0, GetPipeConfigWithRetries());
1100+
TAutoPtr<IEventHandle> handle;
1101+
auto createTabletReply = runtime.GrabEdgeEventRethrow<TEvHive::TEvCreateTabletReply>(handle);
1102+
ui64 tabletId = createTabletReply->Record.GetTabletID();
1103+
tablets.insert(tabletId);
1104+
}
1105+
NTabletPipe::TClientConfig pipeConfig;
1106+
pipeConfig.RetryPolicy = NTabletPipe::TClientRetryPolicy::WithRetries();
1107+
for (TTabletId tabletId : tablets) {
1108+
MakeSureTabletIsUp(runtime, tabletId, 0, &pipeConfig);
1109+
}
1110+
};
1111+
1112+
createTablets();
1113+
1114+
ui32 nodeId = runtime.GetNodeId(2);
1115+
{
1116+
Ctest << "1. Drain a node\n";
1117+
1118+
runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvDrainNode(nodeId));
1119+
1120+
Ctest << "2. Kill it & wait for hive to delete it\n";
1121+
1122+
SendKillLocal(runtime, 0);
1123+
{
1124+
TDispatchOptions options;
1125+
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvDeleteNode);
1126+
runtime.DispatchEvents(options, TDuration::Seconds(6));
1127+
}
1128+
}
1129+
1130+
auto isNodeEmpty = [&](ui32 nodeId) -> bool {
1131+
bool empty = true;
1132+
TAutoPtr<IEventHandle> handle;
1133+
TActorId whiteboard = NNodeWhiteboard::MakeNodeWhiteboardServiceId(nodeId);
1134+
runtime.Send(new IEventHandle(whiteboard, senderA, new NNodeWhiteboard::TEvWhiteboard::TEvTabletStateRequest()));
1135+
NNodeWhiteboard::TEvWhiteboard::TEvTabletStateResponse* wbResponse = runtime.GrabEdgeEventRethrow<NNodeWhiteboard::TEvWhiteboard::TEvTabletStateResponse>(handle);
1136+
for (const NKikimrWhiteboard::TTabletStateInfo& tabletInfo : wbResponse->Record.GetTabletStateInfo()) {
1137+
if (tablets.contains(tabletInfo.GetTabletId()) && tabletInfo.GetState() != NKikimrWhiteboard::TTabletStateInfo::Dead) {
1138+
Ctest << "Tablet " << tabletInfo.GetTabletId() << "." << tabletInfo.GetFollowerId()
1139+
<< " is not dead yet (" << NKikimrWhiteboard::TTabletStateInfo::ETabletState_Name(tabletInfo.GetState()) << ")" << Endl;
1140+
empty = false;
1141+
}
1142+
}
1143+
return empty;
1144+
};
1145+
1146+
Ctest << "3. Start the node again\n";
1147+
CreateLocal(runtime, 0);
1148+
1149+
{
1150+
TDispatchOptions options;
1151+
options.FinalEvents.emplace_back(TEvLocal::EvStatus);
1152+
runtime.DispatchEvents(options);
1153+
}
1154+
1155+
Ctest << "4. Restart hive\n";
1156+
1157+
runtime.Register(CreateTabletKiller(hiveTablet));
1158+
{
1159+
TDispatchOptions options;
1160+
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
1161+
runtime.DispatchEvents(options);
1162+
}
1163+
1164+
Ctest << "5. Ensure node is not down (by creating tablets)\n";
1165+
1166+
createTablets();
1167+
1168+
UNIT_ASSERT(!isNodeEmpty(nodeId));
1169+
}
1170+
10721171
Y_UNIT_TEST(TestCreateSubHiveCreateTablet) {
10731172
TTestBasicRuntime runtime(1, false);
10741173
Setup(runtime, true);

ydb/core/mind/hive/node_info.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -453,15 +453,15 @@ TResourceRawValues TNodeInfo::GetStDevResourceValues() {
453453
return GetStDev(values);
454454
}
455455

456-
bool TNodeInfo::CanBeDeleted() const {
456+
bool TNodeInfo::CanBeDeleted(TInstant now) const {
457457
TInstant lastAlive(TInstant::MilliSeconds(Statistics.GetLastAliveTimestamp()));
458458
if (lastAlive) {
459459
return (IsDisconnected() || IsUnknown())
460460
&& !Local
461461
&& GetTabletsTotal() == 0
462462
&& LockedTablets.empty()
463463
&& !Freeze
464-
&& (lastAlive + Hive.GetNodeDeletePeriod() < TInstant::Now());
464+
&& (lastAlive + Hive.GetNodeDeletePeriod() < now);
465465
} else {
466466
return (IsDisconnected() || IsUnknown()) && !Local && GetTabletsTotal() == 0 && LockedTablets.empty() && !Freeze;
467467
}

ydb/core/mind/hive/node_info.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ struct TNodeInfo {
232232
}
233233
}
234234

235-
bool CanBeDeleted() const;
235+
bool CanBeDeleted(TInstant now) const;
236236
void RegisterInDomains();
237237
void DeregisterInDomains();
238238
void Ping();
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#include "hive_impl.h"
2+
#include "hive_log.h"
3+
4+
namespace NKikimr {
5+
namespace NHive {
6+
7+
class TTxDeleteNode : public TTransactionBase<THive> {
8+
protected:
9+
TNodeId NodeId;
10+
public:
11+
TTxDeleteNode(TNodeId nodeId, THive *hive)
12+
: TBase(hive)
13+
, NodeId(nodeId)
14+
{}
15+
16+
bool Execute(TTransactionContext &txc, const TActorContext&) override {
17+
NIceDb::TNiceDb db(txc.DB);
18+
db.Table<Schema::Node>().Key(NodeId).Delete();
19+
auto restrictionsRowset = db.Table<Schema::TabletAvailabilityRestrictions>().Range(NodeId).Select();
20+
while (!restrictionsRowset.EndOfSet()) {
21+
db.Table<Schema::TabletAvailabilityRestrictions>().Key(restrictionsRowset.GetKey()).Delete();
22+
if (!restrictionsRowset.Next()) {
23+
return false;
24+
}
25+
}
26+
return true;
27+
}
28+
29+
void Complete(const TActorContext&) override {
30+
}
31+
};
32+
33+
ITransaction* THive::CreateDeleteNode(TNodeId nodeId) {
34+
return new TTxDeleteNode(nodeId, this);
35+
}
36+
37+
} // NHive
38+
} // NKikimr

ydb/core/mind/hive/tx__load_everything.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -750,8 +750,9 @@ class TTxLoadEverything : public TTransactionBase<THive> {
750750

751751
size_t numDeletedNodes = 0;
752752
size_t numDeletedRestrictions = 0;
753+
TInstant now = TActivationContext::Now();
753754
for (auto itNode = Self->Nodes.begin(); itNode != Self->Nodes.end();) {
754-
if (itNode->second.CanBeDeleted()) {
755+
if (itNode->second.CanBeDeleted(now)) {
755756
++numDeletedNodes;
756757
auto restrictionsRowset = db.Table<Schema::TabletAvailabilityRestrictions>().Range(itNode->first).Select();
757758
while (!restrictionsRowset.EndOfSet()) {

ydb/core/mind/hive/tx__register_node.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class TTxRegisterNode : public TTransactionBase<THive> {
2323
TNodeId nodeId = Local.NodeId();
2424
TNodeInfo& node = Self->GetNode(nodeId);
2525
if (node.Local != Local) {
26-
TInstant now = TInstant::Now();
26+
TInstant now = TActivationContext::Now();
2727
node.Statistics.AddRestartTimestamp(now.MilliSeconds());
2828
node.ActualizeNodeStatistics(now);
2929
for (const auto& t : node.Tablets) {
@@ -57,6 +57,7 @@ class TTxRegisterNode : public TTransactionBase<THive> {
5757
db.Table<Schema::Node>().Key(nodeId).Update<Schema::Node::Down, Schema::Node::Freeze>(false, false);
5858
}
5959
if (node.BecomeUpOnRestart) {
60+
BLOG_TRACE("THive::TTxRegisterNode(" << Local.NodeId() << ")::Execute - node became up on restart");
6061
node.SetDown(false);
6162
node.BecomeUpOnRestart = false;
6263
db.Table<Schema::Node>().Key(nodeId).Update<Schema::Node::Down, Schema::Node::BecomeUpOnRestart>(false, false);

ydb/core/mind/hive/ya.make

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ SRCS(
4848
tx__configure_subdomain.cpp
4949
tx__create_tablet.cpp
5050
tx__cut_tablet_history.cpp
51+
tx__delete_node.cpp
5152
tx__delete_tablet.cpp
5253
tx__delete_tablet_result.cpp
5354
tx__disconnect_node.cpp

0 commit comments

Comments
 (0)