Skip to content

Commit ad0cdee

Browse files
authored
Bootstrapper: full node list in TBootstrapperInfo, handle possible duplicates (#9848)
1 parent 3b280bb commit ad0cdee

File tree

5 files changed

+97
-59
lines changed

5 files changed

+97
-59
lines changed

ydb/core/driver_lib/run/kikimr_services_initializers.cpp

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,33 +1420,28 @@ void TBootstrapperInitializer::InitializeServices(
14201420
TActorSetupCmd(CreateConfiguredTabletBootstrapper(boot), TMailboxType::HTSwap, appData->SystemPoolId)));
14211421
} else {
14221422
const bool standby = boot.HasStandBy() && boot.GetStandBy();
1423-
for (const ui32 bootstrapperNode : boot.GetNode()) {
1424-
if (bootstrapperNode == NodeId) {
1425-
1426-
TIntrusivePtr<TTabletStorageInfo> info(TabletStorageInfoFromProto(boot.GetInfo()));
1427-
1428-
auto tabletType = BootstrapperTypeToTabletType(boot.GetType());
1429-
1430-
auto tabletSetupInfo = CreateTablet(
1431-
TTabletTypes::TypeToStr(tabletType),
1432-
info,
1433-
appData);
1434-
1435-
TIntrusivePtr<TBootstrapperInfo> bi = new TBootstrapperInfo(tabletSetupInfo.Get());
1436-
1437-
if (boot.NodeSize() != 1) {
1438-
bi->OtherNodes.reserve(boot.NodeSize() - 1);
1439-
for (ui32 x : boot.GetNode())
1440-
if (x != NodeId)
1441-
bi->OtherNodes.push_back(x);
1442-
if (boot.HasWatchThreshold())
1443-
bi->WatchThreshold = TDuration::MilliSeconds(boot.GetWatchThreshold());
1444-
if (boot.HasStartFollowers())
1445-
bi->StartFollowers = boot.GetStartFollowers();
1446-
}
1447-
1448-
setup->LocalServices.push_back(std::pair<TActorId, TActorSetupCmd>(MakeBootstrapperID(info->TabletID, bootstrapperNode), TActorSetupCmd(CreateBootstrapper(info.Get(), bi.Get(), standby), TMailboxType::HTSwap, appData->SystemPoolId)));
1449-
}
1423+
if (Find(boot.GetNode(), NodeId) != boot.GetNode().end()) {
1424+
TIntrusivePtr<TTabletStorageInfo> info(TabletStorageInfoFromProto(boot.GetInfo()));
1425+
1426+
auto tabletType = BootstrapperTypeToTabletType(boot.GetType());
1427+
1428+
auto tabletSetupInfo = CreateTablet(
1429+
TTabletTypes::TypeToStr(tabletType),
1430+
info,
1431+
appData);
1432+
1433+
TIntrusivePtr<TBootstrapperInfo> bi = new TBootstrapperInfo(tabletSetupInfo.Get());
1434+
bi->Nodes.reserve(boot.NodeSize());
1435+
for (ui32 x : boot.GetNode())
1436+
bi->Nodes.push_back(x);
1437+
if (boot.HasWatchThreshold())
1438+
bi->WatchThreshold = TDuration::MilliSeconds(boot.GetWatchThreshold());
1439+
if (boot.HasStartFollowers())
1440+
bi->StartFollowers = boot.GetStartFollowers();
1441+
1442+
setup->LocalServices.push_back(std::pair<TActorId, TActorSetupCmd>(
1443+
MakeBootstrapperID(info->TabletID, NodeId),
1444+
TActorSetupCmd(CreateBootstrapper(info.Get(), bi.Get(), standby), TMailboxType::HTSwap, appData->SystemPoolId)));
14501445
}
14511446
}
14521447
}

ydb/core/mind/configured_tablet_bootstrapper.cpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -82,16 +82,13 @@ class TConfiguredTabletBootstrapper : public TActorBootstrapped<TConfiguredTable
8282
TIntrusivePtr<TTabletSetupInfo> tabletSetupInfo = MakeTabletSetupInfo(tabletType, appData->UserPoolId, appData->SystemPoolId);
8383

8484
TIntrusivePtr<TBootstrapperInfo> bi = new TBootstrapperInfo(tabletSetupInfo.Get());
85-
if (config.NodeSize() != 1) {
86-
for (ui32 node : config.GetNode()) {
87-
if (node != selfNode)
88-
bi->OtherNodes.emplace_back(node);
89-
}
90-
if (config.HasWatchThreshold())
91-
bi->WatchThreshold = TDuration::MilliSeconds(config.GetWatchThreshold());
92-
if (config.HasStartFollowers())
93-
bi->StartFollowers = config.GetStartFollowers();
85+
for (ui32 node : config.GetNode()) {
86+
bi->Nodes.emplace_back(node);
9487
}
88+
if (config.HasWatchThreshold())
89+
bi->WatchThreshold = TDuration::MilliSeconds(config.GetWatchThreshold());
90+
if (config.HasStartFollowers())
91+
bi->StartFollowers = config.GetStartFollowers();
9592

9693
BootstrapperInstance = Register(CreateBootstrapper(storageInfo.Get(), bi.Get(), false), TMailboxType::HTSwap, appData->SystemPoolId);
9794

ydb/core/tablet/bootstrapper.cpp

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ class TBootstrapper : public TActorBootstrapped<TBootstrapper> {
2020
const TIntrusivePtr<TTabletStorageInfo> TabletInfo;
2121
const TIntrusivePtr<TBootstrapperInfo> BootstrapperInfo;
2222
bool ModeStandby;
23+
TVector<ui32> OtherNodes;
24+
THashMap<ui32, size_t> OtherNodesIndex;
2325

2426
TActorId KnownLeaderPipe;
2527

@@ -63,10 +65,12 @@ class TBootstrapper : public TActorBootstrapped<TBootstrapper> {
6365

6466
TVector<TAlien> Aliens;
6567
TVector<TWatcher> Watchers;
68+
size_t Waiting;
6669

6770
explicit TRound(size_t count)
6871
: Aliens(count)
6972
, Watchers(count)
73+
, Waiting(count)
7074
{}
7175
};
7276

@@ -134,10 +138,22 @@ class TBootstrapper : public TActorBootstrapped<TBootstrapper> {
134138
return NKikimrBootstrapper::TEvWatchResult::EState_Name(state).c_str();
135139
}
136140

141+
void BuildOtherNodes() {
142+
ui32 selfNodeId = SelfId().NodeId();
143+
for (ui32 nodeId : BootstrapperInfo->Nodes) {
144+
if (nodeId != selfNodeId && !OtherNodesIndex.contains(nodeId)) {
145+
size_t index = OtherNodes.size();
146+
OtherNodes.push_back(nodeId);
147+
OtherNodesIndex[nodeId] = index;
148+
}
149+
}
150+
}
151+
137152
size_t AlienIndex(ui32 alienNodeId) {
138-
for (size_t i = 0, e = BootstrapperInfo->OtherNodes.size(); i != e; ++i)
139-
if (BootstrapperInfo->OtherNodes[i] == alienNodeId)
140-
return i;
153+
auto it = OtherNodesIndex.find(alienNodeId);
154+
if (it != OtherNodesIndex.end()) {
155+
return it->second;
156+
}
141157
return Max<size_t>();
142158
}
143159

@@ -300,7 +316,7 @@ class TBootstrapper : public TActorBootstrapped<TBootstrapper> {
300316
// Note: make sure notifications from previous states don't interfere
301317
++RoundCounter;
302318

303-
if (BootstrapperInfo->OtherNodes.empty()) {
319+
if (OtherNodes.empty()) {
304320
return Boot();
305321
}
306322

@@ -311,8 +327,8 @@ class TBootstrapper : public TActorBootstrapped<TBootstrapper> {
311327

312328
const ui64 tabletId = TabletInfo->TabletID;
313329

314-
Round.emplace(BootstrapperInfo->OtherNodes.size());
315-
for (ui32 alienNode : BootstrapperInfo->OtherNodes) {
330+
Round.emplace(OtherNodes.size());
331+
for (ui32 alienNode : OtherNodes) {
316332
Send(MakeBootstrapperID(tabletId, alienNode),
317333
new TEvBootstrapper::TEvWatch(tabletId, SelfSeed, RoundCounter),
318334
IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession,
@@ -407,6 +423,10 @@ class TBootstrapper : public TActorBootstrapped<TBootstrapper> {
407423
"tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName()
408424
<< ", apply alien " << alien.NodeId() << " state: " << GetStateName(state));
409425

426+
if (alienEntry.State == TRound::EAlienState::Wait) {
427+
Y_ABORT_UNLESS(Round->Waiting-- > 0);
428+
}
429+
410430
alienEntry.Seed = seed;
411431

412432
switch (state) {
@@ -434,17 +454,22 @@ class TBootstrapper : public TActorBootstrapped<TBootstrapper> {
434454
}
435455

436456
void CheckRoundCompletion() {
457+
auto& round = Round.value();
458+
if (round.Waiting > 0) {
459+
return;
460+
}
461+
437462
ui64 winnerSeed = SelfSeed;
438463
ui32 winner = SelfId().NodeId();
439464

440465
size_t undelivered = 0;
441466
size_t disconnected = 0;
442-
auto& round = Round.value();
443467
for (size_t i = 0, e = round.Aliens.size(); i != e; ++i) {
444468
const auto& alien = round.Aliens[i];
445-
const ui32 node = BootstrapperInfo->OtherNodes.at(i);
469+
const ui32 node = OtherNodes.at(i);
446470
switch (alien.State) {
447471
case TRound::EAlienState::Wait:
472+
Y_DEBUG_ABORT("Unexpected Wait state");
448473
return;
449474
case TRound::EAlienState::Unknown:
450475
break;
@@ -485,7 +510,7 @@ class TBootstrapper : public TActorBootstrapped<TBootstrapper> {
485510

486511
bool CheckBootPermitted(size_t undelivered, size_t disconnected) {
487512
// Total number of nodes that participate in tablet booting
488-
size_t total = 1 + BootstrapperInfo->OtherNodes.size();
513+
size_t total = 1 + OtherNodes.size();
489514
Y_DEBUG_ABORT_UNLESS(total >= 1 + undelivered + disconnected);
490515

491516
// Ignore nodes that don't have bootstrapper running
@@ -785,7 +810,7 @@ class TBootstrapper : public TActorBootstrapped<TBootstrapper> {
785810
}
786811

787812
void PassAway() override {
788-
for (ui32 nodeId : BootstrapperInfo->OtherNodes) {
813+
for (ui32 nodeId : OtherNodes) {
789814
Send(TActivationContext::InterconnectProxy(nodeId), new TEvents::TEvUnsubscribe);
790815
}
791816
NotifyWatchers();
@@ -806,6 +831,7 @@ class TBootstrapper : public TActorBootstrapped<TBootstrapper> {
806831
}
807832

808833
void Bootstrap() {
834+
BuildOtherNodes();
809835
if (ModeStandby) {
810836
Become(&TThis::StateStandBy);
811837
} else {

ydb/core/tablet/bootstrapper.h

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,13 @@ struct TEvBootstrapper {
3333

3434
struct TBootstrapperInfo : public TThrRefBase {
3535
TIntrusivePtr<TTabletSetupInfo> SetupInfo;
36-
TVector<ui32> OtherNodes;
37-
TDuration WatchThreshold;
38-
TDuration OfflineDelay;
39-
bool StartFollowers;
36+
TVector<ui32> Nodes;
37+
TDuration WatchThreshold = TDuration::MilliSeconds(200);
38+
TDuration OfflineDelay = TDuration::Seconds(3);
39+
bool StartFollowers = false;
4040

41-
TBootstrapperInfo(TTabletSetupInfo* setupInfo)
41+
explicit TBootstrapperInfo(TTabletSetupInfo* setupInfo)
4242
: SetupInfo(setupInfo)
43-
, WatchThreshold(TDuration::MilliSeconds(200))
44-
, OfflineDelay(TDuration::Seconds(3))
45-
, StartFollowers(false)
4643
{}
4744
};
4845

ydb/core/tablet/bootstrapper_ut.cpp

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,16 @@ Y_UNIT_TEST_SUITE(BootstrapperTest) {
9595
std::vector<TActorId> boots;
9696
auto tabletInfo = CreateSimpleTabletStorageInfo(tabletId);
9797
auto setupInfo = CreateSimpleTabletSetupInfo();
98+
auto bootInfo = MakeIntrusive<TBootstrapperInfo>(setupInfo.Get());
9899
for (ui32 nodeIdx : nodeIdxs) {
99-
auto bootInfo = MakeIntrusive<TBootstrapperInfo>(setupInfo.Get());
100-
for (ui32 otherNodeIdx : nodeIdxs) {
101-
if (otherNodeIdx != nodeIdx) {
102-
bootInfo->OtherNodes.push_back(runtime.GetNodeId(otherNodeIdx));
103-
}
100+
bootInfo->Nodes.push_back(runtime.GetNodeId(nodeIdx));
101+
}
102+
THashMap<ui32, TActorId> started;
103+
for (ui32 nodeIdx : nodeIdxs) {
104+
if (started.contains(nodeIdx)) {
105+
// Start one bootstrapper per node
106+
boots.push_back(started.at(nodeIdx));
107+
continue;
104108
}
105109
boots.push_back(runtime.Register(CreateBootstrapper(tabletInfo.Get(), bootInfo.Get()), nodeIdx));
106110
runtime.EnableScheduleForActor(boots.back());
@@ -109,6 +113,7 @@ Y_UNIT_TEST_SUITE(BootstrapperTest) {
109113
MakeBootstrapperID(tabletId, runtime.GetNodeId(nodeIdx)),
110114
boots.back(),
111115
nodeIdx);
116+
started[nodeIdx] = boots.back();
112117
}
113118
return boots;
114119
}
@@ -423,6 +428,24 @@ Y_UNIT_TEST_SUITE(BootstrapperTest) {
423428
Y_UNUSED(client2);
424429
}
425430

431+
Y_UNIT_TEST(DuplicateNodes) {
432+
TTestBasicRuntime runtime(3);
433+
SetupTabletServices(runtime);
434+
runtime.SetLogPriority(NKikimrServices::BOOTSTRAPPER, NActors::NLog::PRI_DEBUG);
435+
436+
StartSimpleTabletBootstrappers(runtime, {1, 1, 2, 2});
437+
438+
size_t boots = 0;
439+
auto observer = runtime.AddObserver<TEvTablet::TEvBoot>([&](auto&) {
440+
++boots;
441+
});
442+
443+
runtime.SimulateSleep(TDuration::Seconds(1));
444+
445+
// Tablet must boot exactly once
446+
UNIT_ASSERT_VALUES_EQUAL(boots, 1u);
447+
}
448+
426449
} // Y_UNIT_TEST_SUITE(BootstrapperTest)
427450

428451
} // namespace NKikimr

0 commit comments

Comments
 (0)