Skip to content

Commit f216f16

Browse files
authored
25-1: follower compatability issues (#15974) (#18377)
2 parents f161e2b + 4500386 commit f216f16

File tree

4 files changed

+136
-2
lines changed

4 files changed

+136
-2
lines changed

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3089,6 +3089,7 @@ STFUNC(THive::StateInit) {
30893089
hFunc(TEvInterconnect::TEvNodesInfo, Handle);
30903090
hFunc(TEvPrivate::TEvProcessBootQueue, HandleInit);
30913091
hFunc(TEvPrivate::TEvProcessTabletBalancer, HandleInit);
3092+
hFunc(TEvPrivate::TEvUpdateDataCenterFollowers, HandleInit);
30923093
// We subscribe to config updates before hive is fully loaded
30933094
hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle);
30943095
fFunc(NConsole::TEvConsole::TEvConfigNotificationRequest::EventType, EnqueueIncomingEvent);
@@ -3517,6 +3518,11 @@ void THive::Handle(TEvPrivate::TEvUpdateDataCenterFollowers::TPtr& ev) {
35173518
}
35183519
}
35193520

3521+
void THive::HandleInit(TEvPrivate::TEvUpdateDataCenterFollowers::TPtr& ev) {
3522+
BLOG_W("Received TEvUpdateDataCenterFollowers while in StateInit");
3523+
Schedule(TDuration::Seconds(1), ev->Release().Release());
3524+
}
3525+
35203526
void THive::Handle(TEvPrivate::TEvUpdateFollowers::TPtr&) {
35213527
Execute(CreateProcessUpdateFollowers());
35223528
}

ydb/core/mind/hive/hive_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
591591
void Handle(TEvHive::TEvUpdateDomain::TPtr& ev);
592592
void Handle(TEvPrivate::TEvDeleteNode::TPtr& ev);
593593
void Handle(TEvHive::TEvRequestTabletDistribution::TPtr& ev);
594+
void HandleInit(TEvPrivate::TEvUpdateDataCenterFollowers::TPtr& ev);
594595
void Handle(TEvPrivate::TEvUpdateDataCenterFollowers::TPtr& ev);
595596
void Handle(TEvHive::TEvRequestScaleRecommendation::TPtr& ev);
596597
void Handle(TEvPrivate::TEvGenerateTestData::TPtr& ev);

ydb/core/mind/hive/hive_ut.cpp

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <ydb/core/protos/counters_hive.pb.h>
1313
#include <ydb/core/protos/follower_group.pb.h>
1414
#include <ydb/core/protos/schemeshard/operations.pb.h>
15+
#include <ydb/core/protos/tx_proxy.pb.h>
1516
#include <ydb/core/mind/bscontroller/bsc.h>
1617
#include <ydb/core/mind/tenant_pool.h>
1718
#include <ydb/core/tablet_flat/tablet_flat_executed.h>
@@ -5919,6 +5920,119 @@ Y_UNIT_TEST_SUITE(THiveTest) {
59195920
}
59205921
}
59215922

5923+
Y_UNIT_TEST(TestFollowerCompatability1) {
5924+
static constexpr ui32 NUM_NODES = 3;
5925+
TTestBasicRuntime runtime(NUM_NODES, NUM_NODES); // num nodes = num dcs
5926+
Setup(runtime, true);
5927+
TVector<ui64> tabletIds;
5928+
const ui64 hiveTablet = MakeDefaultHiveID();
5929+
const ui64 testerTablet = MakeTabletID(false, 1);
5930+
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive, 0);
5931+
{
5932+
TDispatchOptions options;
5933+
options.FinalEvents.emplace_back(TEvLocal::EvSyncTablets, runtime.GetNodeCount());
5934+
runtime.DispatchEvents(options);
5935+
}
5936+
TTabletTypes::EType tabletType = TTabletTypes::Dummy;
5937+
// RequireAllDataCenters = true, FollowerCountPerDataCenter = false
5938+
// This confguration is nonsensical, and followers are never created like that
5939+
// Yet, there might be some followers like that remaining from the olden pre-follower-groups days
5940+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500, tabletType, BINDED_CHANNELS));
5941+
ev->Record.SetObjectId(1);
5942+
auto* followerGroup = ev->Record.AddFollowerGroups();
5943+
followerGroup->SetFollowerCount(NUM_NODES);
5944+
followerGroup->SetFollowerCountPerDataCenter(false);
5945+
followerGroup->SetRequireAllDataCenters(true);
5946+
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
5947+
// restart everything
5948+
for (ui32 i = 0; i < NUM_NODES; ++i) {
5949+
SendKillLocal(runtime, i);
5950+
}
5951+
runtime.Register(CreateTabletKiller(hiveTablet));
5952+
for (ui32 i = 0; i < NUM_NODES; ++i) {
5953+
CreateLocal(runtime, i);
5954+
}
5955+
{
5956+
TDispatchOptions options;
5957+
options.FinalEvents.emplace_back(TEvLocal::EvSyncTablets, runtime.GetNodeCount());
5958+
runtime.DispatchEvents(options);
5959+
}
5960+
{
5961+
TDispatchOptions options;
5962+
options.FinalEvents.emplace_back(TEvLocal::EvTabletStatus, 3);
5963+
runtime.DispatchEvents(options, TDuration::Seconds(1));
5964+
}
5965+
// test every node has a follower running
5966+
NTabletPipe::TClientConfig pipeConfig;
5967+
pipeConfig.ForceLocal = true;
5968+
pipeConfig.AllowFollower = true;
5969+
pipeConfig.ForceFollower = true;
5970+
for (ui32 node = 0; node < NUM_NODES; ++node) {
5971+
MakeSureTabletIsUp(runtime, tabletId, node, &pipeConfig);
5972+
}
5973+
}
5974+
5975+
Y_UNIT_TEST(TestFollowerCompatability2) {
5976+
static constexpr ui32 NUM_NODES = 3;
5977+
TTestBasicRuntime runtime(NUM_NODES, NUM_NODES); // num nodes = num dcs
5978+
Setup(runtime, true);
5979+
TVector<ui64> tabletIds;
5980+
const ui64 hiveTablet = MakeDefaultHiveID();
5981+
const ui64 testerTablet = MakeTabletID(false, 1);
5982+
const TActorId senderA = runtime.AllocateEdgeActor(0);
5983+
SendKillLocal(runtime, 0); // node 0 exists but does not run local - to simulate a case where a db does not have nodes in every dc
5984+
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive, 0);
5985+
{
5986+
TDispatchOptions options;
5987+
options.FinalEvents.emplace_back(TEvLocal::EvSyncTablets, 2);
5988+
runtime.DispatchEvents(options);
5989+
}
5990+
TTabletTypes::EType tabletType = TTabletTypes::Dummy;
5991+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500, tabletType, BINDED_CHANNELS));
5992+
ev->Record.SetObjectId(1);
5993+
auto* followerGroup = ev->Record.AddFollowerGroups();
5994+
followerGroup->SetFollowerCount(1);
5995+
followerGroup->SetFollowerCountPerDataCenter(true);
5996+
followerGroup->SetRequireAllDataCenters(true);
5997+
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
5998+
5999+
// drop dc column from followers - to imitate that they were created on an older version
6000+
TStringBuilder program;
6001+
program << "((let result (AsList ";
6002+
for (unsigned i = 1; i < 3; ++i) {
6003+
program << "(UpdateRow 'TabletFollowerTablet '('('TabletID (Uint64 '" << tabletId <<")) '('FollowerID (Uint64 '" << i << "))) '('('DataCenter)))";
6004+
}
6005+
program << ")) (return result))";
6006+
auto mkql = std::make_unique<TEvTablet::TEvLocalMKQL>();
6007+
mkql->Record.MutableProgram()->MutableProgram()->SetText(program);
6008+
runtime.SendToPipe(hiveTablet, senderA, mkql.release());
6009+
{
6010+
TAutoPtr<IEventHandle> handle;
6011+
runtime.GrabEdgeEvent<TEvTablet::TEvLocalMKQLResponse>(handle);
6012+
}
6013+
6014+
runtime.Register(CreateTabletKiller(hiveTablet));
6015+
runtime.DispatchEvents({}, TDuration::MilliSeconds(50));
6016+
6017+
// There should be exactly 2 followers, with ids 1 and 2
6018+
// (that is, there should not be a follower created for the dc that node 0 is in)
6019+
6020+
THolder<TEvHive::TEvRequestHiveInfo> request = MakeHolder<TEvHive::TEvRequestHiveInfo>();
6021+
request->Record.SetReturnFollowers(true);
6022+
runtime.SendToPipe(hiveTablet, senderA, request.Release());
6023+
TAutoPtr<IEventHandle> handle;
6024+
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
6025+
unsigned followers = 0;
6026+
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
6027+
auto followerId = tablet.GetFollowerID();
6028+
if (followerId > 0) {
6029+
UNIT_ASSERT_LE(followerId, 2);
6030+
++followers;
6031+
}
6032+
}
6033+
UNIT_ASSERT_VALUES_EQUAL(followers, 2);
6034+
}
6035+
59226036
Y_UNIT_TEST(TestCreateExternalTablet) {
59236037
TTestBasicRuntime runtime(1, false);
59246038
Setup(runtime, true);

ydb/core/mind/hive/tx__load_everything.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,11 @@ class TTxLoadEverything : public TTransactionBase<THive> {
605605
followerGroup.LocalNodeOnly = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::LocalNodeOnly>();
606606
followerGroup.FollowerCountPerDataCenter = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::FollowerCountPerDataCenter>();
607607
followerGroup.RequireDifferentNodes = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::RequireDifferentNodes>();
608+
609+
if (followerGroup.RequireAllDataCenters && !followerGroup.FollowerCountPerDataCenter) {
610+
followerGroup.FollowerCountPerDataCenter = true;
611+
followerGroup.SetFollowerCount((followerGroup.GetRawFollowerCount() - 1) / Self->DataCenters.size() + 1);
612+
}
608613
} else {
609614
++numMissingTablets;
610615
}
@@ -665,8 +670,10 @@ class TTxLoadEverything : public TTransactionBase<THive> {
665670
continue;
666671
}
667672
std::map<TDataCenterId, i32> dataCentersToCover; // dc -> need x more followers in dc
668-
for (const auto& [dc, _] : Self->DataCenters) {
669-
dataCentersToCover[dc] = group.GetFollowerCountForDataCenter(dc);
673+
for (const auto& [dcId, dcInfo] : Self->DataCenters) {
674+
if (dcInfo.IsRegistered()) {
675+
dataCentersToCover[dcId] = group.GetFollowerCountForDataCenter(dcId);
676+
}
670677
}
671678
auto groupId = group.Id;
672679
auto filterGroup = [groupId](auto&& follower) { return follower->FollowerGroup.Id == groupId;};
@@ -876,6 +883,12 @@ class TTxLoadEverything : public TTransactionBase<THive> {
876883
Self->ProcessFollowerUpdatesScheduled = true;
877884
}
878885

886+
for (const auto& [dcId, dcInfo] : Self->DataCenters) {
887+
if (!dcInfo.IsRegistered()) {
888+
Self->Schedule(TDuration::Seconds(1), new TEvPrivate::TEvUpdateDataCenterFollowers(dcId));
889+
}
890+
}
891+
879892
Self->ProcessPendingStopTablet();
880893
Self->ProcessPendingResumeTablet();
881894

0 commit comments

Comments
 (0)