Skip to content

Commit 4500386

Browse files
committed
follower compatability issues (#15974)
1 parent 4b8e06e commit 4500386

File tree

4 files changed

+136
-2
lines changed

4 files changed

+136
-2
lines changed

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3085,6 +3085,7 @@ STFUNC(THive::StateInit) {
30853085
hFunc(TEvInterconnect::TEvNodesInfo, Handle);
30863086
hFunc(TEvPrivate::TEvProcessBootQueue, HandleInit);
30873087
hFunc(TEvPrivate::TEvProcessTabletBalancer, HandleInit);
3088+
hFunc(TEvPrivate::TEvUpdateDataCenterFollowers, HandleInit);
30883089
// We subscribe to config updates before hive is fully loaded
30893090
hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle);
30903091
fFunc(NConsole::TEvConsole::TEvConfigNotificationRequest::EventType, EnqueueIncomingEvent);
@@ -3513,6 +3514,11 @@ void THive::Handle(TEvPrivate::TEvUpdateDataCenterFollowers::TPtr& ev) {
35133514
}
35143515
}
35153516

3517+
void THive::HandleInit(TEvPrivate::TEvUpdateDataCenterFollowers::TPtr& ev) {
3518+
BLOG_W("Received TEvUpdateDataCenterFollowers while in StateInit");
3519+
Schedule(TDuration::Seconds(1), ev->Release().Release());
3520+
}
3521+
35163522
void THive::Handle(TEvPrivate::TEvUpdateFollowers::TPtr&) {
35173523
Execute(CreateProcessUpdateFollowers());
35183524
}

ydb/core/mind/hive/hive_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
591591
void Handle(TEvHive::TEvUpdateDomain::TPtr& ev);
592592
void Handle(TEvPrivate::TEvDeleteNode::TPtr& ev);
593593
void Handle(TEvHive::TEvRequestTabletDistribution::TPtr& ev);
594+
void HandleInit(TEvPrivate::TEvUpdateDataCenterFollowers::TPtr& ev);
594595
void Handle(TEvPrivate::TEvUpdateDataCenterFollowers::TPtr& ev);
595596
void Handle(TEvHive::TEvRequestScaleRecommendation::TPtr& ev);
596597
void Handle(TEvPrivate::TEvGenerateTestData::TPtr& ev);

ydb/core/mind/hive/hive_ut.cpp

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <ydb/core/protos/counters_hive.pb.h>
1313
#include <ydb/core/protos/follower_group.pb.h>
1414
#include <ydb/core/protos/schemeshard/operations.pb.h>
15+
#include <ydb/core/protos/tx_proxy.pb.h>
1516
#include <ydb/core/mind/bscontroller/bsc.h>
1617
#include <ydb/core/mind/tenant_pool.h>
1718
#include <ydb/core/tablet_flat/tablet_flat_executed.h>
@@ -5876,6 +5877,119 @@ Y_UNIT_TEST_SUITE(THiveTest) {
58765877
}
58775878
}
58785879

5880+
Y_UNIT_TEST(TestFollowerCompatability1) {
5881+
static constexpr ui32 NUM_NODES = 3;
5882+
TTestBasicRuntime runtime(NUM_NODES, NUM_NODES); // num nodes = num dcs
5883+
Setup(runtime, true);
5884+
TVector<ui64> tabletIds;
5885+
const ui64 hiveTablet = MakeDefaultHiveID();
5886+
const ui64 testerTablet = MakeTabletID(false, 1);
5887+
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive, 0);
5888+
{
5889+
TDispatchOptions options;
5890+
options.FinalEvents.emplace_back(TEvLocal::EvSyncTablets, runtime.GetNodeCount());
5891+
runtime.DispatchEvents(options);
5892+
}
5893+
TTabletTypes::EType tabletType = TTabletTypes::Dummy;
5894+
// RequireAllDataCenters = true, FollowerCountPerDataCenter = false
5895+
// This confguration is nonsensical, and followers are never created like that
5896+
// Yet, there might be some followers like that remaining from the olden pre-follower-groups days
5897+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500, tabletType, BINDED_CHANNELS));
5898+
ev->Record.SetObjectId(1);
5899+
auto* followerGroup = ev->Record.AddFollowerGroups();
5900+
followerGroup->SetFollowerCount(NUM_NODES);
5901+
followerGroup->SetFollowerCountPerDataCenter(false);
5902+
followerGroup->SetRequireAllDataCenters(true);
5903+
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
5904+
// restart everything
5905+
for (ui32 i = 0; i < NUM_NODES; ++i) {
5906+
SendKillLocal(runtime, i);
5907+
}
5908+
runtime.Register(CreateTabletKiller(hiveTablet));
5909+
for (ui32 i = 0; i < NUM_NODES; ++i) {
5910+
CreateLocal(runtime, i);
5911+
}
5912+
{
5913+
TDispatchOptions options;
5914+
options.FinalEvents.emplace_back(TEvLocal::EvSyncTablets, runtime.GetNodeCount());
5915+
runtime.DispatchEvents(options);
5916+
}
5917+
{
5918+
TDispatchOptions options;
5919+
options.FinalEvents.emplace_back(TEvLocal::EvTabletStatus, 3);
5920+
runtime.DispatchEvents(options, TDuration::Seconds(1));
5921+
}
5922+
// test every node has a follower running
5923+
NTabletPipe::TClientConfig pipeConfig;
5924+
pipeConfig.ForceLocal = true;
5925+
pipeConfig.AllowFollower = true;
5926+
pipeConfig.ForceFollower = true;
5927+
for (ui32 node = 0; node < NUM_NODES; ++node) {
5928+
MakeSureTabletIsUp(runtime, tabletId, node, &pipeConfig);
5929+
}
5930+
}
5931+
5932+
Y_UNIT_TEST(TestFollowerCompatability2) {
5933+
static constexpr ui32 NUM_NODES = 3;
5934+
TTestBasicRuntime runtime(NUM_NODES, NUM_NODES); // num nodes = num dcs
5935+
Setup(runtime, true);
5936+
TVector<ui64> tabletIds;
5937+
const ui64 hiveTablet = MakeDefaultHiveID();
5938+
const ui64 testerTablet = MakeTabletID(false, 1);
5939+
const TActorId senderA = runtime.AllocateEdgeActor(0);
5940+
SendKillLocal(runtime, 0); // node 0 exists but does not run local - to simulate a case where a db does not have nodes in every dc
5941+
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive, 0);
5942+
{
5943+
TDispatchOptions options;
5944+
options.FinalEvents.emplace_back(TEvLocal::EvSyncTablets, 2);
5945+
runtime.DispatchEvents(options);
5946+
}
5947+
TTabletTypes::EType tabletType = TTabletTypes::Dummy;
5948+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500, tabletType, BINDED_CHANNELS));
5949+
ev->Record.SetObjectId(1);
5950+
auto* followerGroup = ev->Record.AddFollowerGroups();
5951+
followerGroup->SetFollowerCount(1);
5952+
followerGroup->SetFollowerCountPerDataCenter(true);
5953+
followerGroup->SetRequireAllDataCenters(true);
5954+
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
5955+
5956+
// drop dc column from followers - to imitate that they were created on an older version
5957+
TStringBuilder program;
5958+
program << "((let result (AsList ";
5959+
for (unsigned i = 1; i < 3; ++i) {
5960+
program << "(UpdateRow 'TabletFollowerTablet '('('TabletID (Uint64 '" << tabletId <<")) '('FollowerID (Uint64 '" << i << "))) '('('DataCenter)))";
5961+
}
5962+
program << ")) (return result))";
5963+
auto mkql = std::make_unique<TEvTablet::TEvLocalMKQL>();
5964+
mkql->Record.MutableProgram()->MutableProgram()->SetText(program);
5965+
runtime.SendToPipe(hiveTablet, senderA, mkql.release());
5966+
{
5967+
TAutoPtr<IEventHandle> handle;
5968+
runtime.GrabEdgeEvent<TEvTablet::TEvLocalMKQLResponse>(handle);
5969+
}
5970+
5971+
runtime.Register(CreateTabletKiller(hiveTablet));
5972+
runtime.DispatchEvents({}, TDuration::MilliSeconds(50));
5973+
5974+
// There should be exactly 2 followers, with ids 1 and 2
5975+
// (that is, there should not be a follower created for the dc that node 0 is in)
5976+
5977+
THolder<TEvHive::TEvRequestHiveInfo> request = MakeHolder<TEvHive::TEvRequestHiveInfo>();
5978+
request->Record.SetReturnFollowers(true);
5979+
runtime.SendToPipe(hiveTablet, senderA, request.Release());
5980+
TAutoPtr<IEventHandle> handle;
5981+
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
5982+
unsigned followers = 0;
5983+
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
5984+
auto followerId = tablet.GetFollowerID();
5985+
if (followerId > 0) {
5986+
UNIT_ASSERT_LE(followerId, 2);
5987+
++followers;
5988+
}
5989+
}
5990+
UNIT_ASSERT_VALUES_EQUAL(followers, 2);
5991+
}
5992+
58795993
Y_UNIT_TEST(TestCreateExternalTablet) {
58805994
TTestBasicRuntime runtime(1, false);
58815995
Setup(runtime, true);

ydb/core/mind/hive/tx__load_everything.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,11 @@ class TTxLoadEverything : public TTransactionBase<THive> {
605605
followerGroup.LocalNodeOnly = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::LocalNodeOnly>();
606606
followerGroup.FollowerCountPerDataCenter = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::FollowerCountPerDataCenter>();
607607
followerGroup.RequireDifferentNodes = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::RequireDifferentNodes>();
608+
609+
if (followerGroup.RequireAllDataCenters && !followerGroup.FollowerCountPerDataCenter) {
610+
followerGroup.FollowerCountPerDataCenter = true;
611+
followerGroup.SetFollowerCount((followerGroup.GetRawFollowerCount() - 1) / Self->DataCenters.size() + 1);
612+
}
608613
} else {
609614
++numMissingTablets;
610615
}
@@ -665,8 +670,10 @@ class TTxLoadEverything : public TTransactionBase<THive> {
665670
continue;
666671
}
667672
std::map<TDataCenterId, i32> dataCentersToCover; // dc -> need x more followers in dc
668-
for (const auto& [dc, _] : Self->DataCenters) {
669-
dataCentersToCover[dc] = group.GetFollowerCountForDataCenter(dc);
673+
for (const auto& [dcId, dcInfo] : Self->DataCenters) {
674+
if (dcInfo.IsRegistered()) {
675+
dataCentersToCover[dcId] = group.GetFollowerCountForDataCenter(dcId);
676+
}
670677
}
671678
auto groupId = group.Id;
672679
auto filterGroup = [groupId](auto&& follower) { return follower->FollowerGroup.Id == groupId;};
@@ -876,6 +883,12 @@ class TTxLoadEverything : public TTransactionBase<THive> {
876883
Self->ProcessFollowerUpdatesScheduled = true;
877884
}
878885

886+
for (const auto& [dcId, dcInfo] : Self->DataCenters) {
887+
if (!dcInfo.IsRegistered()) {
888+
Self->Schedule(TDuration::Seconds(1), new TEvPrivate::TEvUpdateDataCenterFollowers(dcId));
889+
}
890+
}
891+
879892
Self->ProcessPendingStopTablet();
880893
Self->ProcessPendingResumeTablet();
881894

0 commit comments

Comments
 (0)