Skip to content

Commit c48bfb6

Browse files
authored
maybe skip a balancer if it did nothing last time (#14868)
1 parent 3a1c845 commit c48bfb6

File tree

2 files changed

+128
-39
lines changed

2 files changed

+128
-39
lines changed

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2353,6 +2353,17 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
23532353
nodeUsageHistogram.IncrementFor(record.Usage * 100);
23542354
}
23552355

2356+
std::optional<TBalancerSettings> settings;
2357+
const auto maybeStartBalancer = [&settings, this]() -> bool {
2358+
Y_DEBUG_ABORT_UNLESS(settings);
2359+
if (LastBalancerTrigger == settings->Type
2360+
&& BalancerStats[static_cast<size_t>(settings->Type)].LastRunMovements == 0) {
2361+
return false;
2362+
}
2363+
StartHiveBalancer(std::move(*settings));
2364+
return true;
2365+
};
2366+
23562367
double minUsageToKick = GetMaxNodeUsageToKick() - GetNodeUsageRangeToKick();
23572368
if (stats.MaxUsage >= GetMaxNodeUsageToKick() && stats.MinUsage < minUsageToKick) {
23582369
std::vector<TNodeId> overloadedNodes;
@@ -2363,15 +2374,17 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
23632374
}
23642375

23652376
if (!overloadedNodes.empty()) {
2366-
BLOG_D("Nodes " << overloadedNodes << " with usage over limit " << GetMaxNodeUsageToKick() << " - starting balancer");
2367-
StartHiveBalancer({
2377+
BLOG_D("Nodes " << overloadedNodes << " with usage over limit " << GetMaxNodeUsageToKick() << " - triggered balancer");
2378+
settings.emplace(TBalancerSettings{
23682379
.Type = EBalancerType::Emergency,
23692380
.MaxMovements = (int)CurrentConfig.GetMaxMovementsOnEmergencyBalancer(),
23702381
.RecheckOnFinish = CurrentConfig.GetContinueEmergencyBalancer(),
23712382
.MaxInFlight = GetEmergencyBalancerInflight(),
23722383
.FilterNodeIds = std::move(overloadedNodes),
23732384
});
2374-
return;
2385+
if (maybeStartBalancer()) {
2386+
return;
2387+
}
23752388
}
23762389
}
23772390

@@ -2380,24 +2393,19 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
23802393
}
23812394

23822395
if (ObjectDistributions.GetMaxImbalance() > GetObjectImbalanceToBalance()) {
2383-
TInstant now = TActivationContext::Now();
2384-
if (LastBalancerTrigger != EBalancerType::SpreadNeighbours
2385-
|| BalancerStats[static_cast<std::size_t>(EBalancerType::SpreadNeighbours)].LastRunMovements != 0
2386-
|| BalancerStats[static_cast<std::size_t>(EBalancerType::SpreadNeighbours)].LastRunTimestamp + TDuration::Seconds(1) < now) {
2387-
auto objectToBalance = ObjectDistributions.GetObjectToBalance();
2388-
BLOG_D("Max imbalance " << ObjectDistributions.GetMaxImbalance() << " - starting balancer for object " << objectToBalance.ObjectId);
2389-
StartHiveBalancer({
2390-
.Type = EBalancerType::SpreadNeighbours,
2391-
.MaxMovements = (int)CurrentConfig.GetMaxMovementsOnAutoBalancer(),
2392-
.RecheckOnFinish = CurrentConfig.GetContinueAutoBalancer(),
2393-
.MaxInFlight = GetBalancerInflight(),
2394-
.FilterNodeIds = std::move(objectToBalance.Nodes),
2395-
.ResourceToBalance = EResourceToBalance::Counter,
2396-
.FilterObjectId = objectToBalance.ObjectId,
2397-
});
2396+
auto objectToBalance = ObjectDistributions.GetObjectToBalance();
2397+
BLOG_D("Max imbalance " << ObjectDistributions.GetMaxImbalance() << " - triggered balancer for object " << objectToBalance.ObjectId);
2398+
settings.emplace(TBalancerSettings{
2399+
.Type = EBalancerType::SpreadNeighbours,
2400+
.MaxMovements = (int)CurrentConfig.GetMaxMovementsOnAutoBalancer(),
2401+
.RecheckOnFinish = CurrentConfig.GetContinueAutoBalancer(),
2402+
.MaxInFlight = GetBalancerInflight(),
2403+
.FilterNodeIds = std::move(objectToBalance.Nodes),
2404+
.ResourceToBalance = EResourceToBalance::Counter,
2405+
.FilterObjectId = objectToBalance.ObjectId,
2406+
});
2407+
if (maybeStartBalancer()) {
23982408
return;
2399-
} else {
2400-
BLOG_D("Skipping SpreadNeigbours Balancer, now: " << now << ", allowed: " << BalancerStats[static_cast<std::size_t>(EBalancerType::SpreadNeighbours)].LastRunTimestamp + TDuration::Seconds(1));
24012409
}
24022410
}
24032411

@@ -2422,14 +2430,21 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
24222430
break;
24232431
}
24242432
BLOG_TRACE("Scatter " << stats.ScatterByResource << " over limit "
2425-
<< GetMinScatterToBalance() << " - starting balancer " << EBalancerTypeName(balancerType));
2426-
StartHiveBalancer({
2433+
<< GetMinScatterToBalance() << " - triggered balancer " << EBalancerTypeName(balancerType));
2434+
settings.emplace(TBalancerSettings{
24272435
.Type = balancerType,
24282436
.MaxMovements = (int)CurrentConfig.GetMaxMovementsOnAutoBalancer(),
24292437
.RecheckOnFinish = CurrentConfig.GetContinueAutoBalancer(),
24302438
.MaxInFlight = GetBalancerInflight(),
24312439
.ResourceToBalance = *scatteredResource,
24322440
});
2441+
if (maybeStartBalancer()) {
2442+
return;
2443+
}
2444+
}
2445+
2446+
if (settings) {
2447+
StartHiveBalancer(std::move(*settings));
24332448
return;
24342449
}
24352450

ydb/core/mind/hive/hive_ut.cpp

Lines changed: 91 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <ydb/core/testlib/basics/helpers.h>
2222
#include <ydb/core/testlib/tablet_helpers.h>
2323
#include <ydb/core/testlib/tenant_runtime.h>
24+
#include <ydb/core/tx/columnshard/columnshard.h>
2425
#include <ydb/core/tx/schemeshard/schemeshard.h>
2526
#include <ydb/core/tx/mediator/mediator.h>
2627
#include <ydb/core/util/random.h>
@@ -92,6 +93,7 @@ namespace {
9293
runtime.SetLogPriority(NKikimrServices::TABLET_RESOLVER, otherPriority);
9394
runtime.SetLogPriority(NKikimrServices::STATESTORAGE, otherPriority);
9495
runtime.SetLogPriority(NKikimrServices::BOOTSTRAPPER, otherPriority);
96+
runtime.SetLogPriority(NKikimrServices::TX_COLUMNSHARD, otherPriority);
9597
}
9698

9799
THashMap<ui32, TIntrusivePtr<TNodeWardenConfig>> NodeWardenConfigs;
@@ -215,25 +217,34 @@ namespace {
215217
false);
216218
}
217219

220+
TLocalConfig::TPtr MakeDefaultLocalConfig() {
221+
TLocalConfig::TPtr localConfig(new TLocalConfig());
222+
localConfig->TabletClassInfo[TTabletTypes::Dummy].SetupInfo = new TTabletSetupInfo(
223+
&CreateFlatDummyTablet,
224+
TMailboxType::Simple, 0,
225+
TMailboxType::Simple, 0);
226+
localConfig->TabletClassInfo[TTabletTypes::Hive].SetupInfo = new TTabletSetupInfo(
227+
&CreateDefaultHive,
228+
TMailboxType::Simple, 0,
229+
TMailboxType::Simple, 0);
230+
localConfig->TabletClassInfo[TTabletTypes::Mediator].SetupInfo = new TTabletSetupInfo(
231+
&CreateTxMediator,
232+
TMailboxType::Simple, 0,
233+
TMailboxType::Simple, 0);
234+
localConfig->TabletClassInfo[TTabletTypes::ColumnShard].SetupInfo = new TTabletSetupInfo(
235+
&CreateColumnShard,
236+
TMailboxType::Simple, 0,
237+
TMailboxType::Simple, 0);
238+
return localConfig;
239+
}
240+
218241
void SetupLocals(TTestActorRuntime &runtime, bool isLocalEnabled) {
219242
if (!isLocalEnabled) {
220243
return;
221244
}
222245

223246
for (ui32 nodeIndex = 0; nodeIndex < runtime.GetNodeCount(); ++nodeIndex) {
224-
TLocalConfig::TPtr localConfig(new TLocalConfig());
225-
localConfig->TabletClassInfo[TTabletTypes::Dummy].SetupInfo = new TTabletSetupInfo(
226-
&CreateFlatDummyTablet,
227-
TMailboxType::Simple, 0,
228-
TMailboxType::Simple, 0);
229-
localConfig->TabletClassInfo[TTabletTypes::Hive].SetupInfo = new TTabletSetupInfo(
230-
&CreateDefaultHive,
231-
TMailboxType::Simple, 0,
232-
TMailboxType::Simple, 0);
233-
localConfig->TabletClassInfo[TTabletTypes::Mediator].SetupInfo = new TTabletSetupInfo(
234-
&CreateTxMediator,
235-
TMailboxType::Simple, 0,
236-
TMailboxType::Simple, 0);
247+
auto localConfig = MakeDefaultLocalConfig();
237248
TTenantPoolConfig::TPtr tenantPoolConfig = new TTenantPoolConfig(localConfig);
238249
tenantPoolConfig->AddStaticSlot(DOMAIN_NAME);
239250

@@ -653,10 +664,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
653664

654665
void CreateLocal(TTestActorRuntime &runtime, ui32 nodeIndex, TLocalConfig::TPtr localConfig = {}) {
655666
if (localConfig == nullptr) {
656-
localConfig = new TLocalConfig();
657-
localConfig->TabletClassInfo[TTabletTypes::Dummy].SetupInfo = new TTabletSetupInfo(&CreateFlatDummyTablet,
658-
TMailboxType::Simple, 0,
659-
TMailboxType::Simple, 0);
667+
localConfig = MakeDefaultLocalConfig();
660668
}
661669
TTenantPoolConfig::TPtr tenantPoolConfig = new TTenantPoolConfig(localConfig);
662670
tenantPoolConfig->AddStaticSlot(DOMAIN_NAME);
@@ -5329,6 +5337,72 @@ Y_UNIT_TEST_SUITE(THiveTest) {
53295337
UNIT_ASSERT_EQUAL(initialDistribution, newDistribution);
53305338
}
53315339

5340+
Y_UNIT_TEST(TestHiveBalancerHighUsageAndColumnShards) {
5341+
static constexpr ui64 NUM_NODES = 2;
5342+
TTestBasicRuntime runtime(2, false);
5343+
Setup(runtime, true, 1, [](TAppPrepare& app) {
5344+
app.HiveConfig.SetTabletKickCooldownPeriod(0);
5345+
app.HiveConfig.SetResourceChangeReactionPeriod(0);
5346+
});
5347+
const int nodeBase = runtime.GetNodeId(0);
5348+
TActorId senderA = runtime.AllocateEdgeActor();
5349+
const ui64 hiveTablet = MakeDefaultHiveID();
5350+
const ui64 testerTablet = MakeTabletID(false, 1);
5351+
5352+
auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> std::array<std::vector<ui64>, NUM_NODES> {
5353+
std::array<std::vector<ui64>, NUM_NODES> nodeTablets = {};
5354+
{
5355+
runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo());
5356+
TAutoPtr<IEventHandle> handle;
5357+
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
5358+
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
5359+
UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES),
5360+
"nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase);
5361+
nodeTablets[tablet.GetNodeID() - nodeBase].push_back(tablet.GetTabletID());
5362+
}
5363+
}
5364+
return nodeTablets;
5365+
};
5366+
5367+
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
5368+
5369+
// wait for creation of nodes
5370+
{
5371+
TDispatchOptions options;
5372+
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
5373+
runtime.DispatchEvents(options);
5374+
}
5375+
SendKillLocal(runtime, 1);
5376+
5377+
TTabletTypes::EType tabletType = TTabletTypes::ColumnShard;
5378+
for (size_t i = 0; i < 2; ++i) {
5379+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
5380+
ev->Record.SetObjectId(i);
5381+
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
5382+
MakeSureTabletIsUp(runtime, tabletId, 0);
5383+
}
5384+
5385+
{
5386+
TActorId sender = runtime.AllocateEdgeActor(0);
5387+
THolder<TEvHive::TEvTabletMetrics> metrics = MakeHolder<TEvHive::TEvTabletMetrics>();
5388+
metrics->Record.SetTotalNodeUsage(.95);
5389+
5390+
runtime.SendToPipe(hiveTablet, sender, metrics.Release(), 0);
5391+
}
5392+
CreateLocal(runtime, 1);
5393+
5394+
{
5395+
TDispatchOptions options;
5396+
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvBalancerOut, 2);
5397+
runtime.DispatchEvents(options, TDuration::Seconds(10));
5398+
}
5399+
5400+
// Check that balancer moved a tablet
5401+
auto newDistribution = getDistribution();
5402+
5403+
UNIT_ASSERT_VALUES_EQUAL(newDistribution[0].size(), newDistribution[1].size());
5404+
}
5405+
53325406
Y_UNIT_TEST(TestUpdateTabletsObjectUpdatesMetrics) {
53335407
TTestBasicRuntime runtime(1, false);
53345408
Setup(runtime, true);

0 commit comments

Comments
 (0)