Skip to content

Commit ebe630e

Browse files
authored
guess tablet's impact on node usage when moving it (#18376)
1 parent 2e9d176 commit ebe630e

File tree

14 files changed

+215
-26
lines changed

14 files changed

+215
-26
lines changed

ydb/core/mind/hive/hive_impl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,10 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
10091009
return CurrentConfig.GetNodeRestartsForPenalty() ?: Max<ui64>();
10101010
}
10111011

1012+
bool GetUseTabletUsageEstimate() const {
1013+
return CurrentConfig.GetUseTabletUsageEstimate();
1014+
}
1015+
10121016
static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
10131017
static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
10141018
static bool IsSystemTablet(TTabletTypes::EType type);

ydb/core/mind/hive/hive_schema.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,13 +229,14 @@ struct Schema : NIceDb::Schema {
229229
struct TabletID : Column<1, Tablet::ID::ColumnType> {};
230230
struct FollowerID : Column<2, TabletFollowerTablet::FollowerID::ColumnType> {};
231231
struct ProtoMetrics : Column<3, NScheme::NTypeIds::String> { using Type = NKikimrTabletBase::TMetrics; };
232+
struct UsageImpact : Column<4, NScheme::NTypeIds::Double> {};
232233

233234
struct MaximumCPU : Column<100 + (int)NMetrics::EResource::CPU, NScheme::NTypeIds::String> { using Type = NKikimrMetricsProto::TMaximumValueUI64; };
234235
struct MaximumMemory : Column<100 + (int)NMetrics::EResource::Memory, NScheme::NTypeIds::String> { using Type = NKikimrMetricsProto::TMaximumValueUI64; };
235236
struct MaximumNetwork : Column<100 + (int)NMetrics::EResource::Network, NScheme::NTypeIds::String> { using Type = NKikimrMetricsProto::TMaximumValueUI64; };
236237

237238
using TKey = TableKey<TabletID, FollowerID>;
238-
using TColumns = TableColumns<TabletID, FollowerID, ProtoMetrics, MaximumCPU, MaximumMemory, MaximumNetwork>;
239+
using TColumns = TableColumns<TabletID, FollowerID, ProtoMetrics, UsageImpact, MaximumCPU, MaximumMemory, MaximumNetwork>;
239240
};
240241

241242
struct TabletTypeMetrics : Table<13> {

ydb/core/mind/hive/hive_ut.cpp

Lines changed: 135 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4334,9 +4334,9 @@ Y_UNIT_TEST_SUITE(THiveTest) {
43344334
// - change BalancerPolicy to BALANCER_BALANCE for all remaining tablets
43354335
// - test that balancer also moved out former BALANCER_IGNORE tablets
43364336
//
4337-
static const int NUM_NODES = 4;
4338-
static const int NUM_TABLETS = 3;
4339-
static const ui64 SINGLE_TABLET_NETWORK_USAGE = 15'000'000;
4337+
static const int NUM_NODES = 6;
4338+
static const int NUM_TABLETS = 6;
4339+
static const ui64 SINGLE_TABLET_NETWORK_USAGE = 5'000'000;
43404340

43414341
TTestBasicRuntime runtime(NUM_NODES, false);
43424342

@@ -4350,6 +4350,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
43504350
app.HiveConfig.SetMaxNodeUsageToKick(0.01);
43514351
app.HiveConfig.SetNodeUsageRangeToKick(0);
43524352
app.HiveConfig.SetEmergencyBalancerInflight(1); // to ensure fair distribution
4353+
app.HiveConfig.SetResourceOvercommitment(1);
43534354
});
43544355

43554356
TActorId senderA = runtime.AllocateEdgeActor();
@@ -4413,7 +4414,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
44134414
for (int i = 0; i < NUM_TABLETS; ++i) {
44144415
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
44154416
ev->Record.SetObjectId(i);
4416-
switch (i % NUM_TABLETS) {
4417+
switch (i % 3) {
44174418
case 0: // policy not explicitly set
44184419
break;
44194420
case 1: // policy explicitly set to default value
@@ -4433,7 +4434,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
44334434
// check that tablets retain their BalancerPolicy flags...
44344435
for (const auto& i : tabletInfos_A) {
44354436
Ctest << "Step A: tablet index " << i.ObjectId << ", tablet id " << i.TabletId << ", node index " << i.NodeIndex << ", balancer policy " << NKikimrHive::EBalancerPolicy_Name(i.BalancerPolicy) << Endl;
4436-
switch (i.ObjectId % NUM_TABLETS) {
4437+
switch (i.ObjectId % 3) {
44374438
case 0:
44384439
case 1:
44394440
UNIT_ASSERT_EQUAL_C(i.BalancerPolicy, NKikimrHive::EBalancerPolicy::POLICY_BALANCE, "objectId# " << i.ObjectId << " value# " << (ui64)i.BalancerPolicy << " name# " << NKikimrHive::EBalancerPolicy_Name(i.BalancerPolicy));
@@ -4547,15 +4548,15 @@ Y_UNIT_TEST_SUITE(THiveTest) {
45474548
Ctest << Endl;
45484549
auto minmax = std::minmax_element(nodeTablets.begin(), nodeTablets.end());
45494550
UNIT_ASSERT_VALUES_EQUAL(*minmax.first, 0);
4550-
UNIT_ASSERT_VALUES_EQUAL(*minmax.second, 1);
4551-
UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 1);
4551+
UNIT_ASSERT_VALUES_EQUAL(*minmax.second, NUM_TABLETS / 3);
4552+
UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], NUM_TABLETS / 3);
45524553
}
45534554

45544555
Ctest << "Step D: change tablets BalancerPolicy" << Endl;
45554556

45564557
// set all tablets with BalancerPolicy "ignore" back to "balance"
45574558
for (int i = 0; i < NUM_TABLETS; ++i) {
4558-
switch(i % NUM_TABLETS) {
4559+
switch(i % 3) {
45594560
case 0:
45604561
case 1:
45614562
break;
@@ -4580,12 +4581,12 @@ Y_UNIT_TEST_SUITE(THiveTest) {
45804581

45814582
Ctest << "Step D: raise metrics for previously ignored tablets" << Endl;
45824583
for (const auto& i: tabletInfos_D) {
4583-
switch(i.ObjectId % NUM_TABLETS) {
4584+
switch(i.ObjectId % 3) {
45844585
case 0:
45854586
case 1:
45864587
break;
45874588
case 2:
4588-
reportTabletMetrics(i.TabletId, NUM_TABLETS * SINGLE_TABLET_NETWORK_USAGE, true);
4589+
reportTabletMetrics(i.TabletId, 2 * SINGLE_TABLET_NETWORK_USAGE, true);
45894590
break;
45904591
}
45914592
}
@@ -4605,7 +4606,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
46054606
bool ignoredTabletsAreMoved = false;
46064607
for (const auto& i : tabletInfos_E) {
46074608
Ctest << "Step E: tablet index " << i.ObjectId << ", tablet id " << i.TabletId << ", node index " << i.NodeIndex << ", balancer policy " << NKikimrHive::EBalancerPolicy_Name(i.BalancerPolicy) << Endl;
4608-
switch (i.ObjectId % NUM_TABLETS) {
4609+
switch (i.ObjectId % 3) {
46094610
case 0:
46104611
case 1:
46114612
break;
@@ -4620,7 +4621,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
46204621
}
46214622
UNIT_ASSERT_VALUES_EQUAL(ignoredTabletsAreMoved, true);
46224623
}
4623-
// ...and that the original node is completely void of tablets
4624+
// ...and that the original node has only one tablet left
46244625
{
46254626
std::array<int, NUM_NODES> nodeTablets = {};
46264627
for (auto& i : tabletInfos_E) {
@@ -4632,9 +4633,9 @@ Y_UNIT_TEST_SUITE(THiveTest) {
46324633
}
46334634
Ctest << Endl;
46344635
auto minmax = std::minmax_element(nodeTablets.begin(), nodeTablets.end());
4635-
UNIT_ASSERT_VALUES_EQUAL(*minmax.first, 0);
4636+
UNIT_ASSERT_VALUES_EQUAL(*minmax.first, 1);
46364637
UNIT_ASSERT_VALUES_EQUAL(*minmax.second, 1);
4637-
UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 0);
4638+
UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 1);
46384639
}
46394640
}
46404641

@@ -5456,6 +5457,126 @@ Y_UNIT_TEST_SUITE(THiveTest) {
54565457
UNIT_ASSERT_VALUES_EQUAL(newDistribution[0].size(), newDistribution[1].size());
54575458
}
54585459

5460+
Y_UNIT_TEST(TestHiveBalancerOneTabletHighUsage) {
5461+
static constexpr ui64 NUM_NODES = 4;
5462+
static constexpr ui64 NUM_TABLETS = NUM_NODES * NUM_NODES;
5463+
TTestBasicRuntime runtime(NUM_NODES, false);
5464+
Setup(runtime, true, 1, [](TAppPrepare& app) {
5465+
app.HiveConfig.SetTabletKickCooldownPeriod(3);
5466+
app.HiveConfig.SetResourceChangeReactionPeriod(0);
5467+
app.HiveConfig.SetMinPeriodBetweenEmergencyBalance(0);
5468+
});
5469+
const int nodeBase = runtime.GetNodeId(0);
5470+
TActorId senderA = runtime.AllocateEdgeActor();
5471+
const ui64 hiveTablet = MakeDefaultHiveID();
5472+
const ui64 testerTablet = MakeTabletID(false, 1);
5473+
5474+
using TDistribution = std::array<std::vector<ui64>, NUM_NODES>;
5475+
auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> TDistribution {
5476+
std::array<std::vector<ui64>, NUM_NODES> nodeTablets = {};
5477+
{
5478+
runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo());
5479+
TAutoPtr<IEventHandle> handle;
5480+
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
5481+
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
5482+
if (tablet.GetNodeID() == 0) {
5483+
continue;
5484+
}
5485+
UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES),
5486+
"nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase);
5487+
nodeTablets[tablet.GetNodeID() - nodeBase].push_back(tablet.GetTabletID());
5488+
}
5489+
}
5490+
return nodeTablets;
5491+
};
5492+
5493+
auto tabletNode = [](const TDistribution& distribution, ui64 tabletId) -> std::optional<size_t> {
5494+
auto hasTablet = [tabletId](const std::vector<ui64>& tablets) {
5495+
return std::find(tablets.begin(), tablets.end(), tabletId) != tablets.end();
5496+
};
5497+
auto it = std::find_if(distribution.begin(), distribution.end(), hasTablet);
5498+
if (it == distribution.end()) {
5499+
return std::nullopt;
5500+
}
5501+
return it - distribution.begin();
5502+
};
5503+
5504+
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
5505+
5506+
// wait for creation of nodes
5507+
{
5508+
TDispatchOptions options;
5509+
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
5510+
runtime.DispatchEvents(options);
5511+
}
5512+
5513+
TTabletTypes::EType tabletType = TTabletTypes::Dummy;
5514+
std::vector<ui64> tablets;
5515+
tablets.reserve(NUM_TABLETS);
5516+
for (size_t i = 0; i < NUM_TABLETS; ++i) {
5517+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
5518+
ev->Record.SetObjectId(i);
5519+
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
5520+
tablets.push_back(tabletId);
5521+
MakeSureTabletIsUp(runtime, tabletId, 0);
5522+
}
5523+
5524+
const ui64 overloadingTablet = tablets.front();
5525+
auto distribution = getDistribution();
5526+
auto nodeWithTablet = tabletNode(distribution, overloadingTablet);
5527+
Ctest << "picked tablet " << overloadingTablet << Endl;
5528+
unsigned moves = 0;
5529+
5530+
for (int i = 0; i < 20; ++i) {
5531+
for (int j = 0; j < 5; ++j) {
5532+
for (ui32 node = 0; node < NUM_NODES; ++node) {
5533+
TActorId sender = runtime.AllocateEdgeActor(node);
5534+
THolder<TEvHive::TEvTabletMetrics> metrics = MakeHolder<TEvHive::TEvTabletMetrics>();
5535+
metrics->Record.SetTotalNodeUsage(node == nodeWithTablet ? .99 : .05);
5536+
5537+
runtime.SendToPipe(hiveTablet, sender, metrics.Release(), node);
5538+
}
5539+
}
5540+
5541+
TDispatchOptions options;
5542+
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvBalancerOut);
5543+
runtime.DispatchEvents(options, TDuration::MilliSeconds(10));
5544+
runtime.AdvanceCurrentTime(TDuration::MilliSeconds(500));
5545+
5546+
distribution = getDistribution();
5547+
auto newNodeWithTablet = tabletNode(distribution, overloadingTablet);
5548+
if (newNodeWithTablet != nodeWithTablet) {
5549+
nodeWithTablet = newNodeWithTablet;
5550+
if (newNodeWithTablet) {
5551+
++moves;
5552+
}
5553+
}
5554+
5555+
Ctest << "distribution: ";
5556+
for (size_t i = 0; i < NUM_NODES; ++i) {
5557+
if (i == nodeWithTablet) {
5558+
Ctest << "*";
5559+
}
5560+
Ctest << distribution[i].size() << " ";
5561+
}
5562+
Ctest << Endl;
5563+
}
5564+
5565+
UNIT_ASSERT_LE(moves, 2);
5566+
5567+
std::set<size_t> tabletsOnNodes;
5568+
Ctest << "Final distribution: ";
5569+
for (size_t i = 0; i < NUM_NODES; ++i) {
5570+
Ctest << distribution[i].size() << " ";
5571+
if (i != nodeWithTablet) {
5572+
tabletsOnNodes.insert(distribution[i].size());
5573+
}
5574+
}
5575+
Ctest << Endl;
5576+
UNIT_ASSERT_VALUES_EQUAL(distribution[*nodeWithTablet].size(), 1);
5577+
UNIT_ASSERT_VALUES_EQUAL(tabletsOnNodes.size(), 1);
5578+
}
5579+
54595580
Y_UNIT_TEST(TestUpdateTabletsObjectUpdatesMetrics) {
54605581
TTestBasicRuntime runtime(1, false);
54615582
Setup(runtime, true);

ydb/core/mind/hive/monitoring.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,7 @@ class TTxMonEvent_Resources : public TTransactionBase<THive> {
590590
out << "<th>Storage</th>";
591591
out << "<th>Read</th>";
592592
out << "<th>Write</th>";
593+
out << "<th>Usage impact</th>";
593594
out << "</tr>";
594595
out << "</thead>";
595596

@@ -602,6 +603,7 @@ class TTxMonEvent_Resources : public TTransactionBase<THive> {
602603
out << "<tr title='" << tablet.GetResourceValues().DebugString() << "'>";
603604
out << "<td data-text='" << index << "'><a href='../tablets?TabletID=" << id << "'>" << id << "</a></td>";
604605
out << GetResourceValuesHtml(tablet.GetResourceValues());
606+
out << "<td>" << tablet.UsageImpact << "</td>";
605607
out << "</tr>";
606608
}
607609
out <<"</tbody>";
@@ -3795,6 +3797,7 @@ class TTxMonEvent_TabletInfo : public TTransactionBase<THive> {
37953797
result["ResourceMetricsAggregates"] = MakeFrom(tablet.ResourceMetricsAggregates);
37963798
result["ActorsToNotify"] = MakeFrom(tablet.ActorsToNotify);
37973799
result["ActorsToNotifyOnRestart"] = MakeFrom(tablet.ActorsToNotifyOnRestart);
3800+
result["UsageImpact"] = tablet.UsageImpact;
37983801
return result;
37993802
}
38003803

ydb/core/mind/hive/node_info.cpp

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV
6969
if (IsResourceDrainingState(oldState)) {
7070
if (Tablets[oldState].erase(tablet) != 0) {
7171
UpdateResourceValues(tablet, tablet->GetResourceValues(), NKikimrTabletBase::TMetrics());
72+
if (!IsResourceDrainingState(newState)) {
73+
LastScheduledTablet.reset();
74+
}
7275
} else {
7376
if (oldState != newState) {
7477
BLOG_W("Node(" << Id << ") could not delete tablet " << tablet->ToString() << " from state " << TTabletInfo::EVolatileStateName(oldState));
@@ -87,6 +90,9 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV
8790
if (IsResourceDrainingState(newState)) {
8891
if (Tablets[newState].insert(tablet).second) {
8992
UpdateResourceValues(tablet, NKikimrTabletBase::TMetrics(), tablet->GetResourceValues());
93+
if (!IsResourceDrainingState(oldState)) {
94+
LastScheduledTablet = {.TabletId = tablet->GetFullTabletId(), .UsageBefore = NodeTotalUsage};
95+
}
9096
} else {
9197
BLOG_W("Node(" << Id << ") could not insert tablet " << tablet->ToString() << " to state " << TTabletInfo::EVolatileStateName(newState));
9298
}
@@ -279,6 +285,14 @@ bool TNodeInfo::IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState*
279285
return false;
280286
}
281287

288+
if (tablet.IsAlive() && GetNodeUsageForTablet(tablet, false) > Hive.GetMaxNodeUsageToKick()) {
289+
// ... or when node is not overloaded yet, but would be
290+
if (debugState) {
291+
debugState->NodesWithoutResources++;
292+
}
293+
return false;
294+
}
295+
282296
TResourceRawValues maximumResources = GetResourceMaximumValues() * Hive.GetResourceOvercommitment();
283297
TResourceRawValues allocatedResources = GetResourceCurrentValues() + tablet.GetResourceCurrentValues();
284298
auto cmp = piecewise_compare(allocatedResources, maximumResources);
@@ -408,17 +422,21 @@ void TNodeInfo::UpdateResourceMaximum(const NKikimrTabletBase::TMetrics& metrics
408422
Hive.UpdateTotalResourceValues(nullptr, nullptr, NKikimrTabletBase::TMetrics(), NKikimrTabletBase::TMetrics(), {}, normalizedValues - oldNormalizedValues);
409423
}
410424

411-
double TNodeInfo::GetNodeUsageForTablet(const TTabletInfo& tablet) const {
425+
double TNodeInfo::GetNodeUsageForTablet(const TTabletInfo& tablet, bool neighbourPenalty) const {
412426
// what it would like when tablet will run on this node?
427+
auto maximum = GetResourceMaximumValues();
413428
TResourceRawValues nodeValues = GetResourceCurrentValues();
414429
TResourceRawValues tabletValues = tablet.GetResourceCurrentValues();
430+
if (Hive.GetUseTabletUsageEstimate()) {
431+
auto estimateUsageValues = cast_like(maximum * tablet.UsageImpact, tabletValues);
432+
tabletValues = piecewise_max(tabletValues, estimateUsageValues);
433+
}
415434
tablet.FilterRawValues(nodeValues);
416435
tablet.FilterRawValues(tabletValues);
417436
auto current = tablet.IsAliveOnLocal(Local) ? nodeValues : nodeValues + tabletValues;
418-
auto maximum = GetResourceMaximumValues();
419437
// basically, this is: return max(a / b);
420438
double usage = TTabletInfo::GetUsage(current, maximum);
421-
if (Hive.GetSpreadNeighbours() && usage < 1) {
439+
if (Hive.GetSpreadNeighbours() && usage < 1 && neighbourPenalty) {
422440
auto neighbours = GetTabletNeighboursCount(tablet);
423441
if (neighbours > 0) {
424442
auto remain = 1 - usage;
@@ -478,13 +496,28 @@ bool TNodeInfo::CanBeDeleted(TInstant now) const {
478496
}
479497
}
480498

481-
void TNodeInfo::UpdateResourceTotalUsage(const NKikimrHive::TEvTabletMetrics& metrics) {
499+
void TNodeInfo::UpdateResourceTotalUsage(const NKikimrHive::TEvTabletMetrics& metrics, NIceDb::TNiceDb& db) {
482500
if (metrics.HasTotalResourceUsage()) {
483501
AveragedResourceTotalValues.Push(ResourceRawValuesFromMetrics(metrics.GetTotalResourceUsage()));
484502
ResourceTotalValues = AveragedResourceTotalValues.GetValue();
485503
}
486504
if (metrics.HasTotalNodeUsage()) {
487505
AveragedNodeTotalUsage.Push(metrics.GetTotalNodeUsage());
506+
if (LastScheduledTablet) {
507+
if (LastScheduledTablet->UsageSince.Push(metrics.GetTotalNodeUsage())) {
508+
// we kept enough stats for this tablet
509+
LastScheduledTablet.reset();
510+
} else {
511+
double usageImpact = LastScheduledTablet->UsageSince.GetValue() - LastScheduledTablet->UsageBefore;
512+
usageImpact = std::max<double>(usageImpact, 0);
513+
auto* tablet = Hive.FindTablet(LastScheduledTablet->TabletId);
514+
if (tablet) {
515+
BLOG_D("Estimate impact of tablet " << LastScheduledTablet->TabletId << " on usage of node " << Id << " as " << usageImpact);
516+
tablet->UsageImpact = usageImpact;
517+
db.Table<Schema::Metrics>().Key(LastScheduledTablet->TabletId).Update<Schema::Metrics::UsageImpact>(usageImpact);
518+
}
519+
}
520+
}
488521
NodeTotalUsage = AveragedNodeTotalUsage.GetValue();
489522
}
490523
if (metrics.HasTotalNodeCpuUsage()) {

0 commit comments

Comments
 (0)