Skip to content

Commit 6192c83

Browse files
authored
rework node priorities to make dc preference overridable (#14554)
1 parent 62c2bfa commit 6192c83

File tree

8 files changed

+89
-50
lines changed

8 files changed

+89
-50
lines changed

ydb/core/mind/hive/hive.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ using TResourceRawValues = std::tuple<i64, i64, i64, i64>; // CPU, Memory, Netwo
5555
using TResourceNormalizedValues = std::tuple<double, double, double, double>;
5656
using TOwnerIdxType = NScheme::TPairUi64Ui64;
5757
using TSubActorId = ui64; // = LocalId part of TActorId
58+
using TDataCenterPriority = std::unordered_map<TDataCenterId, i32>;
5859

5960
static constexpr std::size_t MAX_TABLET_CHANNELS = 256;
6061

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 15 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,15 +1176,15 @@ TNodeInfo* THive::SelectNode<NKikimrConfig::THiveConfig::HIVE_NODE_SELECT_STRATE
11761176
return itNode->Node;
11771177
}
11781178

1179-
TVector<THive::TSelectedNode> THive::SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const
1179+
TVector<THive::TSelectedNode> THive::SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const
11801180
{
11811181
i32 priority = std::numeric_limits<i32>::min();
11821182
for (const TSelectedNode& selectedNode : selectedNodes) {
1183-
priority = std::max(priority, selectedNode.Node->GetPriorityForTablet(tablet));
1183+
priority = std::max(priority, selectedNode.Node->GetPriorityForTablet(tablet, dcPriority));
11841184
}
11851185

11861186
auto it = std::partition(selectedNodes.begin(), selectedNodes.end(), [&] (const TSelectedNode& selectedNode) {
1187-
return selectedNode.Node->GetPriorityForTablet(tablet) == priority;
1187+
return selectedNode.Node->GetPriorityForTablet(tablet, dcPriority) == priority;
11881188
});
11891189

11901190
selectedNodes.erase(it, selectedNodes.end());
@@ -1279,53 +1279,21 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su
12791279
}
12801280
}
12811281

1282-
std::vector<std::vector<TNodeInfo*>> candidateGroups;
1283-
candidateGroups.resize(dataCentersGroups.size() + 1);
1284-
std::unordered_map<TDataCenterId, std::vector<TNodeInfo*>*> indexDC2Group;
1282+
TDataCenterPriority dcPriority;
12851283
for (size_t numGroup = 0; numGroup < dataCentersGroups.size(); ++numGroup) {
12861284
const NKikimrHive::TDataCentersGroup* dcGroup = dataCentersGroups[numGroup];
1287-
if (dcGroup->DataCenterSize()) {
1288-
for (TDataCenterId dc : dcGroup->GetDataCenter()) {
1289-
indexDC2Group[dc] = candidateGroups.data() + numGroup;
1290-
}
1291-
} else {
1292-
for (const ui64 dcId : dcGroup->GetDataCenterNum()) {
1293-
indexDC2Group[DataCenterToString(dcId)] = candidateGroups.data() + numGroup;
1294-
}
1295-
}
1296-
}
1297-
for (auto it = Nodes.begin(); it != Nodes.end(); ++it) {
1298-
TNodeInfo* nodeInfo = &it->second;
1299-
if (nodeInfo->IsAlive()) {
1300-
TDataCenterId dataCenterId = nodeInfo->GetDataCenter();
1301-
auto itDataCenter = indexDC2Group.find(dataCenterId);
1302-
if (itDataCenter != indexDC2Group.end()) {
1303-
itDataCenter->second->push_back(nodeInfo);
1304-
} else {
1305-
candidateGroups.back().push_back(nodeInfo);
1306-
}
1307-
} else {
1308-
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " node " << nodeInfo->Id << " is not alive");
1309-
debugState.NodesDead++;
1285+
for (TDataCenterId dc : dcGroup->GetDataCenter()) {
1286+
// First group gets largest priority, last group gets +1 priority, dcs not in any groups get 0
1287+
dcPriority[dc] = dataCentersGroups.size() - numGroup;
13101288
}
13111289
}
13121290

13131291
TVector<TSelectedNode> selectedNodes;
1292+
selectedNodes.reserve(Nodes.size());
13141293
bool thereAreNodesWithManyStarts = false;
13151294

1316-
for (auto itCandidateNodes = candidateGroups.begin(); itCandidateNodes != candidateGroups.end(); ++itCandidateNodes) {
1317-
const std::vector<TNodeInfo*>& candidateNodes(*itCandidateNodes);
1318-
if (candidateGroups.size() > 1) {
1319-
BLOG_TRACE("[FBN] Tablet " << tablet.ToString()
1320-
<< " checking candidates group " << (itCandidateNodes - candidateGroups.begin() + 1)
1321-
<< " of " << candidateGroups.size());
1322-
}
1323-
1324-
selectedNodes.clear();
1325-
selectedNodes.reserve(candidateNodes.size());
1326-
1327-
for (auto it = candidateNodes.begin(); it != candidateNodes.end(); ++it) {
1328-
TNodeInfo& nodeInfo = *(*it);
1295+
for (auto& [_, nodeInfo] : Nodes) {
1296+
if (nodeInfo.IsAlive()) {
13291297
if (nodeInfo.IsAllowedToRunTablet(tablet, &debugState)) {
13301298
if (nodeInfo.IsAbleToScheduleTablet()) {
13311299
if (nodeInfo.IsAbleToRunTablet(tablet, &debugState)) {
@@ -1351,11 +1319,12 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su
13511319
<< " tablet allowed domains " << tablet.GetNodeFilter().AllowedDomains
13521320
<< " tablet effective allowed domains " << tablet.GetNodeFilter().GetEffectiveAllowedDomains());
13531321
}
1354-
}
1355-
if (!selectedNodes.empty()) {
1356-
break;
1322+
} else {
1323+
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " node " << nodeInfo.Id << " is not alive");
1324+
debugState.NodesDead++;
13571325
}
13581326
}
1327+
13591328
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " selected nodes count " << selectedNodes.size());
13601329
if (selectedNodes.empty() && thereAreNodesWithManyStarts) {
13611330
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " all available nodes are booting too many tablets");
@@ -1364,7 +1333,7 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su
13641333

13651334
TNodeInfo* selectedNode = nullptr;
13661335
if (!selectedNodes.empty()) {
1367-
selectedNodes = SelectMaxPriorityNodes(std::move(selectedNodes), tablet);
1336+
selectedNodes = SelectMaxPriorityNodes(std::move(selectedNodes), tablet, dcPriority);
13681337
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " selected max priority nodes count " << selectedNodes.size());
13691338

13701339
switch (GetNodeSelectStrategy()) {

ydb/core/mind/hive/hive_impl.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
640640

641641
template <NKikimrConfig::THiveConfig::EHiveNodeSelectStrategy Strategy>
642642
TNodeInfo* SelectNode(const std::vector<TSelectedNode>& selectedNodes);
643-
TVector<TSelectedNode> SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const;
643+
TVector<TSelectedNode> SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const;
644644

645645
public:
646646
void AssignTabletGroups(TLeaderTabletInfo& tablet);
@@ -1002,6 +1002,10 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
10021002
return CurrentConfig.GetMaxPingsInFlight();
10031003
}
10041004

1005+
ui64 GetNodeRestartsForPenalty() const {
1006+
return CurrentConfig.GetNodeRestartsForPenalty();
1007+
}
1008+
10051009
static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
10061010
static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
10071011
static bool IsSystemTablet(TTabletTypes::EType type);

ydb/core/mind/hive/hive_ut.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3810,6 +3810,65 @@ Y_UNIT_TEST_SUITE(THiveTest) {
38103810
}
38113811
}
38123812

3813+
Y_UNIT_TEST(TestHiveBalancerWithPreferredDC3) {
3814+
// Tablet prefers DC 1, but the nodes there are constantly crashing
3815+
// Test that it will be eventually launched in DC 2
3816+
static const int NUM_NODES = 4;
3817+
TTestBasicRuntime runtime(NUM_NODES, false);
3818+
3819+
runtime.LocationCallback = GetLocation;
3820+
3821+
Setup(runtime, true);
3822+
const int nodeBase = runtime.GetNodeId(0);
3823+
TActorId senderA = runtime.AllocateEdgeActor();
3824+
const ui64 hiveTablet = MakeDefaultHiveID();
3825+
const ui64 testerTablet = MakeTabletID(false, 1);
3826+
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
3827+
{
3828+
TDispatchOptions options;
3829+
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
3830+
runtime.DispatchEvents(options);
3831+
}
3832+
3833+
TTabletTypes::EType tabletType = TTabletTypes::Dummy;
3834+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500, tabletType, BINDED_CHANNELS));
3835+
ev->Record.SetFollowerCount(3);
3836+
auto* group = ev->Record.MutableDataCentersPreference()->AddDataCentersGroups();
3837+
group->AddDataCenter(ToString(1));
3838+
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
3839+
MakeSureTabletIsUp(runtime, tabletId, 0);
3840+
3841+
auto getTabletDC = [&]() -> std::optional<TString> {
3842+
std::unique_ptr<TEvHive::TEvRequestHiveInfo> request = std::make_unique<TEvHive::TEvRequestHiveInfo>();
3843+
runtime.SendToPipe(hiveTablet, senderA, request.release());
3844+
TAutoPtr<IEventHandle> handle;
3845+
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
3846+
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
3847+
if (tablet.GetTabletID() == tabletId) {
3848+
ui32 nodeId = tablet.GetNodeID();
3849+
if (nodeId == 0) {
3850+
return std::nullopt;
3851+
}
3852+
auto location = GetLocation(nodeId - nodeBase);
3853+
return location.GetDataCenterId();
3854+
}
3855+
}
3856+
return std::nullopt;
3857+
};
3858+
3859+
UNIT_ASSERT_VALUES_EQUAL(getTabletDC(), "1");
3860+
for (ui32 i = 0;; ++i) {
3861+
// restart node in DC 1
3862+
SendKillLocal(runtime, i % 2);
3863+
CreateLocal(runtime, i % 2);
3864+
auto dc = getTabletDC();
3865+
Ctest << "tablet is in dc" << dc << Endl;
3866+
if (dc == "2") {
3867+
break;
3868+
}
3869+
}
3870+
}
3871+
38133872
Y_UNIT_TEST(TestHiveFollowersWithChangingDC) {
38143873
static const int NUM_NODES = 6;
38153874
static const int NUM_TABLETS = 1;

ydb/core/mind/hive/monitoring.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -848,6 +848,7 @@ class TTxMonEvent_Settings : public TTransactionBase<THive>, public TLoggedMonTr
848848
UpdateConfig(db, "ScaleInWindowSize", configUpdates);
849849
UpdateConfig(db, "TargetTrackingCPUMargin", configUpdates);
850850
UpdateConfig(db, "DryRunTargetTrackingCPU", configUpdates);
851+
UpdateConfig(db, "NodeRestartsForPenalty", configUpdates);
851852

852853
if (params.contains("BalancerIgnoreTabletTypes")) {
853854
auto value = params.Get("BalancerIgnoreTabletTypes");
@@ -1201,6 +1202,7 @@ class TTxMonEvent_Settings : public TTransactionBase<THive>, public TLoggedMonTr
12011202
ShowConfig(out, "ScaleInWindowSize");
12021203
ShowConfig(out, "TargetTrackingCPUMargin");
12031204
ShowConfig(out, "DryRunTargetTrackingCPU");
1205+
ShowConfig(out, "NodeRestartsForPenalty");
12041206

12051207
out << "<div class='row' style='margin-top:40px'>";
12061208
out << "<div class='col-sm-2' style='padding-top:30px;text-align:right'><label for='allowedMetrics'>AllowedMetrics:</label></div>";

ydb/core/mind/hive/node_info.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugStat
209209
return true;
210210
}
211211

212-
i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const {
212+
i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const {
213213
i32 priority = 0;
214214

215215
auto it = TabletAvailability.find(tablet.GetTabletType());
@@ -221,6 +221,9 @@ i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const {
221221
--priority;
222222
}
223223

224+
priority += dcPriority[GetDataCenter()];
225+
priority -= GetRestartsPerPeriod() / Hive.GetNodeRestartsForPenalty();
226+
224227
return priority;
225228
}
226229

ydb/core/mind/hive/node_info.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ struct TNodeInfo {
160160
bool IsAllowedToRunTablet(TTabletDebugState* debugState = nullptr) const;
161161
bool IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const;
162162
bool IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const;
163-
i32 GetPriorityForTablet(const TTabletInfo& tablet) const;
163+
i32 GetPriorityForTablet(const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const;
164164
ui64 GetMaxTabletsScheduled() const;
165165
ui64 GetMaxCountForTabletType(TTabletTypes::EType tabletType) const;
166166

@@ -272,7 +272,7 @@ struct TNodeInfo {
272272

273273
void UpdateResourceTotalUsage(const NKikimrHive::TEvTabletMetrics& metrics);
274274
void ActualizeNodeStatistics(TInstant now);
275-
ui64 GetRestartsPerPeriod(TInstant barrier) const;
275+
ui64 GetRestartsPerPeriod(TInstant barrier = {}) const;
276276

277277
TDataCenterId GetDataCenter() const {
278278
return Location.GetDataCenterId();

ydb/core/protos/config.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1763,6 +1763,7 @@ message THiveConfig {
17631763
optional uint64 ScaleInWindowSize = 82 [default = 5]; // buckets
17641764
optional double TargetTrackingCPUMargin = 83 [default = 0.1]; // percent
17651765
optional double DryRunTargetTrackingCPU = 84; // percent
1766+
optional uint64 NodeRestartsForPenalty = 85 [default = 3];
17661767
}
17671768

17681769
message TBlobCacheConfig {

0 commit comments

Comments
 (0)