Skip to content

Commit 9a1ddb7

Browse files
authored
25-1: add sensor for down nodes (#11592) (#18476)
2 parents f216f16 + 9ef3588 commit 9a1ddb7

File tree

5 files changed

+25
-0
lines changed

5 files changed

+25
-0
lines changed

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1712,6 +1712,22 @@ void THive::UpdateCounterTabletChannelHistorySize() {
17121712
}
17131713
}
17141714

1715+
void THive::UpdateCounterNodesDown(i64 nodesDownDiff) {
1716+
if (TabletCounters != nullptr) {
1717+
auto& counter = TabletCounters->Simple()[NHive::COUNTER_NODES_DOWN];
1718+
auto newValue = counter.Get() + nodesDownDiff;
1719+
counter.Set(newValue);
1720+
}
1721+
}
1722+
1723+
void THive::UpdateCounterNodesFrozen(i64 nodesFrozenDiff) {
1724+
if (TabletCounters != nullptr) {
1725+
auto& counter = TabletCounters->Simple()[NHive::COUNTER_NODES_FROZEN];
1726+
auto newValue = counter.Get() + nodesFrozenDiff;
1727+
counter.Set(newValue);
1728+
}
1729+
}
1730+
17151731
void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) {
17161732
TabletMoveHistory.PushBack(moveInfo);
17171733
TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);

ydb/core/mind/hive/hive_impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,8 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
679679
void UpdateCounterTabletsStarting(i64 tabletsStartingDiff);
680680
void UpdateCounterPingQueueSize();
681681
void UpdateCounterTabletChannelHistorySize();
682+
void UpdateCounterNodesDown(i64 nodesDownDiff);
683+
void UpdateCounterNodesFrozen(i64 nodesFrozenDiff);
682684
void RecordTabletMove(const TTabletMoveInfo& info);
683685
bool DomainHasNodes(const TSubDomainKey &domainKey) const;
684686
void ProcessBootQueue();

ydb/core/mind/hive/node_info.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ void TNodeInfo::SendReconnect(const TActorId& local) {
360360
}
361361

362362
void TNodeInfo::SetDown(bool down) {
363+
Hive.UpdateCounterNodesDown(static_cast<i64>(down) - static_cast<i64>(Down));
363364
Down = down;
364365
if (Down) {
365366
Hive.ObjectDistributions.RemoveNode(*this);
@@ -370,6 +371,7 @@ void TNodeInfo::SetDown(bool down) {
370371
}
371372

372373
void TNodeInfo::SetFreeze(bool freeze) {
374+
Hive.UpdateCounterNodesFrozen(static_cast<i64>(freeze) - static_cast<i64>(Freeze));
373375
Freeze = freeze;
374376
if (Freeze) {
375377
for (const auto& [state, tablets] : Tablets) {

ydb/core/mind/hive/tx__load_everything.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,9 @@ class TTxLoadEverything : public TTransactionBase<THive> {
326326
// That was not persisted to avoid issues with downgrades
327327
node.Down = true;
328328
}
329+
if (node.Down) {
330+
Self->UpdateCounterNodesDown(+1);
331+
}
329332
if (nodeRowset.HaveValue<Schema::Node::Location>()) {
330333
auto location = nodeRowset.GetValue<Schema::Node::Location>();
331334
if (location.HasDataCenter()) {

ydb/core/protos/counters_hive.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ enum ESimpleCounters {
3434
COUNTER_NODES_RECOMMENDED = 24 [(CounterOpts) = {Name: "NodesRecommended"}];
3535
COUNTER_NODES_RECOMMENDED_DRY_RUN = 25 [(CounterOpts) = {Name: "NodesRecommendedDryRun"}];
3636
COUNTER_AVG_CPU_UTILIZATION = 26 [(CounterOpts) = {Name: "AvgCPUUtilization"}];
37+
COUNTER_NODES_DOWN = 27 [(CounterOpts) = {Name: "NodesDown"}];
38+
COUNTER_NODES_FROZEN = 28 [(CounterOpts) = {Name: "NodesFrozen"}];
3739
}
3840

3941
enum ECumulativeCounters {

0 commit comments

Comments
 (0)