Skip to content

Commit 9ef3588

Browse files
committed
add sensor for down nodes (#11592)
1 parent 4b8e06e commit 9ef3588

File tree

5 files changed

+25
-0
lines changed

5 files changed

+25
-0
lines changed

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1708,6 +1708,22 @@ void THive::UpdateCounterTabletChannelHistorySize() {
17081708
}
17091709
}
17101710

1711+
void THive::UpdateCounterNodesDown(i64 nodesDownDiff) {
1712+
if (TabletCounters != nullptr) {
1713+
auto& counter = TabletCounters->Simple()[NHive::COUNTER_NODES_DOWN];
1714+
auto newValue = counter.Get() + nodesDownDiff;
1715+
counter.Set(newValue);
1716+
}
1717+
}
1718+
1719+
void THive::UpdateCounterNodesFrozen(i64 nodesFrozenDiff) {
1720+
if (TabletCounters != nullptr) {
1721+
auto& counter = TabletCounters->Simple()[NHive::COUNTER_NODES_FROZEN];
1722+
auto newValue = counter.Get() + nodesFrozenDiff;
1723+
counter.Set(newValue);
1724+
}
1725+
}
1726+
17111727
void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) {
17121728
TabletMoveHistory.PushBack(moveInfo);
17131729
TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);

ydb/core/mind/hive/hive_impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,8 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
678678
void UpdateCounterTabletsStarting(i64 tabletsStartingDiff);
679679
void UpdateCounterPingQueueSize();
680680
void UpdateCounterTabletChannelHistorySize();
681+
void UpdateCounterNodesDown(i64 nodesDownDiff);
682+
void UpdateCounterNodesFrozen(i64 nodesFrozenDiff);
681683
void RecordTabletMove(const TTabletMoveInfo& info);
682684
bool DomainHasNodes(const TSubDomainKey &domainKey) const;
683685
void ProcessBootQueue();

ydb/core/mind/hive/node_info.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ void TNodeInfo::SendReconnect(const TActorId& local) {
360360
}
361361

362362
void TNodeInfo::SetDown(bool down) {
363+
Hive.UpdateCounterNodesDown(static_cast<i64>(down) - static_cast<i64>(Down));
363364
Down = down;
364365
if (Down) {
365366
Hive.ObjectDistributions.RemoveNode(*this);
@@ -370,6 +371,7 @@ void TNodeInfo::SetDown(bool down) {
370371
}
371372

372373
void TNodeInfo::SetFreeze(bool freeze) {
374+
Hive.UpdateCounterNodesFrozen(static_cast<i64>(freeze) - static_cast<i64>(Freeze));
373375
Freeze = freeze;
374376
if (Freeze) {
375377
for (const auto& [state, tablets] : Tablets) {

ydb/core/mind/hive/tx__load_everything.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,9 @@ class TTxLoadEverything : public TTransactionBase<THive> {
326326
// That was not persisted to avoid issues with downgrades
327327
node.Down = true;
328328
}
329+
if (node.Down) {
330+
Self->UpdateCounterNodesDown(+1);
331+
}
329332
if (nodeRowset.HaveValue<Schema::Node::Location>()) {
330333
auto location = nodeRowset.GetValue<Schema::Node::Location>();
331334
if (location.HasDataCenter()) {

ydb/core/protos/counters_hive.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ enum ESimpleCounters {
3434
COUNTER_NODES_RECOMMENDED = 24 [(CounterOpts) = {Name: "NodesRecommended"}];
3535
COUNTER_NODES_RECOMMENDED_DRY_RUN = 25 [(CounterOpts) = {Name: "NodesRecommendedDryRun"}];
3636
COUNTER_AVG_CPU_UTILIZATION = 26 [(CounterOpts) = {Name: "AvgCPUUtilization"}];
37+
COUNTER_NODES_DOWN = 27 [(CounterOpts) = {Name: "NodesDown"}];
38+
COUNTER_NODES_FROZEN = 28 [(CounterOpts) = {Name: "NodesFrozen"}];
3739
}
3840

3941
enum ECumulativeCounters {

0 commit comments

Comments
 (0)