Skip to content

Commit 98298e9

Browse files
changed healthcheck config (#15693)
1 parent 0367e33 commit 98298e9

File tree

3 files changed

+17
-13
lines changed

3 files changed

+17
-13
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -747,7 +747,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
747747

748748
TTabletRequestsState TabletRequests;
749749

750-
TDuration Timeout = TDuration::MilliSeconds(20000);
750+
TDuration Timeout = TDuration::MilliSeconds(HealthCheckConfig.GetTimeout());
751751
static constexpr TStringBuf STATIC_STORAGE_POOL_NAME = "static";
752752

753753
bool IsSpecificDatabaseFilter() const {
@@ -1509,7 +1509,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15091509
for (const auto& [hiveId, hiveResponse] : HiveInfo) {
15101510
if (hiveResponse.IsOk()) {
15111511
settings.AliveBarrier = TInstant::MilliSeconds(hiveResponse->Record.GetResponseTimestamp()) - TDuration::Minutes(5);
1512-
settings.MaxRestartsPerPeriod = HealthCheckConfig.GetTabletsRestartsPerPeriodOrangeThreshold();
1512+
settings.MaxRestartsPerPeriod = HealthCheckConfig.GetThresholds().GetTabletsRestartsOrange();
15131513
for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record.GetTablets()) {
15141514
TSubDomainKey tenantId = TSubDomainKey(hiveTablet.GetObjectDomain());
15151515
auto itDomain = FilterDomainKey.find(tenantId);
@@ -1735,9 +1735,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
17351735
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
17361736

17371737
TSelfCheckContext rrContext(&context, "NODE_UPTIME");
1738-
if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetNodeRestartsPerPeriodOrangeThreshold()) {
1738+
if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetThresholds().GetNodeRestartsOrange()) {
17391739
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Node is restarting too often", ETags::Uptime);
1740-
} else if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetNodeRestartsPerPeriodYellowThreshold()) {
1740+
} else if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetThresholds().GetNodeRestartsYellow()) {
17411741
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "The number of node restarts has increased", ETags::Uptime);
17421742
} else {
17431743
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
@@ -1775,9 +1775,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
17751775
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
17761776
TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
17771777
Ydb::Monitoring::StatusFlag::Status status;
1778-
if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetNodesTimeDifferenceUsOrangeThreshold())) {
1778+
if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange())) {
17791779
status = Ydb::Monitoring::StatusFlag::ORANGE;
1780-
} else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetNodesTimeDifferenceUsYellowThreshold())) {
1780+
} else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow())) {
17811781
status = Ydb::Monitoring::StatusFlag::YELLOW;
17821782
} else {
17831783
status = Ydb::Monitoring::StatusFlag::GREEN;

ydb/core/health_check/health_check_ut.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1982,8 +1982,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
19821982
void ChangeNodeRestartsPerPeriod(TTestActorRuntime &runtime, const TActorId& sender, const ui32 restartsYellow, const ui32 restartsOrange) {
19831983
NKikimrConfig::TAppConfig ext;
19841984
auto &cfg = *ext.MutableHealthCheckConfig();
1985-
cfg.SetNodeRestartsPerPeriodYellowThreshold(restartsYellow);
1986-
cfg.SetNodeRestartsPerPeriodOrangeThreshold(restartsOrange);
1985+
cfg.MutableThresholds()->SetNodeRestartsYellow(restartsYellow);
1986+
cfg.MutableThresholds()->SetNodeRestartsOrange(restartsOrange);
19871987
SendHealthCheckConfigUpdate(runtime, sender, cfg);
19881988
}
19891989

ydb/core/protos/config.proto

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1770,11 +1770,15 @@ message THiveConfig {
17701770
}
17711771

17721772
message THealthCheckConfig {
1773-
optional uint32 NodeRestartsPerPeriodYellowThreshold = 1 [default = 10];
1774-
optional uint32 NodeRestartsPerPeriodOrangeThreshold = 2 [default = 30];
1775-
optional uint64 NodesTimeDifferenceUsYellowThreshold = 3 [default = 5000];
1776-
optional uint64 NodesTimeDifferenceUsOrangeThreshold = 4 [default = 25000];
1777-
optional uint32 TabletsRestartsPerPeriodOrangeThreshold = 5 [default = 30];
1773+
message TThresholds {
1774+
optional uint32 NodeRestartsYellow = 1 [default = 10]; // per period, see HiveConfig.NodeRestartWatchPeriod
1775+
optional uint32 NodeRestartsOrange = 2 [default = 30]; // per period, see HiveConfig.NodeRestartWatchPeriod
1776+
optional uint64 NodesTimeDifferenceYellow = 3 [default = 5000]; // microseconds
1777+
optional uint64 NodesTimeDifferenceOrange = 4 [default = 25000]; // microseconds
1778+
optional uint32 TabletsRestartsOrange = 5 [default = 30]; // per period, see HiveConfig.TabletRestartWatchPeriod
1779+
}
1780+
optional TThresholds Thresholds = 1;
1781+
optional uint32 Timeout = 2 [default = 20000]; // milliseconds
17781782
}
17791783

17801784
message TBlobCacheConfig {

0 commit comments

Comments
 (0)