changed healthcheck config (#15693)

StekPerepolnen · StekPerepolnen · commit 98298e9d4d6f · 2025-03-19T12:51:18.000+01:00
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp
@@ -747,7 +747,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
 
     TTabletRequestsState TabletRequests;
 
-    TDuration Timeout = TDuration::MilliSeconds(20000);
+    TDuration Timeout = TDuration::MilliSeconds(HealthCheckConfig.GetTimeout());
     static constexpr TStringBuf STATIC_STORAGE_POOL_NAME = "static";
 
     bool IsSpecificDatabaseFilter() const {
@@ -1509,7 +1509,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
         for (const auto& [hiveId, hiveResponse] : HiveInfo) {
             if (hiveResponse.IsOk()) {
                 settings.AliveBarrier = TInstant::MilliSeconds(hiveResponse->Record.GetResponseTimestamp()) - TDuration::Minutes(5);
-                settings.MaxRestartsPerPeriod = HealthCheckConfig.GetTabletsRestartsPerPeriodOrangeThreshold();
+                settings.MaxRestartsPerPeriod = HealthCheckConfig.GetThresholds().GetTabletsRestartsOrange();
                 for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record.GetTablets()) {
                     TSubDomainKey tenantId = TSubDomainKey(hiveTablet.GetObjectDomain());
                     auto itDomain = FilterDomainKey.find(tenantId);
@@ -1735,9 +1735,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
         FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
 
         TSelfCheckContext rrContext(&context, "NODE_UPTIME");
-        if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetNodeRestartsPerPeriodOrangeThreshold()) {
+        if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetThresholds().GetNodeRestartsOrange()) {
             rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Node is restarting too often", ETags::Uptime);
-        } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetNodeRestartsPerPeriodYellowThreshold()) {
+        } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetThresholds().GetNodeRestartsYellow()) {
             rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "The number of node restarts has increased", ETags::Uptime);
         } else {
             rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
@@ -1775,9 +1775,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
                 long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
                 TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
                 Ydb::Monitoring::StatusFlag::Status status;
-                if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetNodesTimeDifferenceUsOrangeThreshold())) {
+                if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange())) {
                     status = Ydb::Monitoring::StatusFlag::ORANGE;
-                } else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetNodesTimeDifferenceUsYellowThreshold())) {
+                } else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow())) {
                     status = Ydb::Monitoring::StatusFlag::YELLOW;
                 } else {
                     status = Ydb::Monitoring::StatusFlag::GREEN;
diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp
@@ -1982,8 +1982,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
     void ChangeNodeRestartsPerPeriod(TTestActorRuntime &runtime, const TActorId& sender, const ui32 restartsYellow, const ui32 restartsOrange) {
         NKikimrConfig::TAppConfig ext;
         auto &cfg = *ext.MutableHealthCheckConfig();
-        cfg.SetNodeRestartsPerPeriodYellowThreshold(restartsYellow);
-        cfg.SetNodeRestartsPerPeriodOrangeThreshold(restartsOrange);
+        cfg.MutableThresholds()->SetNodeRestartsYellow(restartsYellow);
+        cfg.MutableThresholds()->SetNodeRestartsOrange(restartsOrange);
         SendHealthCheckConfigUpdate(runtime, sender, cfg);
     }
 
diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto
@@ -1770,11 +1770,15 @@ message THiveConfig {
 }
 
 message THealthCheckConfig {
-    optional uint32 NodeRestartsPerPeriodYellowThreshold = 1 [default = 10];
-    optional uint32 NodeRestartsPerPeriodOrangeThreshold = 2 [default = 30];
-    optional uint64 NodesTimeDifferenceUsYellowThreshold = 3 [default = 5000];
-    optional uint64 NodesTimeDifferenceUsOrangeThreshold = 4 [default = 25000];
-    optional uint32 TabletsRestartsPerPeriodOrangeThreshold = 5 [default = 30];
+    message TThresholds {
+        optional uint32 NodeRestartsYellow = 1 [default = 10]; // per period, see HiveConfig.NodeRestartWatchPeriod
+        optional uint32 NodeRestartsOrange = 2 [default = 30]; // per period, see HiveConfig.NodeRestartWatchPeriod
+        optional uint64 NodesTimeDifferenceYellow = 3 [default = 5000]; // microseconds
+        optional uint64 NodesTimeDifferenceOrange = 4 [default = 25000]; // microseconds
+        optional uint32 TabletsRestartsOrange = 5 [default = 30]; // per period, see HiveConfig.TabletRestartWatchPeriod
+    }
+    optional TThresholds Thresholds = 1;
+    optional uint32 Timeout = 2 [default = 20000]; // milliseconds
 }
 
 message TBlobCacheConfig {