16
16
#include < ydb/core/base/path.h>
17
17
#include < ydb/core/base/statestorage.h>
18
18
#include < ydb/core/base/tablet_pipe.h>
19
+ #include < ydb/core/cms/console/configs_dispatcher.h>
19
20
#include < ydb/core/mon/mon.h>
20
21
#include < ydb/core/base/nameservice.h>
21
22
#include < ydb/core/blobstorage/base/blobstorage_events.h>
28
29
#include < ydb/core/util/tuples.h>
29
30
30
31
#include < ydb/core/protos/blobstorage_distributed_config.pb.h>
32
+ #include < ydb/core/protos/config.pb.h>
31
33
#include < ydb/core/sys_view/common/events.h>
32
34
33
35
#include < ydb/public/api/grpc/ydb_monitoring_v1.grpc.pb.h>
@@ -121,11 +123,12 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
121
123
ui64 Cookie;
122
124
NWilson::TSpan Span;
123
125
124
- TSelfCheckRequest (const TActorId& sender, THolder<TEvSelfCheckRequest> request, ui64 cookie, NWilson::TTraceId&& traceId)
126
+ TSelfCheckRequest (const TActorId& sender, THolder<TEvSelfCheckRequest> request, ui64 cookie, NWilson::TTraceId&& traceId, const NKikimrConfig::THealthCheckConfig& config )
125
127
: Sender(sender)
126
128
, Request(std::move(request))
127
129
, Cookie(cookie)
128
130
, Span(TComponentTracingLevels::TTablet::Basic, std::move(traceId), " health_check" , NWilson::EFlags::AUTO_END)
131
+ , HealthCheckConfig(config)
129
132
{}
130
133
131
134
using TGroupId = ui32;
@@ -163,7 +166,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
163
166
struct TNodeTabletState {
164
167
struct TTabletStateSettings {
165
168
TInstant AliveBarrier;
166
- ui32 MaxRestartsPerPeriod = 30 ; // per hour
169
+ ui32 MaxRestartsPerPeriod; // per hour
167
170
ui32 MaxTabletIdsStored = 10 ;
168
171
bool ReportGoodTabletsIds = false ;
169
172
};
@@ -266,6 +269,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
266
269
TString ErasureSpecies;
267
270
std::vector<const NKikimrSysView::TVSlotEntry*> VSlots;
268
271
ui32 Generation;
272
+ bool LayoutCorrect = true ;
269
273
};
270
274
271
275
struct TSelfCheckResult {
@@ -647,6 +651,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
647
651
std::optional<TRequestResponse<TEvStateStorage::TEvBoardInfo>> DatabaseBoardInfo;
648
652
THashSet<TNodeId> UnknownStaticGroups;
649
653
654
+ const NKikimrConfig::THealthCheckConfig& HealthCheckConfig;
655
+
650
656
std::vector<TNodeId> SubscribedNodeIds;
651
657
THashSet<TNodeId> StorageNodeIds;
652
658
THashSet<TNodeId> ComputeNodeIds;
@@ -742,7 +748,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
742
748
743
749
TTabletRequestsState TabletRequests;
744
750
745
- TDuration Timeout = TDuration::MilliSeconds(20000 );
751
+ TDuration Timeout = TDuration::MilliSeconds(HealthCheckConfig.GetTimeout() );
746
752
static constexpr TStringBuf STATIC_STORAGE_POOL_NAME = " static" ;
747
753
748
754
bool IsSpecificDatabaseFilter () const {
@@ -1504,6 +1510,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1504
1510
for (const auto & [hiveId, hiveResponse] : HiveInfo) {
1505
1511
if (hiveResponse.IsOk ()) {
1506
1512
settings.AliveBarrier = TInstant::MilliSeconds (hiveResponse->Record .GetResponseTimestamp ()) - TDuration::Minutes (5 );
1513
+ settings.MaxRestartsPerPeriod = HealthCheckConfig.GetThresholds ().GetTabletsRestartsOrange ();
1507
1514
for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record .GetTablets ()) {
1508
1515
TSubDomainKey tenantId = TSubDomainKey (hiveTablet.GetObjectDomain ());
1509
1516
auto itDomain = FilterDomainKey.find (tenantId);
@@ -1569,6 +1576,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1569
1576
auto & groupState = GroupState[groupId];
1570
1577
groupState.ErasureSpecies = group.GetInfo ().GetErasureSpeciesV2 ();
1571
1578
groupState.Generation = group.GetInfo ().GetGeneration ();
1579
+ groupState.LayoutCorrect = group.GetInfo ().GetLayoutCorrect ();
1572
1580
StoragePoolState[poolId].Groups .emplace (groupId);
1573
1581
}
1574
1582
for (const auto & vSlot : VSlots->Get ()->Record .GetEntries ()) {
@@ -1729,9 +1737,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1729
1737
FillNodeInfo (nodeId, context.Location .mutable_compute ()->mutable_node ());
1730
1738
1731
1739
TSelfCheckContext rrContext (&context, " NODE_UPTIME" );
1732
- if (databaseState.NodeRestartsPerPeriod [nodeId] >= 30 ) {
1740
+ if (databaseState.NodeRestartsPerPeriod [nodeId] >= HealthCheckConfig. GetThresholds (). GetNodeRestartsOrange () ) {
1733
1741
rrContext.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, " Node is restarting too often" , ETags::Uptime);
1734
- } else if (databaseState.NodeRestartsPerPeriod [nodeId] >= 10 ) {
1742
+ } else if (databaseState.NodeRestartsPerPeriod [nodeId] >= HealthCheckConfig. GetThresholds (). GetNodeRestartsYellow () ) {
1735
1743
rrContext.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, " The number of node restarts has increased" , ETags::Uptime);
1736
1744
} else {
1737
1745
rrContext.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
@@ -1769,9 +1777,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1769
1777
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs ();
1770
1778
TDuration timeDifferenceDuration = TDuration::MicroSeconds (abs (timeDifferenceUs));
1771
1779
Ydb::Monitoring::StatusFlag::Status status;
1772
- if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME ) {
1780
+ if (timeDifferenceDuration > TDuration::MicroSeconds (HealthCheckConfig. GetThresholds (). GetNodesTimeDifferenceOrange ()) ) {
1773
1781
status = Ydb::Monitoring::StatusFlag::ORANGE;
1774
- } else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME ) {
1782
+ } else if (timeDifferenceDuration > TDuration::MicroSeconds (HealthCheckConfig. GetThresholds (). GetNodesTimeDifferenceYellow ()) ) {
1775
1783
status = Ydb::Monitoring::StatusFlag::YELLOW;
1776
1784
} else {
1777
1785
status = Ydb::Monitoring::StatusFlag::GREEN;
@@ -2343,6 +2351,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2343
2351
2344
2352
class TGroupChecker {
2345
2353
TString ErasureSpecies;
2354
+ bool LayoutCorrect;
2346
2355
int FailedDisks = 0 ;
2347
2356
std::array<int , Ydb::Monitoring::StatusFlag::Status_ARRAYSIZE> DisksColors = {};
2348
2357
TStackVec<std::pair<ui32, int >> FailedRealms;
@@ -2359,7 +2368,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2359
2368
}
2360
2369
2361
2370
public:
2362
- TGroupChecker (const TString& erasure) : ErasureSpecies(erasure) {}
2371
+ TGroupChecker (const TString& erasure, const bool layoutCorrect = true )
2372
+ : ErasureSpecies(erasure)
2373
+ , LayoutCorrect(layoutCorrect)
2374
+ {}
2363
2375
2364
2376
void AddVDiskStatus (Ydb::Monitoring::StatusFlag::Status status, ui32 realm) {
2365
2377
++DisksColors[status];
@@ -2378,6 +2390,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2378
2390
2379
2391
void ReportStatus (TSelfCheckContext& context) const {
2380
2392
context.OverallStatus = Ydb::Monitoring::StatusFlag::GREEN;
2393
+ if (!LayoutCorrect) {
2394
+ context.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, " Group layout is incorrect" , ETags::GroupState);
2395
+ }
2381
2396
if (ErasureSpecies == NONE) {
2382
2397
if (FailedDisks > 0 ) {
2383
2398
context.ReportStatus (Ydb::Monitoring::StatusFlag::RED, " Group failed" , ETags::GroupState, {ETags::VDiskState});
@@ -2727,7 +2742,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2727
2742
return ;
2728
2743
}
2729
2744
2730
- TGroupChecker checker (itGroup->second .ErasureSpecies );
2745
+ TGroupChecker checker (itGroup->second .ErasureSpecies , itGroup-> second . LayoutCorrect );
2731
2746
const auto & slots = itGroup->second .VSlots ;
2732
2747
for (const auto * slot : slots) {
2733
2748
const auto & slotInfo = slot->GetInfo ();
@@ -2921,9 +2936,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2921
2936
}
2922
2937
}
2923
2938
2924
- const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000 );
2925
- const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000 );
2926
-
2927
2939
void FillResult (TOverallStateContext context) {
2928
2940
if (IsSpecificDatabaseFilter ()) {
2929
2941
FillDatabaseResult (context, FilterDatabase, DatabaseState[FilterDatabase]);
@@ -3252,12 +3264,16 @@ void TNodeCheckRequest<NMon::TEvHttpInfo>::Bootstrap() {
3252
3264
class THealthCheckService : public TActorBootstrapped <THealthCheckService> {
3253
3265
public:
3254
3266
static constexpr NKikimrServices::TActivity::EType ActorActivityType () { return NKikimrServices::TActivity::MONITORING_SERVICE; }
3267
+ NKikimrConfig::THealthCheckConfig HealthCheckConfig;
3255
3268
3256
3269
THealthCheckService ()
3257
3270
{
3258
3271
}
3259
3272
3260
3273
void Bootstrap () {
3274
+ HealthCheckConfig.CopyFrom (AppData ()->HealthCheckConfig );
3275
+ Send (NConsole::MakeConfigsDispatcherID (SelfId ().NodeId ()),
3276
+ new NConsole::TEvConfigsDispatcher::TEvSetConfigSubscriptionRequest ({NKikimrConsole::TConfigItem::HealthCheckConfigItem}));
3261
3277
TMon* mon = AppData ()->Mon ;
3262
3278
if (mon) {
3263
3279
mon->RegisterActorPage ({
@@ -3270,8 +3286,16 @@ class THealthCheckService : public TActorBootstrapped<THealthCheckService> {
3270
3286
Become (&THealthCheckService::StateWork);
3271
3287
}
3272
3288
3289
+ void Handle (NConsole::TEvConsole::TEvConfigNotificationRequest::TPtr& ev) {
3290
+ const auto & record = ev->Get ()->Record ;
3291
+ if (record.GetConfig ().HasHealthCheckConfig ()) {
3292
+ HealthCheckConfig.CopyFrom (record.GetConfig ().GetHealthCheckConfig ());
3293
+ }
3294
+ Send (ev->Sender , new NConsole::TEvConsole::TEvConfigNotificationResponse (record), 0 , ev->Cookie );
3295
+ }
3296
+
3273
3297
void Handle (TEvSelfCheckRequest::TPtr& ev) {
3274
- Register (new TSelfCheckRequest (ev->Sender , ev.Get ()->Release (), ev->Cookie , std::move (ev->TraceId )));
3298
+ Register (new TSelfCheckRequest (ev->Sender , ev.Get ()->Release (), ev->Cookie , std::move (ev->TraceId ), HealthCheckConfig ));
3275
3299
}
3276
3300
3277
3301
std::shared_ptr<NYdbGrpc::TGRpcClientLow> GRpcClientLow;
@@ -3299,6 +3323,7 @@ class THealthCheckService : public TActorBootstrapped<THealthCheckService> {
3299
3323
hFunc (TEvSelfCheckRequest, Handle);
3300
3324
hFunc (TEvNodeCheckRequest, Handle);
3301
3325
hFunc (NMon::TEvHttpInfo, Handle);
3326
+ hFunc (NConsole::TEvConsole::TEvConfigNotificationRequest, Handle);
3302
3327
cFunc (TEvents::TSystem::PoisonPill, PassAway);
3303
3328
}
3304
3329
}
0 commit comments