Skip to content

Commit fc17805

Browse files
Merge pull request #16023 from StekPerepolnen/healthcheck-merge/stable-25-1
healthcheck merge/stable 25 1
2 parents b605930 + b9128ab commit fc17805

25 files changed

+493
-108
lines changed

ydb/core/base/appdata.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ struct TAppData::TImpl {
7070
NKikimrReplication::TReplicationDefaults ReplicationConfig;
7171
NKikimrProto::TDataIntegrityTrailsConfig DataIntegrityTrailsConfig;
7272
NKikimrConfig::TDataErasureConfig DataErasureConfig;
73+
NKikimrConfig::THealthCheckConfig HealthCheckConfig;
7374
};
7475

7576
TAppData::TAppData(
@@ -127,6 +128,7 @@ TAppData::TAppData(
127128
, ReplicationConfig(Impl->ReplicationConfig)
128129
, DataIntegrityTrailsConfig(Impl->DataIntegrityTrailsConfig)
129130
, DataErasureConfig(Impl->DataErasureConfig)
131+
, HealthCheckConfig(Impl->HealthCheckConfig)
130132
, KikimrShouldContinue(kikimrShouldContinue)
131133
, TracingConfigurator(MakeIntrusive<NJaegerTracing::TSamplingThrottlingConfigurator>(TimeProvider, RandomProvider))
132134
{}

ydb/core/base/appdata_fwd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ namespace NKikimrConfig {
7373
class TMetadataCacheConfig;
7474
class TMemoryControllerConfig;
7575
class TFeatureFlags;
76+
class THealthCheckConfig;
7677
}
7778

7879
namespace NKikimrReplication {
@@ -242,6 +243,7 @@ struct TAppData {
242243
NKikimrReplication::TReplicationDefaults& ReplicationConfig;
243244
NKikimrProto::TDataIntegrityTrailsConfig& DataIntegrityTrailsConfig;
244245
NKikimrConfig::TDataErasureConfig& DataErasureConfig;
246+
NKikimrConfig::THealthCheckConfig& HealthCheckConfig;
245247
bool EnforceUserTokenRequirement = false;
246248
bool EnforceUserTokenCheckRequirement = false; // check token if it was specified
247249
bool AllowHugeKeyValueDeletes = true; // delete when all clients limit deletes per request

ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
2525
}
2626
}
2727

28-
void CreateEnv(std::unique_ptr<TEnvironmentSetup>& env, std::vector<TNodeLocation>& locations) {
29-
TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
28+
void CreateEnv(std::unique_ptr<TEnvironmentSetup>& env, std::vector<TNodeLocation>& locations,
29+
TBlobStorageGroupType groupType) {
3030
const ui32 numNodes = locations.size();
3131

3232
env.reset(new TEnvironmentSetup({
@@ -37,39 +37,49 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
3737

3838
const ui32 disksPerNode = 1;
3939
const ui32 slotsPerDisk = 3;
40+
41+
env->Runtime->FilterFunction = CatchSanitizeRequests;
4042
env->CreateBoxAndPool(disksPerNode, numNodes * disksPerNode * slotsPerDisk / 9);
43+
env->Runtime->FilterFunction = {};
4144
}
4245

43-
Y_UNIT_TEST(Test3dc) {
46+
NActorsInterconnect::TNodeLocation LocationGenerator(ui32 dc, ui32 rack, ui32 unit) {
47+
NActorsInterconnect::TNodeLocation proto;
48+
proto.SetDataCenter(ToString(dc));
49+
proto.SetRack(ToString(rack));
50+
proto.SetUnit(ToString(unit));
51+
return proto;
52+
}
53+
54+
void Test(TBlobStorageGroupType groupType, ui32 dcs, ui32 racks, ui32 units) {
4455
std::vector<TNodeLocation> locations;
45-
TLocationGenerator locationGenerator = [](ui32 dc, ui32 rack, ui32 unit) {
46-
NActorsInterconnect::TNodeLocation proto;
47-
proto.SetDataCenter(ToString(dc));
48-
proto.SetRack(ToString(rack));
49-
proto.SetUnit(ToString(unit));
50-
return proto;
51-
};
5256

53-
MakeLocations(locations, 3, 5, 1, locationGenerator);
57+
MakeLocations(locations, dcs, racks, units, LocationGenerator);
5458
std::unique_ptr<TEnvironmentSetup> env;
55-
CreateEnv(env, locations);
5659

57-
TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
58-
TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
60+
CreateEnv(env, locations, groupType);
5961

62+
63+
// Assure that sanitizer doesn't send request to initially allocated groups
6064
env->Runtime->FilterFunction = CatchSanitizeRequests;
65+
env->UpdateSettings(true, false, true);
66+
env->Sim(TDuration::Minutes(3));
67+
env->UpdateSettings(false, false, false);
68+
69+
TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
6170

6271
TString error;
6372
auto cfg = env->FetchBaseConfig();
6473
UNIT_ASSERT_C(CheckBaseConfigLayout(geom, cfg, true, error), error);
65-
env->Cleanup();
6674

6775
// Shuffle node locayion, assure that layout error occured
68-
std::random_shuffle(locations.begin(), locations.end());
69-
env->Initialize();
70-
env->Sim(TDuration::Seconds(100));
71-
cfg = env->FetchBaseConfig();
72-
CheckBaseConfigLayout(geom, cfg, true, error);
76+
do {
77+
env->Cleanup();
78+
std::random_shuffle(locations.begin(), locations.end());
79+
env->Initialize();
80+
env->Sim(TDuration::Seconds(100));
81+
cfg = env->FetchBaseConfig();
82+
} while (CheckBaseConfigLayout(geom, cfg, true, error));
7383
Cerr << error << Endl;
7484

7585
// Sanitize groups
@@ -86,6 +96,18 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
8696
UNIT_ASSERT_C(CheckBaseConfigLayout(geom, cfg, true, error), error);
8797
}
8898

99+
Y_UNIT_TEST(Test3dc) {
100+
Test(TBlobStorageGroupType::ErasureMirror3dc, 3, 5, 1);
101+
}
102+
103+
Y_UNIT_TEST(TestBlock4Plus2) {
104+
Test(TBlobStorageGroupType::Erasure4Plus2Block, 1, 10, 2);
105+
}
106+
107+
Y_UNIT_TEST(TestMirror3of4) {
108+
Test(TBlobStorageGroupType::ErasureMirror3of4, 1, 10, 2);
109+
}
110+
89111
TString PrintGroups(TBlobStorageGroupType groupType, const NKikimrBlobStorage::TBaseConfig& cfg,
90112
std::vector<TNodeLocation> locations) {
91113
TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
@@ -137,6 +159,7 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
137159
}
138160

139161
void TestMultipleRealmsOccupation(bool allowMultipleRealmsOccupation) {
162+
TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
140163
std::vector<TNodeLocation> locations;
141164
TLocationGenerator locationGenerator = [](ui32 dc, ui32 rack, ui32 unit) {
142165
NActorsInterconnect::TNodeLocation proto;
@@ -152,9 +175,8 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
152175
};
153176
MakeLocations(locations, 4, 5, 1, locationGenerator);
154177
std::unique_ptr<TEnvironmentSetup> env;
155-
CreateEnv(env, locations);
178+
CreateEnv(env, locations, groupType);
156179

157-
TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
158180
TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
159181

160182
env->Runtime->FilterFunction = CatchSanitizeRequests;

ydb/core/cms/console/configs_dispatcher.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ const THashSet<ui32> DYNAMIC_KINDS({
6767
(ui32)NKikimrConsole::TConfigItem::BlobStorageConfigItem,
6868
(ui32)NKikimrConsole::TConfigItem::MetadataCacheConfigItem,
6969
(ui32)NKikimrConsole::TConfigItem::MemoryControllerConfigItem,
70+
(ui32)NKikimrConsole::TConfigItem::HealthCheckConfigItem,
7071
});
7172

7273
const THashSet<ui32> NON_YAML_KINDS({

ydb/core/driver_lib/run/run.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1205,6 +1205,10 @@ void TKikimrRunner::InitializeAppData(const TKikimrRunConfig& runConfig)
12051205
AppData->ReplicationConfig = runConfig.AppConfig.GetReplicationConfig();
12061206
}
12071207

1208+
if (runConfig.AppConfig.HasHealthCheckConfig()) {
1209+
AppData->HealthCheckConfig = runConfig.AppConfig.GetHealthCheckConfig();
1210+
}
1211+
12081212
// setup resource profiles
12091213
AppData->ResourceProfiles = new TResourceProfiles;
12101214
if (runConfig.AppConfig.GetBootstrapConfig().ResourceProfilesSize())

ydb/core/health_check/health_check.cpp

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <ydb/core/base/path.h>
1717
#include <ydb/core/base/statestorage.h>
1818
#include <ydb/core/base/tablet_pipe.h>
19+
#include <ydb/core/cms/console/configs_dispatcher.h>
1920
#include <ydb/core/mon/mon.h>
2021
#include <ydb/core/base/nameservice.h>
2122
#include <ydb/core/blobstorage/base/blobstorage_events.h>
@@ -28,6 +29,7 @@
2829
#include <ydb/core/util/tuples.h>
2930

3031
#include <ydb/core/protos/blobstorage_distributed_config.pb.h>
32+
#include <ydb/core/protos/config.pb.h>
3133
#include <ydb/core/sys_view/common/events.h>
3234

3335
#include <ydb/public/api/grpc/ydb_monitoring_v1.grpc.pb.h>
@@ -121,11 +123,12 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
121123
ui64 Cookie;
122124
NWilson::TSpan Span;
123125

124-
TSelfCheckRequest(const TActorId& sender, THolder<TEvSelfCheckRequest> request, ui64 cookie, NWilson::TTraceId&& traceId)
126+
TSelfCheckRequest(const TActorId& sender, THolder<TEvSelfCheckRequest> request, ui64 cookie, NWilson::TTraceId&& traceId, const NKikimrConfig::THealthCheckConfig& config)
125127
: Sender(sender)
126128
, Request(std::move(request))
127129
, Cookie(cookie)
128130
, Span(TComponentTracingLevels::TTablet::Basic, std::move(traceId), "health_check", NWilson::EFlags::AUTO_END)
131+
, HealthCheckConfig(config)
129132
{}
130133

131134
using TGroupId = ui32;
@@ -163,7 +166,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
163166
struct TNodeTabletState {
164167
struct TTabletStateSettings {
165168
TInstant AliveBarrier;
166-
ui32 MaxRestartsPerPeriod = 30; // per hour
169+
ui32 MaxRestartsPerPeriod; // per hour
167170
ui32 MaxTabletIdsStored = 10;
168171
bool ReportGoodTabletsIds = false;
169172
};
@@ -266,6 +269,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
266269
TString ErasureSpecies;
267270
std::vector<const NKikimrSysView::TVSlotEntry*> VSlots;
268271
ui32 Generation;
272+
bool LayoutCorrect = true;
269273
};
270274

271275
struct TSelfCheckResult {
@@ -647,6 +651,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
647651
std::optional<TRequestResponse<TEvStateStorage::TEvBoardInfo>> DatabaseBoardInfo;
648652
THashSet<TNodeId> UnknownStaticGroups;
649653

654+
const NKikimrConfig::THealthCheckConfig& HealthCheckConfig;
655+
650656
std::vector<TNodeId> SubscribedNodeIds;
651657
THashSet<TNodeId> StorageNodeIds;
652658
THashSet<TNodeId> ComputeNodeIds;
@@ -742,7 +748,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
742748

743749
TTabletRequestsState TabletRequests;
744750

745-
TDuration Timeout = TDuration::MilliSeconds(20000);
751+
TDuration Timeout = TDuration::MilliSeconds(HealthCheckConfig.GetTimeout());
746752
static constexpr TStringBuf STATIC_STORAGE_POOL_NAME = "static";
747753

748754
bool IsSpecificDatabaseFilter() const {
@@ -1504,6 +1510,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15041510
for (const auto& [hiveId, hiveResponse] : HiveInfo) {
15051511
if (hiveResponse.IsOk()) {
15061512
settings.AliveBarrier = TInstant::MilliSeconds(hiveResponse->Record.GetResponseTimestamp()) - TDuration::Minutes(5);
1513+
settings.MaxRestartsPerPeriod = HealthCheckConfig.GetThresholds().GetTabletsRestartsOrange();
15071514
for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record.GetTablets()) {
15081515
TSubDomainKey tenantId = TSubDomainKey(hiveTablet.GetObjectDomain());
15091516
auto itDomain = FilterDomainKey.find(tenantId);
@@ -1569,6 +1576,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15691576
auto& groupState = GroupState[groupId];
15701577
groupState.ErasureSpecies = group.GetInfo().GetErasureSpeciesV2();
15711578
groupState.Generation = group.GetInfo().GetGeneration();
1579+
groupState.LayoutCorrect = group.GetInfo().GetLayoutCorrect();
15721580
StoragePoolState[poolId].Groups.emplace(groupId);
15731581
}
15741582
for (const auto& vSlot : VSlots->Get()->Record.GetEntries()) {
@@ -1729,9 +1737,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
17291737
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
17301738

17311739
TSelfCheckContext rrContext(&context, "NODE_UPTIME");
1732-
if (databaseState.NodeRestartsPerPeriod[nodeId] >= 30) {
1740+
if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetThresholds().GetNodeRestartsOrange()) {
17331741
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Node is restarting too often", ETags::Uptime);
1734-
} else if (databaseState.NodeRestartsPerPeriod[nodeId] >= 10) {
1742+
} else if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetThresholds().GetNodeRestartsYellow()) {
17351743
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "The number of node restarts has increased", ETags::Uptime);
17361744
} else {
17371745
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
@@ -1769,9 +1777,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
17691777
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
17701778
TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
17711779
Ydb::Monitoring::StatusFlag::Status status;
1772-
if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
1780+
if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange())) {
17731781
status = Ydb::Monitoring::StatusFlag::ORANGE;
1774-
} else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
1782+
} else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow())) {
17751783
status = Ydb::Monitoring::StatusFlag::YELLOW;
17761784
} else {
17771785
status = Ydb::Monitoring::StatusFlag::GREEN;
@@ -2343,6 +2351,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23432351

23442352
class TGroupChecker {
23452353
TString ErasureSpecies;
2354+
bool LayoutCorrect;
23462355
int FailedDisks = 0;
23472356
std::array<int, Ydb::Monitoring::StatusFlag::Status_ARRAYSIZE> DisksColors = {};
23482357
TStackVec<std::pair<ui32, int>> FailedRealms;
@@ -2359,7 +2368,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23592368
}
23602369

23612370
public:
2362-
TGroupChecker(const TString& erasure) : ErasureSpecies(erasure) {}
2371+
TGroupChecker(const TString& erasure, const bool layoutCorrect = true)
2372+
: ErasureSpecies(erasure)
2373+
, LayoutCorrect(layoutCorrect)
2374+
{}
23632375

23642376
void AddVDiskStatus(Ydb::Monitoring::StatusFlag::Status status, ui32 realm) {
23652377
++DisksColors[status];
@@ -2378,6 +2390,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23782390

23792391
void ReportStatus(TSelfCheckContext& context) const {
23802392
context.OverallStatus = Ydb::Monitoring::StatusFlag::GREEN;
2393+
if (!LayoutCorrect) {
2394+
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Group layout is incorrect", ETags::GroupState);
2395+
}
23812396
if (ErasureSpecies == NONE) {
23822397
if (FailedDisks > 0) {
23832398
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState});
@@ -2727,7 +2742,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
27272742
return;
27282743
}
27292744

2730-
TGroupChecker checker(itGroup->second.ErasureSpecies);
2745+
TGroupChecker checker(itGroup->second.ErasureSpecies, itGroup->second.LayoutCorrect);
27312746
const auto& slots = itGroup->second.VSlots;
27322747
for (const auto* slot : slots) {
27332748
const auto& slotInfo = slot->GetInfo();
@@ -2921,9 +2936,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
29212936
}
29222937
}
29232938

2924-
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000);
2925-
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000);
2926-
29272939
void FillResult(TOverallStateContext context) {
29282940
if (IsSpecificDatabaseFilter()) {
29292941
FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]);
@@ -3252,12 +3264,16 @@ void TNodeCheckRequest<NMon::TEvHttpInfo>::Bootstrap() {
32523264
class THealthCheckService : public TActorBootstrapped<THealthCheckService> {
32533265
public:
32543266
static constexpr NKikimrServices::TActivity::EType ActorActivityType() { return NKikimrServices::TActivity::MONITORING_SERVICE; }
3267+
NKikimrConfig::THealthCheckConfig HealthCheckConfig;
32553268

32563269
THealthCheckService()
32573270
{
32583271
}
32593272

32603273
void Bootstrap() {
3274+
HealthCheckConfig.CopyFrom(AppData()->HealthCheckConfig);
3275+
Send(NConsole::MakeConfigsDispatcherID(SelfId().NodeId()),
3276+
new NConsole::TEvConfigsDispatcher::TEvSetConfigSubscriptionRequest({NKikimrConsole::TConfigItem::HealthCheckConfigItem}));
32613277
TMon* mon = AppData()->Mon;
32623278
if (mon) {
32633279
mon->RegisterActorPage({
@@ -3270,8 +3286,16 @@ class THealthCheckService : public TActorBootstrapped<THealthCheckService> {
32703286
Become(&THealthCheckService::StateWork);
32713287
}
32723288

3289+
void Handle(NConsole::TEvConsole::TEvConfigNotificationRequest::TPtr& ev) {
3290+
const auto& record = ev->Get()->Record;
3291+
if (record.GetConfig().HasHealthCheckConfig()) {
3292+
HealthCheckConfig.CopyFrom(record.GetConfig().GetHealthCheckConfig());
3293+
}
3294+
Send(ev->Sender, new NConsole::TEvConsole::TEvConfigNotificationResponse(record), 0, ev->Cookie);
3295+
}
3296+
32733297
void Handle(TEvSelfCheckRequest::TPtr& ev) {
3274-
Register(new TSelfCheckRequest(ev->Sender, ev.Get()->Release(), ev->Cookie, std::move(ev->TraceId)));
3298+
Register(new TSelfCheckRequest(ev->Sender, ev.Get()->Release(), ev->Cookie, std::move(ev->TraceId), HealthCheckConfig));
32753299
}
32763300

32773301
std::shared_ptr<NYdbGrpc::TGRpcClientLow> GRpcClientLow;
@@ -3299,6 +3323,7 @@ class THealthCheckService : public TActorBootstrapped<THealthCheckService> {
32993323
hFunc(TEvSelfCheckRequest, Handle);
33003324
hFunc(TEvNodeCheckRequest, Handle);
33013325
hFunc(NMon::TEvHttpInfo, Handle);
3326+
hFunc(NConsole::TEvConsole::TEvConfigNotificationRequest, Handle);
33023327
cFunc(TEvents::TSystem::PoisonPill, PassAway);
33033328
}
33043329
}

0 commit comments

Comments
 (0)