Skip to content

Commit 4dec1e5

Browse files
healthcheck report storage group layout incorrect (#15743)
1 parent 98298e9 commit 4dec1e5

File tree

2 files changed

+79
-2
lines changed

2 files changed

+79
-2
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
269269
TString ErasureSpecies;
270270
std::vector<const NKikimrSysView::TVSlotEntry*> VSlots;
271271
ui32 Generation;
272+
bool LayoutCorrect = true;
272273
};
273274

274275
struct TSelfCheckResult {
@@ -1575,6 +1576,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15751576
auto& groupState = GroupState[groupId];
15761577
groupState.ErasureSpecies = group.GetInfo().GetErasureSpeciesV2();
15771578
groupState.Generation = group.GetInfo().GetGeneration();
1579+
groupState.LayoutCorrect = group.GetInfo().GetLayoutCorrect();
15781580
StoragePoolState[poolId].Groups.emplace(groupId);
15791581
}
15801582
for (const auto& vSlot : VSlots->Get()->Record.GetEntries()) {
@@ -2349,6 +2351,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23492351

23502352
class TGroupChecker {
23512353
TString ErasureSpecies;
2354+
bool LayoutCorrect;
23522355
int FailedDisks = 0;
23532356
std::array<int, Ydb::Monitoring::StatusFlag::Status_ARRAYSIZE> DisksColors = {};
23542357
TStackVec<std::pair<ui32, int>> FailedRealms;
@@ -2365,7 +2368,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23652368
}
23662369

23672370
public:
2368-
TGroupChecker(const TString& erasure) : ErasureSpecies(erasure) {}
2371+
TGroupChecker(const TString& erasure, const bool layoutCorrect = true)
2372+
: ErasureSpecies(erasure)
2373+
, LayoutCorrect(layoutCorrect)
2374+
{}
23692375

23702376
void AddVDiskStatus(Ydb::Monitoring::StatusFlag::Status status, ui32 realm) {
23712377
++DisksColors[status];
@@ -2384,6 +2390,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23842390

23852391
void ReportStatus(TSelfCheckContext& context) const {
23862392
context.OverallStatus = Ydb::Monitoring::StatusFlag::GREEN;
2393+
if (!LayoutCorrect) {
2394+
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Group layout is incorrect", ETags::GroupState);
2395+
}
23872396
if (ErasureSpecies == NONE) {
23882397
if (FailedDisks > 0) {
23892398
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState});
@@ -2733,7 +2742,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
27332742
return;
27342743
}
27352744

2736-
TGroupChecker checker(itGroup->second.ErasureSpecies);
2745+
TGroupChecker checker(itGroup->second.ErasureSpecies, itGroup->second.LayoutCorrect);
27372746
const auto& slots = itGroup->second.VSlots;
27382747
for (const auto* slot : slots) {
27392748
const auto& slotInfo = slot->GetInfo();

ydb/core/health_check/health_check_ut.cpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2113,5 +2113,73 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
21132113
TestConfigUpdateNodeRestartsPerPeriod(runtime, sender, nodeRestarts / 2, nodeRestarts + 5, nodeId, Ydb::Monitoring::StatusFlag::YELLOW);
21142114
TestConfigUpdateNodeRestartsPerPeriod(runtime, sender, nodeRestarts / 5, nodeRestarts / 2, nodeId, Ydb::Monitoring::StatusFlag::ORANGE);
21152115
}
2116+
2117+
Y_UNIT_TEST(LayoutIncorrect) {
2118+
TPortManager tp;
2119+
ui16 port = tp.GetPort(2134);
2120+
ui16 grpcPort = tp.GetPort(2135);
2121+
auto settings = TServerSettings(port)
2122+
.SetNodeCount(1)
2123+
.SetDynamicNodeCount(1)
2124+
.SetUseRealThreads(false)
2125+
.SetDomainName("Root");
2126+
2127+
TServer server(settings);
2128+
server.EnableGRpc(grpcPort);
2129+
TClient client(settings);
2130+
TTestActorRuntime& runtime = *server.GetRuntime();
2131+
TActorId sender = runtime.AllocateEdgeActor();
2132+
2133+
auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
2134+
switch (ev->GetTypeRewrite()) {
2135+
case NSysView::TEvSysView::EvGetGroupsResponse: {
2136+
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetGroupsResponse::TPtr*>(&ev);
2137+
auto& record = (*x)->Get()->Record;
2138+
for (auto& entry : *record.mutable_entries()) {
2139+
entry.mutable_info()->set_layoutcorrect(false);
2140+
}
2141+
2142+
break;
2143+
}
2144+
}
2145+
2146+
return TTestActorRuntime::EEventAction::PROCESS;
2147+
};
2148+
runtime.SetObserverFunc(observerFunc);
2149+
2150+
TAutoPtr<IEventHandle> handle;
2151+
auto *request = new NHealthCheck::TEvSelfCheckRequest;
2152+
request->Request.set_return_verbose_status(true);
2153+
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
2154+
auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
2155+
2156+
UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::MAINTENANCE_REQUIRED);
2157+
UNIT_ASSERT_VALUES_EQUAL(result.database_status_size(), 1);
2158+
const auto &database_status = result.database_status(0);
2159+
2160+
UNIT_ASSERT_VALUES_EQUAL(database_status.overall(), Ydb::Monitoring::StatusFlag::ORANGE);
2161+
UNIT_ASSERT_VALUES_EQUAL(database_status.compute().overall(), Ydb::Monitoring::StatusFlag::GREEN);
2162+
UNIT_ASSERT_VALUES_EQUAL(database_status.storage().overall(), Ydb::Monitoring::StatusFlag::ORANGE);
2163+
UNIT_ASSERT_VALUES_EQUAL(database_status.storage().pools().size(), 1);
2164+
UNIT_ASSERT_VALUES_EQUAL(database_status.storage().pools()[0].overall(), Ydb::Monitoring::StatusFlag::ORANGE);
2165+
UNIT_ASSERT_VALUES_EQUAL(database_status.storage().pools()[0].groups().size(), 1);
2166+
UNIT_ASSERT_VALUES_EQUAL(database_status.storage().pools()[0].groups()[0].overall(), Ydb::Monitoring::StatusFlag::ORANGE);
2167+
2168+
for (const auto &issue_log : result.issue_log()) {
2169+
if (issue_log.level() == 1 && issue_log.type() == "DATABASE") {
2170+
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().database().name(), "/Root");
2171+
UNIT_ASSERT_VALUES_EQUAL(issue_log.message(), "Database has storage issues");
2172+
} else if (issue_log.level() == 2 && issue_log.type() == "STORAGE") {
2173+
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().database().name(), "/Root");
2174+
UNIT_ASSERT_VALUES_EQUAL(issue_log.message(), "Storage has no redundancy");
2175+
} else if (issue_log.level() == 3 && issue_log.type() == "STORAGE_POOL") {
2176+
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().storage().pool().name(), "static");
2177+
UNIT_ASSERT_VALUES_EQUAL(issue_log.message(), "Pool has no redundancy");
2178+
} else if (issue_log.level() == 4 && issue_log.type() == "STORAGE_GROUP") {
2179+
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().storage().pool().name(), "static");
2180+
UNIT_ASSERT_VALUES_EQUAL(issue_log.message(), "Group layout is incorrect");
2181+
}
2182+
}
2183+
}
21162184
}
21172185
}

0 commit comments

Comments
 (0)