Skip to content

Commit b8023f6

Browse files
authored
do not let faulty pdisks make group status dead in healthcheck (#9744)
1 parent da0e694 commit b8023f6

File tree

2 files changed

+22
-5
lines changed

2 files changed

+22
-5
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2006,6 +2006,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
20062006
}
20072007
}
20082008

2009+
// do not propagate RED status to vdisk - so that vdisk is not considered down when computing group status
2010+
context.OverallStatus = MinStatus(context.OverallStatus, Ydb::Monitoring::StatusFlag::ORANGE);
20092011
storagePDiskStatus.set_overall(context.GetOverallStatus());
20102012
}
20112013

ydb/core/health_check/health_check_ut.cpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
6969

7070
struct TTestVSlotInfo {
7171
std::optional<NKikimrBlobStorage::EVDiskStatus> Status;
72-
ui32 Generation;
72+
ui32 Generation = DEFAULT_GROUP_GENERATION;
73+
NKikimrBlobStorage::EDriveStatus PDiskStatus = NKikimrBlobStorage::ACTIVE;
7374

7475
TTestVSlotInfo(std::optional<NKikimrBlobStorage::EVDiskStatus> status = NKikimrBlobStorage::READY,
7576
ui32 generation = DEFAULT_GROUP_GENERATION)
@@ -78,7 +79,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
7879
{
7980
}
8081

81-
TTestVSlotInfo(NKikimrBlobStorage::EVDiskStatus status) : Status(status), Generation(DEFAULT_GROUP_GENERATION) {}
82+
TTestVSlotInfo(NKikimrBlobStorage::EVDiskStatus status, NKikimrBlobStorage::EDriveStatus pDiskStatus = NKikimrBlobStorage::ACTIVE)
83+
: Status(status)
84+
, PDiskStatus(pDiskStatus)
85+
{
86+
}
8287
};
8388

8489
using TVDisks = TVector<TTestVSlotInfo>;
@@ -223,18 +228,20 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
223228
entry->mutable_info()->set_name(STORAGE_POOL_NAME);
224229
}
225230

226-
void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, size_t count, double occupancy) {
231+
void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, const TVDisks& vslots, double occupancy) {
227232
auto& record = (*ev)->Get()->Record;
228233
auto entrySample = record.entries(0);
229234
record.clear_entries();
230235
auto pdiskId = PDISK_START_ID;
231236
const size_t totalSize = 3'200'000'000'000ull;
232-
for (size_t i = 0; i < count; ++i) {
237+
const auto *descriptor = NKikimrBlobStorage::EDriveStatus_descriptor();
238+
for (const auto& vslot : vslots) {
233239
auto* entry = record.add_entries();
234240
entry->CopyFrom(entrySample);
235241
entry->mutable_key()->set_pdiskid(pdiskId);
236242
entry->mutable_info()->set_totalsize(totalSize);
237243
entry->mutable_info()->set_availablesize((1 - occupancy) * totalSize);
244+
entry->mutable_info()->set_statusv2(descriptor->FindValueByNumber(vslot.PDiskStatus)->name());
238245
++pdiskId;
239246
}
240247
}
@@ -483,7 +490,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
483490
}
484491
case NSysView::TEvSysView::EvGetPDisksResponse: {
485492
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetPDisksResponse::TPtr*>(&ev);
486-
AddPDisksToSysViewResponse(x, vdisks.size(), occupancy);
493+
AddPDisksToSysViewResponse(x, vdisks, occupancy);
487494
break;
488495
}
489496
case NSysView::TEvSysView::EvGetGroupsResponse: {
@@ -711,6 +718,14 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
711718
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
712719
}
713720

721+
Y_UNIT_TEST(YellowIssueReadyVDisksOnFaultyPDisks) {
722+
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3, {NKikimrBlobStorage::READY, NKikimrBlobStorage::FAULTY}});
723+
Cerr << result.ShortDebugString() << Endl;
724+
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
725+
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::ORANGE, 0);
726+
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 0);
727+
}
728+
714729
/* HC currently infers group status on its own, so it's never unknown
715730
Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) {
716731
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});

0 commit comments

Comments
 (0)