Skip to content

Commit 163ff65

Browse files
authored
Report UNKNOWN status to SysView for disks that have not checked in yet (#8144)
1 parent 55eedf7 commit 163ff65

File tree

11 files changed

+46
-32
lines changed

11 files changed

+46
-32
lines changed

ydb/core/mind/bscontroller/bsc.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ void TBlobStorageController::ValidateInternalState() {
337337
Y_ABORT_UNLESS(donor->GetShortVDiskId() == vslot->GetShortVDiskId());
338338
}
339339
if (vslot->Group) {
340-
if (vslot->Status == NKikimrBlobStorage::EVDiskStatus::READY) {
340+
if (vslot->GetStatus() == NKikimrBlobStorage::EVDiskStatus::READY) {
341341
Y_DEBUG_ABORT_UNLESS(vslot->IsReady || vslot->IsInVSlotReadyTimestampQ());
342342
} else {
343343
Y_DEBUG_ABORT_UNLESS(!vslot->IsReady && !vslot->IsInVSlotReadyTimestampQ());
@@ -401,7 +401,7 @@ ui32 TBlobStorageController::GetEventPriority(IEventHandle *ev) {
401401
const auto& record = msg->Record;
402402
for (const auto& item : record.GetVDiskStatus()) {
403403
const TVSlotId vslotId(item.GetNodeId(), item.GetPDiskId(), item.GetVSlotId());
404-
if (TVSlotInfo *slot = FindVSlot(vslotId); slot && slot->Status > item.GetStatus()) {
404+
if (TVSlotInfo *slot = FindVSlot(vslotId); slot && slot->GetStatus() > item.GetStatus()) {
405405
return 1;
406406
} else if (const auto it = StaticVSlots.find(vslotId); it != StaticVSlots.end() && it->second.VDiskStatus > item.GetStatus()) {
407407
return 1;

ydb/core/mind/bscontroller/cmds_box.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ namespace NKikimr::NBsController {
213213
for (const auto& [id, slot] : pdisk->VSlotsOnPDisk) {
214214
if (slot->Group) {
215215
auto *m = VSlots.FindForUpdate(slot->VSlotId);
216-
m->Status = NKikimrBlobStorage::EVDiskStatus::ERROR;
216+
m->VDiskStatus = NKikimrBlobStorage::EVDiskStatus::ERROR;
217217
m->IsReady = false;
218218
TGroupInfo *group = Groups.FindForUpdate(slot->Group->ID);
219219
GroupFailureModelChanged.insert(slot->Group->ID);

ydb/core/mind/bscontroller/cmds_storage_pool.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ namespace NKikimr::NBsController {
557557
x->MutableVDiskMetrics()->CopyFrom(*vslot.VDiskMetrics);
558558
x->MutableVDiskMetrics()->ClearVDiskId();
559559
}
560-
x->SetStatus(NKikimrBlobStorage::EVDiskStatus_Name(vslot.VDiskStatus));
560+
x->SetStatus(NKikimrBlobStorage::EVDiskStatus_Name(vslot.VDiskStatus.value_or(NKikimrBlobStorage::EVDiskStatus::ERROR)));
561561
x->SetReady(vslot.ReadySince <= mono);
562562
}
563563
if (const auto& s = Self.StorageConfig; s.HasBlobStorageConfig()) {
@@ -698,7 +698,7 @@ namespace NKikimr::NBsController {
698698

699699
TGroupInfo *group = Groups.FindForUpdate(vslot->GroupId);
700700
vslot->Mood = TMood::Wipe;
701-
vslot->Status = NKikimrBlobStorage::EVDiskStatus::ERROR;
701+
vslot->VDiskStatus = NKikimrBlobStorage::EVDiskStatus::ERROR;
702702
vslot->IsReady = false;
703703
GroupFailureModelChanged.insert(group->ID);
704704
group->CalculateGroupStatus();
@@ -744,7 +744,7 @@ namespace NKikimr::NBsController {
744744

745745
TGroupInfo *group = Groups.FindForUpdate(vslot->GroupId);
746746
vslot->Mood = targetMood;
747-
vslot->Status = NKikimrBlobStorage::EVDiskStatus::ERROR;
747+
vslot->VDiskStatus = NKikimrBlobStorage::EVDiskStatus::ERROR;
748748
vslot->IsReady = false;
749749
GroupFailureModelChanged.insert(group->ID);
750750
group->CalculateGroupStatus();

ydb/core/mind/bscontroller/config.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -486,9 +486,9 @@ namespace NKikimr::NBsController {
486486
if (!overlay->second || !overlay->second->Group) { // deleted one
487487
(overlay->second ? overlay->second : base->second)->DropFromVSlotReadyTimestampQ();
488488
NotReadyVSlotIds.erase(overlay->first);
489-
} else if (overlay->second->Status != NKikimrBlobStorage::EVDiskStatus::READY) {
489+
} else if (overlay->second->GetStatus() != NKikimrBlobStorage::EVDiskStatus::READY) {
490490
overlay->second->DropFromVSlotReadyTimestampQ();
491-
} else if (!base || base->second->Status != NKikimrBlobStorage::EVDiskStatus::READY) {
491+
} else if (!base || base->second->GetStatus() != NKikimrBlobStorage::EVDiskStatus::READY) {
492492
overlay->second->PutInVSlotReadyTimestampQ(now);
493493
} else {
494494
Y_DEBUG_ABORT_UNLESS(overlay->second->IsReady || overlay->second->IsInVSlotReadyTimestampQ());
@@ -998,7 +998,7 @@ namespace NKikimr::NBsController {
998998
pb->SetAllocatedSize(vslot.Metrics.GetAllocatedSize());
999999
pb->MutableVDiskMetrics()->CopyFrom(vslot.Metrics);
10001000
pb->MutableVDiskMetrics()->ClearVDiskId();
1001-
pb->SetStatus(NKikimrBlobStorage::EVDiskStatus_Name(vslot.Status));
1001+
pb->SetStatus(NKikimrBlobStorage::EVDiskStatus_Name(vslot.GetStatus()));
10021002
for (const TVSlotId& vslotId : vslot.Donors) {
10031003
auto *item = pb->AddDonors();
10041004
Serialize(item->MutableVSlotId(), vslotId);

ydb/core/mind/bscontroller/config_fit_groups.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,7 @@ namespace NKikimr {
524524
// also we have to find replicating VSlots on this PDisk and assume they consume up to
525525
// max(vslotSize for every slot in group), not their actual AllocatedSize
526526
for (const auto& [id, slot] : info.VSlotsOnPDisk) {
527-
if (slot->Group && slot->Status != NKikimrBlobStorage::EVDiskStatus::READY) {
527+
if (slot->Group && slot->GetStatus() != NKikimrBlobStorage::EVDiskStatus::READY) {
528528
ui64 maxGroupSlotSize = 0;
529529
for (const TVSlotInfo *peer : slot->Group->VDisksInGroup) {
530530
maxGroupSlotSize = Max(maxGroupSlotSize, peer->Metrics.GetAllocatedSize());

ydb/core/mind/bscontroller/impl.h

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -124,16 +124,17 @@ class TBlobStorageController : public TActor<TBlobStorageController>, public TTa
124124
TVSlotReadyTimestampQ::iterator VSlotReadyTimestampIter;
125125

126126
public:
127-
NKikimrBlobStorage::EVDiskStatus Status = NKikimrBlobStorage::EVDiskStatus::ERROR;
127+
std::optional<NKikimrBlobStorage::EVDiskStatus> VDiskStatus;
128+
NHPTimer::STime VDiskStatusTimestamp = GetCycleCountFast();
128129
bool IsReady = false;
129130
bool OnlyPhantomsRemain = false;
130131

131132
public:
132133
void SetStatus(NKikimrBlobStorage::EVDiskStatus status, TMonotonic now, TInstant instant, bool onlyPhantomsRemain) {
133-
if (status != Status) {
134+
if (status != VDiskStatus) {
134135
if (status == NKikimrBlobStorage::EVDiskStatus::REPLICATING) { // became "replicating"
135136
LastGotReplicating = instant;
136-
} else if (Status == NKikimrBlobStorage::EVDiskStatus::REPLICATING) { // was "replicating"
137+
} else if (VDiskStatus == NKikimrBlobStorage::EVDiskStatus::REPLICATING) { // was "replicating"
137138
Y_DEBUG_ABORT_UNLESS(LastGotReplicating != TInstant::Zero());
138139
ReplicationTime += instant - LastGotReplicating;
139140
LastGotReplicating = {};
@@ -145,7 +146,7 @@ class TBlobStorageController : public TActor<TBlobStorageController>, public TTa
145146
LastSeenReady = instant;
146147
}
147148

148-
Status = status;
149+
VDiskStatus = status;
149150
IsReady = false;
150151
if (status == NKikimrBlobStorage::EVDiskStatus::READY) {
151152
PutInVSlotReadyTimestampQ(now);
@@ -159,6 +160,10 @@ class TBlobStorageController : public TActor<TBlobStorageController>, public TTa
159160
}
160161
}
161162

163+
NKikimrBlobStorage::EVDiskStatus GetStatus() const {
164+
return VDiskStatus.value_or(NKikimrBlobStorage::EVDiskStatus::ERROR);
165+
}
166+
162167
void PutInVSlotReadyTimestampQ(TMonotonic now) {
163168
const TMonotonic readyAfter = now + ReadyStablePeriod; // vdisk will be treated as READY one shortly, but not now
164169
Y_ABORT_UNLESS(VSlotReadyTimestampIter == TVSlotReadyTimestampQ::iterator());
@@ -291,15 +296,16 @@ class TBlobStorageController : public TActor<TBlobStorageController>, public TTa
291296

292297
TString GetStatusString() const {
293298
TStringStream s;
294-
s << NKikimrBlobStorage::EVDiskStatus_Name(Status);
295-
if (Status == NKikimrBlobStorage::REPLICATING && OnlyPhantomsRemain) {
299+
const auto status = GetStatus();
300+
s << NKikimrBlobStorage::EVDiskStatus_Name(status);
301+
if (status == NKikimrBlobStorage::REPLICATING && OnlyPhantomsRemain) {
296302
s << "/p";
297303
}
298304
return s.Str();
299305
}
300306

301307
bool IsOperational() const {
302-
return Status >= NKikimrBlobStorage::REPLICATING;
308+
return GetStatus() >= NKikimrBlobStorage::REPLICATING;
303309
}
304310

305311
void OnCommit();
@@ -2276,7 +2282,7 @@ class TBlobStorageController : public TActor<TBlobStorageController>, public TTa
22762282
histo.IncrementFor(passed.Seconds());
22772283

22782284
TDuration timeBeingReplicating = slot->ReplicationTime;
2279-
if (slot->Status == NKikimrBlobStorage::EVDiskStatus::REPLICATING) {
2285+
if (slot->GetStatus() == NKikimrBlobStorage::EVDiskStatus::REPLICATING) {
22802286
timeBeingReplicating += now - slot->LastGotReplicating;
22812287
}
22822288

@@ -2301,7 +2307,8 @@ class TBlobStorageController : public TActor<TBlobStorageController>, public TTa
23012307
const NKikimrBlobStorage::TVDiskKind::EVDiskKind VDiskKind;
23022308

23032309
std::optional<NKikimrBlobStorage::TVDiskMetrics> VDiskMetrics;
2304-
NKikimrBlobStorage::EVDiskStatus VDiskStatus = NKikimrBlobStorage::EVDiskStatus::ERROR;
2310+
std::optional<NKikimrBlobStorage::EVDiskStatus> VDiskStatus;
2311+
NHPTimer::STime VDiskStatusTimestamp = GetCycleCountFast();
23052312
TMonotonic ReadySince = TMonotonic::Max(); // when IsReady becomes true for this disk; Max() in non-READY state
23062313

23072314
TStaticVSlotInfo(const NKikimrBlobStorage::TNodeWardenServiceSet::TVDisk& vdisk,
@@ -2315,6 +2322,7 @@ class TBlobStorageController : public TActor<TBlobStorageController>, public TTa
23152322
TStaticVSlotInfo& item = it->second;
23162323
VDiskMetrics = std::move(item.VDiskMetrics);
23172324
VDiskStatus = item.VDiskStatus;
2325+
VDiskStatusTimestamp = item.VDiskStatusTimestamp;
23182326
ReadySince = item.ReadySince;
23192327
}
23202328
}

ydb/core/mind/bscontroller/monitoring.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1296,7 +1296,7 @@ void TBlobStorageController::RenderVSlotRow(IOutputStream& out, const TVSlotInfo
12961296
}
12971297
TABLED() {
12981298
TDuration time = vslot.ReplicationTime;
1299-
if (vslot.Status == NKikimrBlobStorage::EVDiskStatus::REPLICATING) {
1299+
if (vslot.GetStatus() == NKikimrBlobStorage::EVDiskStatus::REPLICATING) {
13001300
time += TActivationContext::Now() - vslot.LastGotReplicating;
13011301
}
13021302
out << time;

ydb/core/mind/bscontroller/register_node.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,7 @@ void TBlobStorageController::OnWardenDisconnected(TNodeId nodeId, TActorId serve
547547
updates.push_back({
548548
.VDiskId = it->second->GetVDiskId(),
549549
.IsReady = it->second->IsReady,
550-
.VDiskStatus = it->second->Status,
550+
.VDiskStatus = it->second->GetStatus(),
551551
});
552552
ScrubState.UpdateVDiskState(&*it->second);
553553
SysViewChangedVSlots.insert(it->second->VSlotId);

ydb/core/mind/bscontroller/self_heal.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -913,7 +913,7 @@ namespace NKikimr::NBsController {
913913
slot->OnlyPhantomsRemain,
914914
slot->IsReady,
915915
TMonotonic::Zero(),
916-
slot->Status,
916+
slot->GetStatus(),
917917
};
918918
}
919919
}
@@ -960,7 +960,7 @@ namespace NKikimr::NBsController {
960960
false, /* OnlyPhantomsRemain */
961961
true, /* IsReady; decision is based on ReadySince */
962962
info.ReadySince,
963-
info.VDiskStatus,
963+
info.VDiskStatus.value_or(NKikimrBlobStorage::EVDiskStatus::ERROR),
964964
};
965965
}
966966
}
@@ -987,7 +987,7 @@ namespace NKikimr::NBsController {
987987
const bool was = slot->IsOperational();
988988
if (const TGroupInfo *group = slot->Group) {
989989
const bool wasReady = slot->IsReady;
990-
if (slot->Status != m.GetStatus() || slot->OnlyPhantomsRemain != m.GetOnlyPhantomsRemain()) {
990+
if (slot->GetStatus() != m.GetStatus() || slot->OnlyPhantomsRemain != m.GetOnlyPhantomsRemain()) {
991991
slot->SetStatus(m.GetStatus(), mono, now, m.GetOnlyPhantomsRemain());
992992
if (slot->IsReady != wasReady) {
993993
ScrubState.UpdateVDiskState(slot);
@@ -1001,14 +1001,14 @@ namespace NKikimr::NBsController {
10011001
.VDiskId = vdiskId,
10021002
.OnlyPhantomsRemain = slot->OnlyPhantomsRemain,
10031003
.IsReady = slot->IsReady,
1004-
.VDiskStatus = slot->Status,
1004+
.VDiskStatus = slot->GetStatus(),
10051005
});
10061006
if (!was && slot->IsOperational() && !group->SeenOperational) {
10071007
groups.insert(const_cast<TGroupInfo*>(group));
10081008
}
10091009
SysViewChangedVSlots.insert(vslotId);
10101010
}
1011-
if (slot->Status == NKikimrBlobStorage::EVDiskStatus::READY) {
1011+
if (slot->GetStatus() == NKikimrBlobStorage::EVDiskStatus::READY) {
10121012
// we can release donor slots without further notice then the VDisk is completely replicated; we
10131013
// intentionally use GetStatus() here instead of IsReady() to prevent waiting
10141014
for (const TVSlotId& donorVSlotId : slot->Donors) {

ydb/core/mind/bscontroller/sys_view.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,8 @@ void CopyInfo(NKikimrSysView::TPDiskInfo* info, const THolder<TBlobStorageContro
325325
}
326326

327327
void SerializeVSlotInfo(NKikimrSysView::TVSlotInfo *pb, const TVDiskID& vdiskId, const NKikimrBlobStorage::TVDiskMetrics& m,
328-
NKikimrBlobStorage::EVDiskStatus status, NKikimrBlobStorage::TVDiskKind::EVDiskKind kind, bool isBeingDeleted) {
328+
std::optional<NKikimrBlobStorage::EVDiskStatus> status, NHPTimer::STime statusTimestamp,
329+
NKikimrBlobStorage::TVDiskKind::EVDiskKind kind, bool isBeingDeleted) {
329330
pb->SetGroupId(vdiskId.GroupID.GetRawId());
330331
pb->SetGroupGeneration(vdiskId.GroupGeneration);
331332
pb->SetFailRealm(vdiskId.FailRealm);
@@ -337,16 +338,21 @@ void SerializeVSlotInfo(NKikimrSysView::TVSlotInfo *pb, const TVDiskID& vdiskId,
337338
if (m.HasAvailableSize()) {
338339
pb->SetAvailableSize(m.GetAvailableSize());
339340
}
340-
pb->SetStatusV2(NKikimrBlobStorage::EVDiskStatus_Name(status));
341+
if (!status && CyclesToDuration(GetCycleCountFast() - statusTimestamp) > TDuration::Seconds(15)) {
342+
status = NKikimrBlobStorage::EVDiskStatus::ERROR;
343+
}
344+
if (status) {
345+
pb->SetStatusV2(NKikimrBlobStorage::EVDiskStatus_Name(*status));
346+
}
341347
pb->SetKind(NKikimrBlobStorage::TVDiskKind::EVDiskKind_Name(kind));
342348
if (isBeingDeleted) {
343349
pb->SetIsBeingDeleted(true);
344350
}
345351
}
346352

347353
void CopyInfo(NKikimrSysView::TVSlotInfo* info, const THolder<TBlobStorageController::TVSlotInfo>& vSlotInfo) {
348-
SerializeVSlotInfo(info, vSlotInfo->GetVDiskId(), vSlotInfo->Metrics, vSlotInfo->Status, vSlotInfo->Kind,
349-
vSlotInfo->IsBeingDeleted());
354+
SerializeVSlotInfo(info, vSlotInfo->GetVDiskId(), vSlotInfo->Metrics, vSlotInfo->VDiskStatus,
355+
vSlotInfo->VDiskStatusTimestamp, vSlotInfo->Kind, vSlotInfo->IsBeingDeleted());
350356
}
351357

352358
void CopyInfo(NKikimrSysView::TGroupInfo* info, const THolder<TBlobStorageController::TGroupInfo>& groupInfo) {
@@ -462,7 +468,7 @@ void TBlobStorageController::UpdateSystemViews() {
462468
if (SysViewChangedVSlots.count(vslotId)) {
463469
static const NKikimrBlobStorage::TVDiskMetrics zero;
464470
SerializeVSlotInfo(&state.VSlots[vslotId], vslot.VDiskId, vslot.VDiskMetrics ? *vslot.VDiskMetrics : zero,
465-
vslot.VDiskStatus, vslot.VDiskKind, false);
471+
vslot.VDiskStatus, vslot.VDiskStatusTimestamp, vslot.VDiskKind, false);
466472
}
467473
}
468474
if (StorageConfig.HasBlobStorageConfig()) {

0 commit comments

Comments
 (0)