Skip to content

Commit 91312c6

Browse files
authored
fix whiteboard retries (#19765) (#20421)
2 parents a39a335 + bbe135c commit 91312c6

File tree

1 file changed

+76
-52
lines changed

1 file changed

+76
-52
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 76 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,28 +1108,16 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11081108
auto nodeId = ev->Get()->NodeId;
11091109
switch (eventId) {
11101110
case TEvWhiteboard::EvSystemStateRequest:
1111-
if (!NodeSystemState[nodeId].IsDone()) {
1112-
NodeSystemState.erase(nodeId);
1113-
NodeSystemState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId, {-1});
1114-
}
1111+
NodeSystemState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId, {-1});
11151112
break;
11161113
case TEvWhiteboard::EvVDiskStateRequest:
1117-
if (!NodeVDiskState[nodeId].IsDone()) {
1118-
NodeVDiskState.erase(nodeId);
1119-
NodeVDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId);
1120-
}
1114+
NodeVDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId);
11211115
break;
11221116
case TEvWhiteboard::EvPDiskStateRequest:
1123-
if (!NodePDiskState[nodeId].IsDone()) {
1124-
NodePDiskState.erase(nodeId);
1125-
NodePDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId);
1126-
}
1117+
NodePDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId);
11271118
break;
11281119
case TEvWhiteboard::EvBSGroupStateRequest:
1129-
if (!NodeBSGroupState[nodeId].IsDone()) {
1130-
NodeBSGroupState.erase(nodeId);
1131-
NodeBSGroupState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId);
1132-
}
1120+
NodeBSGroupState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId);
11331121
break;
11341122
default:
11351123
RequestDone("unsupported event scheduled");
@@ -1151,31 +1139,39 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11511139
TString error = "Undelivered";
11521140
if (ev->Get()->SourceType == TEvWhiteboard::EvSystemStateRequest) {
11531141
if (NodeSystemState.count(nodeId) && NodeSystemState[nodeId].Error(error)) {
1154-
if (!RetryRequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId)) {
1142+
if (RetryRequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId)) {
1143+
NodeSystemState.erase(nodeId);
1144+
} else {
11551145
RequestDone("undelivered of TEvSystemStateRequest");
11561146
UnavailableComputeNodes.insert(nodeId);
11571147
}
11581148
}
11591149
}
11601150
if (ev->Get()->SourceType == TEvWhiteboard::EvVDiskStateRequest) {
11611151
if (NodeVDiskState.count(nodeId) && NodeVDiskState[nodeId].Error(error)) {
1162-
if (!RetryRequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId)) {
1152+
if (RetryRequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId)) {
1153+
NodeVDiskState.erase(nodeId);
1154+
} else {
11631155
RequestDone("undelivered of TEvVDiskStateRequest");
11641156
UnavailableStorageNodes.insert(nodeId);
11651157
}
11661158
}
11671159
}
11681160
if (ev->Get()->SourceType == TEvWhiteboard::EvPDiskStateRequest) {
11691161
if (NodePDiskState.count(nodeId) && NodePDiskState[nodeId].Error(error)) {
1170-
if (!RetryRequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId)) {
1162+
if (RetryRequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId)) {
1163+
NodePDiskState.erase(nodeId);
1164+
} else {
11711165
RequestDone("undelivered of TEvPDiskStateRequest");
11721166
UnavailableStorageNodes.insert(nodeId);
11731167
}
11741168
}
11751169
}
11761170
if (ev->Get()->SourceType == TEvWhiteboard::EvBSGroupStateRequest) {
11771171
if (NodeBSGroupState.count(nodeId) && NodeBSGroupState[nodeId].Error(error)) {
1178-
if (!RetryRequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId)) {
1172+
if (RetryRequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId)) {
1173+
NodeBSGroupState.erase(nodeId);
1174+
} else {
11791175
RequestDone("undelivered of TEvBSGroupStateRequest");
11801176
}
11811177
}
@@ -1186,25 +1182,33 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11861182
ui32 nodeId = ev->Get()->NodeId;
11871183
TString error = "NodeDisconnected";
11881184
if (NodeSystemState.count(nodeId) && NodeSystemState[nodeId].Error(error)) {
1189-
if (!RetryRequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId)) {
1185+
if (RetryRequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId)) {
1186+
NodeSystemState.erase(nodeId);
1187+
} else {
11901188
RequestDone("node disconnected with TEvSystemStateRequest");
11911189
UnavailableComputeNodes.insert(nodeId);
11921190
}
11931191
}
11941192
if (NodeVDiskState.count(nodeId) && NodeVDiskState[nodeId].Error(error)) {
1195-
if (!RetryRequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId)) {
1193+
if (RetryRequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId)) {
1194+
NodeVDiskState.erase(nodeId);
1195+
} else {
11961196
RequestDone("node disconnected with TEvVDiskStateRequest");
11971197
UnavailableStorageNodes.insert(nodeId);
11981198
}
11991199
}
12001200
if (NodePDiskState.count(nodeId) && NodePDiskState[nodeId].Error(error)) {
1201-
if (!RetryRequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId)) {
1201+
if (RetryRequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId)) {
1202+
NodePDiskState.erase(nodeId);
1203+
} else {
12021204
RequestDone("node disconnected with TEvPDiskStateRequest");
12031205
UnavailableStorageNodes.insert(nodeId);
12041206
}
12051207
}
12061208
if (NodeBSGroupState.count(nodeId) && NodeBSGroupState[nodeId].Error(error)) {
1207-
if (!RetryRequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId)) {
1209+
if (RetryRequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId)) {
1210+
NodeBSGroupState.erase(nodeId);
1211+
} else {
12081212
RequestDone("node disconnected with TEvBSGroupStateRequest");
12091213
}
12101214
}
@@ -1509,10 +1513,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15091513
TNodeId nodeId = ev.Get()->Cookie;
15101514
auto& nodeSystemState(NodeSystemState[nodeId]);
15111515
nodeSystemState.Set(std::move(ev));
1512-
for (NKikimrWhiteboard::TSystemStateInfo& state : *nodeSystemState->Record.MutableSystemStateInfo()) {
1513-
state.set_nodeid(nodeId);
1514-
MergedNodeSystemState[nodeId] = &state;
1515-
}
15161516
RequestDone("TEvSystemStateResponse");
15171517
}
15181518

@@ -1616,6 +1616,53 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
16161616
}
16171617
}
16181618

1619+
void AggregateWhiteboard() {
1620+
for (auto& [nodeId, nodeSystemState] : NodeSystemState) {
1621+
if (nodeSystemState.IsOk()) {
1622+
for (NKikimrWhiteboard::TSystemStateInfo& state : *nodeSystemState->Record.MutableSystemStateInfo()) {
1623+
state.set_nodeid(nodeId);
1624+
MergedNodeSystemState[nodeId] = &state;
1625+
}
1626+
}
1627+
}
1628+
for (auto& [nodeId, nodeVDiskState] : NodeVDiskState) {
1629+
if (nodeVDiskState.IsOk()) {
1630+
for (NKikimrWhiteboard::TVDiskStateInfo& state : *nodeVDiskState->Record.MutableVDiskStateInfo()) {
1631+
state.set_nodeid(nodeId);
1632+
auto id = GetVDiskId(state.vdiskid());
1633+
MergedVDiskState[id] = &state;
1634+
}
1635+
}
1636+
}
1637+
for (auto& [nodeId, nodePDiskState] : NodePDiskState) {
1638+
if (nodePDiskState.IsOk()) {
1639+
for (NKikimrWhiteboard::TPDiskStateInfo& state : *nodePDiskState->Record.MutablePDiskStateInfo()) {
1640+
state.set_nodeid(nodeId);
1641+
auto id = GetPDiskId(state);
1642+
MergedPDiskState[id] = &state;
1643+
}
1644+
}
1645+
}
1646+
for (auto& [nodeId, nodeBSGroupState] : NodeBSGroupState) {
1647+
if (nodeBSGroupState.IsOk()) {
1648+
for (NKikimrWhiteboard::TBSGroupStateInfo& state : *nodeBSGroupState->Record.MutableBSGroupStateInfo()) {
1649+
state.set_nodeid(nodeId);
1650+
TString storagePoolName = state.storagepoolname();
1651+
TGroupID groupId(state.groupid());
1652+
const NKikimrWhiteboard::TBSGroupStateInfo*& current(MergedBSGroupState[state.groupid()]);
1653+
if (current == nullptr || current->GetGroupGeneration() < state.GetGroupGeneration()) {
1654+
current = &state;
1655+
}
1656+
if (storagePoolName.empty() && groupId.ConfigurationType() != EGroupConfigurationType::Static) {
1657+
continue;
1658+
}
1659+
StoragePoolStateByName[storagePoolName].Groups.emplace(state.groupid());
1660+
StoragePoolStateByName[storagePoolName].Name = storagePoolName;
1661+
}
1662+
}
1663+
}
1664+
}
1665+
16191666
static Ydb::Monitoring::StatusFlag::Status MaxStatus(Ydb::Monitoring::StatusFlag::Status a, Ydb::Monitoring::StatusFlag::Status b) {
16201667
return static_cast<Ydb::Monitoring::StatusFlag::Status>(std::max<int>(a, b));
16211668
}
@@ -2157,44 +2204,20 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
21572204
TNodeId nodeId = ev.Get()->Cookie;
21582205
auto& nodeVDiskState(NodeVDiskState[nodeId]);
21592206
nodeVDiskState.Set(std::move(ev));
2160-
for (NKikimrWhiteboard::TVDiskStateInfo& state : *nodeVDiskState->Record.MutableVDiskStateInfo()) {
2161-
state.set_nodeid(nodeId);
2162-
auto id = GetVDiskId(state.vdiskid());
2163-
MergedVDiskState[id] = &state;
2164-
}
21652207
RequestDone("TEvVDiskStateResponse");
21662208
}
21672209

21682210
void Handle(TEvWhiteboard::TEvPDiskStateResponse::TPtr& ev) {
21692211
TNodeId nodeId = ev.Get()->Cookie;
21702212
auto& nodePDiskState(NodePDiskState[nodeId]);
21712213
nodePDiskState.Set(std::move(ev));
2172-
for (NKikimrWhiteboard::TPDiskStateInfo& state : *nodePDiskState->Record.MutablePDiskStateInfo()) {
2173-
state.set_nodeid(nodeId);
2174-
auto id = GetPDiskId(state);
2175-
MergedPDiskState[id] = &state;
2176-
}
21772214
RequestDone("TEvPDiskStateResponse");
21782215
}
21792216

21802217
void Handle(TEvWhiteboard::TEvBSGroupStateResponse::TPtr& ev) {
21812218
ui64 nodeId = ev.Get()->Cookie;
21822219
auto& nodeBSGroupState(NodeBSGroupState[nodeId]);
21832220
nodeBSGroupState.Set(std::move(ev));
2184-
for (NKikimrWhiteboard::TBSGroupStateInfo& state : *nodeBSGroupState->Record.MutableBSGroupStateInfo()) {
2185-
state.set_nodeid(nodeId);
2186-
TString storagePoolName = state.storagepoolname();
2187-
TGroupID groupId(state.groupid());
2188-
const NKikimrWhiteboard::TBSGroupStateInfo*& current(MergedBSGroupState[state.groupid()]);
2189-
if (current == nullptr || current->GetGroupGeneration() < state.GetGroupGeneration()) {
2190-
current = &state;
2191-
}
2192-
if (storagePoolName.empty() && groupId.ConfigurationType() != EGroupConfigurationType::Static) {
2193-
continue;
2194-
}
2195-
StoragePoolStateByName[storagePoolName].Groups.emplace(state.groupid());
2196-
StoragePoolStateByName[storagePoolName].Name = storagePoolName;
2197-
}
21982221
RequestDone("TEvBSGroupStateResponse");
21992222
}
22002223

@@ -3049,6 +3072,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
30493072
AggregateHiveInfo();
30503073
AggregateHiveNodeStats();
30513074
AggregateStoragePools();
3075+
AggregateWhiteboard();
30523076

30533077
for (auto& [requestId, request] : TabletRequests.RequestsInFlight) {
30543078
auto tabletId = request.TabletId;

0 commit comments

Comments
 (0)