Skip to content

Commit ac60eb4

Browse files
serbel324blinkov
authored andcommitted
Save ErrorReason and print it on EBS_DISINTEGRATED (#14425)
1 parent efcb5e6 commit ac60eb4

12 files changed

+105
-24
lines changed

ydb/core/blobstorage/dsproxy/dsproxy_blackboard.cpp

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,8 @@ void TBlobState::AddPutOkResponse(const TBlobStorageGroupInfo &info, const TLogo
139139
diskPart.Situation = ESituation::Present;
140140
}
141141

142-
void TBlobState::AddErrorResponse(const TBlobStorageGroupInfo &info, const TLogoBlobID &id, ui32 orderNumber) {
142+
void TBlobState::AddErrorResponse(const TBlobStorageGroupInfo &info, const TLogoBlobID &id, ui32 orderNumber,
143+
const TString& errorReason) {
143144
Y_ABORT_UNLESS(id.PartId() != 0);
144145
ui32 partIdx = id.PartId() - 1;
145146
IsChanged = true;
@@ -153,6 +154,7 @@ void TBlobState::AddErrorResponse(const TBlobStorageGroupInfo &info, const TLogo
153154
TDiskPart &diskPart = disk.DiskParts[partIdx];
154155
diskPart.Situation = ESituation::Error;
155156
diskPart.Requested.Clear();
157+
diskPart.ErrorReason = errorReason;
156158
}
157159

158160
void TBlobState::AddNotYetResponse(const TBlobStorageGroupInfo &info, const TLogoBlobID &id, ui32 orderNumber) {
@@ -238,7 +240,26 @@ TString TBlobState::SituationToString(ESituation situation) {
238240
case ESituation::Sent:
239241
return "ESituation::Sent";
240242
}
241-
Y_ABORT_UNLESS(false, "Unexpected situation# %" PRIu64, ui64(situation));
243+
Y_DEBUG_ABORT("Unexpected situation# %" PRIu64, ui64(situation));
244+
return "";
245+
}
246+
247+
TString TBlobState::SituationToShortString(ESituation situation) {
248+
switch (situation) {
249+
case ESituation::Unknown:
250+
return "U";
251+
case ESituation::Error:
252+
return "E";
253+
case ESituation::Absent:
254+
return "A";
255+
case ESituation::Lost:
256+
return "L";
257+
case ESituation::Present:
258+
return "P";
259+
case ESituation::Sent:
260+
return "S";
261+
}
262+
Y_DEBUG_ABORT("Unexpected situation# %" PRIu64, ui64(situation));
242263
return "";
243264
}
244265

@@ -258,6 +279,9 @@ TString TBlobState::TDiskPart::ToString() const {
258279
TStringStream str;
259280
str << "{Requested# " << Requested.ToString();
260281
str << " Situation# " << SituationToString(Situation);
282+
if (ErrorReason) {
283+
str << " ErrorReason# " << ErrorReason;
284+
}
261285
str << "}";
262286
return str.Str();
263287
}
@@ -280,6 +304,48 @@ TString TBlobState::TWholeState::ToString() const {
280304
return str.Str();
281305
}
282306

307+
TString TBlobState::ReportProblems(const TBlobStorageGroupInfo& info) const {
308+
TStackVec<TStackVec<TString, 3>, TypicalDisksInSubring> errorsByDisk(Disks.size());
309+
for (const TDisk& disk : Disks) {
310+
for (const TDiskPart& part : disk.DiskParts) {
311+
if (part.ErrorReason) {
312+
TVDiskID vdiskId = info.GetVDiskId(disk.OrderNumber);
313+
ui32 diskIdx = info.GetIdxInSubgroup(vdiskId, Id.Hash());
314+
errorsByDisk[diskIdx].push_back(part.ErrorReason);
315+
}
316+
}
317+
}
318+
319+
TStringStream str;
320+
str << "[ ";
321+
for (ui32 diskIdx = 0; diskIdx < errorsByDisk.size(); ++diskIdx) {
322+
if (!errorsByDisk[diskIdx].empty()) {
323+
ui32 orderNumber = Disks[diskIdx].OrderNumber;
324+
str << "{ OrderNumber# " << orderNumber;
325+
str << " VDiskId# " << info.GetVDiskId(orderNumber);
326+
str << " NodeId# " << info.GetActorId(orderNumber).NodeId();
327+
str << " ErrorReasons# [ ";
328+
for (const TString& errorReason : errorsByDisk[diskIdx]) {
329+
str << "\"" << errorReason << "\", ";
330+
}
331+
str << "] } ";
332+
}
333+
}
334+
str << "] ";
335+
336+
str << " Part situations# [ ";
337+
for (const TDisk& disk : Disks) {
338+
str << "{ OrderNumber# " << disk.OrderNumber << " Situations# ";
339+
for (const TDiskPart& part : disk.DiskParts) {
340+
str << SituationToShortString(part.Situation);
341+
}
342+
str << " } ";
343+
}
344+
str << "] ";
345+
346+
return str.Str();
347+
}
348+
283349
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
284350
// TGroupDiskRequests
285351
//
@@ -359,11 +425,11 @@ void TBlackboard::AddNotYetResponse(const TLogoBlobID &id, ui32 orderNumber) {
359425
state.AddNotYetResponse(*Info, id, orderNumber);
360426
}
361427

362-
void TBlackboard::AddErrorResponse(const TLogoBlobID &id, ui32 orderNumber) {
428+
void TBlackboard::AddErrorResponse(const TLogoBlobID &id, ui32 orderNumber, const TString& errorReason) {
363429
Y_ABORT_UNLESS(bool(id));
364430
Y_ABORT_UNLESS(id.PartId() != 0);
365431
TBlobState &state = GetState(id);
366-
state.AddErrorResponse(*Info, id, orderNumber);
432+
state.AddErrorResponse(*Info, id, orderNumber, errorReason);
367433
}
368434

369435
EStrategyOutcome TBlackboard::RunStrategies(TLogContext &logCtx, const TStackVec<IStrategy*, 1>& s,

ydb/core/blobstorage/dsproxy/dsproxy_blackboard.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ struct TBlobState {
6565
struct TDiskPart {
6666
TIntervalSet<i32> Requested;
6767
ESituation Situation = ESituation::Unknown;
68+
TString ErrorReason;
6869

6970
TString ToString() const;
7071
};
@@ -93,16 +94,19 @@ struct TBlobState {
9394
ui32 shift, TRope&& data);
9495
void AddPutOkResponse(const TBlobStorageGroupInfo &info, const TLogoBlobID &id, ui32 orderNumber);
9596
void AddNoDataResponse(const TBlobStorageGroupInfo &info, const TLogoBlobID &id, ui32 diskIdxInSubring);
96-
void AddErrorResponse(const TBlobStorageGroupInfo &info, const TLogoBlobID &id, ui32 diskIdxInSubring);
97+
void AddErrorResponse(const TBlobStorageGroupInfo &info, const TLogoBlobID &id, ui32 diskIdxInSubring,
98+
const TString& errorReason);
9799
void AddNotYetResponse(const TBlobStorageGroupInfo &info, const TLogoBlobID &id, ui32 diskIdxInSubring);
98100
ui64 GetPredictedDelayNs(const TBlobStorageGroupInfo &info, TGroupQueues &groupQueues,
99101
ui32 diskIdxInSubring, NKikimrBlobStorage::EVDiskQueueId queueId) const;
100102
void GetWorstPredictedDelaysNs(const TBlobStorageGroupInfo &info, TGroupQueues &groupQueues,
101103
NKikimrBlobStorage::EVDiskQueueId queueId, TDiskDelayPredictions *outNWorst,
102104
const TAccelerationParams& accelerationParams) const;
103105
TString ToString() const;
106+
TString ReportProblems(const TBlobStorageGroupInfo& info) const;
104107
bool HasWrittenQuorum(const TBlobStorageGroupInfo& info, const TBlobStorageGroupInfo::TGroupVDisks& expired) const;
105108
static TString SituationToString(ESituation situation);
109+
static TString SituationToShortString(ESituation situation);
106110
};
107111

108112
struct TDiskGetRequest {
@@ -199,7 +203,7 @@ struct TBlackboard {
199203
void AddResponseData(const TLogoBlobID &id, ui32 orderNumber, ui32 shift, TRope&& data);
200204
void AddPutOkResponse(const TLogoBlobID &id, ui32 orderNumber);
201205
void AddNoDataResponse(const TLogoBlobID &id, ui32 orderNumber);
202-
void AddErrorResponse(const TLogoBlobID &id, ui32 orderNumber);
206+
void AddErrorResponse(const TLogoBlobID &id, ui32 orderNumber, const TString& errorReason);
203207
void AddNotYetResponse(const TLogoBlobID &id, ui32 orderNumber);
204208

205209
EStrategyOutcome RunStrategies(TLogContext& logCtx, const TStackVec<IStrategy*, 1>& strategies,

ydb/core/blobstorage/dsproxy/dsproxy_get_impl.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ void TGetImpl::OnVPutResult(TLogContext &logCtx, TEvBlobStorage::TEvVPutResult &
365365
case NKikimrProto::ERROR:
366366
case NKikimrProto::VDISK_ERROR_STATE:
367367
case NKikimrProto::OUT_OF_SPACE:
368-
Blackboard.AddErrorResponse(blob, orderNumber);
368+
Blackboard.AddErrorResponse(blob, orderNumber, record.GetErrorReason());
369369
break;
370370
case NKikimrProto::OK:
371371
case NKikimrProto::ALREADY:

ydb/core/blobstorage/dsproxy/dsproxy_get_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ class TGetImpl {
239239
|| replyStatus == NKikimrProto::CORRUPTED) {
240240
DSP_LOG_DEBUG_SX(logCtx, "BPG60", "Got# " << NKikimrProto::EReplyStatus_Name(replyStatus).data()
241241
<< " orderNumber# " << orderNumber << " vDiskId# " << vdisk.ToString());
242-
Blackboard.AddErrorResponse(blobId, orderNumber);
242+
Blackboard.AddErrorResponse(blobId, orderNumber, record.GetErrorReason());
243243
AtLeastOneResponseWasNotOk = true;
244244
} else if (replyStatus == NKikimrProto::NOT_YET) {
245245
DSP_LOG_DEBUG_SX(logCtx, "BPG67", "Got# NOT_YET orderNumber# " << orderNumber

ydb/core/blobstorage/dsproxy/dsproxy_put_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ class TPutImpl {
327327
case NKikimrProto::ERROR:
328328
case NKikimrProto::VDISK_ERROR_STATE:
329329
case NKikimrProto::OUT_OF_SPACE:
330-
Blackboard.AddErrorResponse(blobId, orderNumber);
330+
Blackboard.AddErrorResponse(blobId, orderNumber, record.GetErrorReason());
331331
AtLeastOneResponseWasNotOk = true;
332332
break;
333333
case NKikimrProto::OK:

ydb/core/blobstorage/dsproxy/dsproxy_strategy_base.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,12 +159,16 @@ std::optional<EStrategyOutcome> TStrategyBase::SetAbsentForUnrecoverableAltruist
159159
}
160160

161161
std::optional<EStrategyOutcome> TStrategyBase::ProcessOptimistic(TBlobStorageGroupInfo::EBlobState altruisticState,
162-
TBlobStorageGroupInfo::EBlobState optimisticState, bool isDryRun, TBlobState &state) {
162+
TBlobStorageGroupInfo::EBlobState optimisticState, bool isDryRun, TBlobState &state,
163+
const TBlobStorageGroupInfo& info) {
163164
switch (optimisticState) {
164165
case TBlobStorageGroupInfo::EBS_DISINTEGRATED:
165166
if (!isDryRun) {
166167
return EStrategyOutcome::Error(TStringBuilder() << "TStrategyBase saw optimisticState# "
167-
<< TBlobStorageGroupInfo::BlobStateToString(optimisticState));
168+
<< TBlobStorageGroupInfo::BlobStateToString(optimisticState)
169+
<< " GroupId# " << info.GroupID
170+
<< " BlobId# " << state.Id
171+
<< " Reported ErrorReasons# " << state.ReportProblems(info));
168172
}
169173
return EStrategyOutcome::DONE;
170174
case TBlobStorageGroupInfo::EBS_UNRECOVERABLE_FRAGMENTARY:

ydb/core/blobstorage/dsproxy/dsproxy_strategy_base.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ class TStrategyBase : public IStrategy {
2828
bool IsUnrecoverableAltruistic(TBlobStorageGroupInfo::EBlobState recoveryState);
2929
std::optional<EStrategyOutcome> SetAbsentForUnrecoverableAltruistic(TBlobStorageGroupInfo::EBlobState recoveryState, TBlobState &state);
3030
std::optional<EStrategyOutcome> ProcessOptimistic(TBlobStorageGroupInfo::EBlobState altruisticState,
31-
TBlobStorageGroupInfo::EBlobState optimisticState, bool isDryRun, TBlobState &state);
31+
TBlobStorageGroupInfo::EBlobState optimisticState, bool isDryRun, TBlobState &state,
32+
const TBlobStorageGroupInfo& info);
3233
std::optional<EStrategyOutcome> ProcessPessimistic(const TBlobStorageGroupInfo &info, TBlobStorageGroupInfo::EBlobState pessimisticState,
3334
bool doVerify, TBlobState &state);
3435
void AddGetRequest(TLogContext &logCtx, TGroupDiskRequests &groupDiskRequests, TLogoBlobID &fullId, ui32 partIdx,

ydb/core/blobstorage/dsproxy/dsproxy_strategy_get_bold.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class TBoldStrategy : public TStrategyBase {
4141
EvaluateCurrentLayout(logCtx, state, info, &pessimisticState, &optimisticState, &altruisticState, false);
4242
if (auto res = SetAbsentForUnrecoverableAltruistic(altruisticState, state)) {
4343
return *res;
44-
} else if (auto res = ProcessOptimistic(altruisticState, optimisticState, false, state)) {
44+
} else if (auto res = ProcessOptimistic(altruisticState, optimisticState, false, state, info)) {
4545
return *res;
4646
} else if (auto res = ProcessPessimistic(info, pessimisticState, true, state)) {
4747
return *res;

ydb/core/blobstorage/dsproxy/dsproxy_strategy_get_min_iops_block.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ class TMinIopsBlockStrategy : public TStrategyBase {
303303

304304
if (auto res = SetAbsentForUnrecoverableAltruistic(altruisticState, state)) {
305305
return *res;
306-
} else if (auto res = ProcessOptimistic(altruisticState, optimisticState, false, state)) {
306+
} else if (auto res = ProcessOptimistic(altruisticState, optimisticState, false, state, info)) {
307307
return *res;
308308
} else if (auto res = ProcessPessimistic(info, pessimisticState, false, state)) {
309309
return *res;
@@ -320,7 +320,7 @@ class TMinIopsBlockStrategy : public TStrategyBase {
320320
EvaluateCurrentLayout(logCtx, state, info, &fastPessimisticState, &fastOptimisticState,
321321
&fastAltruisticState, true);
322322
if (!IsUnrecoverableAltruistic(fastAltruisticState)
323-
&& !ProcessOptimistic(fastAltruisticState, fastOptimisticState, true, state)) {
323+
&& !ProcessOptimistic(fastAltruisticState, fastOptimisticState, true, state, info)) {
324324
IssueGetRequests(logCtx, state, info, true, groupDiskRequests);
325325
isDone = true;
326326
}

ydb/core/blobstorage/dsproxy/dsproxy_strategy_get_min_iops_mirror.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class TMinIopsMirrorStrategy : public TStrategyBase {
3232

3333
if (auto res = SetAbsentForUnrecoverableAltruistic(altruisticState, state)) {
3434
return *res;
35-
} else if (auto res = ProcessOptimistic(altruisticState, optimisticState, false, state)) {
35+
} else if (auto res = ProcessOptimistic(altruisticState, optimisticState, false, state, info)) {
3636
return *res;
3737
} else if (auto res = ProcessPessimistic(info, pessimisticState, false, state)) {
3838
return *res;

ydb/core/blobstorage/dsproxy/dsproxy_strategy_restore.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,15 @@ class TRestoreStrategy : public TStrategyBase {
6565
<< " optimisticState# " << TBlobStorageGroupInfo::BlobStateToString(*optimisticState));
6666
}
6767

68-
std::optional<EStrategyOutcome> SetErrorForUnrecoverableOptimistic(TBlobStorageGroupInfo::EBlobState optimisticState) {
68+
std::optional<EStrategyOutcome> SetErrorForUnrecoverableOptimistic(TBlobStorageGroupInfo::EBlobState optimisticState,
69+
TBlobState& state, const TBlobStorageGroupInfo& info) {
6970
switch (optimisticState) {
7071
case TBlobStorageGroupInfo::EBS_DISINTEGRATED:
72+
return EStrategyOutcome::Error(TStringBuilder() << "TRestoreStrategy saw optimisticState# "
73+
<< TBlobStorageGroupInfo::BlobStateToString(optimisticState)
74+
<< " GroupId# " << info.GroupID
75+
<< " BlobId# " << state.Id
76+
<< " Reported ErrorReasons# " << state.ReportProblems(info));
7177
case TBlobStorageGroupInfo::EBS_UNRECOVERABLE_FRAGMENTARY:
7278
case TBlobStorageGroupInfo::EBS_RECOVERABLE_FRAGMENTARY:
7379
case TBlobStorageGroupInfo::EBS_RECOVERABLE_DOUBTED:
@@ -122,7 +128,7 @@ class TRestoreStrategy : public TStrategyBase {
122128
// Look at the current layout and set the status if possible
123129
TBlobStorageGroupInfo::EBlobState optimisticState = TBlobStorageGroupInfo::EBS_DISINTEGRATED;
124130
EvaluateRestoreLayout(logCtx, state, info, &optimisticState);
125-
if (auto res = SetErrorForUnrecoverableOptimistic(optimisticState)) {
131+
if (auto res = SetErrorForUnrecoverableOptimistic(optimisticState, state, info)) {
126132
return *res;
127133
}
128134

ydb/core/blobstorage/dsproxy/ut_strategy/strategy_ut.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class TGroupModel {
4848
Ctest << "orderNumber# " << get.OrderNumber << " get Id# " << get.Id;
4949
if (disk.InErrorState) {
5050
Ctest << " ERROR";
51-
blackboard.AddErrorResponse(get.Id, get.OrderNumber);
51+
blackboard.AddErrorResponse(get.Id, get.OrderNumber, "Disk in error state");
5252
} else if (auto it = disk.Blobs.find(get.Id); it == disk.Blobs.end()) {
5353
Ctest << " NODATA";
5454
blackboard.AddNoDataResponse(get.Id, get.OrderNumber);
@@ -76,7 +76,7 @@ class TGroupModel {
7676
Ctest << "orderNumber# " << put.OrderNumber << " put Id# " << put.Id;
7777
if (disk.InErrorState) {
7878
Ctest << " ERROR";
79-
blackboard.AddErrorResponse(put.Id, put.OrderNumber);
79+
blackboard.AddErrorResponse(put.Id, put.OrderNumber, "Disk in error state");
8080
} else {
8181
Ctest << " OK";
8282
disk.Blobs[put.Id] = std::move(put.Buffer);
@@ -132,12 +132,12 @@ void RunStrategyTest(TBlobStorageGroupType type) {
132132
TBlobStorageGroupInfo::TGroupVDisks diskMask = {&info.GetTopology(), info.GetVDiskId(orderNumber)};
133133
if (sureFailedDisks & diskMask) {
134134
if (RandomNumber(5u) == 0) {
135-
blackboard.AddErrorResponse(partId, orderNumber);
135+
blackboard.AddErrorResponse(partId, orderNumber, "Bad disk");
136136
}
137137
} else {
138138
switch (RandomNumber(100u)) {
139139
case 0:
140-
blackboard.AddErrorResponse(partId, orderNumber);
140+
blackboard.AddErrorResponse(partId, orderNumber, "Random failure");
141141
break;
142142

143143
case 1:
@@ -264,7 +264,7 @@ void RunTestLevel(const TBlobStorageGroupInfo& info, TBlackboard& blackboard,
264264
[&](const TGetQuery& op) {
265265
const ui32 idxInSubgroup = info.GetTopology().GetIdxInSubgroup(info.GetVDiskId(op.OrderNumber), id.Hash());
266266
if (nonWorkingDomain && idxInSubgroup % 3 == 2) {
267-
branch.AddErrorResponse(op.Id, op.OrderNumber);
267+
branch.AddErrorResponse(op.Id, op.OrderNumber, "Non-working domain");
268268
} else if (myPresenceMask.GetDisksWithPart(op.Id.PartId() - 1) >> idxInSubgroup & 1) {
269269
const ui32 blobSize = op.Id.BlobSize();
270270
const ui32 shift = Min(op.Shift, blobSize);
@@ -277,7 +277,7 @@ void RunTestLevel(const TBlobStorageGroupInfo& info, TBlackboard& blackboard,
277277
[&](const TPutQuery& op) {
278278
const ui32 idxInSubgroup = info.GetTopology().GetIdxInSubgroup(info.GetVDiskId(op.OrderNumber), id.Hash());
279279
if (nonWorkingDomain && idxInSubgroup % 3 == 2) {
280-
branch.AddErrorResponse(op.Id, op.OrderNumber);
280+
branch.AddErrorResponse(op.Id, op.OrderNumber, "Non-working domain");
281281
} else {
282282
myPresenceMask.AddItem(idxInSubgroup, op.Id.PartId() - 1, info.Type);
283283
branch.AddPutOkResponse(op.Id, op.OrderNumber);

0 commit comments

Comments
 (0)