Skip to content

Commit 4304de0

Browse files
committed
Handle unhandled exceptions during build index SchemeShard init (#19312)
1 parent 516183f commit 4304de0

9 files changed

+254
-122
lines changed

ydb/core/tx/schemeshard/schemeshard__init.cpp

Lines changed: 70 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4516,6 +4516,36 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
45164516

45174517
// Read index build
45184518
{
4519+
auto fillBuildInfoSafe = [&](TIndexBuildInfo& buildInfo, const TString& stepName, const auto& fill) {
4520+
try {
4521+
fill(buildInfo);
4522+
} catch (const std::exception& exc) {
4523+
LOG_ERROR_S(ctx, NKikimrServices::BUILD_INDEX,
4524+
"Init " << stepName << " unhandled exception, id#" << buildInfo.Id
4525+
<< " " << TypeName(exc) << ": " << exc.what() << Endl
4526+
<< TBackTrace::FromCurrentException().PrintToString()
4527+
<< ", TIndexBuildInfo: " << buildInfo);
4528+
4529+
// in-memory volatile state:
4530+
buildInfo.IsBroken = true;
4531+
buildInfo.AddIssue(TStringBuilder() << "Init " << stepName << " unhandled exception " << exc.what());
4532+
}
4533+
};
4534+
4535+
auto fillBuildInfoByIdSafe = [&](TIndexBuildId id, const TString& stepName, const auto& fill) {
4536+
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4537+
Y_ASSERT(buildInfoPtr);
4538+
if (!buildInfoPtr) {
4539+
LOG_ERROR_S(ctx, NKikimrServices::BUILD_INDEX,
4540+
"Init " << stepName << " BuildInfo not found: id#" << id);
4541+
return;
4542+
}
4543+
auto& buildInfo = *buildInfoPtr->Get();
4544+
if (!buildInfo.IsBroken) {
4545+
fillBuildInfoSafe(buildInfo, stepName, fill);
4546+
}
4547+
};
4548+
45194549
// read main info
45204550
{
45214551
auto rowset = db.Table<Schema::IndexBuild>().Range().Select();
@@ -4524,17 +4554,21 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
45244554
}
45254555

45264556
while (!rowset.EndOfSet()) {
4527-
TIndexBuildInfo::TPtr indexInfo = TIndexBuildInfo::FromRow(rowset);
4528-
4529-
auto [it, emplaced] = Self->IndexBuilds.emplace(indexInfo->Id, indexInfo);
4530-
Y_ABORT_UNLESS(emplaced);
4531-
if (indexInfo->Uid) {
4532-
// TODO(mbkkt) It also should be unique, but we're not sure.
4533-
Y_ASSERT(!Self->IndexBuildsByUid.contains(indexInfo->Uid));
4534-
Self->IndexBuildsByUid[indexInfo->Uid] = indexInfo;
4557+
TIndexBuildInfo::TPtr buildInfo = new TIndexBuildInfo();
4558+
fillBuildInfoSafe(*buildInfo, "IndexBuild", [&](TIndexBuildInfo& buildInfo) {
4559+
TIndexBuildInfo::FillFromRow(rowset, &buildInfo);
4560+
});
4561+
4562+
// Note: broken build are also added to IndexBuilds
4563+
Y_ASSERT(!Self->IndexBuilds.contains(buildInfo->Id));
4564+
Self->IndexBuilds[buildInfo->Id] = buildInfo;
4565+
4566+
if (buildInfo->Uid) {
4567+
Y_ASSERT(!Self->IndexBuildsByUid.contains(buildInfo->Uid));
4568+
Self->IndexBuildsByUid[buildInfo->Uid] = buildInfo;
45354569
}
45364570

4537-
OnComplete.ToProgress(indexInfo->Id);
4571+
OnComplete.ToProgress(buildInfo->Id);
45384572

45394573
if (!rowset.Next()) {
45404574
return false;
@@ -4556,19 +4590,18 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
45564590

45574591
while (!rowset.EndOfSet()) {
45584592
TIndexBuildId id = rowset.GetValue<Schema::KMeansTreeProgress::Id>();
4559-
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4560-
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found: id# " << id);
4561-
auto& buildInfo = *buildInfoPtr->Get();
4562-
buildInfo.KMeans.Set(
4563-
rowset.GetValue<Schema::KMeansTreeProgress::Level>(),
4564-
rowset.GetValue<Schema::KMeansTreeProgress::ParentBegin>(),
4565-
rowset.GetValue<Schema::KMeansTreeProgress::Parent>(),
4566-
rowset.GetValue<Schema::KMeansTreeProgress::ChildBegin>(),
4567-
rowset.GetValue<Schema::KMeansTreeProgress::Child>(),
4568-
rowset.GetValue<Schema::KMeansTreeProgress::State>(),
4569-
rowset.GetValue<Schema::KMeansTreeProgress::TableSize>()
4570-
);
4571-
buildInfo.Sample.Rows.reserve(buildInfo.KMeans.K * 2);
4593+
fillBuildInfoByIdSafe(id, "KMeansTreeProgress", [&](TIndexBuildInfo& buildInfo) {
4594+
buildInfo.KMeans.Set(
4595+
rowset.GetValue<Schema::KMeansTreeProgress::Level>(),
4596+
rowset.GetValue<Schema::KMeansTreeProgress::ParentBegin>(),
4597+
rowset.GetValue<Schema::KMeansTreeProgress::Parent>(),
4598+
rowset.GetValue<Schema::KMeansTreeProgress::ChildBegin>(),
4599+
rowset.GetValue<Schema::KMeansTreeProgress::Child>(),
4600+
rowset.GetValue<Schema::KMeansTreeProgress::State>(),
4601+
rowset.GetValue<Schema::KMeansTreeProgress::TableSize>()
4602+
);
4603+
buildInfo.Sample.Rows.reserve(buildInfo.KMeans.K * 2);
4604+
});
45724605

45734606
if (!rowset.Next()) {
45744607
return false;
@@ -4587,13 +4620,12 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
45874620
size_t sampleCount = 0;
45884621
while (!rowset.EndOfSet()) {
45894622
TIndexBuildId id = rowset.GetValue<Schema::KMeansTreeSample::Id>();
4590-
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4591-
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found: id# " << id);
4592-
auto& buildInfo = *buildInfoPtr->Get();
4593-
buildInfo.Sample.Add(
4594-
rowset.GetValue<Schema::KMeansTreeSample::Probability>(),
4595-
rowset.GetValue<Schema::KMeansTreeSample::Data>()
4596-
);
4623+
fillBuildInfoByIdSafe(id, "KMeansTreeSample", [&](TIndexBuildInfo& buildInfo) {
4624+
buildInfo.Sample.Add(
4625+
rowset.GetValue<Schema::KMeansTreeSample::Probability>(),
4626+
rowset.GetValue<Schema::KMeansTreeSample::Data>()
4627+
);
4628+
});
45974629
sampleCount++;
45984630

45994631
if (!rowset.Next()) {
@@ -4615,11 +4647,9 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
46154647

46164648
while (!rowset.EndOfSet()) {
46174649
TIndexBuildId id = rowset.GetValue<Schema::IndexBuildColumns::Id>();
4618-
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4619-
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found"
4620-
<< ": id# " << id);
4621-
auto& buildInfo = *buildInfoPtr->Get();
4622-
buildInfo.AddIndexColumnInfo(rowset);
4650+
fillBuildInfoByIdSafe(id, "IndexBuildColumns", [&](TIndexBuildInfo& buildInfo) {
4651+
buildInfo.AddIndexColumnInfo(rowset);
4652+
});
46234653

46244654
if (!rowset.Next()) {
46254655
return false;
@@ -4635,11 +4665,9 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
46354665

46364666
while (!rowset.EndOfSet()) {
46374667
TIndexBuildId id = rowset.GetValue<Schema::BuildColumnOperationSettings::Id>();
4638-
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4639-
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found"
4640-
<< ": id# " << id);
4641-
auto& buildInfo = *buildInfoPtr->Get();
4642-
buildInfo.AddBuildColumnInfo(rowset);
4668+
fillBuildInfoByIdSafe(id, "BuildColumnOperationSettings", [&](TIndexBuildInfo& buildInfo) {
4669+
buildInfo.AddBuildColumnInfo(rowset);
4670+
});
46434671

46444672
if (!rowset.Next()) {
46454673
return false;
@@ -4656,11 +4684,9 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
46564684

46574685
while (!rowset.EndOfSet()) {
46584686
TIndexBuildId id = rowset.GetValue<Schema::IndexBuildShardStatus::Id>();
4659-
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4660-
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found"
4661-
<< ": id# " << id);
4662-
auto& buildInfo = *buildInfoPtr->Get();
4663-
buildInfo.AddShardStatus(rowset);
4687+
fillBuildInfoByIdSafe(id, "IndexBuildShardStatus", [&](TIndexBuildInfo& buildInfo) {
4688+
buildInfo.AddShardStatus(rowset);
4689+
});
46644690

46654691
if (!rowset.Next()) {
46664692
return false;

ydb/core/tx/schemeshard/schemeshard__monitoring.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,8 @@ struct TSchemeShard::TTxMonitoring : public NTabletFlatExecutor::TTransactionBas
818818
<< "CancelRequested: " << (info.CancelRequested ? "YES" : "NO") << Endl
819819

820820
<< "State: " << info.State << Endl
821-
<< "Issue: " << info.Issue << Endl
821+
<< "IsBroken: " << info.IsBroken << Endl
822+
<< "Issue: " << info.GetIssue() << Endl
822823

823824
<< "Shards.size: " << info.Shards.size() << Endl
824825
<< "ToUploadShards.size: " << info.ToUploadShards.size() << Endl

ydb/core/tx/schemeshard/schemeshard_build_index.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ void TSchemeShard::Handle(TEvPrivate::TEvIndexBuildingMakeABill::TPtr& ev, const
5353
}
5454

5555
void TSchemeShard::PersistCreateBuildIndex(NIceDb::TNiceDb& db, const TIndexBuildInfo& info) {
56-
Y_ABORT_UNLESS(info.BuildKind != TIndexBuildInfo::EBuildKind::BuildKindUnspecified);
56+
Y_ENSURE(info.BuildKind != TIndexBuildInfo::EBuildKind::BuildKindUnspecified);
5757
auto persistedBuildIndex = db.Table<Schema::IndexBuild>().Key(info.Id);
5858
persistedBuildIndex.Update(
5959
NIceDb::TUpdate<Schema::IndexBuild::Uid>(info.Uid),
@@ -126,7 +126,7 @@ void TSchemeShard::PersistCreateBuildIndex(NIceDb::TNiceDb& db, const TIndexBuil
126126
void TSchemeShard::PersistBuildIndexState(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo) {
127127
db.Table<Schema::IndexBuild>().Key(indexInfo.Id).Update(
128128
NIceDb::TUpdate<Schema::IndexBuild::State>(ui32(indexInfo.State)),
129-
NIceDb::TUpdate<Schema::IndexBuild::Issue>(indexInfo.Issue),
129+
NIceDb::TUpdate<Schema::IndexBuild::Issue>(indexInfo.GetIssue()),
130130
NIceDb::TUpdate<Schema::IndexBuild::StartTime>(indexInfo.StartTime.Seconds()),
131131
NIceDb::TUpdate<Schema::IndexBuild::EndTime>(indexInfo.EndTime.Seconds())
132132
);
@@ -139,7 +139,7 @@ void TSchemeShard::PersistBuildIndexCancelRequest(NIceDb::TNiceDb& db, const TIn
139139

140140
void TSchemeShard::PersistBuildIndexIssue(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo) {
141141
db.Table<Schema::IndexBuild>().Key(indexInfo.Id).Update(
142-
NIceDb::TUpdate<Schema::IndexBuild::Issue>(indexInfo.Issue));
142+
NIceDb::TUpdate<Schema::IndexBuild::Issue>(indexInfo.GetIssue()));
143143
}
144144

145145
void TSchemeShard::PersistBuildIndexAlterMainTableTxId(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo) {
@@ -314,9 +314,12 @@ void TSchemeShard::PersistBuildIndexForget(NIceDb::TNiceDb& db, const TIndexBuil
314314

315315
void TSchemeShard::Resume(const TDeque<TIndexBuildId>& indexIds, const TActorContext& ctx) {
316316
for (const auto& id : indexIds) {
317-
if (IndexBuilds.contains(id)) {
318-
Execute(CreateTxProgress(id), ctx);
317+
const auto* buildInfoPtr = IndexBuilds.FindPtr(id);
318+
if (!buildInfoPtr || buildInfoPtr->Get()->IsBroken) {
319+
continue;
319320
}
321+
322+
Execute(CreateTxProgress(id), ctx);
320323
}
321324
}
322325

@@ -331,7 +334,7 @@ void TSchemeShard::SetupRouting(const TDeque<TIndexBuildId>& indexIds, const TAc
331334
auto handle = [&] (auto txId) {
332335
if (txId) {
333336
auto [it, emplaced] = TxIdToIndexBuilds.try_emplace(txId, buildInfo.Id);
334-
Y_ABORT_UNLESS(it->second == buildInfo.Id);
337+
Y_ENSURE(it->second == buildInfo.Id);
335338
}
336339
};
337340

ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,9 @@ class TSchemeShard::TIndexBuilder::TTxCreate: public TSchemeShard::TIndexBuilder
8282
}
8383
}
8484

85-
TIndexBuildInfo::TPtr buildInfo = new TIndexBuildInfo(BuildId, uid);
85+
TIndexBuildInfo::TPtr buildInfo = new TIndexBuildInfo();
86+
buildInfo->Id = BuildId;
87+
buildInfo->Uid = uid;
8688
buildInfo->DomainPathId = domainPath.Base()->PathId;
8789
buildInfo->TablePathId = tablePath.Base()->PathId;
8890

0 commit comments

Comments
 (0)