Skip to content

Commit e0df3b7

Browse files
committed
Fix saving shard progress on error (#19448)
1 parent c3555cf commit e0df3b7

File tree

2 files changed

+43
-23
lines changed

2 files changed

+43
-23
lines changed

ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp

Lines changed: 38 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1608,7 +1608,14 @@ struct TTxShardReply: public TSchemeShard::TIndexBuilder::TTxReply {
16081608
}
16091609

16101610
if (buildInfo.State != TIndexBuildInfo::EState::Filling) {
1611-
LOG_I("TTxReply : " << TypeName<TEvResponse>() << " superfluous event, id# " << BuildId);
1611+
LOG_N("TTxReply : " << TypeName<TEvResponse>() << " superfluous state event, id# " << BuildId
1612+
<< ", TIndexBuildInfo: " << buildInfo);
1613+
return true;
1614+
}
1615+
1616+
if (!buildInfo.InProgressShards.contains(shardIdx)) {
1617+
LOG_N("TTxReply : " << TypeName<TEvResponse>() << " superfluous shard event, id# " << BuildId
1618+
<< ", TIndexBuildInfo: " << buildInfo);
16121619
return true;
16131620
}
16141621

@@ -1637,41 +1644,50 @@ struct TTxShardReply: public TSchemeShard::TIndexBuilder::TTxReply {
16371644
shardStatus.Status = record.GetStatus();
16381645

16391646
switch (shardStatus.Status) {
1640-
case NKikimrIndexBuilder::EBuildStatus::INVALID:
1647+
case NKikimrIndexBuilder::EBuildStatus::INVALID:
16411648
Y_ENSURE(false, "Unreachable");
1642-
case NKikimrIndexBuilder::EBuildStatus::ACCEPTED: // TODO: do we need ACCEPTED?
1643-
case NKikimrIndexBuilder::EBuildStatus::IN_PROGRESS:
1649+
case NKikimrIndexBuilder::EBuildStatus::ACCEPTED: // TODO: do we need ACCEPTED?
1650+
case NKikimrIndexBuilder::EBuildStatus::IN_PROGRESS: {
16441651
HandleProgress(shardStatus, buildInfo);
16451652
Self->PersistBuildIndexUploadProgress(db, BuildId, shardIdx, shardStatus);
1653+
// no progress
1654+
// no pipe close
16461655
return true;
1647-
case NKikimrIndexBuilder::EBuildStatus::DONE:
1648-
if (buildInfo.InProgressShards.erase(shardIdx)) {
1649-
HandleDone(db, buildInfo);
1650-
buildInfo.DoneShards.emplace_back(shardIdx);
1651-
}
1652-
break;
1653-
case NKikimrIndexBuilder::EBuildStatus::ABORTED:
1656+
}
1657+
case NKikimrIndexBuilder::EBuildStatus::DONE: {
1658+
bool erased = buildInfo.InProgressShards.erase(shardIdx);
1659+
Y_ENSURE(erased);
1660+
buildInfo.DoneShards.emplace_back(shardIdx);
1661+
HandleDone(db, buildInfo);
1662+
Self->PersistBuildIndexUploadProgress(db, BuildId, shardIdx, shardStatus);
1663+
Self->IndexBuildPipes.Close(BuildId, shardId, ctx);
1664+
Progress(BuildId);
1665+
return true;
1666+
}
1667+
case NKikimrIndexBuilder::EBuildStatus::ABORTED: {
16541668
// datashard gracefully rebooted, reschedule shard
1655-
if (buildInfo.InProgressShards.erase(shardIdx)) {
1656-
buildInfo.ToUploadShards.emplace_front(shardIdx);
1657-
}
1658-
break;
1659-
case NKikimrIndexBuilder::EBuildStatus::BUILD_ERROR:
1660-
case NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST:
1669+
bool erased = buildInfo.InProgressShards.erase(shardIdx);
1670+
Y_ENSURE(erased);
1671+
buildInfo.ToUploadShards.emplace_front(shardIdx);
1672+
Self->PersistBuildIndexUploadProgress(db, BuildId, shardIdx, shardStatus);
1673+
Self->IndexBuildPipes.Close(BuildId, shardId, ctx);
1674+
Progress(BuildId);
1675+
return true;
1676+
}
1677+
case NKikimrIndexBuilder::EBuildStatus::BUILD_ERROR:
1678+
case NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST: {
16611679
Self->PersistBuildIndexAddIssue(db, buildInfo, TStringBuilder()
16621680
<< "One of the shards report " << shardStatus.Status << " " << shardStatus.DebugMessage
16631681
<< " at Filling stage, process has to be canceled"
16641682
<< ", shardId: " << shardId
16651683
<< ", shardIdx: " << shardIdx);
1684+
Self->PersistBuildIndexUploadProgress(db, BuildId, shardIdx, shardStatus);
1685+
Self->IndexBuildPipes.Close(BuildId, shardId, ctx);
16661686
ChangeState(buildInfo.Id, TIndexBuildInfo::EState::Rejection_Applying);
16671687
Progress(BuildId);
16681688
return true;
16691689
}
1670-
Self->PersistBuildIndexUploadProgress(db, BuildId, shardIdx, shardStatus);
1671-
Self->IndexBuildPipes.Close(BuildId, shardId, ctx);
1672-
Progress(BuildId);
1673-
1674-
return true;
1690+
}
16751691
}
16761692

16771693
virtual void HandleProgress(TIndexBuildInfo::TShardStatus& shardStatus, TIndexBuildInfo& buildInfo) {

ydb/core/tx/schemeshard/ut_index_build/ut_vector_index_build.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -652,7 +652,7 @@ Y_UNIT_TEST_SUITE (VectorIndexBuildTest) {
652652
TTestEnv env(runtime);
653653
ui64 txId = 100;
654654

655-
// runtime.SetLogPriority(NKikimrServices::TX_DATASHARD, NLog::PRI_TRACE);
655+
runtime.SetLogPriority(NKikimrServices::TX_DATASHARD, NLog::PRI_TRACE);
656656
runtime.SetLogPriority(NKikimrServices::BUILD_INDEX, NLog::PRI_TRACE);
657657

658658
TestCreateTable(runtime, ++txId, "/MyRoot", R"(
@@ -701,24 +701,28 @@ Y_UNIT_TEST_SUITE (VectorIndexBuildTest) {
701701

702702
{
703703
auto buildIndexOperation = TestGetBuildIndex(runtime, TTestTxConfig::SchemeShard, "/MyRoot", buildIndexTx);
704+
Cout << "BuildIndex 1 " << buildIndexOperation.DebugString() << Endl;
704705
UNIT_ASSERT_VALUES_EQUAL_C(
705706
buildIndexOperation.GetIndexBuild().GetState(), Ydb::Table::IndexBuildState::STATE_REJECTED,
706707
buildIndexOperation.DebugString()
707708
);
708709
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "One of the shards report BUILD_ERROR");
709710
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "Error: Datashard test fail");
711+
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "Processed: { upload rows: 0, upload bytes: 0, read rows: 0, read bytes: 0 } }");
710712
}
711713

712714
RebootTablet(runtime, TTestTxConfig::SchemeShard, runtime.AllocateEdgeActor());
713715

714716
{
715717
auto buildIndexOperation = TestGetBuildIndex(runtime, TTestTxConfig::SchemeShard, "/MyRoot", buildIndexTx);
718+
Cout << "BuildIndex 2 " << buildIndexOperation.DebugString() << Endl;
716719
UNIT_ASSERT_VALUES_EQUAL_C(
717720
buildIndexOperation.GetIndexBuild().GetState(), Ydb::Table::IndexBuildState::STATE_REJECTED,
718721
buildIndexOperation.DebugString()
719722
);
720723
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "One of the shards report BUILD_ERROR");
721724
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "Error: Datashard test fail");
725+
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "Processed: { upload rows: 0, upload bytes: 0, read rows: 0, read bytes: 0 } }");
722726
}
723727
}
724728
}

0 commit comments

Comments
 (0)