Skip to content

Commit c1cd33f

Browse files
committed
Save shard error message (#19388)
1 parent 4304de0 commit c1cd33f

File tree

4 files changed

+98
-31
lines changed

4 files changed

+98
-31
lines changed

ydb/core/tx/schemeshard/schemeshard_build_index.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,11 @@ void TSchemeShard::PersistBuildIndexCancelRequest(NIceDb::TNiceDb& db, const TIn
137137
NIceDb::TUpdate<Schema::IndexBuild::CancelRequest>(indexInfo.CancelRequested));
138138
}
139139

140-
void TSchemeShard::PersistBuildIndexIssue(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo) {
141-
db.Table<Schema::IndexBuild>().Key(indexInfo.Id).Update(
142-
NIceDb::TUpdate<Schema::IndexBuild::Issue>(indexInfo.GetIssue()));
140+
void TSchemeShard::PersistBuildIndexAddIssue(NIceDb::TNiceDb& db, TIndexBuildInfo& indexInfo, const TString& issue) {
141+
if (indexInfo.AddIssue(issue)) {
142+
db.Table<Schema::IndexBuild>().Key(indexInfo.Id).Update(
143+
NIceDb::TUpdate<Schema::IndexBuild::Issue>(indexInfo.GetIssue()));
144+
}
143145
}
144146

145147
void TSchemeShard::PersistBuildIndexAlterMainTableTxId(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo) {

ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1359,9 +1359,8 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil
13591359
}
13601360

13611361
NIceDb::TNiceDb db(txc.DB);
1362-
if (buildInfo->AddIssue(TStringBuilder() << "Unhandled exception " << exc.what())) {
1363-
Self->PersistBuildIndexIssue(db, *buildInfo);
1364-
}
1362+
Self->PersistBuildIndexAddIssue(db, *buildInfo,
1363+
TStringBuilder() << "Unhandled exception " << exc.what());
13651364

13661365
if (buildInfo->State != TIndexBuildInfo::EState::Filling) {
13671366
// no idea how to gracefully stop index build otherwise
@@ -1500,9 +1499,8 @@ struct TSchemeShard::TIndexBuilder::TTxReply: public TSchemeShard::TIndexBuilder
15001499
}
15011500

15021501
NIceDb::TNiceDb db(txc.DB);
1503-
if (buildInfo->AddIssue(TStringBuilder() << "Unhandled exception " << exc.what())) {
1504-
Self->PersistBuildIndexIssue(db, *buildInfo);
1505-
}
1502+
Self->PersistBuildIndexAddIssue(db, *buildInfo,
1503+
TStringBuilder() << "Unhandled exception " << exc.what());
15061504

15071505
if (buildInfo->State != TIndexBuildInfo::EState::Filling) {
15081506
// most replies are used at Filling stage
@@ -1677,12 +1675,11 @@ struct TSchemeShard::TIndexBuilder::TTxReplySampleK: public TSchemeShard::TIndex
16771675
break;
16781676
case NKikimrIndexBuilder::EBuildStatus::BUILD_ERROR:
16791677
case NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST:
1680-
buildInfo.AddIssue(TStringBuilder()
1681-
<< "One of the shards report " << shardStatus.Status
1678+
Self->PersistBuildIndexAddIssue(db, buildInfo, TStringBuilder()
1679+
<< "One of the shards report " << shardStatus.Status << " " << shardStatus.DebugMessage
16821680
<< " at Filling stage, process has to be canceled"
16831681
<< ", shardId: " << shardId
16841682
<< ", shardIdx: " << shardIdx);
1685-
Self->PersistBuildIndexIssue(db, buildInfo);
16861683
ChangeState(buildInfo.Id, TIndexBuildInfo::EState::Rejection_Applying);
16871684
Progress(BuildId);
16881685
return true;
@@ -1778,12 +1775,11 @@ struct TSchemeShard::TIndexBuilder::TTxReplyLocalKMeans: public TSchemeShard::TI
17781775
break;
17791776
case NKikimrIndexBuilder::EBuildStatus::BUILD_ERROR:
17801777
case NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST:
1781-
buildInfo.AddIssue(TStringBuilder()
1782-
<< "One of the shards report " << shardStatus.Status
1778+
Self->PersistBuildIndexAddIssue(db, buildInfo, TStringBuilder()
1779+
<< "One of the shards report " << shardStatus.Status << " " << shardStatus.DebugMessage
17831780
<< " at Filling stage, process has to be canceled"
17841781
<< ", shardId: " << shardId
17851782
<< ", shardIdx: " << shardIdx);
1786-
Self->PersistBuildIndexIssue(db, buildInfo);
17871783
ChangeState(buildInfo.Id, TIndexBuildInfo::EState::Rejection_Applying);
17881784
Progress(BuildId);
17891785
return true;
@@ -1879,12 +1875,11 @@ struct TSchemeShard::TIndexBuilder::TTxReplyReshuffleKMeans: public TSchemeShard
18791875
break;
18801876
case NKikimrIndexBuilder::EBuildStatus::BUILD_ERROR:
18811877
case NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST:
1882-
buildInfo.AddIssue(TStringBuilder()
1883-
<< "One of the shards report " << shardStatus.Status
1878+
Self->PersistBuildIndexAddIssue(db, buildInfo, TStringBuilder()
1879+
<< "One of the shards report " << shardStatus.Status << " " << shardStatus.DebugMessage
18841880
<< " at Filling stage, process has to be canceled"
18851881
<< ", shardId: " << shardId
18861882
<< ", shardIdx: " << shardIdx);
1887-
Self->PersistBuildIndexIssue(db, buildInfo);
18881883
ChangeState(buildInfo.Id, TIndexBuildInfo::EState::Rejection_Applying);
18891884
Progress(BuildId);
18901885
return true;
@@ -1980,12 +1975,11 @@ struct TSchemeShard::TIndexBuilder::TTxReplyPrefixKMeans: public TSchemeShard::T
19801975
break;
19811976
case NKikimrIndexBuilder::EBuildStatus::BUILD_ERROR:
19821977
case NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST:
1983-
buildInfo.AddIssue(TStringBuilder()
1984-
<< "One of the shards report " << shardStatus.Status
1978+
Self->PersistBuildIndexAddIssue(db, buildInfo, TStringBuilder()
1979+
<< "One of the shards report " << shardStatus.Status << " " << shardStatus.DebugMessage
19851980
<< " at Filling stage, process has to be canceled"
19861981
<< ", shardId: " << shardId
19871982
<< ", shardIdx: " << shardIdx);
1988-
Self->PersistBuildIndexIssue(db, buildInfo);
19891983
ChangeState(buildInfo.Id, TIndexBuildInfo::EState::Rejection_Applying);
19901984
Progress(BuildId);
19911985
return true;
@@ -2053,8 +2047,7 @@ struct TSchemeShard::TIndexBuilder::TTxReplyUploadSample: public TSchemeShard::T
20532047
} else {
20542048
NYql::TIssues issues;
20552049
NYql::IssuesFromMessage(record.GetIssues(), issues);
2056-
buildInfo.AddIssue(issues.ToString());
2057-
Self->PersistBuildIndexIssue(db, buildInfo);
2050+
Self->PersistBuildIndexAddIssue(db, buildInfo, issues.ToString());
20582051
ChangeState(buildInfo.Id, TIndexBuildInfo::EState::Rejection_Applying);
20592052
Progress(BuildId);
20602053
}
@@ -2176,12 +2169,11 @@ struct TSchemeShard::TIndexBuilder::TTxReplyProgress: public TSchemeShard::TInde
21762169
break;
21772170
case NKikimrIndexBuilder::EBuildStatus::BUILD_ERROR:
21782171
case NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST:
2179-
buildInfo.AddIssue(TStringBuilder()
2180-
<< "One of the shards report " << shardStatus.Status
2172+
Self->PersistBuildIndexAddIssue(db, buildInfo, TStringBuilder()
2173+
<< "One of the shards report " << shardStatus.Status << " " << shardStatus.DebugMessage
21812174
<< " at Filling stage, process has to be canceled"
21822175
<< ", shardId: " << shardId
21832176
<< ", shardIdx: " << shardIdx);
2184-
Self->PersistBuildIndexIssue(db, buildInfo);
21852177
ChangeState(buildInfo.Id, TIndexBuildInfo::EState::Rejection_Applying);
21862178
Progress(BuildId);
21872179
return true;
@@ -2354,11 +2346,10 @@ struct TSchemeShard::TIndexBuilder::TTxReplyModify: public TSchemeShard::TIndexB
23542346
auto statusCode = TranslateStatusCode(record.GetStatus());
23552347

23562348
if (statusCode != Ydb::StatusIds::SUCCESS) {
2357-
buildInfo.AddIssue(TStringBuilder()
2349+
Self->PersistBuildIndexAddIssue(db, buildInfo, TStringBuilder()
23582350
<< "At " << state << " state got unsuccess propose result"
23592351
<< ", status: " << NKikimrScheme::EStatus_Name(record.GetStatus())
23602352
<< ", reason: " << record.GetReason());
2361-
Self->PersistBuildIndexIssue(db, buildInfo);
23622353
Self->PersistBuildIndexForget(db, buildInfo);
23632354
EraseBuildInfo(buildInfo);
23642355
}
@@ -2373,11 +2364,10 @@ struct TSchemeShard::TIndexBuilder::TTxReplyModify: public TSchemeShard::TIndexB
23732364
Y_ENSURE(false, "NEED MORE TESTING");
23742365
// no op
23752366
} else {
2376-
buildInfo.AddIssue(TStringBuilder()
2367+
Self->PersistBuildIndexAddIssue(db, buildInfo, TStringBuilder()
23772368
<< "At " << state << " state got unsuccess propose result"
23782369
<< ", status: " << NKikimrScheme::EStatus_Name(record.GetStatus())
23792370
<< ", reason: " << record.GetReason());
2380-
Self->PersistBuildIndexIssue(db, buildInfo);
23812371
ChangeState(buildInfo.Id, to);
23822372
}
23832373
};

ydb/core/tx/schemeshard/schemeshard_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1379,7 +1379,7 @@ class TSchemeShard
13791379

13801380
void PersistCreateBuildIndex(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo);
13811381
void PersistBuildIndexState(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo);
1382-
void PersistBuildIndexIssue(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo);
1382+
void PersistBuildIndexAddIssue(NIceDb::TNiceDb& db, TIndexBuildInfo& indexInfo, const TString& issue);
13831383
void PersistBuildIndexCancelRequest(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo);
13841384

13851385
void PersistBuildIndexAlterMainTableTxId(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo);

ydb/core/tx/schemeshard/ut_index_build/ut_vector_index_build.cpp

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -646,4 +646,79 @@ Y_UNIT_TEST_SUITE (VectorIndexBuildTest) {
646646
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "Condition violated: `creationConfig.ParseFromString");
647647
}
648648
}
649+
650+
Y_UNIT_TEST(Shard_Build_Error) {
651+
TTestBasicRuntime runtime;
652+
TTestEnv env(runtime);
653+
ui64 txId = 100;
654+
655+
// runtime.SetLogPriority(NKikimrServices::TX_DATASHARD, NLog::PRI_TRACE);
656+
runtime.SetLogPriority(NKikimrServices::BUILD_INDEX, NLog::PRI_TRACE);
657+
658+
TestCreateTable(runtime, ++txId, "/MyRoot", R"(
659+
Name: "vectors"
660+
Columns { Name: "id" Type: "Uint64" }
661+
Columns { Name: "embedding" Type: "String" }
662+
KeyColumnNames: [ "id" ]
663+
)");
664+
env.TestWaitNotification(runtime, txId);
665+
666+
NYdb::NTable::TGlobalIndexSettings globalIndexSettings;
667+
668+
std::unique_ptr<NYdb::NTable::TKMeansTreeSettings> kmeansTreeSettings;
669+
{
670+
Ydb::Table::KMeansTreeSettings proto;
671+
UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(R"(
672+
settings {
673+
metric: DISTANCE_COSINE
674+
vector_type: VECTOR_TYPE_FLOAT
675+
vector_dimension: 1024
676+
}
677+
levels: 5
678+
clusters: 4
679+
)", &proto));
680+
using T = NYdb::NTable::TKMeansTreeSettings;
681+
kmeansTreeSettings = std::make_unique<T>(T::FromProto(proto));
682+
}
683+
684+
TBlockEvents<TEvDataShard::TEvLocalKMeansResponse> blocked(runtime, [&](auto& ev) {
685+
ev->Get()->Record.SetStatus(NKikimrIndexBuilder::EBuildStatus::BUILD_ERROR);
686+
auto issue = ev->Get()->Record.AddIssues();
687+
issue->set_severity(NYql::TSeverityIds::S_ERROR);
688+
issue->set_message("Datashard test fail");
689+
return true;
690+
});
691+
692+
const ui64 buildIndexTx = ++txId;
693+
const TVector<TString> dataColumns;
694+
const TVector<TString> indexColumns{"embedding"};
695+
AsyncBuildVectorIndex(runtime, buildIndexTx, TTestTxConfig::SchemeShard, "/MyRoot", "/MyRoot/vectors", "index1", "embedding");
696+
697+
runtime.WaitFor("block", [&]{ return blocked.size(); });
698+
blocked.Stop().Unblock();
699+
700+
env.TestWaitNotification(runtime, buildIndexTx);
701+
702+
{
703+
auto buildIndexOperation = TestGetBuildIndex(runtime, TTestTxConfig::SchemeShard, "/MyRoot", buildIndexTx);
704+
UNIT_ASSERT_VALUES_EQUAL_C(
705+
buildIndexOperation.GetIndexBuild().GetState(), Ydb::Table::IndexBuildState::STATE_REJECTED,
706+
buildIndexOperation.DebugString()
707+
);
708+
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "One of the shards report BUILD_ERROR");
709+
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "Error: Datashard test fail");
710+
}
711+
712+
RebootTablet(runtime, TTestTxConfig::SchemeShard, runtime.AllocateEdgeActor());
713+
714+
{
715+
auto buildIndexOperation = TestGetBuildIndex(runtime, TTestTxConfig::SchemeShard, "/MyRoot", buildIndexTx);
716+
UNIT_ASSERT_VALUES_EQUAL_C(
717+
buildIndexOperation.GetIndexBuild().GetState(), Ydb::Table::IndexBuildState::STATE_REJECTED,
718+
buildIndexOperation.DebugString()
719+
);
720+
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "One of the shards report BUILD_ERROR");
721+
UNIT_ASSERT_STRING_CONTAINS(buildIndexOperation.DebugString(), "Error: Datashard test fail");
722+
}
723+
}
649724
}

0 commit comments

Comments
 (0)