Skip to content

Commit 88c2372

Browse files
committed
add tablet issues to shared db health check (#15609)
1 parent 4b8e06e commit 88c2372

File tree

2 files changed

+139
-23
lines changed

2 files changed

+139
-23
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,10 +1403,13 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
14031403
FilterDomainKey[TSubDomainKey(domainInfo->DomainKey.OwnerId, domainInfo->DomainKey.LocalPathId)] = path;
14041404

14051405
TTabletId hiveId = domainInfo->Params.GetHive();
1406-
if (hiveId && NeedToAskHive(hiveId)) {
1406+
if (hiveId) {
14071407
DatabaseState[path].HiveId = hiveId;
1408-
AskHive(path, hiveId);
1408+
if (NeedToAskHive(hiveId)) {
1409+
AskHive(path, hiveId);
1410+
}
14091411
} else if (RootHiveId && NeedToAskHive(RootHiveId)) {
1412+
DatabaseState[DomainPath].HiveId = RootHiveId;
14101413
AskHive(DomainPath, RootHiveId);
14111414
}
14121415

@@ -1501,22 +1504,30 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15011504

15021505
void AggregateHiveInfo() {
15031506
TNodeTabletState::TTabletStateSettings settings;
1504-
for (const auto& [hiveId, hiveResponse] : HiveInfo) {
1507+
for (auto& [dbPath, dbState] : DatabaseState) {
1508+
const auto& hiveResponse = HiveInfo[dbState.HiveId];
15051509
if (hiveResponse.IsOk()) {
15061510
settings.AliveBarrier = TInstant::MilliSeconds(hiveResponse->Record.GetResponseTimestamp()) - TDuration::Minutes(5);
15071511
for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record.GetTablets()) {
15081512
TSubDomainKey tenantId = TSubDomainKey(hiveTablet.GetObjectDomain());
15091513
auto itDomain = FilterDomainKey.find(tenantId);
1514+
TDatabaseState* database = nullptr;
15101515
if (itDomain == FilterDomainKey.end()) {
1511-
continue;
1512-
}
1513-
auto itDatabase = DatabaseState.find(itDomain->second);
1514-
if (itDatabase == DatabaseState.end()) {
1515-
continue;
1516+
if (!FilterDatabase || FilterDatabase == dbPath) {
1517+
database = &dbState;
1518+
} else {
1519+
continue;
1520+
}
1521+
} else {
1522+
auto itDatabase = DatabaseState.find(itDomain->second);
1523+
if (itDatabase != DatabaseState.end()) {
1524+
database = &itDatabase->second;
1525+
} else {
1526+
continue;
1527+
}
15161528
}
1517-
TDatabaseState& database = itDatabase->second;
15181529
auto tabletId = std::make_pair(hiveTablet.GetTabletID(), hiveTablet.GetFollowerID());
1519-
database.MergedTabletState.emplace(tabletId, &hiveTablet);
1530+
database->MergedTabletState.emplace(tabletId, &hiveTablet);
15201531
TNodeId nodeId = hiveTablet.GetNodeID();
15211532
switch (hiveTablet.GetVolatileState()) {
15221533
case NKikimrHive::ETabletVolatileState::TABLET_VOLATILE_STATE_STARTING:
@@ -1526,7 +1537,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15261537
nodeId = 0;
15271538
break;
15281539
}
1529-
database.MergedNodeTabletState[nodeId].AddTablet(hiveTablet, settings);
1540+
database->MergedNodeTabletState[nodeId].AddTablet(hiveTablet, settings);
15301541
}
15311542
}
15321543
}

ydb/core/health_check/health_check_ut.cpp

Lines changed: 117 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
445445
UNIT_ASSERT_VALUES_EQUAL(issueVdiscCount, issueVdiscNumber);
446446
}
447447

448+
bool HasTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
449+
for (const auto& issue_log : result.issue_log()) {
450+
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
451+
return true;
452+
}
453+
}
454+
return false;
455+
}
456+
448457
void ListingTest(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) {
449458
auto result = RequestHc(groupNumber, vdiscPerGroupNumber, isMergeRecords);
450459
CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
@@ -864,6 +873,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
864873
}
865874
}
866875

876+
void AddBadServerlessTablet(TEvHive::TEvResponseHiveInfo::TPtr* ev) {
877+
auto &record = (*ev)->Get()->Record;
878+
auto* tablet = record.MutableTablets()->Add();
879+
tablet->SetTabletID(1);
880+
tablet->MutableObjectDomain()->SetSchemeShard(SERVERLESS_DOMAIN_KEY.OwnerId);
881+
tablet->MutableObjectDomain()->SetPathId(SERVERLESS_DOMAIN_KEY.LocalPathId);
882+
tablet->SetRestartsPerPeriod(500);
883+
}
884+
867885
Y_UNIT_TEST(SpecificServerless) {
868886
TPortManager tp;
869887
ui16 port = tp.GetPort(2134);
@@ -1163,6 +1181,102 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
11631181
UNIT_ASSERT(!databaseFoundInResult);
11641182
}
11651183

1184+
Y_UNIT_TEST(ServerlessBadTablets) {
1185+
TPortManager tp;
1186+
ui16 port = tp.GetPort(2134);
1187+
ui16 grpcPort = tp.GetPort(2135);
1188+
auto settings = TServerSettings(port)
1189+
.SetNodeCount(1)
1190+
.SetDynamicNodeCount(1)
1191+
.SetUseRealThreads(false)
1192+
.SetDomainName("Root");
1193+
TServer server(settings);
1194+
server.EnableGRpc(grpcPort);
1195+
TClient client(settings);
1196+
TTestActorRuntime& runtime = *server.GetRuntime();
1197+
1198+
auto &dynamicNameserviceConfig = runtime.GetAppData().DynamicNameserviceConfig;
1199+
dynamicNameserviceConfig->MaxStaticNodeId = runtime.GetNodeId(server.StaticNodes() - 1);
1200+
dynamicNameserviceConfig->MinDynamicNodeId = runtime.GetNodeId(server.StaticNodes());
1201+
dynamicNameserviceConfig->MaxDynamicNodeId = runtime.GetNodeId(server.StaticNodes() + server.DynamicNodes() - 1);
1202+
1203+
ui32 sharedDynNodeId = runtime.GetNodeId(1);
1204+
1205+
bool firstConsoleResponse = true;
1206+
auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
1207+
switch (ev->GetTypeRewrite()) {
1208+
case NConsole::TEvConsole::EvListTenantsResponse: {
1209+
auto *x = reinterpret_cast<NConsole::TEvConsole::TEvListTenantsResponse::TPtr*>(&ev);
1210+
AddPathsToListTenantsResponse(x, { "/Root/serverless", "/Root/shared" });
1211+
break;
1212+
}
1213+
case NConsole::TEvConsole::EvGetTenantStatusResponse: {
1214+
auto *x = reinterpret_cast<NConsole::TEvConsole::TEvGetTenantStatusResponse::TPtr*>(&ev);
1215+
if (!firstConsoleResponse) {
1216+
ChangeGetTenantStatusResponse(x, "/Root/serverless");
1217+
} else {
1218+
firstConsoleResponse = false;
1219+
ChangeGetTenantStatusResponse(x, "/Root/shared");
1220+
}
1221+
break;
1222+
}
1223+
case TEvTxProxySchemeCache::EvNavigateKeySetResult: {
1224+
auto *x = reinterpret_cast<TEvTxProxySchemeCache::TEvNavigateKeySetResult::TPtr*>(&ev);
1225+
ChangeNavigateKeyResultServerless(x, NKikimrSubDomains::EServerlessComputeResourcesModeShared, runtime);
1226+
break;
1227+
}
1228+
case TEvHive::EvResponseHiveNodeStats: {
1229+
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveNodeStats::TPtr*>(&ev);
1230+
ChangeResponseHiveNodeStats(x, sharedDynNodeId);
1231+
break;
1232+
}
1233+
case TEvHive::EvResponseHiveInfo: {
1234+
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveInfo::TPtr*>(&ev);
1235+
AddBadServerlessTablet(x);
1236+
break;
1237+
}
1238+
case TEvSchemeShard::EvDescribeSchemeResult: {
1239+
auto *x = reinterpret_cast<NSchemeShard::TEvSchemeShard::TEvDescribeSchemeResult::TPtr*>(&ev);
1240+
ChangeDescribeSchemeResultServerless(x);
1241+
break;
1242+
}
1243+
case TEvBlobStorage::EvControllerConfigResponse: {
1244+
auto *x = reinterpret_cast<TEvBlobStorage::TEvControllerConfigResponse::TPtr*>(&ev);
1245+
AddGroupVSlotInControllerConfigResponseWithStaticGroup(x, NKikimrBlobStorage::TGroupStatus::FULL, TVDisks(1));
1246+
break;
1247+
}
1248+
case NSysView::TEvSysView::EvGetVSlotsResponse: {
1249+
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr*>(&ev);
1250+
AddVSlotsToSysViewResponse(x, 1, TVDisks(1));
1251+
break;
1252+
}
1253+
case NSysView::TEvSysView::EvGetGroupsResponse: {
1254+
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetGroupsResponse::TPtr*>(&ev);
1255+
AddGroupsToSysViewResponse(x);
1256+
break;
1257+
}
1258+
case NSysView::TEvSysView::EvGetStoragePoolsResponse: {
1259+
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetStoragePoolsResponse::TPtr*>(&ev);
1260+
AddStoragePoolsToSysViewResponse(x);
1261+
break;
1262+
}
1263+
}
1264+
1265+
return TTestActorRuntime::EEventAction::PROCESS;
1266+
};
1267+
runtime.SetObserverFunc(observerFunc);
1268+
1269+
TActorId sender = runtime.AllocateEdgeActor();
1270+
TAutoPtr<IEventHandle> handle;
1271+
1272+
auto *request = new NHealthCheck::TEvSelfCheckRequest;
1273+
request->Request.set_return_verbose_status(true);
1274+
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
1275+
const auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
1276+
Ctest << result.ShortDebugString();
1277+
UNIT_ASSERT(HasTabletIssue(result));
1278+
}
1279+
11661280
Y_UNIT_TEST(DontIgnoreServerlessWithExclusiveNodesWhenNotSpecific) {
11671281
TPortManager tp;
11681282
ui16 port = tp.GetPort(2134);
@@ -1858,15 +1972,6 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
18581972
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
18591973
}
18601974

1861-
bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
1862-
for (const auto& issue_log : result.issue_log()) {
1863-
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
1864-
return true;
1865-
}
1866-
}
1867-
return false;
1868-
}
1869-
18701975
Y_UNIT_TEST(TestTabletIsDead) {
18711976
TPortManager tp;
18721977
ui16 port = tp.GetPort(2134);
@@ -1894,7 +1999,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
18941999
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
18952000
Cerr << result.ShortDebugString();
18962001

1897-
UNIT_ASSERT(HasDeadTabletIssue(result));
2002+
UNIT_ASSERT(HasTabletIssue(result));
18982003
}
18992004

19002005
Y_UNIT_TEST(TestBootingTabletIsNotDead) {
@@ -1925,7 +2030,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
19252030
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
19262031
Cerr << result.ShortDebugString();
19272032

1928-
UNIT_ASSERT(!HasDeadTabletIssue(result));
2033+
UNIT_ASSERT(!HasTabletIssue(result));
19292034
}
19302035

19312036
Y_UNIT_TEST(TestReBootingTabletIsDead) {
@@ -1959,7 +2064,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
19592064
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
19602065
Cerr << result.ShortDebugString();
19612066

1962-
UNIT_ASSERT(HasDeadTabletIssue(result));
2067+
UNIT_ASSERT(HasTabletIssue(result));
19632068
}
19642069
}
19652070
}

0 commit comments

Comments
 (0)