Skip to content

Commit 6ac053e

Browse files
authored
25-1: add tablet issues to shared db health check (#15609) (#18488)
2 parents 9a1ddb7 + 88c2372 commit 6ac053e

File tree

2 files changed

+139
-23
lines changed

2 files changed

+139
-23
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1417,10 +1417,13 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
14171417
FilterDomainKey[TSubDomainKey(domainInfo->DomainKey.OwnerId, domainInfo->DomainKey.LocalPathId)] = path;
14181418

14191419
TTabletId hiveId = domainInfo->Params.GetHive();
1420-
if (hiveId && NeedToAskHive(hiveId)) {
1420+
if (hiveId) {
14211421
DatabaseState[path].HiveId = hiveId;
1422-
AskHive(path, hiveId);
1422+
if (NeedToAskHive(hiveId)) {
1423+
AskHive(path, hiveId);
1424+
}
14231425
} else if (RootHiveId && NeedToAskHive(RootHiveId)) {
1426+
DatabaseState[DomainPath].HiveId = RootHiveId;
14241427
AskHive(DomainPath, RootHiveId);
14251428
}
14261429

@@ -1515,23 +1518,31 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15151518

15161519
void AggregateHiveInfo() {
15171520
TNodeTabletState::TTabletStateSettings settings;
1518-
for (const auto& [hiveId, hiveResponse] : HiveInfo) {
1521+
for (auto& [dbPath, dbState] : DatabaseState) {
1522+
const auto& hiveResponse = HiveInfo[dbState.HiveId];
15191523
if (hiveResponse.IsOk()) {
15201524
settings.AliveBarrier = TInstant::MilliSeconds(hiveResponse->Record.GetResponseTimestamp()) - TDuration::Minutes(5);
15211525
settings.MaxRestartsPerPeriod = HealthCheckConfig.GetThresholds().GetTabletsRestartsOrange();
15221526
for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record.GetTablets()) {
15231527
TSubDomainKey tenantId = TSubDomainKey(hiveTablet.GetObjectDomain());
15241528
auto itDomain = FilterDomainKey.find(tenantId);
1529+
TDatabaseState* database = nullptr;
15251530
if (itDomain == FilterDomainKey.end()) {
1526-
continue;
1527-
}
1528-
auto itDatabase = DatabaseState.find(itDomain->second);
1529-
if (itDatabase == DatabaseState.end()) {
1530-
continue;
1531+
if (!FilterDatabase || FilterDatabase == dbPath) {
1532+
database = &dbState;
1533+
} else {
1534+
continue;
1535+
}
1536+
} else {
1537+
auto itDatabase = DatabaseState.find(itDomain->second);
1538+
if (itDatabase != DatabaseState.end()) {
1539+
database = &itDatabase->second;
1540+
} else {
1541+
continue;
1542+
}
15311543
}
1532-
TDatabaseState& database = itDatabase->second;
15331544
auto tabletId = std::make_pair(hiveTablet.GetTabletID(), hiveTablet.GetFollowerID());
1534-
database.MergedTabletState.emplace(tabletId, &hiveTablet);
1545+
database->MergedTabletState.emplace(tabletId, &hiveTablet);
15351546
TNodeId nodeId = hiveTablet.GetNodeID();
15361547
switch (hiveTablet.GetVolatileState()) {
15371548
case NKikimrHive::ETabletVolatileState::TABLET_VOLATILE_STATE_STARTING:
@@ -1541,7 +1552,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15411552
nodeId = 0;
15421553
break;
15431554
}
1544-
database.MergedNodeTabletState[nodeId].AddTablet(hiveTablet, settings);
1555+
database->MergedNodeTabletState[nodeId].AddTablet(hiveTablet, settings);
15451556
}
15461557
}
15471558
}

ydb/core/health_check/health_check_ut.cpp

Lines changed: 117 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
446446
UNIT_ASSERT_VALUES_EQUAL(issueVdiscCount, issueVdiscNumber);
447447
}
448448

449+
bool HasTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
450+
for (const auto& issue_log : result.issue_log()) {
451+
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
452+
return true;
453+
}
454+
}
455+
return false;
456+
}
457+
449458
void ListingTest(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) {
450459
auto result = RequestHc(groupNumber, vdiscPerGroupNumber, isMergeRecords);
451460
CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
@@ -865,6 +874,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
865874
}
866875
}
867876

877+
void AddBadServerlessTablet(TEvHive::TEvResponseHiveInfo::TPtr* ev) {
878+
auto &record = (*ev)->Get()->Record;
879+
auto* tablet = record.MutableTablets()->Add();
880+
tablet->SetTabletID(1);
881+
tablet->MutableObjectDomain()->SetSchemeShard(SERVERLESS_DOMAIN_KEY.OwnerId);
882+
tablet->MutableObjectDomain()->SetPathId(SERVERLESS_DOMAIN_KEY.LocalPathId);
883+
tablet->SetRestartsPerPeriod(500);
884+
}
885+
868886
Y_UNIT_TEST(SpecificServerless) {
869887
TPortManager tp;
870888
ui16 port = tp.GetPort(2134);
@@ -1164,6 +1182,102 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
11641182
UNIT_ASSERT(!databaseFoundInResult);
11651183
}
11661184

1185+
Y_UNIT_TEST(ServerlessBadTablets) {
1186+
TPortManager tp;
1187+
ui16 port = tp.GetPort(2134);
1188+
ui16 grpcPort = tp.GetPort(2135);
1189+
auto settings = TServerSettings(port)
1190+
.SetNodeCount(1)
1191+
.SetDynamicNodeCount(1)
1192+
.SetUseRealThreads(false)
1193+
.SetDomainName("Root");
1194+
TServer server(settings);
1195+
server.EnableGRpc(grpcPort);
1196+
TClient client(settings);
1197+
TTestActorRuntime& runtime = *server.GetRuntime();
1198+
1199+
auto &dynamicNameserviceConfig = runtime.GetAppData().DynamicNameserviceConfig;
1200+
dynamicNameserviceConfig->MaxStaticNodeId = runtime.GetNodeId(server.StaticNodes() - 1);
1201+
dynamicNameserviceConfig->MinDynamicNodeId = runtime.GetNodeId(server.StaticNodes());
1202+
dynamicNameserviceConfig->MaxDynamicNodeId = runtime.GetNodeId(server.StaticNodes() + server.DynamicNodes() - 1);
1203+
1204+
ui32 sharedDynNodeId = runtime.GetNodeId(1);
1205+
1206+
bool firstConsoleResponse = true;
1207+
auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
1208+
switch (ev->GetTypeRewrite()) {
1209+
case NConsole::TEvConsole::EvListTenantsResponse: {
1210+
auto *x = reinterpret_cast<NConsole::TEvConsole::TEvListTenantsResponse::TPtr*>(&ev);
1211+
AddPathsToListTenantsResponse(x, { "/Root/serverless", "/Root/shared" });
1212+
break;
1213+
}
1214+
case NConsole::TEvConsole::EvGetTenantStatusResponse: {
1215+
auto *x = reinterpret_cast<NConsole::TEvConsole::TEvGetTenantStatusResponse::TPtr*>(&ev);
1216+
if (!firstConsoleResponse) {
1217+
ChangeGetTenantStatusResponse(x, "/Root/serverless");
1218+
} else {
1219+
firstConsoleResponse = false;
1220+
ChangeGetTenantStatusResponse(x, "/Root/shared");
1221+
}
1222+
break;
1223+
}
1224+
case TEvTxProxySchemeCache::EvNavigateKeySetResult: {
1225+
auto *x = reinterpret_cast<TEvTxProxySchemeCache::TEvNavigateKeySetResult::TPtr*>(&ev);
1226+
ChangeNavigateKeyResultServerless(x, NKikimrSubDomains::EServerlessComputeResourcesModeShared, runtime);
1227+
break;
1228+
}
1229+
case TEvHive::EvResponseHiveNodeStats: {
1230+
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveNodeStats::TPtr*>(&ev);
1231+
ChangeResponseHiveNodeStats(x, sharedDynNodeId);
1232+
break;
1233+
}
1234+
case TEvHive::EvResponseHiveInfo: {
1235+
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveInfo::TPtr*>(&ev);
1236+
AddBadServerlessTablet(x);
1237+
break;
1238+
}
1239+
case TEvSchemeShard::EvDescribeSchemeResult: {
1240+
auto *x = reinterpret_cast<NSchemeShard::TEvSchemeShard::TEvDescribeSchemeResult::TPtr*>(&ev);
1241+
ChangeDescribeSchemeResultServerless(x);
1242+
break;
1243+
}
1244+
case TEvBlobStorage::EvControllerConfigResponse: {
1245+
auto *x = reinterpret_cast<TEvBlobStorage::TEvControllerConfigResponse::TPtr*>(&ev);
1246+
AddGroupVSlotInControllerConfigResponseWithStaticGroup(x, NKikimrBlobStorage::TGroupStatus::FULL, TVDisks(1));
1247+
break;
1248+
}
1249+
case NSysView::TEvSysView::EvGetVSlotsResponse: {
1250+
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr*>(&ev);
1251+
AddVSlotsToSysViewResponse(x, 1, TVDisks(1));
1252+
break;
1253+
}
1254+
case NSysView::TEvSysView::EvGetGroupsResponse: {
1255+
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetGroupsResponse::TPtr*>(&ev);
1256+
AddGroupsToSysViewResponse(x);
1257+
break;
1258+
}
1259+
case NSysView::TEvSysView::EvGetStoragePoolsResponse: {
1260+
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetStoragePoolsResponse::TPtr*>(&ev);
1261+
AddStoragePoolsToSysViewResponse(x);
1262+
break;
1263+
}
1264+
}
1265+
1266+
return TTestActorRuntime::EEventAction::PROCESS;
1267+
};
1268+
runtime.SetObserverFunc(observerFunc);
1269+
1270+
TActorId sender = runtime.AllocateEdgeActor();
1271+
TAutoPtr<IEventHandle> handle;
1272+
1273+
auto *request = new NHealthCheck::TEvSelfCheckRequest;
1274+
request->Request.set_return_verbose_status(true);
1275+
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
1276+
const auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
1277+
Ctest << result.ShortDebugString();
1278+
UNIT_ASSERT(HasTabletIssue(result));
1279+
}
1280+
11671281
Y_UNIT_TEST(DontIgnoreServerlessWithExclusiveNodesWhenNotSpecific) {
11681282
TPortManager tp;
11691283
ui16 port = tp.GetPort(2134);
@@ -1859,15 +1973,6 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
18591973
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
18601974
}
18611975

1862-
bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
1863-
for (const auto& issue_log : result.issue_log()) {
1864-
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
1865-
return true;
1866-
}
1867-
}
1868-
return false;
1869-
}
1870-
18711976
Y_UNIT_TEST(TestTabletIsDead) {
18721977
TPortManager tp;
18731978
ui16 port = tp.GetPort(2134);
@@ -1895,7 +2000,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
18952000
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
18962001
Cerr << result.ShortDebugString();
18972002

1898-
UNIT_ASSERT(HasDeadTabletIssue(result));
2003+
UNIT_ASSERT(HasTabletIssue(result));
18992004
}
19002005

19012006
Y_UNIT_TEST(TestBootingTabletIsNotDead) {
@@ -1926,7 +2031,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
19262031
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
19272032
Cerr << result.ShortDebugString();
19282033

1929-
UNIT_ASSERT(!HasDeadTabletIssue(result));
2034+
UNIT_ASSERT(!HasTabletIssue(result));
19302035
}
19312036

19322037
Y_UNIT_TEST(TestReBootingTabletIsDead) {
@@ -1960,7 +2065,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
19602065
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
19612066
Cerr << result.ShortDebugString();
19622067

1963-
UNIT_ASSERT(HasDeadTabletIssue(result));
2068+
UNIT_ASSERT(HasTabletIssue(result));
19642069
}
19652070

19662071
void SendHealthCheckConfigUpdate(TTestActorRuntime &runtime, const TActorId& sender, const NKikimrConfig::THealthCheckConfig &cfg) {

0 commit comments

Comments
 (0)