Skip to content

Commit cb6adbd

Browse files
authored
do not trigger dead tablet issue during creation of a lot of tablets (#10235)
1 parent b1f1273 commit cb6adbd

File tree

7 files changed

+141
-16
lines changed

7 files changed

+141
-16
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -182,21 +182,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
182182
int Count = 1;
183183
TStackVec<TString> Identifiers;
184184

185-
TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) {
186-
Type = info.tablettype();
187-
Leader = info.followerid() == 0;
185+
static ETabletState GetState(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) {
188186
if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_STOPPED) {
189-
State = ETabletState::Stopped;
190-
} else if (info.volatilestate() != NKikimrHive::TABLET_VOLATILE_STATE_RUNNING
191-
&& info.has_lastalivetimestamp()
192-
&& (info.lastalivetimestamp() != 0 && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier)
193-
&& info.tabletbootmode() == NKikimrHive::TABLET_BOOT_MODE_DEFAULT) {
194-
State = ETabletState::Dead;
195-
} else if (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) {
196-
State = ETabletState::RestartsTooOften;
197-
} else {
198-
State = ETabletState::Good;
187+
return ETabletState::Stopped;
188+
}
189+
ETabletState state = (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) ? ETabletState::RestartsTooOften : ETabletState::Good;
190+
if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_RUNNING) {
191+
return state;
192+
}
193+
if (info.tabletbootmode() != NKikimrHive::TABLET_BOOT_MODE_DEFAULT) {
194+
return state;
195+
}
196+
if (info.lastalivetimestamp() != 0 && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier) {
197+
// Tablet is not alive for a long time
198+
// We should report it as dead unless it's just waiting to be created
199+
if (info.generation() == 0 && info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_BOOTING && !info.inwaitqueue()) {
200+
return state;
201+
}
202+
return ETabletState::Dead;
199203
}
204+
return state;
205+
206+
}
207+
208+
TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings)
209+
: Type(info.tablettype())
210+
, State(GetState(info, settings))
211+
, Leader(info.followerid() == 0)
212+
{
200213
}
201214

202215
bool operator ==(const TNodeTabletStateCount& o) const {

ydb/core/health_check/health_check_ut.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <ydb/core/testlib/test_client.h>
44
#include <ydb/public/lib/deprecated/kicli/kicli.h>
55

6+
#include <ydb/core/mind/hive/hive_events.h>
67
#include <ydb/core/node_whiteboard/node_whiteboard.h>
78
#include <ydb/core/blobstorage/base/blobstorage_events.h>
89
#include <ydb/core/tx/schemeshard/schemeshard.h>
@@ -1837,5 +1838,109 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
18371838
Y_UNIT_TEST(ShardsNoLimit) {
18381839
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
18391840
}
1841+
1842+
bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
1843+
for (const auto& issue_log : result.issue_log()) {
1844+
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
1845+
return true;
1846+
}
1847+
}
1848+
return false;
1849+
}
1850+
1851+
Y_UNIT_TEST(TestTabletIsDead) {
1852+
TPortManager tp;
1853+
ui16 port = tp.GetPort(2134);
1854+
ui16 grpcPort = tp.GetPort(2135);
1855+
auto settings = TServerSettings(port)
1856+
.SetNodeCount(2)
1857+
.SetDynamicNodeCount(1)
1858+
.SetUseRealThreads(false)
1859+
.SetDomainName("Root");
1860+
TServer server(settings);
1861+
server.EnableGRpc(grpcPort);
1862+
1863+
TClient client(settings);
1864+
1865+
TTestActorRuntime* runtime = server.GetRuntime();
1866+
TActorId sender = runtime->AllocateEdgeActor();
1867+
1868+
server.SetupDynamicLocalService(2, "Root");
1869+
server.StartPQTablets(1);
1870+
server.DestroyDynamicLocalService(2);
1871+
runtime->AdvanceCurrentTime(TDuration::Minutes(5));
1872+
1873+
TAutoPtr<IEventHandle> handle;
1874+
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
1875+
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
1876+
Cerr << result.ShortDebugString();
1877+
1878+
UNIT_ASSERT(HasDeadTabletIssue(result));
1879+
}
1880+
1881+
Y_UNIT_TEST(TestBootingTabletIsNotDead) {
1882+
TPortManager tp;
1883+
ui16 port = tp.GetPort(2134);
1884+
ui16 grpcPort = tp.GetPort(2135);
1885+
auto settings = TServerSettings(port)
1886+
.SetNodeCount(2)
1887+
.SetDynamicNodeCount(1)
1888+
.SetUseRealThreads(false)
1889+
.SetDomainName("Root");
1890+
TServer server(settings);
1891+
server.EnableGRpc(grpcPort);
1892+
1893+
TClient client(settings);
1894+
1895+
TTestActorRuntime* runtime = server.GetRuntime();
1896+
TActorId sender = runtime->AllocateEdgeActor();
1897+
1898+
auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });
1899+
1900+
server.SetupDynamicLocalService(2, "Root");
1901+
server.StartPQTablets(1, false);
1902+
runtime->AdvanceCurrentTime(TDuration::Minutes(5));
1903+
1904+
TAutoPtr<IEventHandle> handle;
1905+
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
1906+
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
1907+
Cerr << result.ShortDebugString();
1908+
1909+
UNIT_ASSERT(!HasDeadTabletIssue(result));
1910+
}
1911+
1912+
Y_UNIT_TEST(TestReBootingTabletIsDead) {
1913+
TPortManager tp;
1914+
ui16 port = tp.GetPort(2134);
1915+
ui16 grpcPort = tp.GetPort(2135);
1916+
auto settings = TServerSettings(port)
1917+
.SetNodeCount(2)
1918+
.SetDynamicNodeCount(2)
1919+
.SetUseRealThreads(false)
1920+
.SetDomainName("Root");
1921+
TServer server(settings);
1922+
server.EnableGRpc(grpcPort);
1923+
1924+
TClient client(settings);
1925+
1926+
TTestActorRuntime* runtime = server.GetRuntime();
1927+
runtime->SetLogPriority(NKikimrServices::HIVE, NActors::NLog::PRI_TRACE);
1928+
TActorId sender = runtime->AllocateEdgeActor();
1929+
1930+
1931+
server.SetupDynamicLocalService(2, "Root");
1932+
server.StartPQTablets(1, true);
1933+
server.SetupDynamicLocalService(3, "Root");
1934+
auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });
1935+
server.DestroyDynamicLocalService(2);
1936+
runtime->AdvanceCurrentTime(TDuration::Minutes(5));
1937+
1938+
TAutoPtr<IEventHandle> handle;
1939+
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
1940+
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
1941+
Cerr << result.ShortDebugString();
1942+
1943+
UNIT_ASSERT(HasDeadTabletIssue(result));
1944+
}
18401945
}
18411946
}

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb&, TSideEffects& sideEffects)
237237
if (tablet == nullptr) {
238238
continue;
239239
}
240+
tablet->InWaitQueue = false;
240241
if (tablet->IsAlive()) {
241242
BLOG_D("tablet " << record.TabletId << " already alive, skipping");
242243
continue;
@@ -258,6 +259,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb&, TSideEffects& sideEffects)
258259
}
259260
tablet->ActorsToNotifyOnRestart.clear();
260261
BootQueue.AddToWaitQueue(record); // waiting for new node
262+
tablet->InWaitQueue = true;
261263
continue;
262264
}
263265
}
@@ -1878,6 +1880,9 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl
18781880
if (req.GetReturnMetrics()) {
18791881
tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues());
18801882
}
1883+
if (info->InWaitQueue) {
1884+
tabletInfo.SetInWaitQueue(true);
1885+
}
18811886
if (req.GetReturnChannelHistory()) {
18821887
for (const auto& channel : info->TabletStorageInfo->Channels) {
18831888
auto& tabletChannel = *tabletInfo.AddTabletChannels();

ydb/core/mind/hive/tablet_info.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ struct TTabletInfo {
164164
TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node
165165
TInstant BootTime;
166166
TNodeFilter NodeFilter;
167+
bool InWaitQueue = false;
167168

168169
TTabletInfo(ETabletRole role, THive& hive);
169170
TTabletInfo(const TTabletInfo&) = delete;

ydb/core/protos/hive.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,7 @@ message TTabletInfo {
495495
optional uint32 RestartsPerPeriod = 22;
496496
optional uint64 LastAliveTimestamp = 23;
497497
optional EBalancerPolicy BalancerPolicy = 24;
498+
optional bool InWaitQueue = 25;
498499
}
499500

500501
message TEvSeizeTabletsReply {

ydb/core/testlib/test_client.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ namespace Tests {
521521
app.AddDomain(domain.Release());
522522
}
523523

524-
TVector<ui64> TServer::StartPQTablets(ui32 pqTabletsN) {
524+
TVector<ui64> TServer::StartPQTablets(ui32 pqTabletsN, bool wait) {
525525
auto getChannelBind = [](const TString& storagePool) {
526526
TChannelBind bind;
527527
bind.SetStoragePoolName(storagePool);
@@ -556,7 +556,7 @@ namespace Tests {
556556
UNIT_ASSERT_EQUAL_C(createTabletReply->Record.GetOwner(), tabletId,
557557
createTabletReply->Record.GetOwner() << " != " << tabletId);
558558
ui64 id = createTabletReply->Record.GetTabletID();
559-
while (true) {
559+
while (wait) {
560560
auto tabletCreationResult =
561561
Runtime->GrabEdgeEventRethrow<TEvHive::TEvTabletCreationResult>(handle);
562562
UNIT_ASSERT(tabletCreationResult);

ydb/core/testlib/test_client.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ namespace Tests {
314314
}
315315
}
316316
void StartDummyTablets();
317-
TVector<ui64> StartPQTablets(ui32 pqTabletsN);
317+
TVector<ui64> StartPQTablets(ui32 pqTabletsN, bool wait = true);
318318
TTestActorRuntime* GetRuntime() const;
319319
const TServerSettings& GetSettings() const;
320320
const NScheme::TTypeRegistry* GetTypeRegistry();

0 commit comments

Comments
 (0)