Skip to content

Commit b328a5f

Browse files
authored
observability for tablet starts (#6584) (#11266)
1 parent 7dd309e commit b328a5f

File tree

6 files changed

+42
-1
lines changed

6 files changed

+42
-1
lines changed

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,14 @@ void THive::UpdateCounterPingQueueSize() {
17051705
}
17061706
}
17071707

1708+
void THive::UpdateCounterTabletsStarting(i64 tabletsStartingDiff) {
1709+
if (TabletCounters != nullptr) {
1710+
auto& counter = TabletCounters->Simple()[NHive::COUNTER_TABLETS_STARTING];
1711+
auto newValue = counter.Get() + tabletsStartingDiff;
1712+
counter.Set(newValue);
1713+
}
1714+
}
1715+
17081716
void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) {
17091717
TabletMoveHistory.PushBack(moveInfo);
17101718
TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);

ydb/core/mind/hive/hive_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,7 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
653653
void UpdateCounterEventQueueSize(i64 eventQueueSizeDiff);
654654
void UpdateCounterNodesConnected(i64 nodesConnectedDiff);
655655
void UpdateCounterPingQueueSize();
656+
void UpdateCounterTabletsStarting(i64 tabletsStartingDiff);
656657
void RecordTabletMove(const TTabletMoveInfo& info);
657658
bool DomainHasNodes(const TSubDomainKey &domainKey) const;
658659
void ProcessBootQueue();

ydb/core/mind/hive/tablet_info.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ struct TTabletInfo {
163163
EBalancerPolicy BalancerPolicy;
164164
TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node
165165
bool InWaitQueue = false;
166+
TInstant BootTime;
166167

167168
TTabletInfo(ETabletRole role, THive& hive);
168169
TTabletInfo(const TTabletInfo&) = delete;

ydb/core/mind/hive/tx__start_tablet.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ class TTxStartTablet : public TTransactionBase<THive> {
1010
ui64 Cookie;
1111
bool External;
1212
TSideEffects SideEffects;
13+
bool Success;
1314

1415
public:
1516
TTxStartTablet(TFullTabletId tabletId, const TActorId& local, ui64 cookie, bool external, THive *hive)
@@ -23,10 +24,12 @@ class TTxStartTablet : public TTransactionBase<THive> {
2324
TTxType GetTxType() const override { return NHive::TXTYPE_START_TABLET; }
2425

2526
bool Execute(TTransactionContext& txc, const TActorContext&) override {
27+
Success = false;
2628
SideEffects.Reset(Self->SelfId());
2729
BLOG_D("THive::TTxStartTablet::Execute Tablet " << TabletId);
2830
TTabletInfo* tablet = Self->FindTablet(TabletId);
2931
if (tablet != nullptr) {
32+
tablet->BootTime = TActivationContext::Now();
3033
// finish fast-move operation
3134
if (tablet->LastNodeId != 0 && tablet->LastNodeId != Local.NodeId()) {
3235
TNodeInfo* lastNode = Self->FindNode(tablet->LastNodeId);
@@ -65,6 +68,7 @@ class TTxStartTablet : public TTransactionBase<THive> {
6568
new TEvLocal::TEvBootTablet(*leader.TabletStorageInfo, promotableFollowerId, leader.KnownGeneration),
6669
IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession,
6770
Cookie);
71+
Success = true;
6872
return true;
6973
} else {
7074
BLOG_W("THive::TTxStartTablet::Execute, ignoring TEvBootTablet(" << leader.ToString() << ") - wrong state or node");
@@ -79,6 +83,7 @@ class TTxStartTablet : public TTransactionBase<THive> {
7983
new TEvLocal::TEvBootTablet(*follower.LeaderTablet.TabletStorageInfo, follower.Id),
8084
IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession,
8185
Cookie);
86+
Success = true;
8287
return true;
8388
} else {
8489
BLOG_W("THive::TTxStartTablet::Execute, ignoring TEvBootTablet(" << follower.ToString() << ") - wrong state or node");
@@ -108,6 +113,9 @@ class TTxStartTablet : public TTransactionBase<THive> {
108113
void Complete(const TActorContext& ctx) override {
109114
BLOG_D("THive::TTxStartTablet::Complete Tablet " << TabletId << " SideEffects: " << SideEffects);
110115
SideEffects.Complete(ctx);
116+
if (Success) {
117+
Self->UpdateCounterTabletsStarting(+1);
118+
}
111119
}
112120
};
113121

ydb/core/mind/hive/tx__update_tablet_status.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ class TTxUpdateTabletStatus : public TTransactionBase<THive> {
8080
if (Status == TEvLocal::TEvTabletStatus::StatusOk) {
8181
tablet->Statistics.AddRestartTimestamp(now.MilliSeconds());
8282
tablet->ActualizeTabletStatistics(now);
83+
if (tablet->BootTime != TInstant()) {
84+
TDuration startTime = now - tablet->BootTime;
85+
if (startTime > TDuration::Seconds(30)) {
86+
BLOG_W("Tablet " << tablet->GetFullTabletId() << " was starting for " << startTime.Seconds() << " seconds");
87+
}
88+
Self->TabletCounters->Percentile()[NHive::COUNTER_TABLETS_START_TIME].IncrementFor(startTime.MilliSeconds());
89+
Self->UpdateCounterTabletsStarting(-1);
90+
}
8391
TNodeInfo* node = Self->FindNode(Local.NodeId());
8492
if (node == nullptr) {
8593
// event from IC about disconnection of the node could overtake events from the node itself because of Pipe Server

ydb/core/protos/counters_hive.proto

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ enum ESimpleCounters {
2929
COUNTER_IMBALANCED_OBJECTS = 19 [(CounterOpts) = {Name: "ImbalancedObjects"}];
3030
COUNTER_WORST_OBJECT_VARIANCE = 20 [(CounterOpts) = {Name: "WorstObjectVariance"}];
3131
COUNTER_STORAGE_SCATTER = 21 [(CounterOpts) = {Name: "StorageScatter"}];
32-
RESERVED22 = 22;
32+
COUNTER_TABLETS_STARTING = 22 [(CounterOpts) = {Name: "TabletsStarting"}];
3333
COUNTER_PINGQUEUE_SIZE = 23 [(CounterOpts) = {Name: "PingQueueSize"}];
3434
}
3535

@@ -77,6 +77,21 @@ enum EPercentileCounters {
7777
Ranges: { Value: 95 Name: "95%" },
7878
Ranges: { Value: 100 Name: "100%" },
7979
}];
80+
81+
COUNTER_TABLETS_START_TIME = 2 [(CounterOpts) = {
82+
Name: "TabletsStartTimeMs",
83+
Ranges: { Value: 1 }
84+
Ranges: { Value: 5 }
85+
Ranges: { Value: 10 }
86+
Ranges: { Value: 50 }
87+
Ranges: { Value: 100 }
88+
Ranges: { Value: 500 }
89+
Ranges: { Value: 1000 }
90+
Ranges: { Value: 5000 }
91+
Ranges: { Value: 10000 }
92+
Ranges: { Value: 30000 }
93+
Ranges: { Value: 60000 }
94+
}];
8095
}
8196

8297
enum ETxTypes {

0 commit comments

Comments
 (0)