@@ -54,13 +54,19 @@ struct TEvPrivate {
54
54
EvCoordinatorPing = EvBegin + 20 ,
55
55
EvUpdateMetrics,
56
56
EvPrintStateToLog,
57
+ EvTryConnect,
57
58
EvEnd
58
59
};
59
60
60
61
static_assert (EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE), " expect EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE)" );
61
62
struct TEvCoordinatorPing : NActors::TEventLocal<TEvCoordinatorPing, EvCoordinatorPing> {};
62
63
struct TEvUpdateMetrics : public NActors ::TEventLocal<TEvUpdateMetrics, EvUpdateMetrics> {};
63
64
struct TEvPrintStateToLog : public NActors ::TEventLocal<TEvPrintStateToLog, EvPrintStateToLog> {};
65
+ struct TEvTryConnect : public NActors ::TEventLocal<TEvTryConnect, EvTryConnect> {
66
+ TEvTryConnect (ui32 nodeId = 0 )
67
+ : NodeId(nodeId) {}
68
+ ui32 NodeId = 0 ;
69
+ };
64
70
};
65
71
66
72
struct TQueryStat {
@@ -119,6 +125,92 @@ class TRowDispatcher : public TActorBootstrapped<TRowDispatcher> {
119
125
}
120
126
};
121
127
128
+ struct TNodesTracker {
129
+ class TRetryState {
130
+ public:
131
+ TDuration GetNextDelay () {
132
+ constexpr TDuration MaxDelay = TDuration::Seconds (10 );
133
+ constexpr TDuration MinDelay = TDuration::MilliSeconds (100 ); // from second retry
134
+ TDuration ret = Delay; // The first delay is zero
135
+ Delay = ClampVal (Delay * 2 , MinDelay, MaxDelay);
136
+ return ret ? RandomizeDelay (ret) : ret;
137
+ }
138
+ private:
139
+ static TDuration RandomizeDelay (TDuration baseDelay) {
140
+ const TDuration::TValue half = baseDelay.GetValue () / 2 ;
141
+ return TDuration::FromValue (half + RandomNumber<TDuration::TValue>(half));
142
+ }
143
+ private:
144
+ TDuration Delay; // The first time retry will be done instantly.
145
+ };
146
+
147
+ struct TNodeState {
148
+ bool Connected = false ;
149
+ bool RetryScheduled = false ;
150
+ TMaybe<TRetryState> RetryState;
151
+ };
152
+ public:
153
+ void Init (const NActors::TActorId& selfId) {
154
+ SelfId = selfId;
155
+ }
156
+
157
+ void AddNode (ui32 nodeId) {
158
+ if (Nodes.contains (nodeId)) {
159
+ return ;
160
+ }
161
+ HandleNodeDisconnected (nodeId);
162
+ }
163
+
164
+ void TryConnect (ui32 nodeId) {
165
+ auto & state = Nodes[nodeId];
166
+ state.RetryScheduled = false ;
167
+ if (state.Connected ) {
168
+ return ;
169
+ }
170
+ auto connectEvent = MakeHolder<NActors::TEvInterconnect::TEvConnectNode>();
171
+ auto proxyId = NActors::TActivationContext::InterconnectProxy (nodeId);
172
+ NActors::TActivationContext::Send (
173
+ new NActors::IEventHandle (proxyId, SelfId, connectEvent.Release (), 0 , 0 ));
174
+ }
175
+
176
+ bool GetNodeConnected (ui32 nodeId) {
177
+ return Nodes[nodeId].Connected ;
178
+ }
179
+
180
+ void HandleNodeConnected (ui32 nodeId) {
181
+ auto & state = Nodes[nodeId];
182
+ state.Connected = true ;
183
+ state.RetryState = Nothing ();
184
+ }
185
+
186
+ void HandleNodeDisconnected (ui32 nodeId) {
187
+ auto & state = Nodes[nodeId];
188
+ state.Connected = false ;
189
+ if (state.RetryScheduled ) {
190
+ return ;
191
+ }
192
+ state.RetryScheduled = true ;
193
+ if (!state.RetryState ) {
194
+ state.RetryState .ConstructInPlace ();
195
+ }
196
+ auto ev = MakeHolder<TEvPrivate::TEvTryConnect>(nodeId);
197
+ auto delay = state.RetryState ->GetNextDelay ();
198
+ NActors::TActivationContext::Schedule (delay, new NActors::IEventHandle (SelfId, SelfId, ev.Release ()));
199
+ }
200
+
201
+ void PrintInternalState (TStringStream& stream) const {
202
+ stream << " Nodes states: \n " ;
203
+ for (const auto & [nodeId, state] : Nodes) {
204
+ stream << " id " << nodeId << " connected " << state.Connected << " retry scheduled " << state.RetryScheduled << " \n " ;
205
+ }
206
+ }
207
+
208
+ private:
209
+ TMap<ui32, TNodeState> Nodes;
210
+ NActors::TActorId SelfId;
211
+ TString LogPrefix = " RowDispatcher: " ;
212
+ };
213
+
122
214
123
215
NConfig::TRowDispatcherConfig Config;
124
216
NKikimr::TYdbCredentialsProviderFactory CredentialsProviderFactory;
@@ -134,8 +226,7 @@ class TRowDispatcher : public TActorBootstrapped<TRowDispatcher> {
134
226
const ::NMonitoring::TDynamicCounterPtr Counters;
135
227
TRowDispatcherMetrics Metrics;
136
228
NYql::IPqGateway::TPtr PqGateway;
137
- THashSet<TActorId> InterconnectSessions;
138
- TMap<ui32, bool > NodeConnected;
229
+ TNodesTracker NodesTracker;
139
230
140
231
struct ConsumerCounters {
141
232
ui64 NewDataArrived = 0 ;
@@ -222,15 +313,14 @@ class TRowDispatcher : public TActorBootstrapped<TRowDispatcher> {
222
313
void Handle (NFq::TEvRowDispatcher::TEvSessionStatistic::TPtr& ev);
223
314
224
315
void Handle (NFq::TEvRowDispatcher::TEvHeartbeat::TPtr& ev);
225
- void Handle (const NYql::NDq::TEvRetryQueuePrivate::TEvRetry ::TPtr&);
316
+ void Handle (const TEvPrivate::TEvTryConnect ::TPtr&);
226
317
void Handle (const NYql::NDq::TEvRetryQueuePrivate::TEvEvHeartbeat::TPtr&);
227
318
void Handle (const NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed::TPtr&);
228
319
void Handle (NFq::TEvPrivate::TEvUpdateMetrics::TPtr&);
229
320
void Handle (NFq::TEvPrivate::TEvPrintStateToLog::TPtr&);
230
321
void Handle (const NMon::TEvHttpInfo::TPtr&);
231
322
232
323
void DeleteConsumer (const ConsumerSessionKey& key);
233
- void UpdateInterconnectSessions (const NActors::TActorId& interconnectSession);
234
324
void UpdateMetrics ();
235
325
TString GetInternalState ();
236
326
@@ -250,7 +340,7 @@ class TRowDispatcher : public TActorBootstrapped<TRowDispatcher> {
250
340
hFunc (NFq::TEvRowDispatcher::TEvSessionError, Handle);
251
341
hFunc (NFq::TEvRowDispatcher::TEvStatus, Handle);
252
342
hFunc (NFq::TEvRowDispatcher::TEvSessionStatistic, Handle);
253
- hFunc (NYql::NDq::TEvRetryQueuePrivate::TEvRetry , Handle);
343
+ hFunc (TEvPrivate::TEvTryConnect , Handle);
254
344
hFunc (NYql::NDq::TEvRetryQueuePrivate::TEvEvHeartbeat, Handle);
255
345
hFunc (NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed, Handle);
256
346
hFunc (NFq::TEvRowDispatcher::TEvHeartbeat, Handle);
@@ -300,6 +390,7 @@ void TRowDispatcher::Bootstrap() {
300
390
mon->RegisterActorPage (actorsMonPage, " row_dispatcher" , " Row Dispatcher" , false ,
301
391
TlsActivationContext->ExecutorThread .ActorSystem , SelfId ());
302
392
}
393
+ NodesTracker.Init (SelfId ());
303
394
}
304
395
305
396
void TRowDispatcher::Handle (NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr& ev) {
@@ -317,15 +408,15 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr&
317
408
318
409
void TRowDispatcher::HandleConnected (TEvInterconnect::TEvNodeConnected::TPtr& ev) {
319
410
LOG_ROW_DISPATCHER_DEBUG (" EvNodeConnected, node id " << ev->Get ()->NodeId );
320
- NodeConnected[ ev->Get ()->NodeId ] = true ;
411
+ NodesTracker. HandleNodeConnected ( ev->Get ()->NodeId ) ;
321
412
for (auto & [actorId, consumer] : Consumers) {
322
413
consumer->EventsQueue .HandleNodeConnected (ev->Get ()->NodeId );
323
414
}
324
415
}
325
416
326
417
void TRowDispatcher::HandleDisconnected (TEvInterconnect::TEvNodeDisconnected::TPtr& ev) {
327
418
LOG_ROW_DISPATCHER_DEBUG (" TEvNodeDisconnected, node id " << ev->Get ()->NodeId );
328
- NodeConnected[ ev->Get ()->NodeId ] = false ;
419
+ NodesTracker. HandleNodeDisconnected ( ev->Get ()->NodeId ) ;
329
420
for (auto & [actorId, consumer] : Consumers) {
330
421
consumer->EventsQueue .HandleNodeDisconnected (ev->Get ()->NodeId );
331
422
}
@@ -353,7 +444,7 @@ void TRowDispatcher::Handle(NActors::TEvents::TEvPong::TPtr&) {
353
444
354
445
void TRowDispatcher::Handle (NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscribe::TPtr& ev) {
355
446
LOG_ROW_DISPATCHER_DEBUG (" TEvCoordinatorChangesSubscribe from " << ev->Sender );
356
- UpdateInterconnectSessions (ev->InterconnectSession );
447
+ NodesTracker. AddNode (ev->Sender . NodeId () );
357
448
CoordinatorChangedSubscribers.insert (ev->Sender );
358
449
if (!CoordinatorActorId) {
359
450
return ;
@@ -387,6 +478,7 @@ void TRowDispatcher::UpdateMetrics() {
387
478
388
479
TString TRowDispatcher::GetInternalState () {
389
480
TStringStream str;
481
+ NodesTracker.PrintInternalState (str);
390
482
str << " Statistics:\n " ;
391
483
for (auto & [key, sessionsInfo] : TopicSessions) {
392
484
str << " " << key.Endpoint << " / " << key.Database << " / " << key.TopicPath << " / " << key.PartitionId ;
@@ -410,7 +502,7 @@ TString TRowDispatcher::GetInternalState() {
410
502
void TRowDispatcher::Handle (NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) {
411
503
LOG_ROW_DISPATCHER_DEBUG (" TEvStartSession from " << ev->Sender << " , topicPath " << ev->Get ()->Record .GetSource ().GetTopicPath () <<
412
504
" partitionId " << ev->Get ()->Record .GetPartitionId ());
413
- UpdateInterconnectSessions (ev->InterconnectSession );
505
+ NodesTracker. AddNode (ev->Sender . NodeId () );
414
506
TMaybe<ui64> readOffset;
415
507
if (ev->Get ()->Record .HasOffset ()) {
416
508
readOffset = ev->Get ()->Record .GetOffset ();
@@ -430,7 +522,7 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) {
430
522
LOG_ROW_DISPATCHER_DEBUG (" Topic session count " << topicSessionInfo.Sessions .size ());
431
523
Y_ENSURE (topicSessionInfo.Sessions .size () <= 1 );
432
524
433
- auto consumerInfo = MakeAtomicShared<ConsumerInfo>(ev->Sender , SelfId (), NextEventQueueId++, ev->Get ()->Record , TActorId (), NodeConnected[ ev->Sender .NodeId ()] );
525
+ auto consumerInfo = MakeAtomicShared<ConsumerInfo>(ev->Sender , SelfId (), NextEventQueueId++, ev->Get ()->Record , TActorId (), NodesTracker. GetNodeConnected ( ev->Sender .NodeId ()) );
434
526
Consumers[key] = consumerInfo;
435
527
ConsumersByEventQueueId[consumerInfo->EventQueueId ] = consumerInfo;
436
528
if (!consumerInfo->EventsQueue .OnEventReceived (ev)) {
@@ -575,14 +667,9 @@ void TRowDispatcher::Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvSessionClo
575
667
}
576
668
}
577
669
578
- void TRowDispatcher::Handle (const NYql::NDq::TEvRetryQueuePrivate::TEvRetry::TPtr& ev) {
579
- LOG_ROW_DISPATCHER_TRACE (" TEvRetry " << ev->Get ()->EventQueueId );
580
- auto it = ConsumersByEventQueueId.find (ev->Get ()->EventQueueId );
581
- if (it == ConsumersByEventQueueId.end ()) {
582
- LOG_ROW_DISPATCHER_WARN (" No consumer with EventQueueId = " << ev->Get ()->EventQueueId );
583
- return ;
584
- }
585
- it->second ->EventsQueue .Retry ();
670
+ void TRowDispatcher::Handle (const TEvPrivate::TEvTryConnect::TPtr& ev) {
671
+ LOG_ROW_DISPATCHER_TRACE (" TEvTryConnect to node id " << ev->Get ()->NodeId );
672
+ NodesTracker.TryConnect (ev->Get ()->NodeId );
586
673
}
587
674
588
675
void TRowDispatcher::Handle (const NYql::NDq::TEvRetryQueuePrivate::TEvEvHeartbeat::TPtr& ev) {
@@ -705,18 +792,6 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvSessionStatistic::TPtr& ev
705
792
}
706
793
}
707
794
708
- void TRowDispatcher::UpdateInterconnectSessions (const NActors::TActorId& interconnectSession) {
709
- if (!interconnectSession) {
710
- return ;
711
- }
712
- auto sessionsIt = InterconnectSessions.find (interconnectSession);
713
- if (sessionsIt != InterconnectSessions.end ()) {
714
- return ;
715
- }
716
- Send (interconnectSession, new NActors::TEvents::TEvSubscribe, IEventHandle::FlagTrackDelivery);
717
- InterconnectSessions.insert (interconnectSession);
718
- }
719
-
720
795
} // namespace
721
796
722
797
// //////////////////////////////////////////////////////////////////////////////
0 commit comments