Skip to content

Commit 048e65e

Browse files
authored
pq rd: fix use-after-free with logbroker federation (v2) (#17189)
1 parent 476779e commit 048e65e

File tree

1 file changed

+68
-12
lines changed

1 file changed

+68
-12
lines changed

ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp

Lines changed: 68 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ class TDqPqRdReadActor : public NActors::TActor<TDqPqRdReadActor>, public NYql::
234234
std::vector<std::optional<ui64>> ColumnIndexes; // Output column index in schema passed into RowDispatcher
235235
const TType* InputDataType = nullptr; // Multi type (comes from Row Dispatcher)
236236
std::unique_ptr<NKikimr::NMiniKQL::TValuePackerTransport<true>> DataUnpacker;
237+
// Set on Parent
237238
ui64 CpuMicrosec = 0;
238239
// Set on both Parent (cumulative) and Childern (separate)
239240

@@ -377,10 +378,47 @@ class TDqPqRdReadActor : public NActors::TActor<TDqPqRdReadActor>, public NYql::
377378
hFunc(TEvPrivate::TEvRefreshClusters, Handle);
378379
hFunc(TEvPrivate::TEvReceivedClusters, Handle);
379380
hFunc(TEvPrivate::TEvDescribeTopicResult, Handle);
381+
})
380382

381-
cFunc(TEvents::TEvPoisonPill::EventType, PassAway);
383+
STRICT_STFUNC(IgnoreState, {
384+
// ignore all events except for retry queue
385+
hFunc(NFq::TEvRowDispatcher::TEvCoordinatorChanged, IgnoreEvent);
386+
hFunc(NFq::TEvRowDispatcher::TEvCoordinatorResult, ReplyNoSession);
387+
hFunc(NFq::TEvRowDispatcher::TEvNewDataArrived, ReplyNoSession);
388+
hFunc(NFq::TEvRowDispatcher::TEvMessageBatch, ReplyNoSession);
389+
hFunc(NFq::TEvRowDispatcher::TEvStartSessionAck, ReplyNoSession);
390+
hFunc(NFq::TEvRowDispatcher::TEvSessionError, ReplyNoSession);
391+
hFunc(NFq::TEvRowDispatcher::TEvStatistics, ReplyNoSession);
392+
hFunc(NFq::TEvRowDispatcher::TEvGetInternalStateRequest, ReplyNoSession);
393+
394+
hFunc(NActors::TEvents::TEvPong, Handle);
395+
hFunc(TEvInterconnect::TEvNodeConnected, HandleConnected);
396+
hFunc(TEvInterconnect::TEvNodeDisconnected, HandleDisconnected);
397+
hFunc(NActors::TEvents::TEvUndelivered, Handle);
398+
hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvRetry, Handle);
399+
hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvEvHeartbeat, Handle);
400+
401+
// ignore all row dispatcher events
402+
hFunc(NFq::TEvRowDispatcher::TEvHeartbeat, ReplyNoSession);
403+
hFunc(TEvPrivate::TEvPrintState, IgnoreEvent);
404+
hFunc(TEvPrivate::TEvProcessState, IgnoreEvent);
405+
hFunc(TEvPrivate::TEvNotifyCA, IgnoreEvent);
406+
hFunc(TEvPrivate::TEvRefreshClusters, IgnoreEvent);
407+
hFunc(TEvPrivate::TEvReceivedClusters, IgnoreEvent);
408+
hFunc(TEvPrivate::TEvDescribeTopicResult, IgnoreEvent);
382409
})
383410

411+
template <class TEventPtr>
412+
void IgnoreEvent(TEventPtr& ev) {
413+
SRC_LOG_D("Ignore " << typeid(TEventPtr).name() << " from " << ev->Sender);
414+
}
415+
416+
template <class TEventPtr>
417+
void ReplyNoSession(TEventPtr& ev) {
418+
SRC_LOG_D("Ignore (no session) " << typeid(TEventPtr).name() << " from " << ev->Sender);
419+
SendNoSession(ev->Sender, ev->Cookie);
420+
}
421+
384422
static constexpr char ActorName[] = "DQ_PQ_READ_ACTOR";
385423

386424
void CommitState(const NDqProto::TCheckpoint& checkpoint) override;
@@ -485,9 +523,13 @@ TDqPqRdReadActor::TDqPqRdReadActor(
485523
, CredentialsProviderFactory(std::move(credentialsProviderFactory))
486524
, MaxBufferSize(bufferSize)
487525
{
488-
if (Parent == this) {
489-
State = EState::START_CLUSTER_DISCOVERY;
526+
527+
SRC_LOG_I("Start read actor, local row dispatcher " << LocalRowDispatcherActorId.ToString() << ", metadatafields: " << JoinSeq(',', SourceParams.GetMetadataFields())
528+
<< ", partitions: " << JoinSeq(',', GetPartitionsToRead()));
529+
if (Parent != this) {
530+
return;
490531
}
532+
State = EState::START_CLUSTER_DISCOVERY;
491533
const auto programBuilder = std::make_unique<TProgramBuilder>(typeEnv, *holderFactory.GetFunctionRegistry());
492534

493535
// Parse output schema (expected struct output type)
@@ -510,8 +552,6 @@ TDqPqRdReadActor::TDqPqRdReadActor(
510552
DataUnpacker = std::make_unique<NKikimr::NMiniKQL::TValuePackerTransport<true>>(InputDataType);
511553

512554
IngressStats.Level = statsLevel;
513-
SRC_LOG_I("Start read actor, local row dispatcher " << LocalRowDispatcherActorId.ToString() << ", metadatafields: " << JoinSeq(',', SourceParams.GetMetadataFields())
514-
<< ", partitions: " << JoinSeq(',', GetPartitionsToRead()));
515555
}
516556

517557
void TDqPqRdReadActor::Init() {
@@ -663,15 +703,18 @@ void TDqPqRdReadActor::StopSession(TSession& sessionInfo) {
663703
// IActor & IDqComputeActorAsyncInput
664704
void TDqPqRdReadActor::PassAway() { // Is called from Compute Actor
665705
SRC_LOG_I("PassAway");
706+
Become(&TDqPqRdReadActor::IgnoreState);
666707
PrintInternalState();
667708
for (auto& [rowDispatcherActorId, sessionInfo] : Sessions) {
668709
StopSession(sessionInfo);
669710
}
670711
for (auto& clusterState : Clusters) {
671-
if (clusterState.Child == this) {
712+
auto child = clusterState.Child;
713+
if (child == this) {
672714
continue;
673715
}
674-
Send(clusterState.ChildId, new NActors::TEvents::TEvPoison);
716+
// all actors are on same mailbox, safe to call
717+
child->PassAway();
675718
}
676719
Clusters.clear();
677720
FederatedTopicClient.Reset();
@@ -681,6 +724,7 @@ void TDqPqRdReadActor::PassAway() { // Is called from Compute Actor
681724
}
682725

683726
i64 TDqPqRdReadActor::GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& buffer, TMaybe<TInstant>& /*watermark*/, bool&, i64 freeSpace) {
727+
Counters.GetAsyncInputData++;
684728
SRC_LOG_T("GetAsyncInputData freeSpace = " << freeSpace);
685729
Init();
686730
Metrics.InFlyAsyncInputData->Set(0);
@@ -717,6 +761,7 @@ i64 TDqPqRdReadActor::GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& b
717761
continue;
718762
}
719763
for (auto& [rowDispatcherActorId, sessionInfo] : child->Sessions) {
764+
// all actors are on same mailbox, safe to call
720765
child->TrySendGetNextBatch(sessionInfo);
721766
}
722767
}
@@ -769,6 +814,7 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvStatistics::TPtr& ev) {
769814
SRC_LOG_T("Received TEvStatistics from " << ev->Sender << ", seqNo " << meta.GetSeqNo() << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo() << " generation " << ev->Cookie);
770815
Counters.Statistics++;
771816
CpuMicrosec += ev->Get()->Record.GetCpuMicrosec();
817+
// all actors are on same mailbox, this method is not called after Parent stopped, safe to access directly
772818
if (Parent != this) {
773819
Parent->CpuMicrosec += ev->Get()->Record.GetCpuMicrosec();
774820
}
@@ -872,7 +918,7 @@ void TDqPqRdReadActor::Handle(const NFq::TEvRowDispatcher::TEvHeartbeat::TPtr& e
872918

873919
void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr& ev) {
874920
SRC_LOG_D("TEvCoordinatorChanged, new coordinator " << ev->Get()->CoordinatorActorId);
875-
Counters.GetAsyncInputData++;
921+
Counters.CoordinatorChanged++;
876922

877923
if (CoordinatorActorId
878924
&& CoordinatorActorId == ev->Get()->CoordinatorActorId) {
@@ -900,6 +946,7 @@ void TDqPqRdReadActor::ScheduleProcessState() {
900946

901947
void TDqPqRdReadActor::ReInit(const TString& reason) {
902948
SRC_LOG_I("ReInit state, reason " << reason);
949+
// all actors are on same mailbox, this method is not called after Parent stopped, safe to access directly
903950
Parent->Metrics.ReInit->Inc();
904951

905952
State = EState::WAIT_COORDINATOR_ID;
@@ -915,7 +962,7 @@ void TDqPqRdReadActor::Stop(NDqProto::StatusIds::StatusCode status, TIssues issu
915962

916963
void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvCoordinatorResult::TPtr& ev) {
917964
SRC_LOG_I("Received TEvCoordinatorResult from " << ev->Sender.ToString() << ", cookie " << ev->Cookie);
918-
Counters.CoordinatorChanged++;
965+
Counters.CoordinatorResult++;
919966
if (ev->Cookie != CoordinatorRequestCookie) {
920967
SRC_LOG_W("Ignore TEvCoordinatorResult. wrong cookie");
921968
return;
@@ -991,6 +1038,7 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev)
9911038
Stop(NDqProto::StatusIds::INTERNAL_ERROR, {TIssue(TStringBuilder() << LogPrefix << "No partition with id " << partitionId)});
9921039
return;
9931040
}
1041+
// all actors are on same mailbox, this method is not called after Parent stopped, safe to access directly
9941042
Parent->Metrics.InFlyGetNextBatch->Set(0);
9951043
if (ev->Get()->Record.GetMessages().empty()) {
9961044
return;
@@ -1068,14 +1116,21 @@ void TDqPqRdReadActor::PrintInternalState() {
10681116

10691117
TString TDqPqRdReadActor::GetInternalState() {
10701118
TStringStream str;
1071-
str << LogPrefix << "State: used buffer size " << Parent->ReadyBufferSizeBytes << " ready buffer event size " << Parent->ReadyBuffer.size() << " state " << static_cast<ui64>(State) << " InFlyAsyncInputData " << Parent->InFlyAsyncInputData << "\n";
1072-
str << "Counters: GetAsyncInputData " << Counters.GetAsyncInputData << " CoordinatorChanged " << Counters.CoordinatorChanged << " CoordinatorResult " << Counters.CoordinatorResult
1119+
str << LogPrefix << "State:";
1120+
str << " used buffer size " << Parent->ReadyBufferSizeBytes << " ready buffer event size " << Parent->ReadyBuffer.size()
1121+
<< " state " << static_cast<ui64>(State)
1122+
<< " InFlyAsyncInputData " << Parent->InFlyAsyncInputData
1123+
<< "\n";
1124+
str << "Counters:"
1125+
<< " CoordinatorChanged " << Counters.CoordinatorChanged << " CoordinatorResult " << Counters.CoordinatorResult
10731126
<< " MessageBatch " << Counters.MessageBatch << " StartSessionAck " << Counters.StartSessionAck << " NewDataArrived " << Counters.NewDataArrived
10741127
<< " SessionError " << Counters.SessionError << " Statistics " << Counters.Statistics << " NodeDisconnected " << Counters.NodeDisconnected
10751128
<< " NodeConnected " << Counters.NodeConnected << " Undelivered " << Counters.Undelivered << " Retry " << Counters.Retry
10761129
<< " PrivateHeartbeat " << Counters.PrivateHeartbeat << " SessionClosed " << Counters.SessionClosed << " Pong " << Counters.Pong
10771130
<< " Heartbeat " << Counters.Heartbeat << " PrintState " << Counters.PrintState << " ProcessState " << Counters.ProcessState
1078-
<< " NotifyCA " << Parent->Counters.NotifyCA << "\n";
1131+
<< " GetAsyncInputData " << Parent->Counters.GetAsyncInputData
1132+
<< " NotifyCA " << Parent->Counters.NotifyCA
1133+
<< "\n";
10791134

10801135
for (auto& [rowDispatcherActorId, sessionInfo] : Sessions) {
10811136
str << " " << rowDispatcherActorId << " status " << static_cast<ui64>(sessionInfo.Status)
@@ -1365,6 +1420,7 @@ void TDqPqRdReadActor::StartCluster(ui32 clusterIndex) {
13651420
PqGateway,
13661421
this,
13671422
TString(Clusters[clusterIndex].Info.Name));
1423+
Clusters[clusterIndex].Child = actor;
13681424
Clusters[clusterIndex].ChildId = RegisterWithSameMailbox(actor);
13691425
actor->Init();
13701426
actor->InitChild();

0 commit comments

Comments
 (0)