Skip to content

Commit 881e575

Browse files
authored
Sink metrics & trace (#10397)
1 parent c73f760 commit 881e575

File tree

6 files changed

+68
-5
lines changed

6 files changed

+68
-5
lines changed

ydb/core/kqp/counters/kqp_counters.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,18 @@ TKqpCounters::TKqpCounters(const ::NMonitoring::TDynamicCounterPtr& counters, co
814814
DataShardIteratorMessages = KqpGroup->GetCounter("IteratorReads/DatashardMessages", true);
815815
IteratorDeliveryProblems = KqpGroup->GetCounter("IteratorReads/DeliveryProblems", true);
816816

817+
/* sink writes */
818+
WriteActorsShardResolve = KqpGroup->GetCounter("SinkWrites/WriteActorShardResolve", true);
819+
WriteActorsCount = KqpGroup->GetCounter("SinkWrites/WriteActorsCount", false);
820+
WriteActorImmediateWrites = KqpGroup->GetCounter("SinkWrites/WriteActorImmediateWrites", true);
821+
WriteActorImmediateWritesRetries = KqpGroup->GetCounter("SinkWrites/WriteActorImmediateWritesRetries", true);
822+
WriteActorWritesSizeHistogram =
823+
KqpGroup->GetHistogram("SinkWrites/WriteActorWritesSize", NMonitoring::ExponentialHistogram(28, 2, 1));
824+
WriteActorWritesOperationsHistogram =
825+
KqpGroup->GetHistogram("SinkWrites/WriteActorWritesOperations", NMonitoring::ExponentialHistogram(20, 2, 1));
826+
WriteActorWritesLatencyHistogram =
827+
KqpGroup->GetHistogram("SinkWrites/WriteActorWritesLatencyMs", NMonitoring::ExponentialHistogram(20, 2, 1));
828+
817829
/* sequencers */
818830

819831
SequencerActorsCount = KqpGroup->GetCounter("Sequencer/ActorCount", false);

ydb/core/kqp/counters/kqp_counters.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,15 @@ class TKqpCounters : public TKqpCountersBase, public NYql::NDq::TSpillingCounter
409409
::NMonitoring::TDynamicCounters::TCounterPtr DataShardIteratorMessages;
410410
::NMonitoring::TDynamicCounters::TCounterPtr IteratorDeliveryProblems;
411411

412+
// Sink write counters
413+
::NMonitoring::TDynamicCounters::TCounterPtr WriteActorsShardResolve;
414+
::NMonitoring::TDynamicCounters::TCounterPtr WriteActorsCount;
415+
::NMonitoring::TDynamicCounters::TCounterPtr WriteActorImmediateWrites;
416+
::NMonitoring::TDynamicCounters::TCounterPtr WriteActorImmediateWritesRetries;
417+
NMonitoring::THistogramPtr WriteActorWritesSizeHistogram;
418+
NMonitoring::THistogramPtr WriteActorWritesOperationsHistogram;
419+
NMonitoring::THistogramPtr WriteActorWritesLatencyHistogram;
420+
412421
// Scheduler signals
413422
::NMonitoring::TDynamicCounters::TCounterPtr SchedulerThrottled;
414423
::NMonitoring::TDynamicCounters::TCounterPtr SchedulerCapacity;

ydb/core/kqp/runtime/kqp_write_actor.cpp

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <ydb/core/tx/tx.h>
2020
#include <ydb/library/actors/core/actorsystem.h>
2121
#include <ydb/library/actors/core/interconnect.h>
22+
#include <ydb/library/wilson_ids/wilson.h>
2223
#include <ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h>
2324
#include <ydb/library/yql/public/issue/yql_issue_message.h>
2425

@@ -134,10 +135,13 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
134135
, InconsistentTx(
135136
Settings.GetInconsistentTx())
136137
, MemoryLimit(MessageSettings.InFlightMemoryLimitPerActorBytes)
138+
, WriteActorSpan(TWilsonKqp::WriteActor, NWilson::TTraceId(args.TraceId), "WriteActor")
137139
{
138140
YQL_ENSURE(std::holds_alternative<ui64>(TxId));
139141
YQL_ENSURE(!ImmediateTx);
140142
EgressStats.Level = args.StatsLevel;
143+
144+
Counters->WriteActorsCount->Inc();
141145
}
142146

143147
void Bootstrap() {
@@ -244,6 +248,7 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
244248
}
245249

246250
void ResolveTable() {
251+
Counters->WriteActorsShardResolve->Inc();
247252
SchemeEntry.reset();
248253
SchemeRequest.reset();
249254

@@ -267,8 +272,11 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
267272
entry.ShowPrivatePath = true;
268273
request->ResultSet.emplace_back(entry);
269274

270-
Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvInvalidateTable(TableId, {}));
271-
Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvNavigateKeySet(request));
275+
WriteActorStateSpan = NWilson::TSpan(TWilsonKqp::WriteActorTableNavigate, WriteActorSpan.GetTraceId(),
276+
"WaitForShardsResolve", NWilson::EFlags::AUTO_END);
277+
278+
Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvInvalidateTable(TableId, {}), 0, 0, WriteActorSpan.GetTraceId());
279+
Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvNavigateKeySet(request), 0, 0, WriteActorSpan.GetTraceId());
272280
}
273281

274282
void Handle(TEvTxProxySchemeCache::TEvNavigateKeySetResult::TPtr& ev) {
@@ -327,7 +335,7 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
327335
request->ResultSet.emplace_back(std::move(keyRange));
328336

329337
TAutoPtr<TEvTxProxySchemeCache::TEvResolveKeySet> resolveReq(new TEvTxProxySchemeCache::TEvResolveKeySet(request));
330-
Send(MakeSchemeCacheID(), resolveReq.Release(), 0, 0);
338+
Send(MakeSchemeCacheID(), resolveReq.Release(), 0, 0, WriteActorSpan.GetTraceId());
331339
}
332340

333341
void Handle(TEvTxProxySchemeCache::TEvResolveKeySetResult::TPtr& ev) {
@@ -368,6 +376,8 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
368376
}()
369377
<< ", Cookie=" << ev->Cookie);
370378

379+
380+
371381
switch (ev->Get()->GetStatus()) {
372382
case NKikimrDataEvents::TEvWriteResult::STATUS_UNSPECIFIED: {
373383
CA_LOG_E("Got UNSPECIFIED for table `"
@@ -557,6 +567,11 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
557567
EgressStats.Chunks++;
558568
EgressStats.Splits++;
559569
EgressStats.Resume();
570+
571+
if (auto it = SendTime.find(shardId); it != std::end(SendTime)) {
572+
Counters->WriteActorWritesLatencyHistogram->Collect((TInstant::Now() - it->second).MilliSeconds());
573+
SendTime.erase(it);
574+
}
560575
}
561576
resumeNotificator.CheckMemory();
562577
}
@@ -594,7 +609,6 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
594609
NYql::NDqProto::StatusIds::UNAVAILABLE);
595610
return;
596611
}
597-
598612
auto evWrite = std::make_unique<NKikimr::NEvents::TDataEvents::TEvWrite>(
599613
NKikimrDataEvents::TEvWrite::MODE_IMMEDIATE);
600614

@@ -628,6 +642,16 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
628642
ShardedWriteController->GetDataFormat());
629643
}
630644

645+
if (metadata->SendAttempts == 0) {
646+
Counters->WriteActorImmediateWrites->Inc();
647+
Counters->WriteActorWritesSizeHistogram->Collect(serializationResult.TotalDataSize);
648+
Counters->WriteActorWritesOperationsHistogram->Collect(metadata->OperationsCount);
649+
650+
SendTime[shardId] = TInstant::Now();
651+
} else {
652+
Counters->WriteActorImmediateWritesRetries->Inc();
653+
}
654+
631655
CA_LOG_D("Send EvWrite to ShardID=" << shardId << ", TxId=" << evWrite->Record.GetTxId()
632656
<< ", TxMode=" << evWrite->Record.GetTxMode()
633657
<< ", LockTxId=" << evWrite->Record.GetLockTxId() << ", LockNodeId=" << evWrite->Record.GetLockNodeId()
@@ -723,6 +747,13 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
723747
NYql::TIssues issues;
724748
issues.AddIssue(std::move(issue));
725749

750+
if (WriteActorStateSpan) {
751+
WriteActorStateSpan.EndError(issues.ToOneLineString());
752+
}
753+
if (WriteActorSpan) {
754+
WriteActorSpan.EndError(issues.ToOneLineString());
755+
}
756+
726757
Callbacks->OnAsyncOutputError(OutputIndex, std::move(issues), statusCode);
727758
}
728759

@@ -732,6 +763,8 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
732763
}
733764

734765
void Prepare() {
766+
WriteActorStateSpan.EndOk();
767+
735768
YQL_ENSURE(SchemeEntry);
736769
ResolveAttempts = 0;
737770

@@ -803,12 +836,16 @@ class TKqpDirectWriteActor : public TActorBootstrapped<TKqpDirectWriteActor>, pu
803836
std::optional<NSchemeCache::TSchemeCacheRequest::TEntry> SchemeRequest;
804837
ui64 ResolveAttempts = 0;
805838

839+
THashMap<ui64, TInstant> SendTime;
806840
THashMap<ui64, TLockInfo> LocksInfo;
807841
bool Finished = false;
808842

809843
const i64 MemoryLimit;
810844

811845
IShardedWriteControllerPtr ShardedWriteController = nullptr;
846+
847+
NWilson::TSpan WriteActorSpan;
848+
NWilson::TSpan WriteActorStateSpan;
812849
};
813850

814851
void RegisterKqpWriteActor(NYql::NDq::TDqAsyncIoFactory& factory, TIntrusivePtr<TKqpCounters> counters) {

ydb/library/wilson_ids/wilson.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ namespace NKikimr {
7979
LookupActor = TComponentTracingLevels::TQueryProcessor::Basic,
8080
LookupActorShardsResolve = TComponentTracingLevels::TQueryProcessor::Detailed,
8181

82+
WriteActor = TComponentTracingLevels::TQueryProcessor::Basic,
83+
WriteActorTableNavigate = TComponentTracingLevels::TQueryProcessor::Detailed,
84+
8285
BulkUpsertActor = TComponentTracingLevels::TQueryProcessor::TopLevel,
8386
};
8487
};

ydb/library/yql/dq/actors/compute/dq_compute_actor_async_io.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ struct IDqAsyncIoFactory : public TThrRefBase {
287287
const NKikimr::NMiniKQL::THolderFactory& HolderFactory;
288288
std::shared_ptr<NKikimr::NMiniKQL::TScopedAlloc> Alloc;
289289
IRandomProvider *const RandomProvider;
290+
NWilson::TTraceId TraceId;
290291
};
291292

292293
struct TInputTransformArguments {

ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1394,7 +1394,8 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped<TDerived>
13941394
.TypeEnv = typeEnv,
13951395
.HolderFactory = holderFactory,
13961396
.Alloc = Alloc,
1397-
.RandomProvider = randomProvider
1397+
.RandomProvider = randomProvider,
1398+
.TraceId = ComputeActorSpan.GetTraceId(),
13981399
});
13991400
} catch (const std::exception& ex) {
14001401
throw yexception() << "Failed to create sink " << outputDesc.GetSink().GetType() << ": " << ex.what();

0 commit comments

Comments
 (0)