Skip to content

Commit b3d9c13

Browse files
authored
Collect and send backtraces for tx max/failed allocations in error message (#10419)
1 parent f3a9d80 commit b3d9c13

File tree

1 file changed

+51
-5
lines changed

1 file changed

+51
-5
lines changed

ydb/core/kqp/rm_service/kqp_rm_service.h

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@ class TTxState : public TAtomicRefCount<TTxState> {
126126
std::atomic<ui64> TxMaxAllocation = 0;
127127
std::atomic<ui64> TxFailedAllocation = 0;
128128

129+
// TODO(ilezhankin): it's better to use std::atomic<std::shared_ptr<>> which is not supported at the moment.
130+
std::atomic<TBackTrace*> TxMaxAllocationBacktrace = nullptr;
131+
std::atomic<TBackTrace*> TxFailedAllocationBacktrace = nullptr;
132+
129133
public:
130134
explicit TTxState(ui64 txId, TInstant now, TIntrusivePtr<TKqpCounters> counters, const TString& poolId, const double memoryPoolPercent,
131135
const TString& database)
@@ -137,6 +141,11 @@ class TTxState : public TAtomicRefCount<TTxState> {
137141
, Database(database)
138142
{}
139143

144+
~TTxState() {
145+
delete TxMaxAllocationBacktrace.load();
146+
delete TxFailedAllocationBacktrace.load();
147+
}
148+
140149
std::pair<TString, TString> MakePoolId() const {
141150
return std::make_pair(Database, PoolId);
142151
}
@@ -157,7 +166,14 @@ class TTxState : public TAtomicRefCount<TTxState> {
157166
<< ", tx largest failed memory allocation: " << HumanReadableSize(TxFailedAllocation.load(), SF_BYTES)
158167
<< ", tx total execution units: " << TxExecutionUnits.load()
159168
<< ", started at: " << CreatedAt
160-
<< " }";
169+
<< " }" << Endl;
170+
171+
if (TxMaxAllocationBacktrace.load()) {
172+
res << "TxMaxAllocationBacktrace:" << Endl << TxMaxAllocationBacktrace.load()->PrintToString();
173+
}
174+
if (TxFailedAllocationBacktrace.load()) {
175+
res << "TxFailedAllocationBacktrace:" << Endl << TxFailedAllocationBacktrace.load()->PrintToString();
176+
}
161177

162178
return res;
163179
}
@@ -167,8 +183,23 @@ class TTxState : public TAtomicRefCount<TTxState> {
167183
}
168184

169185
void AckFailedMemoryAlloc(ui64 memory) {
186+
auto* oldBacktrace = TxFailedAllocationBacktrace.load();
170187
ui64 maxAlloc = TxFailedAllocation.load();
171-
while(maxAlloc < memory && !TxFailedAllocation.compare_exchange_weak(maxAlloc, memory));
188+
bool exchanged = false;
189+
190+
while(maxAlloc < memory && !exchanged) {
191+
exchanged = TxFailedAllocation.compare_exchange_weak(maxAlloc, memory);
192+
}
193+
194+
if (exchanged) {
195+
auto* newBacktrace = new TBackTrace();
196+
newBacktrace->Capture();
197+
if (TxFailedAllocationBacktrace.compare_exchange_strong(oldBacktrace, newBacktrace)) {
198+
delete oldBacktrace;
199+
} else {
200+
delete newBacktrace;
201+
}
202+
}
172203
}
173204

174205
void Released(TIntrusivePtr<TTaskState>& taskState, const TKqpResourcesRequest& resources) {
@@ -186,9 +217,6 @@ class TTxState : public TAtomicRefCount<TTxState> {
186217
taskState->ScanQueryMemory -= resources.Memory;
187218
Counters->RmMemory->Sub(resources.Memory);
188219

189-
ui64 maxAlloc = TxMaxAllocation.load();
190-
while(maxAlloc < resources.Memory && !TxMaxAllocation.compare_exchange_weak(maxAlloc, resources.Memory));
191-
192220
TxExecutionUnits.fetch_sub(resources.ExecutionUnits);
193221
taskState->ExecutionUnits -= resources.ExecutionUnits;
194222
Counters->RmComputeActors->Sub(resources.ExecutionUnits);
@@ -210,6 +238,24 @@ class TTxState : public TAtomicRefCount<TTxState> {
210238
Counters->RmExtraMemAllocs->Inc();
211239
}
212240

241+
auto* oldBacktrace = TxMaxAllocationBacktrace.load();
242+
ui64 maxAlloc = TxMaxAllocation.load();
243+
bool exchanged = false;
244+
245+
while(maxAlloc < resources.Memory && !exchanged) {
246+
exchanged = TxMaxAllocation.compare_exchange_weak(maxAlloc, resources.Memory);
247+
}
248+
249+
if (exchanged) {
250+
auto* newBacktrace = new TBackTrace();
251+
newBacktrace->Capture();
252+
if (TxMaxAllocationBacktrace.compare_exchange_strong(oldBacktrace, newBacktrace)) {
253+
delete oldBacktrace;
254+
} else {
255+
delete newBacktrace;
256+
}
257+
}
258+
213259
TxExecutionUnits.fetch_add(resources.ExecutionUnits);
214260
taskState->ExecutionUnits += resources.ExecutionUnits;
215261
Counters->RmComputeActors->Add(resources.ExecutionUnits);

0 commit comments

Comments
 (0)