Skip to content

Commit ee2ec8b

Browse files
rustyrussellendothermicdev
authored andcommitted
trace: minimal fix to avoid crash when > 128 traces active.
chanbackup with many peers can do more than 128 concurrent rpc commands. autoclean is the other plugin which can do many requests at once, so I expect a similar issue there. I tested this by rebuilding with `MAX_ACTIVE_SPANS` 1, which autoclean tests triggered immediately. The real fix is probably to use a hash table with a large initial size. ``` Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: common/trace.c:190: trace_span_slot: Assertion `s' failed. Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: FATAL SIGNAL 6 (version v25.02) Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232bac4f send_backtrace Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:33 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232baceb crashdump Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd851f ??? Mar 24 06:30:45 mlbb2 sh[28000]: ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_implementation Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:44 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_internal Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __GI___pthread_kill Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:89 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd8475 __GI_raise Mar 24 06:30:45 mlbb2 sh[28000]: ../sysdeps/posix/raise.c:26 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe7f2 __GI_abort Mar 24 06:30:45 mlbb2 sh[28000]: ./stdlib/abort.c:79 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe71a __assert_fail_base Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:94 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958ccfe95 __GI___assert_fail Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:103 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232ab7fa trace_span_slot Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:190 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232abc9f trace_span_start Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:267 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232a7c34 send_outreq Mar 24 06:30:45 mlbb2 sh[28000]: plugins/libplugin.c:1112 ``` Changelog-Fixed: autoclean/chanbackup: fixed tracepoint crash on large number of requests. Fixes: #8177 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
1 parent 8dae8be commit ee2ec8b

File tree

1 file changed

+29
-2
lines changed

1 file changed

+29
-2
lines changed

common/trace.c

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
#include <common/trace.h>
1111
#include <sodium/randombytes.h>
1212
#include <stdio.h>
13+
#include <unistd.h>
1314

1415
#if HAVE_USDT
15-
#include <sys/sdt.h>
16+
#include <sys/sdt.h>
1617

1718
#define MAX_ACTIVE_SPANS 128
1819

@@ -40,6 +41,7 @@
4041
#endif
4142

4243
const char *trace_service_name = "lightningd";
44+
static bool disable_trace = false;
4345

4446
struct span_tag {
4547
char *name, *value;
@@ -176,6 +178,9 @@ static struct span *trace_span_find(size_t key)
176178
return NULL;
177179
}
178180

181+
/* FIXME: Forward declaration for minimal patch size */
182+
static void trace_span_clear(struct span *s);
183+
179184
/**
180185
* Find an empty slot for a new span.
181186
*/
@@ -187,7 +192,13 @@ static struct span *trace_span_slot(void)
187192

188193
/* Might end up here if we have more than MAX_ACTIVE_SPANS
189194
* concurrent spans. */
190-
assert(s);
195+
if (!s) {
196+
fprintf(stderr, "%u: out of spans, disabling tracing\n", getpid());
197+
for (size_t i = 0; i < MAX_ACTIVE_SPANS; i++)
198+
trace_span_clear(&active_spans[i]);
199+
disable_trace = true;
200+
return NULL;
201+
}
191202
assert(s->parent == NULL);
192203

193204
/* Be extra careful not to create cycles. If we return the
@@ -260,11 +271,15 @@ void trace_span_start(const char *name, const void *key)
260271
size_t numkey = trace_key(key);
261272
struct timeabs now = time_now();
262273

274+
if (disable_trace)
275+
return;
263276
trace_init();
264277
trace_check_tree();
265278

266279
assert(trace_span_find(numkey) == NULL);
267280
struct span *s = trace_span_slot();
281+
if (!s)
282+
return;
268283
s->key = numkey;
269284
randombytes_buf(s->id, SPAN_ID_SIZE);
270285
s->start_time = (now.ts.tv_sec * 1000000) + now.ts.tv_nsec / 1000;
@@ -293,6 +308,9 @@ void trace_span_remote(u8 trace_id[TRACE_ID_SIZE], u8 span_id[SPAN_ID_SIZE])
293308

294309
void trace_span_end(const void *key)
295310
{
311+
if (disable_trace)
312+
return;
313+
296314
size_t numkey = trace_key(key);
297315
struct span *s = trace_span_find(numkey);
298316
assert(s && "Span to end not found");
@@ -323,6 +341,9 @@ void trace_span_end(const void *key)
323341

324342
void trace_span_tag(const void *key, const char *name, const char *value)
325343
{
344+
if (disable_trace)
345+
return;
346+
326347
size_t numkey = trace_key(key);
327348
struct span *span = trace_span_find(numkey);
328349
assert(span);
@@ -341,6 +362,9 @@ void trace_span_tag(const void *key, const char *name, const char *value)
341362

342363
void trace_span_suspend_(const void *key, const char *lbl)
343364
{
365+
if (disable_trace)
366+
return;
367+
344368
size_t numkey = trace_key(key);
345369
struct span *span = trace_span_find(numkey);
346370
TRACE_DBG("Suspending span %s (%zu)\n", current->name, current->key);
@@ -351,6 +375,9 @@ void trace_span_suspend_(const void *key, const char *lbl)
351375

352376
void trace_span_resume_(const void *key, const char *lbl)
353377
{
378+
if (disable_trace)
379+
return;
380+
354381
size_t numkey = trace_key(key);
355382
current = trace_span_find(numkey);
356383
TRACE_DBG("Resuming span %s (%zu)\n", current->name, current->key);

0 commit comments

Comments
 (0)