Skip to content

Commit 26ac81c

Browse files
vtjnashJeffBezanson
authored andcommitted
Try to avoid julia becoming unkillable after fatal errors (#40056)
- don't smash the alt-stack when already using it - handle jl_critical_error on the original stack, leaving our signal handling thread free to handle more signals (and helping lock corruption detection in some cases) - unblock signals when handling signals: some libc apparently like to block all signals, which can cause mild havoc, since we'd really like the user or bad data to be able to still kill the process (and not just be ignored or cause it to hang) - reset signals to SIG_DFL earlier (so we recurse less) - destroy some state from the Task we co-opted to run the exit handlers, so that it can't accidentally jump back into the running program after we've started tearing down the process, from an untimely ^C (previously ^C might cancel the exit) or a jlbacktrace call. - mark functions as leaf with CFI instead of (potentially) smashing the stack, and add a bit of red-zone if we are recursing (to keep pgcstack sensible) - support safe_restore for the mach catch_exception_raise (while we're trying to generate the backtrace) (cherry picked from commit 107901d)
1 parent 7c45ff0 commit 26ac81c

File tree

8 files changed

+310
-139
lines changed

8 files changed

+310
-139
lines changed

src/gf.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1817,7 +1817,7 @@ static void JL_NORETURN jl_method_error_bare(jl_function_t *f, jl_value_t *args,
18171817
jl_static_show((JL_STREAM*)STDERR_FILENO,args); jl_printf((JL_STREAM*)STDERR_FILENO,"\n");
18181818
jl_ptls_t ptls = jl_get_ptls_states();
18191819
ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
1820-
jl_critical_error(0, NULL, ptls->bt_data, &ptls->bt_size);
1820+
jl_critical_error(0, NULL);
18211821
abort();
18221822
}
18231823
// not reached

src/julia_internal.h

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,36 @@ void __tsan_switch_to_fiber(void *fiber, unsigned flags);
6969
# define JL_USE_IFUNC 0
7070
#endif
7171

72+
// If we've smashed the stack, (and not just normal NORETURN)
73+
// this will smash stack-unwind too
74+
#ifdef _OS_WINDOWS_
75+
#if defined(_CPU_X86_64_)
76+
// install the unhandled exception handler at the top of our stack
77+
// to call directly into our personality handler
78+
#define CFI_NORETURN \
79+
asm volatile ("\t.seh_handler __julia_personality, @except\n\t.text");
80+
#else
81+
#define CFI_NORETURN
82+
#endif
83+
#else
84+
// wipe out the call-stack unwind capability beyond this function
85+
// (we are noreturn, so it is not a total lie)
86+
#if defined(_CPU_X86_64_)
87+
// per nongnu libunwind: "x86_64 ABI specifies that end of call-chain is marked with a NULL RBP or undefined return address"
88+
// so we do all 3, to be extra certain of it
89+
#define CFI_NORETURN \
90+
asm volatile ("\t.cfi_undefined rip"); \
91+
asm volatile ("\t.cfi_undefined rbp"); \
92+
asm volatile ("\t.cfi_return_column rbp");
93+
#else
94+
// per nongnu libunwind: "DWARF spec says undefined return address location means end of stack"
95+
// we use whatever happens to be register 1 on this platform for this
96+
#define CFI_NORETURN \
97+
asm volatile ("\t.cfi_undefined 1"); \
98+
asm volatile ("\t.cfi_return_column 1");
99+
#endif
100+
#endif
101+
72102
// If this is detected in a backtrace of segfault, it means the functions
73103
// that use this value must be reworked into their async form with cb arg
74104
// provided and with JL_UV_LOCK used around the calls
@@ -904,7 +934,7 @@ size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t
904934
size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t *ctx, jl_gcframe_t *pgcstack) JL_NOTSAFEPOINT;
905935
#endif
906936
JL_DLLEXPORT jl_value_t *jl_get_backtrace(void);
907-
void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size);
937+
void jl_critical_error(int sig, bt_context_t *context);
908938
JL_DLLEXPORT void jl_raise_debugger(void);
909939
int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT;
910940
JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT;

src/signal-handling.c

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -231,15 +231,44 @@ void jl_show_sigill(void *_ctx)
231231
#endif
232232
}
233233

234-
// what to do on a critical error
235-
void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size)
234+
// what to do on a critical error on a thread
235+
void jl_critical_error(int sig, bt_context_t *context)
236236
{
237-
// This function is not allowed to reference any TLS variables.
238-
// We need to explicitly pass in the TLS buffer pointer when
239-
// we make `jl_filename` and `jl_lineno` thread local.
237+
238+
jl_ptls_t ptls = jl_get_ptls_states();
239+
jl_bt_element_t *bt_data = ptls->bt_data;
240+
size_t *bt_size = &ptls->bt_size;
240241
size_t i, n = *bt_size;
241-
if (sig)
242+
if (sig) {
243+
// kill this task, so that we cannot get back to it accidentally (via an untimely ^C or jlbacktrace in jl_exit)
244+
ptls->pgcstack = NULL;
245+
ptls->safe_restore = NULL;
246+
if (ptls->current_task) {
247+
ptls->current_task->eh = NULL;
248+
ptls->current_task->excstack = NULL;
249+
}
250+
#ifndef _OS_WINDOWS_
251+
sigset_t sset;
252+
sigemptyset(&sset);
253+
// n.b. In `abort()`, Apple's libSystem "helpfully" blocks all signals
254+
// on all threads but SIGABRT. But we also don't know what the thread
255+
// was doing, so unblock all critical signals so that they will crash
256+
// hard, and not just get stuck.
257+
sigaddset(&sset, SIGSEGV);
258+
sigaddset(&sset, SIGBUS);
259+
sigaddset(&sset, SIGILL);
260+
// also unblock fatal signals now, so we won't get back here twice
261+
sigaddset(&sset, SIGTERM);
262+
sigaddset(&sset, SIGABRT);
263+
sigaddset(&sset, SIGQUIT);
264+
// and the original signal is now fatal too, in case it wasn't
265+
// something already listed (?)
266+
if (sig != SIGINT)
267+
sigaddset(&sset, sig);
268+
pthread_sigmask(SIG_UNBLOCK, &sset, NULL);
269+
#endif
242270
jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig));
271+
}
243272
jl_safe_printf("in expression starting at %s:%d\n", jl_filename, jl_lineno);
244273
if (context) {
245274
// Must avoid extended backtrace frames here unless we're sure bt_data

src/signals-mach.c

Lines changed: 82 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,16 @@ extern boolean_t exc_server(mach_msg_header_t *, mach_msg_header_t *);
8484
void *mach_segv_listener(void *arg)
8585
{
8686
(void)arg;
87+
(void)jl_get_ptls_states();
8788
while (1) {
8889
int ret = mach_msg_server(exc_server, 2048, segv_port, MACH_MSG_TIMEOUT_NONE);
8990
jl_safe_printf("mach_msg_server: %s\n", mach_error_string(ret));
9091
jl_exit(128 + SIGSEGV);
9192
}
9293
}
9394

94-
static void allocate_segv_handler()
95+
96+
static void allocate_mach_handler()
9597
{
9698
// ensure KEYMGR_GCC3_DW2_OBJ_LIST is initialized, as this requires malloc
9799
// and thus can deadlock when used without first initializing it.
@@ -122,7 +124,7 @@ static void allocate_segv_handler()
122124
jl_error("pthread_create failed");
123125
}
124126
pthread_attr_destroy(&attr);
125-
for (int16_t tid = 0;tid < jl_n_threads;tid++) {
127+
for (int16_t tid = 0; tid < jl_n_threads; tid++) {
126128
attach_exception_port(pthread_mach_thread_np(jl_all_tls_states[tid]->system_id), 0);
127129
}
128130
}
@@ -164,19 +166,31 @@ typedef arm_exception_state64_t host_exception_state_t;
164166
static void jl_call_in_state(jl_ptls_t ptls2, host_thread_state_t *state,
165167
void (*fptr)(void))
166168
{
167-
uint64_t rsp = (uint64_t)ptls2->signal_stack + sig_stack_size;
169+
#ifdef _CPU_X86_64_
170+
uintptr_t rsp = state->__rsp;
171+
#elif defined(_CPU_AARCH64_)
172+
uintptr_t rsp = state->__sp;
173+
#else
174+
#error "julia: throw-in-context not supported on this platform"
175+
#endif
176+
if (ptls2->signal_stack == NULL || is_addr_on_sigstack(ptls2, (void*)rsp)) {
177+
rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment
178+
}
179+
else {
180+
rsp = (uintptr_t)ptls2->signal_stack + sig_stack_size;
181+
}
168182
assert(rsp % 16 == 0);
169183

170-
// push (null) $RIP onto the stack
171-
rsp -= sizeof(void*);
172-
*(void**)rsp = NULL;
173-
174184
#ifdef _CPU_X86_64_
185+
rsp -= sizeof(void*);
175186
state->__rsp = rsp; // set stack pointer
176187
state->__rip = (uint64_t)fptr; // "call" the function
177-
#else
188+
#elif defined(_CPU_AARCH64_)
178189
state->__sp = rsp;
179190
state->__pc = (uint64_t)fptr;
191+
state->__lr = 0;
192+
#else
193+
#error "julia: throw-in-context not supported on this platform"
180194
#endif
181195
}
182196

@@ -194,11 +208,22 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio
194208
ptls2->sig_exception = exception;
195209
}
196210
jl_call_in_state(ptls2, &state, &jl_sig_throw);
197-
ret = thread_set_state(thread, THREAD_STATE,
198-
(thread_state_t)&state, count);
211+
ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count);
199212
HANDLE_MACH_ERROR("thread_set_state", ret);
200213
}
201214

215+
static void segv_handler(int sig, siginfo_t *info, void *context)
216+
{
217+
jl_ptls_t ptls = jl_get_ptls_states();
218+
assert(sig == SIGSEGV || sig == SIGBUS);
219+
if (ptls->safe_restore) { // restarting jl_ or jl_unwind_stepn
220+
jl_call_in_state(ptls, (host_thread_state_t*)jl_to_bt_context(context), &jl_sig_throw);
221+
}
222+
else {
223+
sigdie_handler(sig, info, context);
224+
}
225+
}
226+
202227
//exc_server uses dlsym to find symbol
203228
JL_DLLEXPORT
204229
kern_return_t catch_exception_raise(mach_port_t exception_port,
@@ -208,18 +233,16 @@ kern_return_t catch_exception_raise(mach_port_t exception_port,
208233
exception_data_t code,
209234
mach_msg_type_number_t code_count)
210235
{
211-
unsigned int count = THREAD_STATE_COUNT;
212236
unsigned int exc_count = HOST_EXCEPTION_STATE_COUNT;
213237
host_exception_state_t exc_state;
214-
host_thread_state_t state;
215-
#ifdef LIBOSXUNWIND
238+
#ifdef LLVMLIBUNWIND
216239
if (thread == mach_profiler_thread) {
217240
return profiler_segv_handler(exception_port, thread, task, exception, code, code_count);
218241
}
219242
#endif
220243
int16_t tid;
221244
jl_ptls_t ptls2 = NULL;
222-
for (tid = 0;tid < jl_n_threads;tid++) {
245+
for (tid = 0; tid < jl_n_threads; tid++) {
223246
jl_ptls_t _ptls2 = jl_all_tls_states[tid];
224247
if (pthread_mach_thread_np(_ptls2->system_id) == thread) {
225248
ptls2 = _ptls2;
@@ -288,11 +311,8 @@ kern_return_t catch_exception_raise(mach_port_t exception_port,
288311
return KERN_SUCCESS;
289312
}
290313
else {
291-
kern_return_t ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)&state, &count);
292-
HANDLE_MACH_ERROR("thread_get_state", ret);
293-
jl_critical_error(SIGSEGV, (unw_context_t*)&state,
294-
ptls2->bt_data, &ptls2->bt_size);
295-
return KERN_INVALID_ARGUMENT;
314+
jl_exit_thread0(128 + SIGSEGV, NULL, 0);
315+
return KERN_SUCCESS;
296316
}
297317
}
298318

@@ -307,24 +327,27 @@ static void attach_exception_port(thread_port_t thread, int segv_only)
307327
HANDLE_MACH_ERROR("thread_set_exception_ports", ret);
308328
}
309329

310-
static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
330+
static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
311331
{
312332
jl_ptls_t ptls2 = jl_all_tls_states[tid];
313-
mach_port_t tid_port = pthread_mach_thread_np(ptls2->system_id);
333+
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
314334

315-
kern_return_t ret = thread_suspend(tid_port);
335+
kern_return_t ret = thread_suspend(thread);
316336
HANDLE_MACH_ERROR("thread_suspend", ret);
317337

318338
// Do the actual sampling
319339
unsigned int count = THREAD_STATE_COUNT;
320-
static unw_context_t state;
321-
memset(&state, 0, sizeof(unw_context_t));
340+
memset(ctx, 0, sizeof(*ctx));
322341

323342
// Get the state of the suspended thread
324-
ret = thread_get_state(tid_port, THREAD_STATE, (thread_state_t)&state, &count);
343+
ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)ctx, &count);
344+
}
325345

326-
// Initialize the unwind context with the suspend thread's state
327-
*ctx = &state;
346+
static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
347+
{
348+
static host_thread_state_t state;
349+
jl_thread_suspend_and_get_state2(tid, &state);
350+
*ctx = (unw_context_t*)&state;
328351
}
329352

330353
static void jl_thread_resume(int tid, int sig)
@@ -366,29 +389,46 @@ static void jl_try_deliver_sigint(void)
366389
HANDLE_MACH_ERROR("thread_resume", ret);
367390
}
368391

369-
static void jl_exit_thread0(int exitstate)
392+
static void JL_NORETURN jl_exit_thread0_cb(int exitstate)
393+
{
394+
CFI_NORETURN
395+
jl_critical_error(exitstate - 128, NULL);
396+
jl_exit(exitstate);
397+
}
398+
399+
static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size)
370400
{
371401
jl_ptls_t ptls2 = jl_all_tls_states[0];
372402
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
373-
kern_return_t ret = thread_suspend(thread);
374-
HANDLE_MACH_ERROR("thread_suspend", ret);
403+
404+
host_thread_state_t state;
405+
jl_thread_suspend_and_get_state2(0, &state);
406+
unw_context_t *uc = (unw_context_t*)&state;
375407

376408
// This aborts `sleep` and other syscalls.
377-
ret = thread_abort(thread);
409+
kern_return_t ret = thread_abort(thread);
378410
HANDLE_MACH_ERROR("thread_abort", ret);
379411

380-
unsigned int count = THREAD_STATE_COUNT;
381-
host_thread_state_t state;
382-
ret = thread_get_state(thread, THREAD_STATE,
383-
(thread_state_t)&state, &count);
412+
if (bt_data == NULL) {
413+
// Must avoid extended backtrace frames here unless we're sure bt_data
414+
// is properly rooted.
415+
ptls2->bt_size = rec_backtrace_ctx(ptls2->bt_data, JL_MAX_BT_SIZE, uc, NULL);
416+
}
417+
else {
418+
ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE
419+
memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0]));
420+
}
384421

385422
void (*exit_func)(int) = &_exit;
386423
if (thread0_exit_count <= 1) {
387-
exit_func = &jl_exit;
424+
exit_func = &jl_exit_thread0_cb;
388425
}
389426
else if (thread0_exit_count == 2) {
390427
exit_func = &exit;
391428
}
429+
else {
430+
exit_func = &_exit;
431+
}
392432

393433
#ifdef _CPU_X86_64_
394434
// First integer argument. Not portable but good enough =)
@@ -399,8 +439,8 @@ static void jl_exit_thread0(int exitstate)
399439
#error Fill in first integer argument here
400440
#endif
401441
jl_call_in_state(ptls2, &state, (void (*)(void))exit_func);
402-
ret = thread_set_state(thread, THREAD_STATE,
403-
(thread_state_t)&state, count);
442+
unsigned int count = THREAD_STATE_COUNT;
443+
ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count);
404444
HANDLE_MACH_ERROR("thread_set_state", ret);
405445

406446
ret = thread_resume(thread);
@@ -498,8 +538,10 @@ void *mach_profile_listener(void *arg)
498538
break;
499539
}
500540

501-
unw_context_t *uc;
502-
jl_thread_suspend_and_get_state(i, &uc);
541+
host_thread_state_t state;
542+
jl_thread_suspend_and_get_state2(i, &state);
543+
unw_context_t *uc = (unw_context_t*)&state;
544+
503545
if (running) {
504546
#ifdef LIBOSXUNWIND
505547
/*

0 commit comments

Comments
 (0)