Skip to content

Commit 97fc237

Browse files
sig profile hanging workers before SIGKILL
1 parent b6ab10d commit 97fc237

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

src/managers.jl

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ addprocs([
113113
114114
* `exeflags`: additional flags passed to the worker processes. It can either be a `Cmd`, a `String`
115115
holding one flag, or a collection of strings, with one element per flag.
116-
E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`.
116+
E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`.
117117
118118
* `topology`: Specifies how the workers connect to each other. Sending a message between
119119
unconnected workers results in an error.
@@ -740,16 +740,25 @@ function kill(manager::SSHManager, pid::Int, config::WorkerConfig)
740740
nothing
741741
end
742742

743-
function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15)
743+
function kill(manager::LocalManager, pid::Int, config::WorkerConfig; profile_wait = 6, exit_timeout = 15, term_timeout = 15)
744+
# profile_wait = 6 is 1s for profile, 5s for the report to show
744745
# First, try sending `exit()` to the remote over the usual control channels
745746
remote_do(exit, pid)
746747

748+
747749
timer_task = @async begin
748750
sleep(exit_timeout)
749751

750752
# Check to see if our child exited, and if not, send an actual kill signal
751753
if !process_exited(config.process)
752-
@warn("Failed to gracefully kill worker $(pid), sending SIGQUIT")
754+
@warn "Failed to gracefully kill worker $(pid)"
755+
profile_sig = Sys.iswindows() ? nothing : Sys.isbsd() ? ("SIGINFO", 29) : ("SIGUSR1" , 10)
756+
if profile_sig !== nothing
757+
@info("Sending profile $(profile_sig[1]) to worker $(pid)")
758+
kill(config.process, profile_sig[2])
759+
sleep(profile_wait)
760+
end
761+
@warn("Sending SIGQUIT to worker $(pid)")
753762
kill(config.process, Base.SIGQUIT)
754763

755764
sleep(term_timeout)

test/distributed_exec.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1937,7 +1937,7 @@ begin
19371937

19381938
# Next, ensure we get a log message when a worker does not cleanly exit
19391939
w = only(addprocs(1))
1940-
@test_logs (:warn, r"sending SIGQUIT") begin
1940+
@test_logs (:warn, r"Sending SIGQUIT") match_mode=:any begin
19411941
remote_do(w) do
19421942
# Cause the 'exit()' message that `rmprocs()` sends to do nothing
19431943
Core.eval(Base, :(exit() = nothing))

0 commit comments

Comments
 (0)