Add support for worker state callbacks

JamesWrigley · JamesWrigley · commit efe8e4dbee5e · 2025-01-02T21:59:01.000+01:00
diff --git a/docs/src/_changelog.md b/docs/src/_changelog.md
@@ -18,6 +18,7 @@ This documents notable changes in DistributedNext.jl. The format is based on
   incompatibilities from both libraries being used simultaneously ([#10]).
 - [`other_workers()`](@ref) and [`other_procs()`](@ref) were implemented and
   exported ([#18]).
+- Implemented callback support for workers being added/removed etc ([#17]).
 
 ## [v1.0.0] - 2024-12-02
 
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -52,6 +52,19 @@ DistributedNext.cluster_cookie()
 DistributedNext.cluster_cookie(::Any)
 ```
 
+## Callbacks
+
+```@docs
+DistributedNext.add_worker_starting_callback
+DistributedNext.remove_worker_starting_callback
+DistributedNext.add_worker_started_callback
+DistributedNext.remove_worker_started_callback
+DistributedNext.add_worker_exiting_callback
+DistributedNext.remove_worker_exiting_callback
+DistributedNext.add_worker_exited_callback
+DistributedNext.remove_worker_exited_callback
+```
+
 ## Cluster Manager Interface
 
 This interface provides a mechanism to launch and manage Julia workers on different cluster environments.
diff --git a/src/cluster.jl b/src/cluster.jl
@@ -472,20 +472,28 @@ end
 ```
 """
 function addprocs(manager::ClusterManager; kwargs...)
+    params = merge(default_addprocs_params(manager), Dict{Symbol, Any}(kwargs))
+
     init_multi()
 
     cluster_mgmt_from_master_check()
 
-    lock(worker_lock)
-    try
-        addprocs_locked(manager::ClusterManager; kwargs...)
-    finally
-        unlock(worker_lock)
-    end
+    # Call worker-starting callbacks
+    warning_interval = params[:callback_warning_interval]
+    _run_callbacks_concurrently("worker-starting", worker_starting_callbacks,
+                                warning_interval, [(manager, params)])
+
+    # Add new workers
+    new_workers = @lock worker_lock addprocs_locked(manager::ClusterManager, params)
+
+    # Call worker-started callbacks
+    _run_callbacks_concurrently("worker-started", worker_started_callbacks,
+                                warning_interval, new_workers)
+
+    return new_workers
 end
 
-function addprocs_locked(manager::ClusterManager; kwargs...)
-    params = merge(default_addprocs_params(manager), Dict{Symbol,Any}(kwargs))
+function addprocs_locked(manager::ClusterManager, params)
     topology(Symbol(params[:topology]))
 
     if PGRP.topology !== :all_to_all
@@ -572,7 +580,8 @@ default_addprocs_params() = Dict{Symbol,Any}(
     :exeflags => ``,
     :env      => [],
     :enable_threaded_blas => false,
-    :lazy => true)
+    :lazy => true,
+    :callback_warning_interval => 10)
 
 
 function setup_launched_worker(manager, wconfig, launched_q)
@@ -870,13 +879,151 @@ const HDR_COOKIE_LEN=16
 const map_pid_wrkr = Dict{Int, Union{Worker, LocalProcess}}()
 const map_sock_wrkr = IdDict()
 const map_del_wrkr = Set{Int}()
+const worker_starting_callbacks = Dict{Any, Base.Callable}()
+const worker_started_callbacks = Dict{Any, Base.Callable}()
+const worker_exiting_callbacks = Dict{Any, Base.Callable}()
+const worker_exited_callbacks = Dict{Any, Base.Callable}()
 
 # whether process is a master or worker in a distributed setup
 myrole() = LPROCROLE[]
 function myrole!(proctype::Symbol)
     LPROCROLE[] = proctype
 end
 
+# Callbacks
+
+function _run_callbacks_concurrently(callbacks_name, callbacks_dict, warning_interval, arglist)
+    callback_tasks = Dict{Any, Task}()
+    for args in arglist
+        for (name, callback) in callbacks_dict
+            callback_tasks[name] = Threads.@spawn callback(args...)
+        end
+    end
+
+    running_callbacks = () -> ["'$(key)'" for (key, task) in callback_tasks if !istaskdone(task)]
+    while timedwait(() -> isempty(running_callbacks()), warning_interval) === :timed_out
+        callbacks_str = join(running_callbacks(), ", ")
+        @warn "Waiting for these $(callbacks_name) callbacks to finish: $(callbacks_str)"
+    end
+
+    # Wait on the tasks so that exceptions bubble up
+    wait.(values(callback_tasks))
+end
+
+function _add_callback(f, key, dict; arg_types=Tuple{Int})
+    desired_signature = "f(" * join(["::$(t)" for t in arg_types.types], ", ") * ")"
+
+    if !hasmethod(f, arg_types)
+        throw(ArgumentError("Callback function is invalid, it must be able to be called with these argument types: $(desired_signature)"))
+    elseif haskey(dict, key)
+        throw(ArgumentError("A callback function with key '$(key)' already exists"))
+    end
+
+    if isnothing(key)
+        key = Symbol(gensym(), nameof(f))
+    end
+
+    dict[key] = f
+    return key
+end
+
+_remove_callback(key, dict) = delete!(dict, key)
+
+"""
+    add_worker_starting_callback(f::Base.Callable; key=nothing)
+
+Register a callback to be called on the master process immediately before new
+workers are started. The callback `f` will be called with the `ClusterManager`
+instance that is being used and a dictionary of parameters related to adding
+workers, i.e. `f(manager, params)`. The `params` dictionary is specific to the
+`manager` type. Note that the `LocalManager` and `SSHManager` cluster managers
+in DistributedNext are not fully documented yet, see the
+[managers.jl](https://github.com/JuliaParallel/DistributedNext.jl/blob/master/src/managers.jl)
+file for their definitions.
+
+!!! warning
+    Adding workers can fail so it is not guaranteed that the workers requested
+    will exist.
+
+The worker-starting callbacks will be executed concurrently. If one throws an
+exception it will not be caught and will bubble up through [`addprocs`](@ref).
+
+Keep in mind that the callbacks will add to the time taken to launch workers; so
+try to either keep the callbacks fast to execute, or do the actual work
+asynchronously by spawning a task in the callback (beware of race conditions if
+you do this).
+"""
+add_worker_starting_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_starting_callbacks;
+                                                                            arg_types=Tuple{ClusterManager, Dict})
+
+remove_worker_starting_callback(key) = _remove_callback(key, worker_starting_callbacks)
+
+"""
+    add_worker_started_callback(f::Base.Callable; key=nothing)
+
+Register a callback to be called on the master process whenever a worker is
+added. The callback will be called with the added worker ID,
+e.g. `f(w::Int)`. Chooses and returns a unique key for the callback if `key` is
+not specified.
+
+The worker-started callbacks will be executed concurrently. If one throws an
+exception it will not be caught and will bubble up through [`addprocs()`](@ref).
+
+Keep in mind that the callbacks will add to the time taken to launch workers; so
+try to either keep the callbacks fast to execute, or do the actual
+initialization asynchronously by spawning a task in the callback (beware of race
+conditions if you do this).
+"""
+add_worker_started_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_started_callbacks)
+
+"""
+    remove_worker_started_callback(key)
+
+Remove the callback for `key` that was added with [`add_worker_started_callback()`](@ref).
+"""
+remove_worker_started_callback(key) = _remove_callback(key, worker_started_callbacks)
+
+"""
+    add_worker_exiting_callback(f::Base.Callable; key=nothing)
+
+Register a callback to be called on the master process immediately before a
+worker is removed with [`rmprocs()`](@ref). The callback will be called with the
+worker ID, e.g. `f(w::Int)`. Chooses and returns a unique key for the callback
+if `key` is not specified.
+
+All worker-exiting callbacks will be executed concurrently and if they don't
+all finish before the `callback_timeout` passed to `rmprocs()` then the process
+will be removed anyway.
+"""
+add_worker_exiting_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_exiting_callbacks)
+
+"""
+    remove_worker_exiting_callback(key)
+
+Remove the callback for `key` that was added with [`add_worker_exiting_callback()`](@ref).
+"""
+remove_worker_exiting_callback(key) = _remove_callback(key, worker_exiting_callbacks)
+
+"""
+    add_worker_exited_callback(f::Base.Callable; key=nothing)
+
+Register a callback to be called on the master process when a worker has exited
+for any reason (i.e. not only because of [`rmprocs()`](@ref) but also the worker
+segfaulting etc). The callback will be called with the worker ID,
+e.g. `f(w::Int)`. Chooses and returns a unique key for the callback if `key` is
+not specified.
+
+If the callback throws an exception it will be caught and printed.
+"""
+add_worker_exited_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_exited_callbacks)
+
+"""
+    remove_worker_exited_callback(key)
+
+Remove the callback for `key` that was added with [`add_worker_exited_callback()`](@ref).
+"""
+remove_worker_exited_callback(key) = _remove_callback(key, worker_exited_callbacks)
+
 # cluster management related API
 """
     myid()
@@ -1063,7 +1210,7 @@ function cluster_mgmt_from_master_check()
 end
 
 """
-    rmprocs(pids...; waitfor=typemax(Int))
+    rmprocs(pids...; waitfor=typemax(Int), callback_timeout=10)
 
 Remove the specified workers. Note that only process 1 can add or remove
 workers.
@@ -1077,6 +1224,10 @@ Argument `waitfor` specifies how long to wait for the workers to shut down:
     returned. The user should call [`wait`](@ref) on the task before invoking any other
     parallel calls.
 
+The `callback_timeout` specifies how long to wait for any callbacks to execute
+before continuing to remove the workers (see
+[`add_worker_exiting_callback()`](@ref)).
+
 # Examples
 ```julia-repl
 \$ julia -p 5
@@ -1093,24 +1244,38 @@ julia> workers()
  6
 ```
 """
-function rmprocs(pids...; waitfor=typemax(Int))
+function rmprocs(pids...; waitfor=typemax(Int), callback_timeout=10)
     cluster_mgmt_from_master_check()
 
     pids = vcat(pids...)
     if waitfor == 0
-        t = @async _rmprocs(pids, typemax(Int))
+        t = @async _rmprocs(pids, typemax(Int), callback_timeout)
         yield()
         return t
     else
-        _rmprocs(pids, waitfor)
+        _rmprocs(pids, waitfor, callback_timeout)
         # return a dummy task object that user code can wait on.
         return @async nothing
     end
 end
 
-function _rmprocs(pids, waitfor)
+function _rmprocs(pids, waitfor, callback_timeout)
     lock(worker_lock)
     try
+        # Run the callbacks
+        callback_tasks = Dict{Any, Task}()
+        for pid in pids
+            for (name, callback) in worker_exiting_callbacks
+                callback_tasks[name] = Threads.@spawn callback(pid)
+            end
+        end
+
+        if timedwait(() -> all(istaskdone.(values(callback_tasks))), callback_timeout) === :timed_out
+            timedout_callbacks = ["'$(key)'" for (key, task) in callback_tasks if !istaskdone(task)]
+            callbacks_str = join(timedout_callbacks, ", ")
+            @warn "Some worker-exiting callbacks have not yet finished, continuing to remove workers anyway. These are the callbacks still running: $(callbacks_str)"
+        end
+
         rmprocset = Union{LocalProcess, Worker}[]
         for p in pids
             if p == 1
@@ -1256,6 +1421,18 @@ function deregister_worker(pg, pid)
             delete!(pg.refs, id)
         end
     end
+
+    # Call callbacks on the master
+    if myid() == 1
+        for (name, callback) in worker_exited_callbacks
+            try
+                callback(pid)
+            catch ex
+                @error "Error when running worker-exited callback '$(name)'" exception=(ex, catch_backtrace())
+            end
+        end
+    end
+
     return
 end
 
diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl
@@ -1,6 +1,7 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 using DistributedNext, Random, Serialization, Sockets
+import DistributedNext
 import DistributedNext: launch, manage
 
 
@@ -1934,6 +1935,64 @@ include("splitrange.jl")
     end
 end
 
+@testset "Worker state callbacks" begin
+    rmprocs(other_workers())
+
+    # Adding a callback with an invalid signature should fail
+    @test_throws ArgumentError DistributedNext.add_worker_started_callback(() -> nothing)
+
+    # Smoke test to ensure that all the callbacks are executed
+    starting_managers = []
+    started_workers = Int[]
+    exiting_workers = Int[]
+    exited_workers = Int[]
+    starting_key = DistributedNext.add_worker_starting_callback((manager, kwargs) -> push!(starting_managers, manager))
+    started_key = DistributedNext.add_worker_started_callback(pid -> (push!(started_workers, pid); error("foo")))
+    exiting_key = DistributedNext.add_worker_exiting_callback(pid -> push!(exiting_workers, pid))
+    exited_key = DistributedNext.add_worker_exited_callback(pid -> push!(exited_workers, pid))
+
+    # Test that the worker-started exception bubbles up
+    @test_throws TaskFailedException addprocs(1)
+
+    pid = only(workers())
+    @test only(starting_managers) isa DistributedNext.LocalManager
+    @test started_workers == [pid]
+    rmprocs(workers())
+    @test exiting_workers == [pid]
+    @test exited_workers == [pid]
+
+    # Trying to reset an existing callback should fail
+    @test_throws ArgumentError DistributedNext.add_worker_started_callback(Returns(nothing); key=started_key)
+
+    # Remove the callbacks
+    DistributedNext.remove_worker_starting_callback(starting_key)
+    DistributedNext.remove_worker_started_callback(started_key)
+    DistributedNext.remove_worker_exiting_callback(exiting_key)
+    DistributedNext.remove_worker_exited_callback(exited_key)
+
+    # Test that the worker-exiting `callback_timeout` option works and that we
+    # get warnings about slow worker-started callbacks.
+    event = Base.Event()
+    callback_task = nothing
+    started_key = DistributedNext.add_worker_started_callback(_ -> sleep(0.5))
+    exiting_key = DistributedNext.add_worker_exiting_callback(_ -> (callback_task = current_task(); wait(event)))
+
+    @test_logs (:warn, r"Waiting for these worker-started callbacks.+") match_mode=:any addprocs(1; callback_warning_interval=0.05)
+    DistributedNext.remove_worker_started_callback(started_key)
+
+    @test_logs (:warn, r"Some worker-exiting callbacks have not yet finished.+") rmprocs(workers(); callback_timeout=0.5)
+    DistributedNext.remove_worker_exiting_callback(exiting_key)
+
+    notify(event)
+    wait(callback_task)
+
+    # Test that the initial callbacks were indeed removed
+    @test length(starting_managers) == 1
+    @test length(started_workers) == 1
+    @test length(exiting_workers) == 1
+    @test length(exited_workers) == 1
+end
+
 # Run topology tests last after removing all workers, since a given
 # cluster at any time only supports a single topology.
 if nprocs() > 1