From 15f6afbb8d3696cf0dc99b618a2c07ef1fc14799 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Sat, 14 Oct 2023 10:18:56 -0700 Subject: [PATCH 1/9] Add gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index de6b1c6..6e6862d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ docs/src/changelog.md +Manifest.toml +*.swp From 853974ae5812ab02eec96b3eb2b2928ec77379fb Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 13 Sep 2021 13:53:41 -0400 Subject: [PATCH 2/9] Make worker state variable threadsafe --- .github/workflows/ci.yml | 2 +- src/cluster.jl | 72 ++++++++++++++++++++++++++++------------ src/managers.jl | 2 +- src/messages.jl | 2 +- src/process_messages.jl | 2 +- test/distributed_exec.jl | 5 ++- test/threads.jl | 64 +++++++++++++++++++++++++++++++++++ 7 files changed, 122 insertions(+), 27 deletions(-) create mode 100644 test/threads.jl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d4104fd..1dee688 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,7 +54,7 @@ jobs: - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 env: - JULIA_DISTRIBUTED_TESTING_STANDALONE: 1 + JULIA_NUM_THREADS: 4 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v4 with: diff --git a/src/cluster.jl b/src/cluster.jl index 2444695..653be62 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -99,10 +99,10 @@ mutable struct Worker del_msgs::Array{Any,1} # XXX: Could del_msgs and add_msgs be Channels? add_msgs::Array{Any,1} @atomic gcflag::Bool - state::WorkerState - c_state::Condition # wait for state changes - ct_time::Float64 # creation time - conn_func::Any # used to setup connections lazily + @atomic state::WorkerState + c_state::Threads.Condition # wait for state changes, lock for state + ct_time::Float64 # creation time + conn_func::Any # used to setup connections lazily r_stream::IO w_stream::IO @@ -134,7 +134,7 @@ mutable struct Worker if haskey(map_pid_wrkr, id) return map_pid_wrkr[id] end - w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Condition(), time(), conn_func) + w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Threads.Condition(), time(), conn_func) w.initialized = Event() register_worker(w) w @@ -144,12 +144,14 @@ mutable struct Worker end function set_worker_state(w, state) - w.state = state - notify(w.c_state; all=true) + lock(w.c_state) do + @atomic w.state = state + notify(w.c_state; all=true) + end end function check_worker_state(w::Worker) - if w.state === W_CREATED + if (@atomic w.state) === W_CREATED if !isclusterlazy() if PGRP.topology === :all_to_all # Since higher pids connect with lower pids, the remote worker @@ -170,6 +172,7 @@ function check_worker_state(w::Worker) wait_for_conn(w) end end + return nothing end exec_conn_func(id::Int) = exec_conn_func(worker_from_id(id)::Worker) @@ -187,13 +190,21 @@ function exec_conn_func(w::Worker) end function wait_for_conn(w) - if w.state === W_CREATED + if (@atomic w.state) === W_CREATED timeout = worker_timeout() - (time() - w.ct_time) timeout <= 0 && error("peer $(w.id) has not connected to $(myid())") - @async (sleep(timeout); notify(w.c_state; all=true)) - wait(w.c_state) - w.state === W_CREATED && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds") + T = Threads.@spawn begin + sleep($timeout) + lock(w.c_state) do + notify(w.c_state; all=true) + end + end + errormonitor(T) + lock(w.c_state) do + wait(w.c_state) + (@atomic w.state) === W_CREATED && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds") + end end nothing end @@ -491,7 +502,10 @@ function addprocs_locked(manager::ClusterManager; kwargs...) while true if isempty(launched) istaskdone(t_launch) && break - @async (sleep(1); notify(launch_ntfy)) + @async begin + sleep(1) + notify(launch_ntfy) + end wait(launch_ntfy) end @@ -645,7 +659,12 @@ function create_worker(manager, wconfig) # require the value of config.connect_at which is set only upon connection completion for jw in PGRP.workers if (jw.id != 1) && (jw.id < w.id) - (jw.state === W_CREATED) && wait(jw.c_state) + lock(jw.c_state) do + # wait for wl to join + if (@atomic jw.state) === W_CREATED + wait(jw.c_state) + end + end push!(join_list, jw) end end @@ -668,7 +687,12 @@ function create_worker(manager, wconfig) end for wl in wlist - (wl.state === W_CREATED) && wait(wl.c_state) + lock(wl.c_state) do + if (@atomic wl.state) === W_CREATED + # wait for wl to join + wait(wl.c_state) + end + end push!(join_list, wl) end end @@ -685,7 +709,11 @@ function create_worker(manager, wconfig) @async manage(w.manager, w.id, w.config, :register) # wait for rr_ntfy_join with timeout timedout = false - @async (sleep($timeout); timedout = true; put!(rr_ntfy_join, 1)) + @async begin + sleep($timeout) + timedout = true + put!(rr_ntfy_join, 1) + end wait(rr_ntfy_join) if timedout error("worker did not connect within $timeout seconds") @@ -870,7 +898,7 @@ function nprocs() n = length(PGRP.workers) # filter out workers in the process of being setup/shutdown. for jw in PGRP.workers - if !isa(jw, LocalProcess) && (jw.state !== W_CONNECTED) + if !isa(jw, LocalProcess) && ((@atomic jw.state) !== W_CONNECTED) n = n - 1 end end @@ -921,7 +949,7 @@ julia> procs() function procs() if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy()) # filter out workers in the process of being setup/shutdown. - return Int[x.id for x in PGRP.workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)] + return Int[x.id for x in PGRP.workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)] else return Int[x.id for x in PGRP.workers] end @@ -930,7 +958,7 @@ end function id_in_procs(id) # faster version of `id in procs()` if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy()) for x in PGRP.workers - if (x.id::Int) == id && (isa(x, LocalProcess) || (x::Worker).state === W_CONNECTED) + if (x.id::Int) == id && (isa(x, LocalProcess) || (@atomic (x::Worker).state) === W_CONNECTED) return true end end @@ -952,7 +980,7 @@ Specifically all workers bound to the same ip-address as `pid` are returned. """ function procs(pid::Integer) if myid() == 1 - all_workers = [x for x in PGRP.workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)] + all_workers = [x for x in PGRP.workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)] if (pid == 1) || (isa(map_pid_wrkr[pid].manager, LocalManager)) Int[x.id for x in filter(w -> (w.id==1) || (isa(w.manager, LocalManager)), all_workers)] else @@ -1059,11 +1087,11 @@ function _rmprocs(pids, waitfor) start = time_ns() while (time_ns() - start) < waitfor*1e9 - all(w -> w.state === W_TERMINATED, rmprocset) && break + all(w -> (@atomic w.state) === W_TERMINATED, rmprocset) && break sleep(min(0.1, waitfor - (time_ns() - start)/1e9)) end - unremoved = [wrkr.id for wrkr in filter(w -> w.state !== W_TERMINATED, rmprocset)] + unremoved = [wrkr.id for wrkr in filter(w -> (@atomic w.state) !== W_TERMINATED, rmprocset)] if length(unremoved) > 0 estr = string("rmprocs: pids ", unremoved, " not terminated after ", waitfor, " seconds.") throw(ErrorException(estr)) diff --git a/src/managers.jl b/src/managers.jl index 129b65c..56d1f78 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -183,7 +183,7 @@ function launch(manager::SSHManager, params::Dict, launched::Array, launch_ntfy: # Wait for all launches to complete. @sync for (i, (machine, cnt)) in enumerate(manager.machines) let machine=machine, cnt=cnt - @async try + @async try launch_on_machine(manager, $machine, $cnt, params, launched, launch_ntfy) catch e print(stderr, "exception launching on machine $(machine) : $(e)\n") diff --git a/src/messages.jl b/src/messages.jl index fe3e5ab..6e895f0 100644 --- a/src/messages.jl +++ b/src/messages.jl @@ -194,7 +194,7 @@ end function flush_gc_msgs() try for w in (PGRP::ProcessGroup).workers - if isa(w,Worker) && (w.state == W_CONNECTED) && w.gcflag + if isa(w,Worker) && ((@atomic w.state) == W_CONNECTED) && w.gcflag flush_gc_msgs(w) end end diff --git a/src/process_messages.jl b/src/process_messages.jl index 3032917..a444651 100644 --- a/src/process_messages.jl +++ b/src/process_messages.jl @@ -222,7 +222,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) println(stderr, "Process($(myid())) - Unknown remote, closing connection.") elseif !(wpid in map_del_wrkr) werr = worker_from_id(wpid) - oldstate = werr.state + oldstate = @atomic werr.state set_worker_state(werr, W_TERMINATED) # If unhandleable error occurred talking to pid 1, exit diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index a5b833b..fb0caba 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -1991,5 +1991,8 @@ end # Run topology tests last after removing all workers, since a given # cluster at any time only supports a single topology. -nprocs() > 1 && rmprocs(workers()) +if nprocs() > 1 + rmprocs(workers()) +end +include("threads.jl") include("topology.jl") diff --git a/test/threads.jl b/test/threads.jl new file mode 100644 index 0000000..9d1d6d4 --- /dev/null +++ b/test/threads.jl @@ -0,0 +1,64 @@ +using Test +using DistributedNext, Base.Threads +using Base.Iterators: product + +exeflags = ("--startup-file=no", + "--check-bounds=yes", + "--depwarn=error", + "--threads=2") + +function call_on(f, wid, tid) + remotecall(wid) do + t = Task(f) + ccall(:jl_set_task_tid, Cvoid, (Any, Cint), t, tid - 1) + schedule(t) + @assert threadid(t) == tid + t + end +end + +# Run function on process holding the data to only serialize the result of f. +# This becomes useful for things that cannot be serialized (e.g. running tasks) +# or that would be unnecessarily big if serialized. +fetch_from_owner(f, rr) = remotecall_fetch(f ∘ fetch, rr.where, rr) + +isdone(rr) = fetch_from_owner(istaskdone, rr) +isfailed(rr) = fetch_from_owner(istaskfailed, rr) + +@testset "RemoteChannel allows put!/take! from thread other than 1" begin + ws = ts = product(1:2, 1:2) + @testset "from worker $w1 to $w2 via 1" for (w1, w2) in ws + @testset "from thread $w1.$t1 to $w2.$t2" for (t1, t2) in ts + # We want (the default) laziness, so that we wait for `Worker.c_state`! + procs_added = addprocs(2; exeflags, lazy=true) + @everywhere procs_added using Base.Threads + + p1 = procs_added[w1] + p2 = procs_added[w2] + chan_id = first(procs_added) + chan = RemoteChannel(chan_id) + send = call_on(p1, t1) do + put!(chan, nothing) + end + recv = call_on(p2, t2) do + take!(chan) + end + + # Wait on the spawned tasks on the owner. Note that we use + # timedwait() instead of @sync to avoid deadlocks. + t1 = Threads.@spawn fetch_from_owner(wait, recv) + t2 = Threads.@spawn fetch_from_owner(wait, send) + @test timedwait(() -> istaskdone(t1), 5) == :ok + @test timedwait(() -> istaskdone(t2), 5) == :ok + + # Check the tasks + @test isdone(send) + @test isdone(recv) + + @test !isfailed(send) + @test !isfailed(recv) + + rmprocs(procs_added) + end + end +end From a15baf428831ccb473bbe14475e0247aa7784d2d Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Sat, 14 Oct 2023 10:18:05 -0700 Subject: [PATCH 3/9] init_multi: Be more thread-safe --- src/cluster.jl | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 653be62..c58a79b 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1318,18 +1318,16 @@ end using Random: randstring -let inited = false - # do initialization that's only needed when there is more than 1 processor - global function init_multi() - if !inited - inited = true - push!(Base.package_callbacks, _require_callback) - atexit(terminate_all_workers) - init_bind_addr() - cluster_cookie(randstring(HDR_COOKIE_LEN)) - end - return nothing +# do initialization that's only needed when there is more than 1 processor +const inited = Threads.Atomic{Bool}(false) +function init_multi() + if !Threads.atomic_cas!(inited, false, true) + push!(Base.package_callbacks, _require_callback) + atexit(terminate_all_workers) + init_bind_addr() + cluster_cookie(randstring(HDR_COOKIE_LEN)) end + return nothing end function init_parallel() From 1e4b52f7b6397b61afcb03c7bd6552dd2e0908a8 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Sun, 2 Jun 2024 19:52:32 +0200 Subject: [PATCH 4/9] Use errormonitor() in a few places --- src/cluster.jl | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index c58a79b..bb08d9b 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -706,14 +706,16 @@ function create_worker(manager, wconfig) join_message = JoinPGRPMsg(w.id, all_locs, PGRP.topology, enable_threaded_blas, isclusterlazy()) send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message) - @async manage(w.manager, w.id, w.config, :register) + errormonitor(@async manage(w.manager, w.id, w.config, :register)) # wait for rr_ntfy_join with timeout timedout = false - @async begin - sleep($timeout) - timedout = true - put!(rr_ntfy_join, 1) - end + errormonitor( + @async begin + sleep($timeout) + timedout = true + put!(rr_ntfy_join, 1) + end + ) wait(rr_ntfy_join) if timedout error("worker did not connect within $timeout seconds") @@ -763,17 +765,20 @@ function check_master_connect() if ccall(:jl_running_on_valgrind,Cint,()) != 0 return end - @async begin - start = time_ns() - while !haskey(map_pid_wrkr, 1) && (time_ns() - start) < timeout - sleep(1.0) - end - if !haskey(map_pid_wrkr, 1) - print(stderr, "Master process (id 1) could not connect within $(timeout/1e9) seconds.\nexiting.\n") - exit(1) + errormonitor( + @async begin + start = time_ns() + while !haskey(map_pid_wrkr, 1) && (time_ns() - start) < timeout + sleep(1.0) + end + + if !haskey(map_pid_wrkr, 1) + print(stderr, "Master process (id 1) could not connect within $(timeout/1e9) seconds.\nexiting.\n") + exit(1) + end end - end + ) end From 3c9080d3af93e33655bbb8a012fab9a650c194bc Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Tue, 29 Oct 2024 21:34:52 +0100 Subject: [PATCH 5/9] Update changelog --- docs/src/_changelog.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/_changelog.md b/docs/src/_changelog.md index fe80fb4..d9d6c95 100644 --- a/docs/src/_changelog.md +++ b/docs/src/_changelog.md @@ -12,6 +12,8 @@ This documents notable changes in DistributedNext.jl. The format is based on ### Fixed - Fixed behaviour of `isempty(::RemoteChannel)`, which previously had the side-effect of taking an element from the channel ([#3]). +- Improved thread-safety, such that it should be safe to start workers with + multiple threads and send messages between them ([#4]). ### Changed - Added a `project` argument to [`addprocs(::AbstractVector)`](@ref) to specify From f6892b5640c3601cfc7f1afa7dea38071b538216 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Thu, 31 Oct 2024 23:26:22 +0100 Subject: [PATCH 6/9] Always run multi-threaded tests --- test/distributed_exec.jl | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index fb0caba..1b438a0 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -147,27 +147,6 @@ function poll_while(f::Function; timeout_seconds::Integer = 120) return true end -function _getenv_include_thread_unsafe() - environment_variable_name = "JULIA_TEST_INCLUDE_THREAD_UNSAFE" - default_value = "false" - environment_variable_value = strip(get(ENV, environment_variable_name, default_value)) - b = parse(Bool, environment_variable_value)::Bool - return b -end -const _env_include_thread_unsafe = _getenv_include_thread_unsafe() -function include_thread_unsafe_tests() - if Threads.maxthreadid() > 1 - if _env_include_thread_unsafe - return true - end - msg = "Skipping a thread-unsafe test because `Threads.maxthreadid() > 1`" - @warn msg Threads.maxthreadid() - Test.@test_broken false - return false - end - return true -end - # DistributedNext GC tests for Futures function test_futures_dgc(id) f = remotecall(myid, id) @@ -290,14 +269,10 @@ let wid1 = workers()[1], fstore = RemoteChannel(wid2) put!(fstore, rr) - if include_thread_unsafe_tests() - @test remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid) == true - end + @test remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid) == true finalize(rr) # finalize locally yield() # flush gc msgs - if include_thread_unsafe_tests() - @test remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid) == true - end + @test remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid) == true remotecall_fetch(r -> (finalize(take!(r)); yield(); nothing), wid2, fstore) # finalize remotely sleep(0.5) # to ensure that wid2 messages have been executed on wid1 @test poll_while(() -> remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid)) From da533c1bca1c54dcf47d5a42facb426348ff4372 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Fri, 1 Nov 2024 14:47:28 +0100 Subject: [PATCH 7/9] Move SSH tests into a single-threaded process Necessary because LibSSH is not thread-safe. --- test/distributed_exec.jl | 94 ------------------------------------ test/runtests.jl | 10 ++-- test/sshmanager.jl | 101 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 97 deletions(-) create mode 100644 test/sshmanager.jl diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 1b438a0..6b5ae00 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -3,8 +3,6 @@ using Test, DistributedNext, Random, Serialization, Sockets import DistributedNext: launch, manage -import LibSSH as ssh -import LibSSH.Demo: DemoServer @test cluster_cookie() isa String @@ -762,98 +760,6 @@ if DoFullTest @test all([p == remotecall_fetch(myid, p) for p in all_w]) end -# LibSSH.jl currently only works on 64bit unixes -if Sys.isunix() && Sys.WORD_SIZE == 64 - function test_n_remove_pids(new_pids) - for p in new_pids - w_in_remote = sort(remotecall_fetch(workers, p)) - try - @test intersect(new_pids, w_in_remote) == new_pids - catch - print("p : $p\n") - print("newpids : $new_pids\n") - print("w_in_remote : $w_in_remote\n") - print("intersect : $(intersect(new_pids, w_in_remote))\n\n\n") - rethrow() - end - end - - remotecall_fetch(rmprocs, 1, new_pids) - end - - println("\n\nTesting SSHManager. A minimum of 4GB of RAM is recommended.") - println("Please ensure port 9300 and 2222 are not in use.") - - DemoServer(2222; auth_methods=[ssh.AuthMethod_None], allow_auth_none=true, verbose=false, timeout=3600) do - sshflags = `-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR -p 2222 ` - #Issue #9951 - hosts=[] - localhost_aliases = ["localhost", string(getipaddr()), "127.0.0.1"] - num_workers = parse(Int,(get(ENV, "JULIA_ADDPROCS_NUM", "9"))) - - for i in 1:(num_workers/length(localhost_aliases)) - append!(hosts, localhost_aliases) - end - - # CI machines sometimes don't already have a .ssh directory - ssh_dir = joinpath(homedir(), ".ssh") - if !isdir(ssh_dir) - mkdir(ssh_dir) - end - - print("\nTesting SSH addprocs with $(length(hosts)) workers...\n") - new_pids = addprocs_with_testenv(hosts; sshflags=sshflags) - @test length(new_pids) == length(hosts) - test_n_remove_pids(new_pids) - - print("\nMixed ssh addprocs with :auto\n") - new_pids = addprocs_with_testenv(["localhost", ("127.0.0.1", :auto), "localhost"]; sshflags=sshflags) - @test length(new_pids) == (2 + Sys.CPU_THREADS) - test_n_remove_pids(new_pids) - - print("\nMixed ssh addprocs with numeric counts\n") - new_pids = addprocs_with_testenv([("localhost", 2), ("127.0.0.1", 2), "localhost"]; sshflags=sshflags) - @test length(new_pids) == 5 - test_n_remove_pids(new_pids) - - print("\nssh addprocs with tunnel\n") - new_pids = addprocs_with_testenv([("localhost", num_workers)]; tunnel=true, sshflags=sshflags) - @test length(new_pids) == num_workers - test_n_remove_pids(new_pids) - - print("\nssh addprocs with tunnel (SSH multiplexing)\n") - new_pids = addprocs_with_testenv([("localhost", num_workers)]; tunnel=true, multiplex=true, sshflags=sshflags) - @test length(new_pids) == num_workers - controlpath = joinpath(ssh_dir, "julia-$(ENV["USER"])@localhost:2222") - @test issocket(controlpath) - test_n_remove_pids(new_pids) - @test :ok == timedwait(()->!issocket(controlpath), 10.0; pollint=0.5) - - print("\nAll supported formats for hostname\n") - h1 = "localhost" - user = ENV["USER"] - h2 = "$user@$h1" - h3 = "$h2:2222" - h4 = "$h3 $(string(getipaddr()))" - h5 = "$h4:9300" - - new_pids = addprocs_with_testenv([h1, h2, h3, h4, h5]; sshflags=sshflags) - @test length(new_pids) == 5 - test_n_remove_pids(new_pids) - - print("\nkeyword arg exename\n") - for exename in [`$(joinpath(Sys.BINDIR, Base.julia_exename()))`, "$(joinpath(Sys.BINDIR, Base.julia_exename()))"] - for addp_func in [()->addprocs_with_testenv(["localhost"]; exename=exename, exeflags=test_exeflags, sshflags=sshflags), - ()->addprocs_with_testenv(1; exename=exename, exeflags=test_exeflags)] - - local new_pids = addp_func() - @test length(new_pids) == 1 - test_n_remove_pids(new_pids) - end - end - end -end # unix-only - let t = @task 42 schedule(t, ErrorException(""), error=true) @test_throws TaskFailedException(t) Base.wait(t) diff --git a/test/runtests.jl b/test/runtests.jl index d34d07c..d4d1d86 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,11 +3,15 @@ # Run the distributed test outside of the main driver since it needs its own # set of dedicated workers. include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) -disttestfile = joinpath(@__DIR__, "distributed_exec.jl") -cmd = `$test_exename $test_exeflags $disttestfile` +cmd = `$test_exename $test_exeflags` + +# Run the SSH tests with a single thread because LibSSH.jl is not thread-safe +sshtestfile = joinpath(@__DIR__, "sshmanager.jl") +run(addenv(`$cmd $sshtestfile`, "JULIA_NUM_THREADS" => "1")) -if !success(pipeline(cmd; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0 +disttestfile = joinpath(@__DIR__, "distributed_exec.jl") +if !success(pipeline(`$cmd $disttestfile`; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0 error("Distributed test failed, cmd : $cmd") end diff --git a/test/sshmanager.jl b/test/sshmanager.jl new file mode 100644 index 0000000..9bed971 --- /dev/null +++ b/test/sshmanager.jl @@ -0,0 +1,101 @@ +using Test +using DistributedNext +import Sockets: getipaddr + +import LibSSH as ssh +import LibSSH.Demo: DemoServer + + +include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) + +# LibSSH.jl currently only works on 64bit unixes +if Sys.isunix() && Sys.WORD_SIZE == 64 + function test_n_remove_pids(new_pids) + for p in new_pids + w_in_remote = sort(remotecall_fetch(workers, p)) + try + @test intersect(new_pids, w_in_remote) == new_pids + catch + print("p : $p\n") + print("newpids : $new_pids\n") + print("w_in_remote : $w_in_remote\n") + print("intersect : $(intersect(new_pids, w_in_remote))\n\n\n") + rethrow() + end + end + + remotecall_fetch(rmprocs, 1, new_pids) + end + + println("\n\nTesting SSHManager. A minimum of 4GB of RAM is recommended.") + println("Please ensure port 9300 and 2222 are not in use.") + + DemoServer(2222; auth_methods=[ssh.AuthMethod_None], allow_auth_none=true, verbose=false, timeout=3600) do + sshflags = `-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR -p 2222 ` + #Issue #9951 + hosts=[] + localhost_aliases = ["localhost", string(getipaddr()), "127.0.0.1"] + num_workers = parse(Int,(get(ENV, "JULIA_ADDPROCS_NUM", "9"))) + + for i in 1:(num_workers/length(localhost_aliases)) + append!(hosts, localhost_aliases) + end + + # CI machines sometimes don't already have a .ssh directory + ssh_dir = joinpath(homedir(), ".ssh") + if !isdir(ssh_dir) + mkdir(ssh_dir) + end + + print("\nTesting SSH addprocs with $(length(hosts)) workers...\n") + new_pids = addprocs_with_testenv(hosts; sshflags=sshflags) + @test length(new_pids) == length(hosts) + test_n_remove_pids(new_pids) + + print("\nMixed ssh addprocs with :auto\n") + new_pids = addprocs_with_testenv(["localhost", ("127.0.0.1", :auto), "localhost"]; sshflags=sshflags) + @test length(new_pids) == (2 + Sys.CPU_THREADS) + test_n_remove_pids(new_pids) + + print("\nMixed ssh addprocs with numeric counts\n") + new_pids = addprocs_with_testenv([("localhost", 2), ("127.0.0.1", 2), "localhost"]; sshflags=sshflags) + @test length(new_pids) == 5 + test_n_remove_pids(new_pids) + + print("\nssh addprocs with tunnel\n") + new_pids = addprocs_with_testenv([("localhost", num_workers)]; tunnel=true, sshflags=sshflags) + @test length(new_pids) == num_workers + test_n_remove_pids(new_pids) + + print("\nssh addprocs with tunnel (SSH multiplexing)\n") + new_pids = addprocs_with_testenv([("localhost", num_workers)]; tunnel=true, multiplex=true, sshflags=sshflags) + @test length(new_pids) == num_workers + controlpath = joinpath(ssh_dir, "julia-$(ENV["USER"])@localhost:2222") + @test issocket(controlpath) + test_n_remove_pids(new_pids) + @test :ok == timedwait(()->!issocket(controlpath), 10.0; pollint=0.5) + + print("\nAll supported formats for hostname\n") + h1 = "localhost" + user = ENV["USER"] + h2 = "$user@$h1" + h3 = "$h2:2222" + h4 = "$h3 $(string(getipaddr()))" + h5 = "$h4:9300" + + new_pids = addprocs_with_testenv([h1, h2, h3, h4, h5]; sshflags=sshflags) + @test length(new_pids) == 5 + test_n_remove_pids(new_pids) + + print("\nkeyword arg exename\n") + for exename in [`$(joinpath(Sys.BINDIR, Base.julia_exename()))`, "$(joinpath(Sys.BINDIR, Base.julia_exename()))"] + for addp_func in [()->addprocs_with_testenv(["localhost"]; exename=exename, exeflags=test_exeflags, sshflags=sshflags), + ()->addprocs_with_testenv(1; exename=exename, exeflags=test_exeflags)] + + local new_pids = addp_func() + @test length(new_pids) == 1 + test_n_remove_pids(new_pids) + end + end + end +end From b78aa9af5e743e6f7788372922a6272cf2aa5df0 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Fri, 1 Nov 2024 22:20:08 +0100 Subject: [PATCH 8/9] Refactor the tests into @testsets This makes it much easier to see where errors/warnings are coming from. The tests have been preserved in the exact order they were written, with no changes other than the necessary ones to put them in `@testset`'s (e.g. creating modules in global scope). --- test/distributed_exec.jl | 2966 +++++++++++++++++++------------------- test/managers.jl | 34 +- test/runtests.jl | 16 +- test/splitrange.jl | 48 +- test/sshmanager.jl | 32 +- test/topology.jl | 220 +-- 6 files changed, 1684 insertions(+), 1632 deletions(-) diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 6b5ae00..0ee9e6b 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -11,108 +11,116 @@ include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) @test DistributedNext.extract_imports(:(begin; import Foo, Bar; let; using Baz; end; end)) == Any[:(import Foo, Bar), :(using Baz)] + +id_me = nothing +id_other = nothing + # Test a few "remote" invocations when no workers are present -@test remote(myid)() == 1 -@test pmap(identity, 1:100) == [1:100...] -@test 100 == @distributed (+) for i in 1:100 +@testset "Remote invocations with no workers" begin + @test remote(myid)() == 1 + @test pmap(identity, 1:100) == [1:100...] + @test 100 == @distributed (+) for i in 1:100 1 end +end -addprocs_with_testenv(4) -@test nprocs() == 5 +@testset "Distributed loading of packages" begin + addprocs_with_testenv(4) + @test nprocs() == 5 -# distributed loading of packages + global id_me = myid() + global id_other = filter(x -> x != id_me, procs())[rand(1:(nprocs()-1))] -# setup -@everywhere begin - old_act_proj = Base.ACTIVE_PROJECT[] - pushfirst!(Base.LOAD_PATH, "@") - Base.ACTIVE_PROJECT[] = joinpath(Sys.BINDIR, "..", "share", "julia", "test", "TestPkg") -end + # setup + @everywhere begin + old_act_proj = Base.ACTIVE_PROJECT[] + pushfirst!(Base.LOAD_PATH, "@") + Base.ACTIVE_PROJECT[] = joinpath(Sys.BINDIR, "..", "share", "julia", "test", "TestPkg") + end -# cause precompilation of TestPkg to avoid race condition -Base.compilecache(Base.identify_package("TestPkg")) + # cause precompilation of TestPkg to avoid race condition + Base.compilecache(Base.identify_package("TestPkg")) -@everywhere using TestPkg -@everywhere using TestPkg + @everywhere using TestPkg + @everywhere using TestPkg -@everywhere begin - Base.ACTIVE_PROJECT[] = old_act_proj - popfirst!(Base.LOAD_PATH) -end - -@everywhere using Test, Random, LinearAlgebra + @everywhere begin + Base.ACTIVE_PROJECT[] = old_act_proj + popfirst!(Base.LOAD_PATH) + end -id_me = myid() -id_other = filter(x -> x != id_me, procs())[rand(1:(nprocs()-1))] + @everywhere using Test, Random, LinearAlgebra +end -# Test role -@everywhere using DistributedNext -@test DistributedNext.myrole() === :master -for wid = workers() - wrole = remotecall_fetch(wid) do - DistributedNext.myrole() +@testset "Test role" begin + @everywhere using DistributedNext + @test DistributedNext.myrole() === :master + for wid = workers() + wrole = remotecall_fetch(wid) do + DistributedNext.myrole() + end + @test wrole === :worker end - @test wrole === :worker end -# Test remote() -let - pool = default_worker_pool() +@testset "Test remote()" begin + let + pool = default_worker_pool() - count = 0 - count_condition = Condition() + count = 0 + count_condition = Condition() - function remote_wait(c) - @async_logerr begin - count += 1 - remote(take!)(c) - count -= 1 - notify(count_condition) + function remote_wait(c) + @async_logerr begin + count += 1 + remote(take!)(c) + count -= 1 + notify(count_condition) + end + yield() end - yield() - end - testchannels = [RemoteChannel() for i in 1:nworkers()] - testcount = 0 - @test isready(pool) == true - for c in testchannels - @test count == testcount - remote_wait(c) - testcount += 1 - end - @test count == testcount - @test isready(pool) == false - - for c in testchannels - @test count == testcount - put!(c, "foo") - testcount -= 1 - (count == testcount) || wait(count_condition) - @test count == testcount + testchannels = [RemoteChannel() for i in 1:nworkers()] + testcount = 0 @test isready(pool) == true - end + for c in testchannels + @test count == testcount + remote_wait(c) + testcount += 1 + end + @test count == testcount + @test isready(pool) == false + + for c in testchannels + @test count == testcount + put!(c, "foo") + testcount -= 1 + (count == testcount) || wait(count_condition) + @test count == testcount + @test isready(pool) == true + end - @test count == 0 + @test count == 0 - for c in testchannels + for c in testchannels + @test count == testcount + remote_wait(c) + testcount += 1 + end @test count == testcount - remote_wait(c) - testcount += 1 - end - @test count == testcount - @test isready(pool) == false + @test isready(pool) == false + + for c in reverse(testchannels) + @test count == testcount + put!(c, "foo") + testcount -= 1 + (count == testcount) || wait(count_condition) + @test count == testcount + @test isready(pool) == true + end - for c in reverse(testchannels) - @test count == testcount - put!(c, "foo") - testcount -= 1 - (count == testcount) || wait(count_condition) - @test count == testcount - @test isready(pool) == true + @test count == 0 end - - @test count == 0 end # Test Futures @@ -130,9 +138,6 @@ function testf(id) @test fetch(f) === :OK end -testf(id_me) -testf(id_other) - function poll_while(f::Function; timeout_seconds::Integer = 120) start_time = time_ns() while f() @@ -168,567 +173,584 @@ function test_futures_dgc(id) @test poll_while(() -> remotecall_fetch(k->(yield();haskey(DistributedNext.PGRP.refs, k)), id, fid)) end -test_futures_dgc(id_me) -test_futures_dgc(id_other) +@testset "GC tests for Futures" begin + testf(id_me) + testf(id_other) -# if sent to another worker, it should not be deleted till all references are fetched. -wid1 = workers()[1] -wid2 = workers()[2] -f = remotecall(myid, wid1) -fid = remoteref_id(f) + test_futures_dgc(id_me) + test_futures_dgc(id_other) -fstore = RemoteChannel(wid2) -put!(fstore, f) + # if sent to another worker, it should not be deleted till all references are fetched. + wid1 = workers()[1] + wid2 = workers()[2] + f = remotecall(myid, wid1) + fid = remoteref_id(f) + + fstore = RemoteChannel(wid2) + put!(fstore, f) -@test fetch(f) == wid1 -@test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == true -remotecall_fetch(r->(fetch(fetch(r)); yield()), wid2, fstore) -sleep(0.5) # to ensure that wid2 gc messages have been executed on wid1 -@test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == false + @test fetch(f) == wid1 + @test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == true + remotecall_fetch(r->(fetch(fetch(r)); yield()), wid2, fstore) + sleep(0.5) # to ensure that wid2 gc messages have been executed on wid1 + @test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == false -# put! should release remote reference since it would have been cached locally -f = Future(wid1) -fid = remoteref_id(f) + # put! should release remote reference since it would have been cached locally + f = Future(wid1) + fid = remoteref_id(f) -# should not be created remotely till accessed -@test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == false -# create it remotely -isready(f) + # should not be created remotely till accessed + @test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == false + # create it remotely + isready(f) -@test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == true -put!(f, :OK) -@test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == false -@test fetch(f) === :OK + @test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == true + put!(f, :OK) + @test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == false + @test fetch(f) === :OK -# RemoteException should be thrown on a put! when another process has set the value -f = Future(wid1) -fid = remoteref_id(f) + # RemoteException should be thrown on a put! when another process has set the value + f = Future(wid1) + fid = remoteref_id(f) -fstore = RemoteChannel(wid2) -put!(fstore, f) # send f to wid2 -put!(f, :OK) # set value from master + fstore = RemoteChannel(wid2) + put!(fstore, f) # send f to wid2 + put!(f, :OK) # set value from master -@test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == true + @test remotecall_fetch(k->haskey(DistributedNext.PGRP.refs, k), wid1, fid) == true -testval = remotecall_fetch(wid2, fstore) do x - try - put!(fetch(x), :OK) - return 0 - catch e - if isa(e, RemoteException) - return 1 - else - return 2 + testval = remotecall_fetch(wid2, fstore) do x + try + put!(fetch(x), :OK) + return 0 + catch e + if isa(e, RemoteException) + return 1 + else + return 2 + end end end -end -@test testval == 1 + @test testval == 1 -# Issue number #25847 -@everywhere function f25847(ref) - fetch(ref) - return true -end + # Issue number #25847 + @everywhere function f25847(ref) + fetch(ref) + return true + end -f = remotecall_wait(identity, id_other, ones(10)) -rrid = DistributedNext.RRID(f.whence, f.id) -remotecall_fetch(f25847, id_other, f) -@test BitSet([id_me]) == remotecall_fetch(()->DistributedNext.PGRP.refs[rrid].clientset, id_other) - -remotecall_fetch(f25847, id_other, f) -@test BitSet([id_me]) == remotecall_fetch(()->DistributedNext.PGRP.refs[rrid].clientset, id_other) - -finalize(f) -yield() # flush gc msgs -@test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(DistributedNext.PGRP.refs, chk_rrid)), id_other, rrid)) - -# DistributedNext GC tests for RemoteChannels -function test_remoteref_dgc(id) - rr = RemoteChannel(id) - put!(rr, :OK) - rrid = remoteref_id(rr) - - # remote value should be deleted after finalizing the ref - @test remotecall_fetch(k->(yield();haskey(DistributedNext.PGRP.refs, k)), id, rrid) == true - @test fetch(rr) === :OK - @test remotecall_fetch(k->(yield();haskey(DistributedNext.PGRP.refs, k)), id, rrid) == true - finalize(rr) - yield(); # flush gc msgs - @test poll_while(() -> remotecall_fetch(k->(yield();haskey(DistributedNext.PGRP.refs, k)), id, rrid)) -end -test_remoteref_dgc(id_me) -test_remoteref_dgc(id_other) - -# if sent to another worker, it should not be deleted till the other worker has also finalized. -let wid1 = workers()[1], - wid2 = workers()[2], - rr = RemoteChannel(wid1), - rrid = remoteref_id(rr), - fstore = RemoteChannel(wid2) + f = remotecall_wait(identity, id_other, ones(10)) + rrid = DistributedNext.RRID(f.whence, f.id) + remotecall_fetch(f25847, id_other, f) + @test BitSet([id_me]) == remotecall_fetch(()->DistributedNext.PGRP.refs[rrid].clientset, id_other) + + remotecall_fetch(f25847, id_other, f) + @test BitSet([id_me]) == remotecall_fetch(()->DistributedNext.PGRP.refs[rrid].clientset, id_other) - put!(fstore, rr) - @test remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid) == true - finalize(rr) # finalize locally + finalize(f) yield() # flush gc msgs - @test remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid) == true - remotecall_fetch(r -> (finalize(take!(r)); yield(); nothing), wid2, fstore) # finalize remotely - sleep(0.5) # to ensure that wid2 messages have been executed on wid1 - @test poll_while(() -> remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid)) -end + @test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(DistributedNext.PGRP.refs, chk_rrid)), id_other, rrid)) +end + +@testset "GC tests for RemoteChannels" begin + function test_remoteref_dgc(id) + rr = RemoteChannel(id) + put!(rr, :OK) + rrid = remoteref_id(rr) + + # remote value should be deleted after finalizing the ref + @test remotecall_fetch(k->(yield();haskey(DistributedNext.PGRP.refs, k)), id, rrid) == true + @test fetch(rr) === :OK + @test remotecall_fetch(k->(yield();haskey(DistributedNext.PGRP.refs, k)), id, rrid) == true + finalize(rr) + yield(); # flush gc msgs + @test poll_while(() -> remotecall_fetch(k->(yield();haskey(DistributedNext.PGRP.refs, k)), id, rrid)) + end + test_remoteref_dgc(id_me) + test_remoteref_dgc(id_other) + + # if sent to another worker, it should not be deleted till the other worker has also finalized. + let wid1 = workers()[1], + wid2 = workers()[2], + rr = RemoteChannel(wid1), + rrid = remoteref_id(rr), + fstore = RemoteChannel(wid2) + + put!(fstore, rr) + @test remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid) == true + finalize(rr) # finalize locally + yield() # flush gc msgs + @test remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid) == true + remotecall_fetch(r -> (finalize(take!(r)); yield(); nothing), wid2, fstore) # finalize remotely + sleep(0.5) # to ensure that wid2 messages have been executed on wid1 + @test poll_while(() -> remotecall_fetch(k -> haskey(DistributedNext.PGRP.refs, k), wid1, rrid)) + end +end + +@testset "issue #23109 - should not hang" begin + f = @spawnat :any rand(1, 1) + Base.Experimental.@sync begin + for _ in 1:10 + @async fetch(f) + end + end -# Tests for issue #23109 - should not hang. -f = @spawnat :any rand(1, 1) -Base.Experimental.@sync begin - for _ in 1:10 + wid1, wid2 = workers()[1:2] + f = @spawnat wid1 rand(1,1) + Base.Experimental.@sync begin @async fetch(f) + @async remotecall_fetch(()->fetch(f), wid2) end end -wid1, wid2 = workers()[1:2] -f = @spawnat wid1 rand(1,1) -Base.Experimental.@sync begin - @async fetch(f) - @async remotecall_fetch(()->fetch(f), wid2) -end +@testset "getindex on Futures and RemoteChannels" begin + @test fetch(@spawnat id_other myid()) == id_other + @test (@fetchfrom id_other myid()) == id_other + pids=[] + for i in 1:nworkers() + push!(pids, @fetch myid()) + end + @test sort(pids) == sort(workers()) -@test fetch(@spawnat id_other myid()) == id_other -@test (@fetchfrom id_other myid()) == id_other - -pids=[] -for i in 1:nworkers() - push!(pids, @fetch myid()) -end -@test sort(pids) == sort(workers()) + # test getindex on Futures and RemoteChannels + function test_indexing(rr) + a = rand(5,5) + put!(rr, a) + @test rr[2,3] == a[2,3] + @test rr[] == a + end -# test getindex on Futures and RemoteChannels -function test_indexing(rr) - a = rand(5,5) - put!(rr, a) - @test rr[2,3] == a[2,3] - @test rr[] == a + test_indexing(Future()) + test_indexing(Future(id_other)) + test_indexing(RemoteChannel()) + test_indexing(RemoteChannel(id_other)) end -test_indexing(Future()) -test_indexing(Future(id_other)) -test_indexing(RemoteChannel()) -test_indexing(RemoteChannel(id_other)) - -# Test ser/deser to non-ClusterSerializer objects. -function test_regular_io_ser(ref::DistributedNext.AbstractRemoteRef) - io = IOBuffer() - serialize(io, ref) - seekstart(io) - ref2 = deserialize(io) - for fld in fieldnames(typeof(ref)) - v = getfield(ref2, fld) - if isa(v, Number) - @test v === zero(typeof(v)) - elseif fld === :lock - @test v isa ReentrantLock - @test !islocked(v) - elseif v !== nothing - error(string("Add test for field ", fld)) +@testset "Ser/deser to non-ClusterSerializer objects" begin + function test_regular_io_ser(ref::DistributedNext.AbstractRemoteRef) + io = IOBuffer() + serialize(io, ref) + seekstart(io) + ref2 = deserialize(io) + for fld in fieldnames(typeof(ref)) + v = getfield(ref2, fld) + if isa(v, Number) + @test v === zero(typeof(v)) + elseif fld === :lock + @test v isa ReentrantLock + @test !islocked(v) + elseif v !== nothing + error(string("Add test for field ", fld)) + end end end + + test_regular_io_ser(Future()) + test_regular_io_ser(RemoteChannel()) end -test_regular_io_ser(Future()) -test_regular_io_ser(RemoteChannel()) - -# Test @distributed load balancing - all processors should get either M or M+1 -# iterations out of the loop range for some M. -ids = @distributed((a,b)->[a;b], for i=1:7; myid(); end) -workloads = Int[sum(ids .== i) for i in 2:nprocs()] -@test maximum(workloads) - minimum(workloads) <= 1 - -# @distributed reduction should work even with very short ranges -@test @distributed(+, for i=1:2; i; end) == 3 - -@test_throws ArgumentError sleep(-1) -@test_throws ArgumentError timedwait(()->false, 0.1, pollint=-0.5) - -# specify pids for pmap -@test sort(workers()[1:2]) == sort(unique(pmap(x->(sleep(0.1);myid()), WorkerPool(workers()[1:2]), 1:10))) - -# Testing buffered and unbuffered reads -# This large array should write directly to the socket -a = fill(1, 10^6) -@test a == remotecall_fetch((x)->x, id_other, a) - -# Not a bitstype, should be buffered -s = [randstring() for x in 1:10^5] -@test s == remotecall_fetch((x)->x, id_other, s) - -#large number of small requests -num_small_requests = 10000 -@test fill(id_other, num_small_requests) == [remotecall_fetch(myid, id_other) for i in 1:num_small_requests] - -# test parallel sends of large arrays from multiple tasks to the same remote worker -ntasks = 10 -rr_list = [Channel(1) for x in 1:ntasks] - -for rr in rr_list - local rr - let rr = rr - @async try - for i in 1:10 - a = rand(2*10^5) - @test a == remotecall_fetch(x->x, id_other, a) - yield() +@testset "@distributed and [un]buffered reads" begin + # Test @distributed load balancing - all processors should get either M or M+1 + # iterations out of the loop range for some M. + ids = @distributed((a,b)->[a;b], for i=1:7; myid(); end) + workloads = Int[sum(ids .== i) for i in 2:nprocs()] + @test maximum(workloads) - minimum(workloads) <= 1 + + # @distributed reduction should work even with very short ranges + @test @distributed(+, for i=1:2; i; end) == 3 + + @test_throws ArgumentError sleep(-1) + @test_throws ArgumentError timedwait(()->false, 0.1, pollint=-0.5) + + # specify pids for pmap + @test sort(workers()[1:2]) == sort(unique(pmap(x->(sleep(0.1);myid()), WorkerPool(workers()[1:2]), 1:10))) + + # Testing buffered and unbuffered reads + # This large array should write directly to the socket + a = fill(1, 10^6) + @test a == remotecall_fetch((x)->x, id_other, a) + + # Not a bitstype, should be buffered + s = [randstring() for x in 1:10^5] + @test s == remotecall_fetch((x)->x, id_other, s) + + #large number of small requests + num_small_requests = 10000 + @test fill(id_other, num_small_requests) == [remotecall_fetch(myid, id_other) for i in 1:num_small_requests] + + # test parallel sends of large arrays from multiple tasks to the same remote worker + ntasks = 10 + rr_list = [Channel(1) for x in 1:ntasks] + + for rr in rr_list + local rr + let rr = rr + @async try + for i in 1:10 + a = rand(2*10^5) + @test a == remotecall_fetch(x->x, id_other, a) + yield() + end + put!(rr, :OK) + catch + put!(rr, :ERROR) end - put!(rr, :OK) - catch - put!(rr, :ERROR) end end -end -@test [fetch(rr) for rr in rr_list] == [:OK for x in 1:ntasks] - -function test_channel(c) - @test isopen(c) == true - put!(c, 1) - put!(c, "Hello") - put!(c, 5.0) - - @test isready(c) == true - @test isopen(c) == true - @test fetch(c) == 1 - @test fetch(c) == 1 # Should not have been popped previously - @test take!(c) == 1 - @test take!(c) == "Hello" - @test fetch(c) == 5.0 - @test take!(c) == 5.0 - @test isready(c) == false - @test isopen(c) == true - close(c) - @test isopen(c) == false + @test [fetch(rr) for rr in rr_list] == [:OK for x in 1:ntasks] end -test_channel(Channel(10)) -test_channel(RemoteChannel(()->Channel(10))) +@testset "RemoteChannels" begin + function test_channel(c) + @test isopen(c) == true + put!(c, 1) + put!(c, "Hello") + put!(c, 5.0) -c=Channel{Int}(1) -@test_throws MethodError put!(c, "Hello") - -# test channel iterations -function test_iteration(in_c, out_c) - t=@async for v in in_c - put!(out_c, v) + @test isready(c) == true + @test isopen(c) == true + @test fetch(c) == 1 + @test fetch(c) == 1 # Should not have been popped previously + @test take!(c) == 1 + @test take!(c) == "Hello" + @test fetch(c) == 5.0 + @test take!(c) == 5.0 + @test isready(c) == false + @test isopen(c) == true + close(c) + @test isopen(c) == false end - @test isopen(in_c) == true - put!(in_c, 1) - @test take!(out_c) == 1 - put!(in_c, "Hello") - close(in_c) - @test take!(out_c) == "Hello" - @test isopen(in_c) == false - @test_throws InvalidStateException put!(in_c, :foo) - yield() - @test istaskdone(t) == true -end + test_channel(Channel(10)) + test_channel(RemoteChannel(()->Channel(10))) + + c=Channel{Int}(1) + @test_throws MethodError put!(c, "Hello") -test_iteration(Channel(10), Channel(10)) -test_iteration(RemoteChannel(() -> Channel(10)), RemoteChannel(() -> Channel(10))) + # test channel iterations + function test_iteration(in_c, out_c) + t=@async for v in in_c + put!(out_c, v) + end -@everywhere function test_iteration_take(ch) - count = 0 - for x in ch - count += 1 + @test isopen(in_c) == true + put!(in_c, 1) + @test take!(out_c) == 1 + put!(in_c, "Hello") + close(in_c) + @test take!(out_c) == "Hello" + @test isopen(in_c) == false + @test_throws InvalidStateException put!(in_c, :foo) + yield() + @test istaskdone(t) == true end - return count -end -@everywhere test_iteration_collect(ch) = length(collect(ch)) + test_iteration(Channel(10), Channel(10)) + test_iteration(RemoteChannel(() -> Channel(10)), RemoteChannel(() -> Channel(10))) -@everywhere function test_iteration_put(ch, total) - for i in 1:total - put!(ch, i) + @everywhere function test_iteration_take(ch) + count = 0 + for x in ch + count += 1 + end + return count end - close(ch) -end -let ch = RemoteChannel(() -> Channel(1)) - @async test_iteration_put(ch, 10) - @test 10 == @fetchfrom id_other test_iteration_take(ch) - ch = RemoteChannel(() -> Channel(1)) - @async test_iteration_put(ch, 10) - @test 10 == @fetchfrom id_other test_iteration_collect(ch) - # now reverse - ch = RemoteChannel(() -> Channel(1)) - @spawnat id_other test_iteration_put(ch, 10) - @test 10 == test_iteration_take(ch) - ch = RemoteChannel(() -> Channel(1)) - @spawnat id_other test_iteration_put(ch, 10) - @test 10 == test_iteration_collect(ch) -end + @everywhere test_iteration_collect(ch) = length(collect(ch)) -# Test isempty(::RemoteChannel). This should not modify the underlying -# AbstractChannel, which Base's default implementation will do. -let - chan = Channel(1) - push!(chan, 1) - remotechan = RemoteChannel(() -> chan) + @everywhere function test_iteration_put(ch, total) + for i in 1:total + put!(ch, i) + end + close(ch) + end - @test !isempty(remotechan) - # Calling `isempty(remotechan)` shouldn't have modified `chan` - @test !isempty(chan) -end + let ch = RemoteChannel(() -> Channel(1)) + @async test_iteration_put(ch, 10) + @test 10 == @fetchfrom id_other test_iteration_take(ch) + ch = RemoteChannel(() -> Channel(1)) + @async test_iteration_put(ch, 10) + @test 10 == @fetchfrom id_other test_iteration_collect(ch) + # now reverse + ch = RemoteChannel(() -> Channel(1)) + @spawnat id_other test_iteration_put(ch, 10) + @test 10 == test_iteration_take(ch) + ch = RemoteChannel(() -> Channel(1)) + @spawnat id_other test_iteration_put(ch, 10) + @test 10 == test_iteration_collect(ch) + end -# make sure exceptions propagate when waiting on Tasks -@test_throws CompositeException (@sync (@async error("oops"))) -try - @sync begin - for i in 1:5 - @async error(i) - end - end - error("unexpected") -catch ex - @test typeof(ex) == CompositeException - @test length(ex) == 5 - @test typeof(ex.exceptions[1]) == TaskFailedException - @test typeof(ex.exceptions[1].task.exception) == ErrorException - # test start, next, and done - for (i, i_ex) in enumerate(ex) - @test i == parse(Int, i_ex.task.exception.msg) - end - # test showerror - err_str = sprint(showerror, ex) - err_one_str = sprint(showerror, ex.exceptions[1]) - @test err_str == err_one_str * "\n\n...and 4 more exceptions.\n" + # Test isempty(::RemoteChannel). This should not modify the underlying + # AbstractChannel, which Base's default implementation will do. + let + chan = Channel(1) + push!(chan, 1) + remotechan = RemoteChannel(() -> chan) + + @test !isempty(remotechan) + # Calling `isempty(remotechan)` shouldn't have modified `chan` + @test !isempty(chan) + end end -@test sprint(showerror, CompositeException()) == "CompositeException()\n" -function test_remoteexception_thrown(expr) +@testset "Exceptions" begin + # make sure exceptions propagate when waiting on Tasks + @test_throws CompositeException (@sync (@async error("oops"))) try - expr() + @sync begin + for i in 1:5 + @async error(i) + end + end error("unexpected") catch ex - @test typeof(ex) == RemoteException - @test typeof(ex.captured) == CapturedException - @test typeof(ex.captured.ex) == ErrorException - @test ex.captured.ex.msg == "foobar" + @test typeof(ex) == CompositeException + @test length(ex) == 5 + @test typeof(ex.exceptions[1]) == TaskFailedException + @test typeof(ex.exceptions[1].task.exception) == ErrorException + # test start, next, and done + for (i, i_ex) in enumerate(ex) + @test i == parse(Int, i_ex.task.exception.msg) + end + # test showerror + err_str = sprint(showerror, ex) + err_one_str = sprint(showerror, ex.exceptions[1]) + @test err_str == err_one_str * "\n\n...and 4 more exceptions.\n" end -end + @test sprint(showerror, CompositeException()) == "CompositeException()\n" -for id in [id_other, id_me] - local id - test_remoteexception_thrown() do - remotecall_fetch(id) do - throw(ErrorException("foobar")) + function test_remoteexception_thrown(expr) + try + expr() + error("unexpected") + catch ex + @test typeof(ex) == RemoteException + @test typeof(ex.captured) == CapturedException + @test typeof(ex.captured.ex) == ErrorException + @test ex.captured.ex.msg == "foobar" end end - test_remoteexception_thrown() do - remotecall_wait(id) do - throw(ErrorException("foobar")) + + for id in [id_other, id_me] + local id + test_remoteexception_thrown() do + remotecall_fetch(id) do + throw(ErrorException("foobar")) + end + end + test_remoteexception_thrown() do + remotecall_wait(id) do + throw(ErrorException("foobar")) + end + end + test_remoteexception_thrown() do + wait(remotecall(id) do + throw(ErrorException("foobar")) + end) end end - test_remoteexception_thrown() do - wait(remotecall(id) do - throw(ErrorException("foobar")) - end) - end -end -# make sure the stackframe from the remote error can be serialized -let ex - try - remotecall_fetch(id_other) do - @eval module AModuleLocalToOther + # make sure the stackframe from the remote error can be serialized + let ex + try + remotecall_fetch(id_other) do + @eval module AModuleLocalToOther foo() = throw(ErrorException("A.error")) foo() + end end + catch ex end - catch ex - end - @test (ex::RemoteException).pid == id_other - @test ((ex.captured::CapturedException).ex::ErrorException).msg == "A.error" - bt = ex.captured.processed_bt::Array{Any,1} - @test length(bt) > 1 - frame, repeated = bt[1]::Tuple{Base.StackTraces.StackFrame, Int} - @test frame.func === :foo - @test frame.linfo === nothing - @test repeated == 1 -end - -# pmap tests. Needs at least 4 processors dedicated to the below tests. Which we currently have -# since the distributed tests are now spawned as a separate set. - -# Test all combinations of pmap keyword args. -pmap_args = [ - (:distributed, [:default, false]), - (:batch_size, [:default,2]), - (:on_error, [:default, e -> (e.msg == "foobar" ? true : rethrow())]), - (:retry_delays, [:default, fill(0.001, 1000)]), - (:retry_check, [:default, (s,e) -> (s,endswith(e.msg,"foobar"))]), - ] - -kwdict = Dict() -function walk_args(i) - if i > length(pmap_args) - kwargs = [] - for (k,v) in kwdict - if v !== :default - push!(kwargs, (k,v)) + @test (ex::RemoteException).pid == id_other + @test ((ex.captured::CapturedException).ex::ErrorException).msg == "A.error" + bt = ex.captured.processed_bt::Array{Any,1} + @test length(bt) > 1 + frame, repeated = bt[1]::Tuple{Base.StackTraces.StackFrame, Int} + @test frame.func === :foo + @test frame.linfo === nothing + @test repeated == 1 + end +end + +@testset "pmap()" begin + # pmap tests. Needs at least 4 processors dedicated to the below tests. Which we currently have + # since the distributed tests are now spawned as a separate set. + + # Test all combinations of pmap keyword args. + pmap_args = [ + (:distributed, [:default, false]), + (:batch_size, [:default,2]), + (:on_error, [:default, e -> (e.msg == "foobar" ? true : rethrow())]), + (:retry_delays, [:default, fill(0.001, 1000)]), + (:retry_check, [:default, (s,e) -> (s,endswith(e.msg,"foobar"))]), + ] + + kwdict = Dict() + function walk_args(i) + if i > length(pmap_args) + kwargs = [] + for (k,v) in kwdict + if v !== :default + push!(kwargs, (k,v)) + end end - end - - data = 1:100 - testw = kwdict[:distributed] === false ? [1] : workers() - - if kwdict[:retry_delays] !== :default - mapf = x -> iseven(myid()) ? error("notfoobar") : (x*2, myid()) - results_test = pmap_res -> begin - results = [x[1] for x in pmap_res] - pids = [x[2] for x in pmap_res] - @test results == [2:2:200...] - for p in testw - if isodd(p) - @test p in pids - else - @test !(p in pids) + data = 1:100 + + testw = kwdict[:distributed] === false ? [1] : workers() + + if kwdict[:retry_delays] !== :default + mapf = x -> iseven(myid()) ? error("notfoobar") : (x*2, myid()) + results_test = pmap_res -> begin + results = [x[1] for x in pmap_res] + pids = [x[2] for x in pmap_res] + @test results == [2:2:200...] + for p in testw + if isodd(p) + @test p in pids + else + @test !(p in pids) + end end end - end - elseif kwdict[:on_error] === :default - mapf = x -> (x*2, myid()) - results_test = pmap_res -> begin - results = [x[1] for x in pmap_res] - pids = [x[2] for x in pmap_res] - @test results == [2:2:200...] - for p in testw - @test p in pids + elseif kwdict[:on_error] === :default + mapf = x -> (x*2, myid()) + results_test = pmap_res -> begin + results = [x[1] for x in pmap_res] + pids = [x[2] for x in pmap_res] + @test results == [2:2:200...] + for p in testw + @test p in pids + end end - end - else - mapf = x -> iseven(x) ? error("foobar") : (x*2, myid()) - results_test = pmap_res -> begin - w = testw - for (idx,x) in enumerate(data) - if iseven(x) - @test pmap_res[idx] == true - else - @test pmap_res[idx][1] == x*2 - @test pmap_res[idx][2] in w + else + mapf = x -> iseven(x) ? error("foobar") : (x*2, myid()) + results_test = pmap_res -> begin + w = testw + for (idx,x) in enumerate(data) + if iseven(x) + @test pmap_res[idx] == true + else + @test pmap_res[idx][1] == x*2 + @test pmap_res[idx][2] in w + end end end end - end - try - results_test(pmap(mapf, data; kwargs...)) - catch - println("pmap executing with args : ", kwargs) - rethrow() - end + try + results_test(pmap(mapf, data; kwargs...)) + catch + println("pmap executing with args : ", kwargs) + rethrow() + end - return - end + return + end - kwdict[pmap_args[i][1]] = pmap_args[i][2][1] - walk_args(i+1) + kwdict[pmap_args[i][1]] = pmap_args[i][2][1] + walk_args(i+1) - kwdict[pmap_args[i][1]] = pmap_args[i][2][2] - walk_args(i+1) -end + kwdict[pmap_args[i][1]] = pmap_args[i][2][2] + walk_args(i+1) + end -# Start test for various kw arg combinations -walk_args(1) + # Start test for various kw arg combinations + walk_args(1) -include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "generic_map_tests.jl")) -empty_pool = WorkerPool([myid()]) -pmap_fallback = (f, c...) -> pmap(f, empty_pool, c...) -generic_map_tests(pmap_fallback) + include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "generic_map_tests.jl")) + empty_pool = WorkerPool([myid()]) + pmap_fallback = (f, c...) -> pmap(f, empty_pool, c...) + generic_map_tests(pmap_fallback) -# pmap with various types. Test for equivalence with map -run_map_equivalence_tests(pmap) -@test pmap(uppercase, "Hello World!") == map(uppercase, "Hello World!") + # pmap with various types. Test for equivalence with map + run_map_equivalence_tests(pmap) + @test pmap(uppercase, "Hello World!") == map(uppercase, "Hello World!") -# Simple test for pmap throws error -let error_thrown = false - try - pmap(x -> x == 50 ? error("foobar") : x, 1:100) - catch e - @test e.captured.ex.msg == "foobar" - error_thrown = true + # Simple test for pmap throws error + let error_thrown = false + try + pmap(x -> x == 50 ? error("foobar") : x, 1:100) + catch e + @test e.captured.ex.msg == "foobar" + error_thrown = true + end + @test error_thrown end - @test error_thrown -end -# Test pmap with a generator type iterator -@test [1:100...] == pmap(x->x, Base.Generator(x->(sleep(0.0001); x), 1:100)) - -# Test pgenerate -n = 10 -as = [rand(4,4) for i in 1:n] -bs = deepcopy(as) -cs = collect(DistributedNext.pgenerate(x->(sleep(rand()*0.1); svd(x)), bs)) -svdas = map(svd, as) -for i in 1:n - @test cs[i].U ≈ svdas[i].U - @test cs[i].S ≈ svdas[i].S - @test cs[i].V ≈ svdas[i].V -end + # Test pmap with a generator type iterator + @test [1:100...] == pmap(x->x, Base.Generator(x->(sleep(0.0001); x), 1:100)) -# Test that the default worker pool cycles through all workers -pmap(_->myid(), 1:nworkers()) # priming run -@test nworkers() == length(unique(pmap(_->myid(), 1:100))) - -# Test same behaviour when executed on a worker -@test nworkers() == length(unique(remotecall_fetch(()->pmap(_->myid(), 1:100), id_other))) - -# Same tests with custom worker pools. -wp = WorkerPool(workers()) -@test nworkers() == length(unique(pmap(_->myid(), wp, 1:100))) -@test nworkers() == length(unique(remotecall_fetch(wp->pmap(_->myid(), wp, 1:100), id_other, wp))) -wp = WorkerPool(2:3) -@test sort(unique(pmap(_->myid(), wp, 1:100))) == [2,3] - -# wait on worker pool -wp = WorkerPool(2:2) -w = take!(wp) - -# local call to _wait -@test !isready(wp) -t = @async wait(wp) -@test !istaskdone(t) -put!(wp, w) -status = timedwait(() -> istaskdone(t), 10) -@test status == :ok - -# remote call to _wait -take!(wp) -@test !isready(wp) -f = @spawnat w wait(wp) -@test !isready(f) -put!(wp, w) -status = timedwait(() -> isready(f), 10) -@test status == :ok - -# CachingPool tests -wp = CachingPool(workers()) -@test [1:100...] == pmap(x->x, wp, 1:100) - -clear!(wp) -@test length(wp.map_obj2ref) == 0 - -# default_worker_pool! tests -wp_default = DistributedNext.default_worker_pool() -try - local wp = CachingPool(workers()) - DistributedNext.default_worker_pool!(wp) + # Test pgenerate + n = 10 + as = [rand(4,4) for i in 1:n] + bs = deepcopy(as) + cs = collect(DistributedNext.pgenerate(x->(sleep(rand()*0.1); svd(x)), bs)) + svdas = map(svd, as) + for i in 1:n + @test cs[i].U ≈ svdas[i].U + @test cs[i].S ≈ svdas[i].S + @test cs[i].V ≈ svdas[i].V + end + + # Test that the default worker pool cycles through all workers + pmap(_->myid(), 1:nworkers()) # priming run + @test nworkers() == length(unique(pmap(_->myid(), 1:100))) + + # Test same behaviour when executed on a worker + @test nworkers() == length(unique(remotecall_fetch(()->pmap(_->myid(), 1:100), id_other))) + + # Same tests with custom worker pools. + wp = WorkerPool(workers()) + @test nworkers() == length(unique(pmap(_->myid(), wp, 1:100))) + @test nworkers() == length(unique(remotecall_fetch(wp->pmap(_->myid(), wp, 1:100), id_other, wp))) + wp = WorkerPool(2:3) + @test sort(unique(pmap(_->myid(), wp, 1:100))) == [2,3] + + # wait on worker pool + wp = WorkerPool(2:2) + w = take!(wp) + + # local call to _wait + @test !isready(wp) + t = @async wait(wp) + @test !istaskdone(t) + put!(wp, w) + status = timedwait(() -> istaskdone(t), 10) + @test status == :ok + + # remote call to _wait + take!(wp) + @test !isready(wp) + f = @spawnat w wait(wp) + @test !isready(f) + put!(wp, w) + status = timedwait(() -> isready(f), 10) + @test status == :ok + + # CachingPool tests + wp = CachingPool(workers()) @test [1:100...] == pmap(x->x, wp, 1:100) - @test !isempty(wp.map_obj2ref) + clear!(wp) - @test isempty(wp.map_obj2ref) -finally - DistributedNext.default_worker_pool!(wp_default) + @test length(wp.map_obj2ref) == 0 + + # default_worker_pool! tests + wp_default = DistributedNext.default_worker_pool() + try + local wp = CachingPool(workers()) + DistributedNext.default_worker_pool!(wp) + @test [1:100...] == pmap(x->x, wp, 1:100) + @test !isempty(wp.map_obj2ref) + clear!(wp) + @test isempty(wp.map_obj2ref) + finally + DistributedNext.default_worker_pool!(wp_default) + end end # The below block of tests are usually run only on local development systems, @@ -760,761 +782,779 @@ if DoFullTest @test all([p == remotecall_fetch(myid, p) for p in all_w]) end -let t = @task 42 - schedule(t, ErrorException(""), error=true) - @test_throws TaskFailedException(t) Base.wait(t) -end +@testset "Various individual issues" begin + let t = @task 42 + schedule(t, ErrorException(""), error=true) + @test_throws TaskFailedException(t) Base.wait(t) + end -# issue #8207 -let A = Any[] - @distributed (+) for i in (push!(A,1); 1:2) - i + # issue #8207 + let A = Any[] + @distributed (+) for i in (push!(A,1); 1:2) + i + end + @test length(A) == 1 end - @test length(A) == 1 -end -# issue #13168 -function f13168(n) - val = 0 - for i = 1:n - val += sum(rand(n, n)^2) + # issue #13168 + function f13168(n) + val = 0 + for i = 1:n + val += sum(rand(n, n)^2) + end + return val + end + let t = schedule(@task f13168(100)) + @test t.state === :runnable + @test t.queue !== nothing + @test_throws ErrorException schedule(t) + yield() + @test t.state === :done + @test t.queue === nothing + @test_throws ErrorException schedule(t) + @test isa(fetch(t), Float64) end - return val -end -let t = schedule(@task f13168(100)) - @test t.state === :runnable - @test t.queue !== nothing - @test_throws ErrorException schedule(t) - yield() - @test t.state === :done - @test t.queue === nothing - @test_throws ErrorException schedule(t) - @test isa(fetch(t), Float64) -end -# issue #13122 -@test remotecall_fetch(identity, workers()[1], C_NULL) === C_NULL + # issue #13122 + @test remotecall_fetch(identity, workers()[1], C_NULL) === C_NULL -# issue #11062 -function t11062() - @async v11062 = 1 - v11062 = 2 -end + # issue #11062 + function t11062() + @async v11062 = 1 + v11062 = 2 + end -@test t11062() == 2 + @test t11062() == 2 -# issue #15406 -v15406 = remotecall_wait(() -> 1, id_other) -fetch(v15406) -remotecall_wait(fetch, id_other, v15406) + # issue #15406 + v15406 = remotecall_wait(() -> 1, id_other) + fetch(v15406) + remotecall_wait(fetch, id_other, v15406) -# issue #43396 -# Covers the remote fetch where the value returned is `nothing` -# May be caused by attempting to unwrap a non-`Some` type with `something` -# `call_on_owner` ref fetches return values not wrapped in `Some` -# and have to be returned directly -@test nothing === fetch(remotecall(() -> nothing, workers()[1])) -@test 10 === fetch(remotecall(() -> 10, workers()[1])) + # issue #43396 + # Covers the remote fetch where the value returned is `nothing` + # May be caused by attempting to unwrap a non-`Some` type with `something` + # `call_on_owner` ref fetches return values not wrapped in `Some` + # and have to be returned directly + @test nothing === fetch(remotecall(() -> nothing, workers()[1])) + @test 10 === fetch(remotecall(() -> 10, workers()[1])) +end +# Helper modules for the tests +module LocalFoo +global foo=1 +end -# Test various forms of remotecall* invocations +module LocalBar +using DistributedNext +bar() = @everywhere new_bar()=myid() +end -@everywhere f_args(v1, v2=0; kw1=0, kw2=0) = v1+v2+kw1+kw2 +f16091a() = 1 +f16091b = () -> 1 -function test_f_args(result, args...; kwargs...) - @test fetch(remotecall(args...; kwargs...)) == result - @test fetch(remotecall_wait(args...; kwargs...)) == result - @test remotecall_fetch(args...; kwargs...) == result +@testset "remotecall*()" begin + # Test various forms of remotecall* invocations - # A visual test - remote_do should NOT print any errors - remote_do(args...; kwargs...) -end + @everywhere f_args(v1, v2=0; kw1=0, kw2=0) = v1+v2+kw1+kw2 -for tid in [id_other, id_me, default_worker_pool()] - test_f_args(1, f_args, tid, 1) - test_f_args(3, f_args, tid, 1, 2) - test_f_args(5, f_args, tid, 1; kw1=4) - test_f_args(13, f_args, tid, 1; kw1=4, kw2=8) - test_f_args(15, f_args, tid, 1, 2; kw1=4, kw2=8) -end + function test_f_args(result, args...; kwargs...) + @test fetch(remotecall(args...; kwargs...)) == result + @test fetch(remotecall_wait(args...; kwargs...)) == result + @test remotecall_fetch(args...; kwargs...) == result -# Test remote_do -f=Future(id_me) -remote_do(fut->put!(fut, myid()), id_me, f) -@test fetch(f) == id_me + # A visual test - remote_do should NOT print any errors + remote_do(args...; kwargs...) + end -f=Future(id_other) -remote_do(fut->put!(fut, myid()), id_other, f) -@test fetch(f) == id_other + for tid in [id_other, id_me, default_worker_pool()] + test_f_args(1, f_args, tid, 1) + test_f_args(3, f_args, tid, 1, 2) + test_f_args(5, f_args, tid, 1; kw1=4) + test_f_args(13, f_args, tid, 1; kw1=4, kw2=8) + test_f_args(15, f_args, tid, 1, 2; kw1=4, kw2=8) + end -# Github issue #29932 -rc_unbuffered = RemoteChannel(()->Channel{Vector{Float64}}(0)) -@test eltype(rc_unbuffered) == Vector{Float64} + # Test remote_do + f=Future(id_me) + remote_do(fut->put!(fut, myid()), id_me, f) + @test fetch(f) == id_me -@async begin - # Trigger direct write (no buffering) of largish array - array_sz = Int(Base.SZ_UNBUFFERED_IO/8) + 1 - largev = zeros(array_sz) - for i in 1:10 - largev[1] = float(i) - put!(rc_unbuffered, largev) - end -end + f=Future(id_other) + remote_do(fut->put!(fut, myid()), id_other, f) + @test fetch(f) == id_other -@test remotecall_fetch(rc -> begin - for i in 1:10 - take!(rc)[1] != float(i) && error("Failed") - end - return :OK - end, id_other, rc_unbuffered) === :OK - -# github issue 33972 -rc_unbuffered_other = RemoteChannel(()->Channel{Int}(0), id_other) -close(rc_unbuffered_other) -try; take!(rc_unbuffered_other); catch; end -@test !remotecall_fetch(rc -> islocked(DistributedNext.lookup_ref(remoteref_id(rc)).synctake), - id_other, rc_unbuffered_other) - -# github PR #14456 -n = DoFullTest ? 6 : 5 -for i = 1:10^n - fetch(@spawnat myid() myid()) -end + # Github issue #29932 + rc_unbuffered = RemoteChannel(()->Channel{Vector{Float64}}(0)) + @test eltype(rc_unbuffered) == Vector{Float64} -# issue #15451 -@test remotecall_fetch(x->(y->2y)(x)+1, workers()[1], 3) == 7 - -# issue #16091 -mutable struct T16091 end -wid = workers()[1] -try - remotecall_fetch(()->T16091, wid) - @test "unreachable" === true -catch ex - ex = ((ex::RemoteException).captured::CapturedException).ex - @test (ex::UndefVarError).var === :T16091 -end -try - remotecall_fetch(identity, wid, T16091) - @test "unreachable" === true -catch ex - ex = ((ex::RemoteException).captured::CapturedException).ex - @test (ex::UndefVarError).var === :T16091 -end + @async begin + # Trigger direct write (no buffering) of largish array + array_sz = Int(Base.SZ_UNBUFFERED_IO/8) + 1 + largev = zeros(array_sz) + for i in 1:10 + largev[1] = float(i) + put!(rc_unbuffered, largev) + end + end -f16091a() = 1 -remotecall_fetch(()->eval(:(f16091a() = 2)), wid) -@test remotecall_fetch(f16091a, wid) === 2 -@test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid, myid()) === 1 + @test remotecall_fetch(rc -> begin + for i in 1:10 + take!(rc)[1] != float(i) && error("Failed") + end + return :OK + end, id_other, rc_unbuffered) === :OK -# these will only heisen-fail, since it depends on the gensym counter collisions: -f16091b = () -> 1 -remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid) -@test remotecall_fetch(f16091b, 2) === 1 -# Global anonymous functions are over-written... -@test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid, myid()) === 1 - -# ...while local anonymous functions are by definition, local. -let - f16091c = () -> 1 - @test remotecall_fetch(f16091c, 2) === 1 - @test remotecall_fetch( - myid -> begin - let - f16091c = () -> 2 - remotecall_fetch(f16091c, myid) - end - end, wid, myid()) === 2 -end + # github issue 33972 + rc_unbuffered_other = RemoteChannel(()->Channel{Int}(0), id_other) + close(rc_unbuffered_other) + try; take!(rc_unbuffered_other); catch; end + @test !remotecall_fetch(rc -> islocked(DistributedNext.lookup_ref(remoteref_id(rc)).synctake), + id_other, rc_unbuffered_other) -# issue #16451 -rng=RandomDevice() -retval = @distributed (+) for _ in 1:10 - rand(rng) -end -@test retval > 0.0 && retval < 10.0 + # github PR #14456 + n = DoFullTest ? 6 : 5 + for i = 1:10^n + fetch(@spawnat myid() myid()) + end -rand(rng) -retval = @distributed (+) for _ in 1:10 - rand(rng) -end -@test retval > 0.0 && retval < 10.0 + # issue #15451 + @test remotecall_fetch(x->(y->2y)(x)+1, workers()[1], 3) == 7 -# serialization tests -wrkr1 = workers()[1] -wrkr2 = workers()[end] + # issue #16091 + mutable struct T16091 end + wid = workers()[1] + try + remotecall_fetch(()->T16091, wid) + @test "unreachable" === true + catch ex + ex = ((ex::RemoteException).captured::CapturedException).ex + @test (ex::UndefVarError).var === :T16091 + end + try + remotecall_fetch(identity, wid, T16091) + @test "unreachable" === true + catch ex + ex = ((ex::RemoteException).captured::CapturedException).ex + @test (ex::UndefVarError).var === :T16091 + end + + remotecall_fetch(()->eval(:(f16091a() = 2)), wid) + @test remotecall_fetch(f16091a, wid) === 2 + @test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid, myid()) === 1 + + # these will only heisen-fail, since it depends on the gensym counter collisions: + remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid) + @test remotecall_fetch(f16091b, 2) === 1 + # Global anonymous functions are over-written... + @test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid, myid()) === 1 + + # ...while local anonymous functions are by definition, local. + let + f16091c = () -> 1 + @test remotecall_fetch(f16091c, 2) === 1 + @test remotecall_fetch( + myid -> begin + let + f16091c = () -> 2 + remotecall_fetch(f16091c, myid) + end + end, wid, myid()) === 2 + end -@test remotecall_fetch(p->remotecall_fetch(myid, p), wrkr1, wrkr2) == wrkr2 + # issue #16451 + rng=RandomDevice() + retval = @distributed (+) for _ in 1:10 + rand(rng) + end + @test retval > 0.0 && retval < 10.0 -# Send f to wrkr1 and wrkr2. Then try calling f on wrkr2 from wrkr1 -f_myid = ()->myid() -@test wrkr1 == remotecall_fetch(f_myid, wrkr1) -@test wrkr2 == remotecall_fetch(f_myid, wrkr2) -@test wrkr2 == remotecall_fetch((f, p)->remotecall_fetch(f, p), wrkr1, f_myid, wrkr2) + rand(rng) + retval = @distributed (+) for _ in 1:10 + rand(rng) + end + @test retval > 0.0 && retval < 10.0 -# Deserialization error recovery test -# locally defined module, but unavailable on workers -module LocalFoo - global foo=1 -end + # serialization tests + wrkr1 = workers()[1] + wrkr2 = workers()[end] -let - @test_throws RemoteException remotecall_fetch(()->LocalFoo.foo, 2) + @test remotecall_fetch(p->remotecall_fetch(myid, p), wrkr1, wrkr2) == wrkr2 - bad_thunk = ()->NonexistentModule.f() - @test_throws RemoteException remotecall_fetch(bad_thunk, 2) + # Send f to wrkr1 and wrkr2. Then try calling f on wrkr2 from wrkr1 + f_myid = ()->myid() + @test wrkr1 == remotecall_fetch(f_myid, wrkr1) + @test wrkr2 == remotecall_fetch(f_myid, wrkr2) + @test wrkr2 == remotecall_fetch((f, p)->remotecall_fetch(f, p), wrkr1, f_myid, wrkr2) - # Test that the stream is still usable - @test remotecall_fetch(()->:test,2) === :test - ref = remotecall(bad_thunk, 2) - @test_throws RemoteException fetch(ref) -end + # Deserialization error recovery test + # locally defined module, but unavailable on workers + let + @test_throws RemoteException remotecall_fetch(()->LocalFoo.foo, 2) -# Test calling @everywhere from a module not defined on the workers -module LocalBar - using DistributedNext - bar() = @everywhere new_bar()=myid() -end -LocalBar.bar() -for p in procs() - @test p == remotecall_fetch(new_bar, p) -end + bad_thunk = ()->NonexistentModule.f() + @test_throws RemoteException remotecall_fetch(bad_thunk, 2) -# @everywhere (remotecall_eval) behaviors (#22589) -let (p, p2) = filter!(p -> p != myid(), procs()) - @test (myid() + 1) == @everywhere myid() (myid() + 1) - @test (p * 2) == @everywhere p (myid() * 2) - @test 1 == @everywhere p defined_on_p = 1 - @test !@isdefined defined_on_p - @test !isdefined(Main, :defined_on_p) - @test remotecall_fetch(isdefined, p, Main, :defined_on_p) - @test !remotecall_fetch(isdefined, p2, Main, :defined_on_p) - @test nothing === @everywhere [p, p] defined_on_p += 1 - @test 3 === @everywhere p defined_on_p - let ref = Ref(0) - @test nothing === - @everywhere [myid(), p, myid(), myid(), p] begin - Test.@test Main === @__MODULE__ - $ref[] += 1 - end - @test ref[] == 3 + # Test that the stream is still usable + @test remotecall_fetch(()->:test,2) === :test + ref = remotecall(bad_thunk, 2) + @test_throws RemoteException fetch(ref) end - function test_throw_on(procs, msg) - try - @everywhere procs error($msg) - error("test failed to throw") - catch excpt - if procs isa Int - ex = Any[excpt] - else - ex = (excpt::CompositeException).exceptions - end - for (p, ex) in zip(procs, ex) - local p - if procs isa Int || p != myid() - @test (ex::RemoteException).pid == p - ex = ((ex::RemoteException).captured::CapturedException).ex + + # Test calling @everywhere from a module not defined on the workers + LocalBar.bar() + for p in procs() + @test p == remotecall_fetch(new_bar, p) + end + + # @everywhere (remotecall_eval) behaviors (#22589) + let (p, p2) = filter!(p -> p != myid(), procs()) + @test (myid() + 1) == @everywhere myid() (myid() + 1) + @test (p * 2) == @everywhere p (myid() * 2) + @test 1 == @everywhere p defined_on_p = 1 + @test !@isdefined defined_on_p + @test !isdefined(Main, :defined_on_p) + @test remotecall_fetch(isdefined, p, Main, :defined_on_p) + @test !remotecall_fetch(isdefined, p2, Main, :defined_on_p) + @test nothing === @everywhere [p, p] defined_on_p += 1 + @test 3 === @everywhere p defined_on_p + let ref = Ref(0) + @test nothing === + @everywhere [myid(), p, myid(), myid(), p] begin + Test.@test Main === @__MODULE__ + $ref[] += 1 + end + @test ref[] == 3 + end + function test_throw_on(procs, msg) + try + @everywhere procs error($msg) + error("test failed to throw") + catch excpt + if procs isa Int + ex = Any[excpt] else - ex = (ex::TaskFailedException).task.exception + ex = (excpt::CompositeException).exceptions + end + for (p, ex) in zip(procs, ex) + local p + if procs isa Int || p != myid() + @test (ex::RemoteException).pid == p + ex = ((ex::RemoteException).captured::CapturedException).ex + else + ex = (ex::TaskFailedException).task.exception + end + @test (ex::ErrorException).msg == msg end - @test (ex::ErrorException).msg == msg end end + test_throw_on(p, "everywhere on p") + test_throw_on(myid(), "everywhere on myid") + test_throw_on([p, myid()], "everywhere on myid and p") + test_throw_on([p2, p], "everywhere on p and p2") end - test_throw_on(p, "everywhere on p") - test_throw_on(myid(), "everywhere on myid") - test_throw_on([p, myid()], "everywhere on myid and p") - test_throw_on([p2, p], "everywhere on p and p2") end -# Test addprocs enable_threaded_blas parameter - -function get_remote_num_threads(processes_added) - return [remotecall_fetch(BLAS.get_num_threads, proc_id) for proc_id in processes_added] -end +@testset "addprocs enable_threaded_blas parameter" begin + function get_remote_num_threads(processes_added) + return [remotecall_fetch(BLAS.get_num_threads, proc_id) for proc_id in processes_added] + end -function test_blas_config(pid, expected) - for worker in DistributedNext.PGRP.workers - if worker.id == pid - @test worker.config.enable_threaded_blas == expected - return + function test_blas_config(pid, expected) + for worker in DistributedNext.PGRP.workers + if worker.id == pid + @test worker.config.enable_threaded_blas == expected + return + end end end -end -function test_add_procs_threaded_blas() - master_blas_thread_count = BLAS.get_num_threads() - if master_blas_thread_count === nothing - @warn "Skipping blas num threads tests due to unsupported blas version" - return - end + function test_add_procs_threaded_blas() + master_blas_thread_count = BLAS.get_num_threads() + if master_blas_thread_count === nothing + @warn "Skipping blas num threads tests due to unsupported blas version" + return + end - # Test with default enable_threaded_blas false - processes_added = addprocs_with_testenv(2) - for proc_id in processes_added - test_blas_config(proc_id, false) - end + # Test with default enable_threaded_blas false + processes_added = addprocs_with_testenv(2) + for proc_id in processes_added + test_blas_config(proc_id, false) + end - # Master thread should not have changed - @test BLAS.get_num_threads() == master_blas_thread_count + # Master thread should not have changed + @test BLAS.get_num_threads() == master_blas_thread_count - # Threading disabled in children by default - thread_counts_by_process = get_remote_num_threads(processes_added) - for thread_count in thread_counts_by_process - @test thread_count == 1 - end - rmprocs(processes_added) + # Threading disabled in children by default + thread_counts_by_process = get_remote_num_threads(processes_added) + for thread_count in thread_counts_by_process + @test thread_count == 1 + end + rmprocs(processes_added) - processes_added = addprocs_with_testenv(2, enable_threaded_blas=true) - for proc_id in processes_added - test_blas_config(proc_id, true) - end + processes_added = addprocs_with_testenv(2, enable_threaded_blas=true) + for proc_id in processes_added + test_blas_config(proc_id, true) + end - @test BLAS.get_num_threads() == master_blas_thread_count + @test BLAS.get_num_threads() == master_blas_thread_count - # BLAS.set_num_threads(`num`) doesn't cause BLAS.get_num_threads to return `num` - # depending on the machine, the BLAS version, and BLAS configuration, so - # we need a very lenient test. - thread_counts_by_process = get_remote_num_threads(processes_added) - for thread_count in thread_counts_by_process - @test thread_count >= 1 + # BLAS.set_num_threads(`num`) doesn't cause BLAS.get_num_threads to return `num` + # depending on the machine, the BLAS version, and BLAS configuration, so + # we need a very lenient test. + thread_counts_by_process = get_remote_num_threads(processes_added) + for thread_count in thread_counts_by_process + @test thread_count >= 1 + end + rmprocs(processes_added) end - rmprocs(processes_added) -end -test_add_procs_threaded_blas() - -#19687 -if false ### TODO: The logic that is supposed to implement this is racy - Disabled for now -# ensure no race conditions between rmprocs and addprocs -for i in 1:5 - p = addprocs_with_testenv(1)[1] - @spawnat p sleep(5) - rmprocs(p; waitfor=0) + test_add_procs_threaded_blas() end -# Test if a wait has been called on rmprocs(...;waitfor=0), further remotecalls -# don't throw errors. -for i in 1:5 - p = addprocs_with_testenv(1)[1] - np = nprocs() - @spawnat p sleep(5) - Base.wait(rmprocs(p; waitfor=0)) - for pid in procs() - @test pid == remotecall_fetch(myid, pid) - end - @test nprocs() == np - 1 -end +@testset "addprocs()/rmprocs()" begin + #19687 + if false ### TODO: The logic that is supposed to implement this is racy - Disabled for now + # ensure no race conditions between rmprocs and addprocs + for i in 1:5 + p = addprocs_with_testenv(1)[1] + @spawnat p sleep(5) + rmprocs(p; waitfor=0) + end -# Test that an exception is thrown if workers are unable to be removed within requested time. -if DoFullTest - pids=addprocs_with_testenv(4); - @test_throws ErrorException rmprocs(pids; waitfor=0.001); - # wait for workers to be removed - while any(in(procs()), pids) - sleep(0.1) + # Test if a wait has been called on rmprocs(...;waitfor=0), further remotecalls + # don't throw errors. + for i in 1:5 + p = addprocs_with_testenv(1)[1] + np = nprocs() + @spawnat p sleep(5) + Base.wait(rmprocs(p; waitfor=0)) + for pid in procs() + @test pid == remotecall_fetch(myid, pid) + end + @test nprocs() == np - 1 + end + + # Test that an exception is thrown if workers are unable to be removed within requested time. + if DoFullTest + pids=addprocs_with_testenv(4); + @test_throws ErrorException rmprocs(pids; waitfor=0.001); + # wait for workers to be removed + while any(in(procs()), pids) + sleep(0.1) + end + end end -end -end -# Test addprocs/rmprocs from master node only -for f in [ ()->addprocs(1; exeflags=test_exeflags), ()->rmprocs(workers()) ] - local f - try - remotecall_fetch(f, id_other) - error("Unexpected") - catch ex - @test isa(ex, RemoteException) - @test ex.captured.ex.msg == "Only process 1 can add and remove workers" + # Test addprocs/rmprocs from master node only + for f in [ ()->addprocs(1; exeflags=test_exeflags), ()->rmprocs(workers()) ] + local f + try + remotecall_fetch(f, id_other) + error("Unexpected") + catch ex + @test isa(ex, RemoteException) + @test ex.captured.ex.msg == "Only process 1 can add and remove workers" + end end -end -# Test the following addprocs error conditions -# - invalid host name - github issue #20372 -# - julia exe exiting with an error -# - timeout reading host:port from worker stdout -# - host:port not found in worker stdout in the first 1000 lines + # Test the following addprocs error conditions + # - invalid host name - github issue #20372 + # - julia exe exiting with an error + # - timeout reading host:port from worker stdout + # - host:port not found in worker stdout in the first 1000 lines -struct ErrorSimulator <: ClusterManager - mode -end + struct ErrorSimulator <: ClusterManager + mode + end -function launch(manager::ErrorSimulator, params::Dict, launched::Array, c::Condition) - exename = params[:exename] - dir = params[:dir] + function DistributedNext.launch(manager::ErrorSimulator, params::Dict, launched::Array, c::Condition) + exename = params[:exename] + dir = params[:dir] - cmd = `$(Base.julia_cmd(exename)) --startup-file=no` - if manager.mode === :timeout - cmd = `$cmd -e "sleep(10)"` - elseif manager.mode === :ntries - cmd = `$cmd -e "[println(x) for x in 1:1001]"` - elseif manager.mode === :exit - cmd = `$cmd -e "exit(-1)"` - else - error("Unknown mode") - end - io = open(detach(setenv(cmd, dir=dir))) + cmd = `$(Base.julia_cmd(exename)) --startup-file=no` + if manager.mode === :timeout + cmd = `$cmd -e "sleep(10)"` + elseif manager.mode === :ntries + cmd = `$cmd -e "[println(x) for x in 1:1001]"` + elseif manager.mode === :exit + cmd = `$cmd -e "exit(-1)"` + else + error("Unknown mode") + end + io = open(detach(setenv(cmd, dir=dir))) - wconfig = WorkerConfig() - wconfig.process = io - wconfig.io = io.out - push!(launched, wconfig) - notify(c) -end + wconfig = WorkerConfig() + wconfig.process = io + wconfig.io = io.out + push!(launched, wconfig) + notify(c) + end -testruns = Any[] + testruns = Any[] -if DoFullTest - append!(testruns, [(()->addprocs_with_testenv(["errorhost20372"]), "Unable to read host:port string from worker. Launch command exited with error?", ())]) -end + if DoFullTest + append!(testruns, [(()->addprocs_with_testenv(["errorhost20372"]), "Unable to read host:port string from worker. Launch command exited with error?", ())]) + end -append!(testruns, [ - (()->addprocs_with_testenv(ErrorSimulator(:exit)), "Unable to read host:port string from worker. Launch command exited with error?", ()), - (()->addprocs_with_testenv(ErrorSimulator(:ntries)), "Unexpected output from worker launch command. Host:port string not found.", ()), - (()->addprocs_with_testenv(ErrorSimulator(:timeout)), "Timed out waiting to read host:port string from worker.", ("JULIA_WORKER_TIMEOUT"=>"1",)) -]) + append!(testruns, [ + (()->addprocs_with_testenv(ErrorSimulator(:exit)), "Unable to read host:port string from worker. Launch command exited with error?", ()), + (()->addprocs_with_testenv(ErrorSimulator(:ntries)), "Unexpected output from worker launch command. Host:port string not found.", ()), + (()->addprocs_with_testenv(ErrorSimulator(:timeout)), "Timed out waiting to read host:port string from worker.", ("JULIA_WORKER_TIMEOUT"=>"1",)) + ]) -for (addp_testf, expected_errstr, env) in testruns - old_stdout = stdout - stdout_out, stdout_in = redirect_stdout() - stdout_txt = @async filter!(readlines(stdout_out)) do s + for (addp_testf, expected_errstr, env) in testruns + old_stdout = stdout + stdout_out, stdout_in = redirect_stdout() + stdout_txt = @async filter!(readlines(stdout_out)) do s return !startswith(s, "\tFrom worker startup:\t") end - try - withenv(env...) do - addp_testf() + try + withenv(env...) do + addp_testf() + end + error("Unexpected") + catch ex + redirect_stdout(old_stdout) + close(stdout_in) + @test isempty(fetch(stdout_txt)) + @test isa(ex, CompositeException) + @test ex.exceptions[1].task.exception.msg == expected_errstr end - error("Unexpected") - catch ex - redirect_stdout(old_stdout) - close(stdout_in) - @test isempty(fetch(stdout_txt)) - @test isa(ex, CompositeException) - @test ex.exceptions[1].task.exception.msg == expected_errstr end end +module FooModLocal end +const c1 = fill(1., 10) -# Auto serialization of globals from Main. -# bitstypes -global v1 = 1 -@test remotecall_fetch(()->v1, id_other) == v1 -@test remotecall_fetch(()->isdefined(Main, :v1), id_other) -for i in 2:5 - global v1 = i - @test remotecall_fetch(()->v1, id_other) == i -end +@testset "Serialization/deserialization" begin + # Auto serialization of globals from Main. + # bitstypes + global v1 = 1 + @test remotecall_fetch(()->v1, id_other) == v1 + @test remotecall_fetch(()->isdefined(Main, :v1), id_other) + for i in 2:5 + global v1 = i + @test remotecall_fetch(()->v1, id_other) == i + end -# non-bitstypes -global v2 = zeros(10) -for i in 1:5 - v2[i] = i - @test remotecall_fetch(()->v2, id_other) == v2 -end + # non-bitstypes + global v2 = zeros(10) + for i in 1:5 + v2[i] = i + @test remotecall_fetch(()->v2, id_other) == v2 + end -# Different global bindings to the same object -global v3 = fill(1., 10) -global v4 = v3 -@test remotecall_fetch(()->v3, id_other) == remotecall_fetch(()->v4, id_other) -@test remotecall_fetch(()->isdefined(Main, :v3), id_other) -@test remotecall_fetch(()->isdefined(Main, :v4), id_other) + # Different global bindings to the same object + global v3 = fill(1., 10) + global v4 = v3 + @test remotecall_fetch(()->v3, id_other) == remotecall_fetch(()->v4, id_other) + @test remotecall_fetch(()->isdefined(Main, :v3), id_other) + @test remotecall_fetch(()->isdefined(Main, :v4), id_other) -# Global references to Types and Modules should work if they are locally defined -global v5 = Int -global v6 = DistributedNext -@test remotecall_fetch(()->v5, id_other) === Int -@test remotecall_fetch(()->v6, id_other) === DistributedNext + # Global references to Types and Modules should work if they are locally defined + global v5 = Int + global v6 = DistributedNext + @test remotecall_fetch(()->v5, id_other) === Int + @test remotecall_fetch(()->v6, id_other) === DistributedNext -struct FooStructLocal end -module FooModLocal end -v5 = FooStructLocal -v6 = FooModLocal -@test_throws RemoteException remotecall_fetch(()->v5, id_other) -@test_throws RemoteException remotecall_fetch(()->v6, id_other) + struct FooStructLocal end + v5 = FooStructLocal + v6 = FooModLocal + @test_throws RemoteException remotecall_fetch(()->v5, id_other) + @test_throws RemoteException remotecall_fetch(()->v6, id_other) -@everywhere struct FooStructEverywhere end -@everywhere module FooModEverywhere end -v5 = FooStructEverywhere -v6 = FooModEverywhere -@test remotecall_fetch(()->v5, id_other) === FooStructEverywhere -@test remotecall_fetch(()->v6, id_other) === FooModEverywhere + @everywhere struct FooStructEverywhere end + @everywhere module FooModEverywhere end + v5 = FooStructEverywhere + v6 = FooModEverywhere + @test remotecall_fetch(()->v5, id_other) === FooStructEverywhere + @test remotecall_fetch(()->v6, id_other) === FooModEverywhere -# hash value same but different object instance -v7 = ones(10) -oid1 = objectid(v7) -hval1 = hash(v7) -@test v7 == @fetchfrom id_other v7 -remote_oid1 = @fetchfrom id_other objectid(v7) + # hash value same but different object instance + v7 = ones(10) + oid1 = objectid(v7) + hval1 = hash(v7) + @test v7 == @fetchfrom id_other v7 + remote_oid1 = @fetchfrom id_other objectid(v7) -v7 = ones(10) -@test oid1 != objectid(v7) -@test hval1 == hash(v7) -@test remote_oid1 != @fetchfrom id_other objectid(v7) + v7 = ones(10) + @test oid1 != objectid(v7) + @test hval1 == hash(v7) + @test remote_oid1 != @fetchfrom id_other objectid(v7) -# Github issue #31252 -v31252 = :a -@test :a == @fetchfrom id_other v31252 + # Github issue #31252 + v31252 = :a + @test :a == @fetchfrom id_other v31252 -v31252 = :b -@test :b == @fetchfrom id_other v31252 + v31252 = :b + @test :b == @fetchfrom id_other v31252 -v31252 = :a -@test :a == @fetchfrom id_other v31252 + v31252 = :a + @test :a == @fetchfrom id_other v31252 -# Test that a global is not being repeatedly serialized when -# a) referenced multiple times in the closure -# b) hash value has not changed. + # Test that a global is not being repeatedly serialized when + # a) referenced multiple times in the closure + # b) hash value has not changed. -@everywhere begin - using Serialization - global testsercnt_d = Dict() - mutable struct TestSerCnt - v - end - import Base.hash, Base.== - hash(x::TestSerCnt, h::UInt) = hash(hash(x.v), h) - ==(x1::TestSerCnt, x2::TestSerCnt) = (x1.v == x2.v) + @everywhere begin + using Serialization + global testsercnt_d = Dict() + mutable struct TestSerCnt + v + end + import Base.hash, Base.== + hash(x::TestSerCnt, h::UInt) = hash(hash(x.v), h) + ==(x1::TestSerCnt, x2::TestSerCnt) = (x1.v == x2.v) + + function Serialization.serialize(s::AbstractSerializer, t::TestSerCnt) + Serialization.serialize_type(s, TestSerCnt) + serialize(s, t.v) + global testsercnt_d + cnt = get!(testsercnt_d, objectid(t), 0) + testsercnt_d[objectid(t)] = cnt+1 + end - function Serialization.serialize(s::AbstractSerializer, t::TestSerCnt) - Serialization.serialize_type(s, TestSerCnt) - serialize(s, t.v) - global testsercnt_d - cnt = get!(testsercnt_d, objectid(t), 0) - testsercnt_d[objectid(t)] = cnt+1 + Serialization.deserialize(s::AbstractSerializer, ::Type{TestSerCnt}) = TestSerCnt(deserialize(s)) end - Serialization.deserialize(s::AbstractSerializer, ::Type{TestSerCnt}) = TestSerCnt(deserialize(s)) -end - -# hash value of tsc is not changed -global tsc = TestSerCnt(zeros(10)) -for i in 1:5 - remotecall_fetch(()->tsc, id_other) -end -# should have been serialized only once -@test testsercnt_d[objectid(tsc)] == 1 - -# hash values are changed -n=5 -testsercnt_d[objectid(tsc)] = 0 -for i in 1:n - tsc.v[i] = i - remotecall_fetch(()->tsc, id_other) -end -# should have been serialized as many times as the loop -@test testsercnt_d[objectid(tsc)] == n + # hash value of tsc is not changed + global tsc = TestSerCnt(zeros(10)) + for i in 1:5 + remotecall_fetch(()->tsc, id_other) + end + # should have been serialized only once + @test testsercnt_d[objectid(tsc)] == 1 -# Multiple references in a closure should be serialized only once. -global mrefs = TestSerCnt(fill(1.,10)) -@test remotecall_fetch(()->(mrefs.v, 2*mrefs.v, 3*mrefs.v), id_other) == (fill(1.,10), fill(2.,10), fill(3.,10)) -@test testsercnt_d[objectid(mrefs)] == 1 + # hash values are changed + n=5 + testsercnt_d[objectid(tsc)] = 0 + for i in 1:n + tsc.v[i] = i + remotecall_fetch(()->tsc, id_other) + end + # should have been serialized as many times as the loop + @test testsercnt_d[objectid(tsc)] == n + # Multiple references in a closure should be serialized only once. + global mrefs = TestSerCnt(fill(1.,10)) + @test remotecall_fetch(()->(mrefs.v, 2*mrefs.v, 3*mrefs.v), id_other) == (fill(1.,10), fill(2.,10), fill(3.,10)) + @test testsercnt_d[objectid(mrefs)] == 1 -# nested anon functions -global f1 = x->x -global f2 = x->f1(x) -v = rand() -@test remotecall_fetch(f2, id_other, v) == v -@test remotecall_fetch(x->f2(x), id_other, v) == v -# consts -const c1 = fill(1., 10) -@test remotecall_fetch(()->c1, id_other) == c1 -@test remotecall_fetch(()->isconst(Main, :c1), id_other) + # nested anon functions + global f1 = x->x + global f2 = x->f1(x) + v = rand() + @test remotecall_fetch(f2, id_other, v) == v + @test remotecall_fetch(x->f2(x), id_other, v) == v + + # consts + @test remotecall_fetch(()->c1, id_other) == c1 + @test remotecall_fetch(()->isconst(Main, :c1), id_other) + + # Test same calls with local vars + function wrapped_var_ser_tests() + # bitstypes + local lv1 = 1 + @test remotecall_fetch(()->lv1, id_other) == lv1 + @test !remotecall_fetch(()->isdefined(Main, :lv1), id_other) + for i in 2:5 + lv1 = i + @test remotecall_fetch(()->lv1, id_other) == i + end -# Test same calls with local vars -function wrapped_var_ser_tests() - # bitstypes - local lv1 = 1 - @test remotecall_fetch(()->lv1, id_other) == lv1 - @test !remotecall_fetch(()->isdefined(Main, :lv1), id_other) - for i in 2:5 - lv1 = i - @test remotecall_fetch(()->lv1, id_other) == i - end + # non-bitstypes + local lv2 = zeros(10) + for i in 1:5 + lv2[i] = i + @test remotecall_fetch(()->lv2, id_other) == lv2 + end - # non-bitstypes - local lv2 = zeros(10) - for i in 1:5 - lv2[i] = i - @test remotecall_fetch(()->lv2, id_other) == lv2 + # nested anon functions + local lf1 = x->x + local lf2 = x->lf1(x) + v = rand() + @test remotecall_fetch(lf2, id_other, v) == v + @test remotecall_fetch(x->lf2(x), id_other, v) == v end - # nested anon functions - local lf1 = x->x - local lf2 = x->lf1(x) - v = rand() - @test remotecall_fetch(lf2, id_other, v) == v - @test remotecall_fetch(x->lf2(x), id_other, v) == v -end - -wrapped_var_ser_tests() + wrapped_var_ser_tests() -# Test internal data structures being cleaned up upon gc. -global ids_cleanup = fill(1., 6) -global ids_func = ()->ids_cleanup + # Test internal data structures being cleaned up upon gc. + global ids_cleanup = fill(1., 6) + global ids_func = ()->ids_cleanup -clust_ser = (DistributedNext.worker_from_id(id_other)).w_serializer -@test remotecall_fetch(ids_func, id_other) == ids_cleanup + clust_ser = (DistributedNext.worker_from_id(id_other)).w_serializer + @test remotecall_fetch(ids_func, id_other) == ids_cleanup -# TODO Add test for cleanup from `clust_ser.glbs_in_tnobj` + # TODO Add test for cleanup from `clust_ser.glbs_in_tnobj` +end -# reported github issues - Mostly tests with globals and various distributed macros -#2669, #5390 v2669=10 -@test fetch(@spawnat :any (1+v2669)) == 11 -#12367 -refs = [] -if true - n = 10 - for p in procs() - push!(refs, @spawnat p begin - @sync for i in 1:n - nothing - end - end) +@testset "More various individual issues" begin + # reported github issues - Mostly tests with globals and various distributed macros + #2669, #5390 + @test fetch(@spawnat :any (1+v2669)) == 11 + + #12367 + refs = [] + if true + n = 10 + for p in procs() + push!(refs, @spawnat p begin + @sync for i in 1:n + nothing + end + end) + end end -end -foreach(wait, refs) + foreach(wait, refs) -#6760 -if true - a = 2 - x = @distributed (vcat) for k=1:2 - sin(a) + #6760 + if true + a = 2 + x = @distributed (vcat) for k=1:2 + sin(a) + end end -end -@test x == map(_->sin(2), 1:2) - -let thrown = false - try - remotecall_fetch(sqrt, 2, -1) - catch e - thrown = true - local b = IOBuffer() - showerror(b, e) - @test occursin("sqrt was called with a negative real argument", String(take!(b))) - end - @test thrown -end + @test x == map(_->sin(2), 1:2) -# issue #34333 -let - @test fetch(remotecall(Float64, id_other, 1)) == Float64(1) - @test fetch(remotecall_wait(Float64, id_other, 1)) == Float64(1) - @test remotecall_fetch(Float64, id_other, 1) == Float64(1) -end - -#19463 -function foo19463() - w1 = workers()[1] - w2 = workers()[2] - w3 = workers()[3] - - b1 = () -> 1 - b2 = () -> fetch(@spawnat w1 b1()) + 1 - b3 = () -> fetch(@spawnat w2 b2()) + 1 - b4 = () -> fetch(@spawnat w3 b3()) + 1 - b4() -end -@test foo19463() == 4 - -# Testing clear! -function setup_syms(n, pids) - syms = [] - for i in 1:n - symstr = string("clrtest", randstring()) - sym = Symbol(symstr) - eval(:(global $sym = rand())) - for p in pids - eval(:(@test $sym == remotecall_fetch(()->$sym, $p))) - eval(:(@test remotecall_fetch(isdefined, $p, Main, Symbol($symstr)))) + let thrown = false + try + remotecall_fetch(sqrt, 2, -1) + catch e + thrown = true + local b = IOBuffer() + showerror(b, e) + @test occursin("sqrt was called with a negative real argument", String(take!(b))) + end + @test thrown + end + + # issue #34333 + let + @test fetch(remotecall(Float64, id_other, 1)) == Float64(1) + @test fetch(remotecall_wait(Float64, id_other, 1)) == Float64(1) + @test remotecall_fetch(Float64, id_other, 1) == Float64(1) + end + + #19463 + function foo19463() + w1 = workers()[1] + w2 = workers()[2] + w3 = workers()[3] + + b1 = () -> 1 + b2 = () -> fetch(@spawnat w1 b1()) + 1 + b3 = () -> fetch(@spawnat w2 b2()) + 1 + b4 = () -> fetch(@spawnat w3 b3()) + 1 + b4() + end + @test foo19463() == 4 +end + +@testset "clear!()" begin + # Testing clear! + function setup_syms(n, pids) + syms = [] + for i in 1:n + symstr = string("clrtest", randstring()) + sym = Symbol(symstr) + eval(:(global $sym = rand())) + for p in pids + eval(:(@test $sym == remotecall_fetch(()->$sym, $p))) + eval(:(@test remotecall_fetch(isdefined, $p, Main, Symbol($symstr)))) + end + push!(syms, sym) end - push!(syms, sym) + syms end - syms -end -function test_clear(syms, pids) - for p in pids - for sym in syms - remote_val = remotecall_fetch(()->getfield(Main, sym), p) - @test remote_val === nothing - @test remote_val != getfield(Main, sym) + function test_clear(syms, pids) + for p in pids + for sym in syms + remote_val = remotecall_fetch(()->getfield(Main, sym), p) + @test remote_val === nothing + @test remote_val != getfield(Main, sym) + end end end -end -syms = setup_syms(1, [id_other]) -clear!(syms[1], id_other) -test_clear(syms, [id_other]) + syms = setup_syms(1, [id_other]) + clear!(syms[1], id_other) + test_clear(syms, [id_other]) -syms = setup_syms(1, workers()) -clear!(syms[1], workers()) -test_clear(syms, workers()) + syms = setup_syms(1, workers()) + clear!(syms[1], workers()) + test_clear(syms, workers()) -syms = setup_syms(3, [id_other]) -clear!(syms, id_other) -test_clear(syms, [id_other]) + syms = setup_syms(3, [id_other]) + clear!(syms, id_other) + test_clear(syms, [id_other]) -syms = setup_syms(3, workers()) -clear!(syms, workers()) -test_clear(syms, workers()) - -# Test partial recovery from a deserialization error in CapturedException -try - expr = quote - mutable struct DontExistOn1 - x - end - throw(BoundsError(DontExistOn1(1), 1)) - end - - remotecall_fetch(()->eval(expr), id_other) - error("unexpected") -catch ex - @test isa(ex.captured.ex.exceptions[1].ex, ErrorException) - @test occursin("BoundsError", ex.captured.ex.exceptions[1].ex.msg) - ex = ex.captured.ex.exceptions[2].ex - @test (ex::UndefVarError).var === :DontExistOn1 + syms = setup_syms(3, workers()) + clear!(syms, workers()) + test_clear(syms, workers()) end -let - # creates a new worker in a different folder and tries to include file - tmp_dir = mktempdir() - tmp_dir2 = joinpath(tmp_dir, "2") - tmp_file = joinpath(tmp_dir2, "testfile") - tmp_file2 = joinpath(tmp_dir2, "testfile2") - proc = addprocs_with_testenv(1, dir=tmp_dir) +@testset "Deserialization error recovery and include()" begin + # Test partial recovery from a deserialization error in CapturedException try - mkdir(tmp_dir2) - write(tmp_file, "23.32 + 32 + myid() + include(\"testfile2\")") - write(tmp_file2, "myid() * 2") - function test_include_fails_to_open_file(fname) - try - include(fname) - catch exc - path = joinpath(@__DIR__, fname) - @test exc isa SystemError - @test exc.prefix == "opening file $(repr(path))" + expr = quote + mutable struct DontExistOn1 + x end + throw(BoundsError(DontExistOn1(1), 1)) + end + + remotecall_fetch(()->eval(expr), id_other) + error("unexpected") + catch ex + @test isa(ex.captured.ex.exceptions[1].ex, ErrorException) + @test occursin("BoundsError", ex.captured.ex.exceptions[1].ex.msg) + ex = ex.captured.ex.exceptions[2].ex + @test (ex::UndefVarError).var === :DontExistOn1 + end + + let + # creates a new worker in a different folder and tries to include file + tmp_dir = mktempdir() + tmp_dir2 = joinpath(tmp_dir, "2") + tmp_file = joinpath(tmp_dir2, "testfile") + tmp_file2 = joinpath(tmp_dir2, "testfile2") + proc = addprocs_with_testenv(1, dir=tmp_dir) + try + mkdir(tmp_dir2) + write(tmp_file, "23.32 + 32 + myid() + include(\"testfile2\")") + write(tmp_file2, "myid() * 2") + function test_include_fails_to_open_file(fname) + try + include(fname) + catch exc + path = joinpath(@__DIR__, fname) + @test exc isa SystemError + @test exc.prefix == "opening file $(repr(path))" + end + end + test_include_fails_to_open_file("testfile") + test_include_fails_to_open_file("testfile2") + test_include_fails_to_open_file(joinpath("2", "testfile2")) + @test include(tmp_file) == 58.32 + @test remotecall_fetch(include, proc[1], joinpath("2", "testfile")) == 55.32 + proc[1] * 3 + finally + rmprocs(proc) + rm(tmp_file, force=true) + rm(tmp_file2, force=true) + rm(tmp_dir2, force=true) + #rm(tmp_dir, force=true) end - test_include_fails_to_open_file("testfile") - test_include_fails_to_open_file("testfile2") - test_include_fails_to_open_file(joinpath("2", "testfile2")) - @test include(tmp_file) == 58.32 - @test remotecall_fetch(include, proc[1], joinpath("2", "testfile")) == 55.32 + proc[1] * 3 - finally - rmprocs(proc) - rm(tmp_file, force=true) - rm(tmp_file2, force=true) - rm(tmp_dir2, force=true) - #rm(tmp_dir, force=true) end end + # cookie and command line option `--worker` tests. remove workers, set cookie and test struct WorkerArgTester <: ClusterManager worker_opt @@ -1542,8 +1582,8 @@ manage(::WorkerArgTester, ::Integer, ::WorkerConfig, ::Symbol) = nothing nprocs()>1 && rmprocs(workers()) -## These tests are disabled because DistributedNext has no way of supporting the -## --worker argument. +# These tests are disabled because DistributedNext has no way of supporting the +# --worker argument. # npids = addprocs_with_testenv(WorkerArgTester(`--worker`, true)) # @test remotecall_fetch(myid, npids[1]) == npids[1] # rmprocs(npids) @@ -1557,111 +1597,116 @@ nprocs()>1 && rmprocs(workers()) # npids = addprocs_with_testenv(WorkerArgTester(`--worker=foobar`, false)) # @test remotecall_fetch(myid, npids[1]) == npids[1] -# tests for start_worker options to retain stdio (issue #31035) -struct RetainStdioTester <: ClusterManager - close_stdin::Bool - stderr_to_stdout::Bool -end - -function launch(manager::RetainStdioTester, params::Dict, launched::Array, c::Condition) - dir = params[:dir] - exename = params[:exename] - exeflags = params[:exeflags] +@testset "start_worker options to retain stdio (issue #31035)" begin + struct RetainStdioTester <: ClusterManager + close_stdin::Bool + stderr_to_stdout::Bool + end - jlcmd = "using DistributedNext; start_worker(\"\"; close_stdin=$(manager.close_stdin), stderr_to_stdout=$(manager.stderr_to_stdout));" - cmd = detach(setenv(`$exename $exeflags --bind-to $(DistributedNext.LPROC.bind_addr) -e $jlcmd`, dir=dir)) - proc = open(cmd, "r+") + function DistributedNext.launch(manager::RetainStdioTester, params::Dict, launched::Array, c::Condition) + dir = params[:dir] + exename = params[:exename] + exeflags = params[:exeflags] - wconfig = WorkerConfig() - wconfig.process = proc - wconfig.io = proc.out - push!(launched, wconfig) + jlcmd = "using DistributedNext; start_worker(\"\"; close_stdin=$(manager.close_stdin), stderr_to_stdout=$(manager.stderr_to_stdout));" + cmd = detach(setenv(`$exename $exeflags --bind-to $(DistributedNext.LPROC.bind_addr) -e $jlcmd`, dir=dir)) + proc = open(cmd, "r+") - notify(c) -end -manage(::RetainStdioTester, ::Integer, ::WorkerConfig, ::Symbol) = nothing + wconfig = WorkerConfig() + wconfig.process = proc + wconfig.io = proc.out + push!(launched, wconfig) + notify(c) + end + DistributedNext.manage(::RetainStdioTester, ::Integer, ::WorkerConfig, ::Symbol) = nothing -nprocs()>1 && rmprocs(workers()) -cluster_cookie("") -for close_stdin in (true, false), stderr_to_stdout in (true, false) - local npids = addprocs_with_testenv(RetainStdioTester(close_stdin,stderr_to_stdout)) - @test remotecall_fetch(myid, npids[1]) == npids[1] - if close_stdin - @test remotecall_fetch(()->stdin === devnull && !isreadable(stdin), npids[1]) - else - @test remotecall_fetch(()->stdin !== devnull && isopen(stdin) && isreadable(stdin), npids[1]) - end - @test stderr_to_stdout == remotecall_fetch(()->(stderr === stdout), npids[1]) - rmprocs(npids) -end + nprocs()>1 && rmprocs(workers()) + cluster_cookie("") -# Issue # 22865 -# Must be run on a new cluster, i.e., all workers must be in the same state. -@assert nprocs() == 1 -p1,p2 = addprocs_with_testenv(2) -@everywhere f22865(p) = remotecall_fetch(x->x.*2, p, fill(1.,2)) -@test fill(2.,2) == remotecall_fetch(f22865, p1, p2) -rmprocs(p1, p2) - -function reuseport_tests() - # Run the test on all processes. - results = asyncmap(procs()) do p - remotecall_fetch(p) do - ports_lower = [] # ports of pids lower than myid() - ports_higher = [] # ports of pids higher than myid() - for w in DistributedNext.PGRP.workers - w.id == myid() && continue - port = Sockets._sockname(w.r_stream, true)[2] - if (w.id == 1) - # master connects to workers - push!(ports_higher, port) - elseif w.id < myid() - push!(ports_lower, port) - elseif w.id > myid() - push!(ports_higher, port) + for close_stdin in (true, false), stderr_to_stdout in (true, false) + local npids = addprocs_with_testenv(RetainStdioTester(close_stdin,stderr_to_stdout)) + @test remotecall_fetch(myid, npids[1]) == npids[1] + if close_stdin + @test remotecall_fetch(()->stdin === devnull && !isreadable(stdin), npids[1]) + else + @test remotecall_fetch(()->stdin !== devnull && isopen(stdin) && isreadable(stdin), npids[1]) + end + @test stderr_to_stdout == remotecall_fetch(()->(stderr === stdout), npids[1]) + rmprocs(npids) + end +end + +@testset "Issue #22865" begin + # Must be run on a new cluster, i.e., all workers must be in the same state. + @assert nprocs() == 1 + p1,p2 = addprocs_with_testenv(2) + @everywhere f22865(p) = remotecall_fetch(x->x.*2, p, fill(1.,2)) + @test fill(2.,2) == remotecall_fetch(f22865, p1, p2) + rmprocs(p1, p2) +end + +@testset "SO_REUSEPORT" begin + function reuseport_tests() + # Run the test on all processes. + results = asyncmap(procs()) do p + remotecall_fetch(p) do + ports_lower = [] # ports of pids lower than myid() + ports_higher = [] # ports of pids higher than myid() + for w in DistributedNext.PGRP.workers + w.id == myid() && continue + port = Sockets._sockname(w.r_stream, true)[2] + if (w.id == 1) + # master connects to workers + push!(ports_higher, port) + elseif w.id < myid() + push!(ports_lower, port) + elseif w.id > myid() + push!(ports_higher, port) + end end - end - @assert (length(ports_lower) + length(ports_higher)) == nworkers() - for portset in [ports_lower, ports_higher] - if (length(portset) > 0) && (length(unique(portset)) != 1) - @warn "SO_REUSEPORT TESTS FAILED. UNSUPPORTED/OLDER UNIX VERSION?" - return 0 + @assert (length(ports_lower) + length(ports_higher)) == nworkers() + for portset in [ports_lower, ports_higher] + if (length(portset) > 0) && (length(unique(portset)) != 1) + @warn "SO_REUSEPORT TESTS FAILED. UNSUPPORTED/OLDER UNIX VERSION?" + return 0 + end end + return myid() end - return myid() end - end - # Ensure that the code has indeed been successfully executed everywhere - @test all(in(results), procs()) -end + # Ensure that the code has indeed been successfully executed everywhere + @test all(in(results), procs()) + end -# Test that the client port is reused. SO_REUSEPORT may not be supported on -# all UNIX platforms, Linux kernels prior to 3.9 and older versions of OSX -@assert nprocs() == 1 -addprocs_with_testenv(4; lazy=false) -if ccall(:jl_has_so_reuseport, Int32, ()) == 1 - reuseport_tests() -else - @info "SO_REUSEPORT is unsupported, skipping reuseport tests" + # Test that the client port is reused. SO_REUSEPORT may not be supported on + # all UNIX platforms, Linux kernels prior to 3.9 and older versions of OSX + @assert nprocs() == 1 + addprocs_with_testenv(4; lazy=false) + if ccall(:jl_has_so_reuseport, Int32, ()) == 1 + reuseport_tests() + else + @info "SO_REUSEPORT is unsupported, skipping reuseport tests" + end end -# issue #27933 -a27933 = :_not_defined_27933 -@test remotecall_fetch(()->a27933, first(workers())) === a27933 +@testset "Even more various individual issues" begin + # issue #27933 + a27933 = :_not_defined_27933 + @test remotecall_fetch(()->a27933, first(workers())) === a27933 -# PR #28651 -for T in (UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64) - local n = @distributed (+) for i in Base.OneTo(T(10)) - i + # PR #28651 + for T in (UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64) + local n = @distributed (+) for i in Base.OneTo(T(10)) + i + end + @test n == 55 end - @test n == 55 -end -# issue #28966 -let code = """ + # issue #28966 + let code = """ import DistributedNext DistributedNext.addprocs(1) DistributedNext.@everywhere f() = myid() @@ -1669,204 +1714,207 @@ let code = """ @assert DistributedNext.remotecall_fetch(f, w) == w end """ - @test success(`$(Base.julia_cmd()) --startup-file=no -e $code`) -end - -# PR 32431: tests for internal DistributedNext.head_and_tail -let (h, t) = DistributedNext.head_and_tail(1:10, 3) - @test h == 1:3 - @test collect(t) == 4:10 -end -let (h, t) = DistributedNext.head_and_tail(1:10, 0) - @test h == [] - @test collect(t) == 1:10 -end -let (h, t) = DistributedNext.head_and_tail(1:3, 5) - @test h == 1:3 - @test collect(t) == [] -end -let (h, t) = DistributedNext.head_and_tail(1:3, 3) - @test h == 1:3 - @test collect(t) == [] -end -let (h, t) = DistributedNext.head_and_tail(Int[], 3) - @test h == [] - @test collect(t) == [] -end -let (h, t) = DistributedNext.head_and_tail(Int[], 0) - @test h == [] - @test collect(t) == [] -end - -# issue #35937 -let e = @test_throws RemoteException pmap(1) do _ - wait(@async error(42)) - end - # check that the inner TaskFailedException is correctly formed & can be printed - es = sprint(showerror, e.value) - @test contains(es, ":\nTaskFailedException\nStacktrace:\n") - @test contains(es, "\n\n nested task error:") - @test contains(es, "\n\n nested task error: 42\n") -end - -# issue #27429, propagate relative `include` path to workers -@everywhere include("includefile.jl") -for p in procs() - @test @fetchfrom(p, i27429) == 27429 -end - -# Propagation of package environments for local workers (#28781) -let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp - pkg_project = joinpath(Base.pkgdir(DistributedNext), "Project.toml") - project = mkdir(joinpath(tmp, "project")) - depots = [mkdir(joinpath(tmp, "depot1")), mkdir(joinpath(tmp, "depot2"))] - load_path = [mkdir(joinpath(tmp, "load_path")), "@stdlib", "@", pkg_project] - pathsep = Sys.iswindows() ? ";" : ":" - env = Dict( - "JULIA_DEPOT_PATH" => join(depots, pathsep), - "JULIA_LOAD_PATH" => join(load_path, pathsep), - # Explicitly propagate `TMPDIR`, in the event that we're running on a - # CI system where `TMPDIR` is special. - "TMPDIR" => dirname(tmp), - ) - - funcscode = """ - using Test - - @everywhere begin - depot_path() = DEPOT_PATH - load_path() = LOAD_PATH - active_project() = Base.ACTIVE_PROJECT[] + @test success(`$(Base.julia_cmd()) --startup-file=no -e $code`) end - """ - - setupcode = """ - using DistributedNext - addprocs(1) - """ * funcscode - testcode = setupcode * """ - for w in workers() - @test remotecall_fetch(depot_path, w) == DEPOT_PATH - @test remotecall_fetch(load_path, w) == LOAD_PATH - @test remotecall_fetch(Base.load_path, w) == Base.load_path() - @test remotecall_fetch(active_project, w) == Base.ACTIVE_PROJECT[] - @test remotecall_fetch(Base.active_project, w) == Base.active_project() + # PR 32431: tests for internal DistributedNext.head_and_tail + let (h, t) = DistributedNext.head_and_tail(1:10, 3) + @test h == 1:3 + @test collect(t) == 4:10 end - """ - - # No active project. This test is disabled because it won't work with - # DistributedNext since the package isn't a stdlib. - # extracode = """ - # for w in workers() - # @test remotecall_fetch(active_project, w) === Base.ACTIVE_PROJECT[] === nothing - # end - # """ - # cmd = setenv(`$(julia) -e $(testcode * extracode)`, env) - # @test success(cmd) - - # --project - extracode = """ - for w in workers() - @test remotecall_fetch(active_project, w) == Base.ACTIVE_PROJECT[] == - $(repr(project)) + let (h, t) = DistributedNext.head_and_tail(1:10, 0) + @test h == [] + @test collect(t) == 1:10 end - """ - cmd = setenv(`$(julia) --project=$(project) -e $(testcode * extracode)`, env) - @test success(cmd) - # JULIA_PROJECT - cmd = setenv(`$(julia) -e $(testcode * extracode)`, - (env["JULIA_PROJECT"] = project; env)) - @test success(cmd) - # Pkg.activate(...) - activateish = """ - Base.ACTIVE_PROJECT[] = $(repr(project)) - using DistributedNext - addprocs(1) - """ - cmd = setenv(`$(julia) -e $(activateish * testcode * extracode)`, env) - @test success(cmd) - # JULIA_(LOAD|DEPOT)_PATH - shufflecode = """ - d = reverse(DEPOT_PATH) - append!(empty!(DEPOT_PATH), d) - l = reverse(LOAD_PATH) - append!(empty!(LOAD_PATH), l) - """ - addcode = """ - using DistributedNext - addprocs(1) # after shuffling - """ - extracode = """ - for w in workers() - @test remotecall_fetch(load_path, w) == $(repr(reverse(load_path))) - @test remotecall_fetch(depot_path, w) == $(repr(reverse(depots))) + let (h, t) = DistributedNext.head_and_tail(1:3, 5) + @test h == 1:3 + @test collect(t) == [] end - """ - cmd = setenv(`$(julia) -e $(shufflecode * addcode * testcode * extracode)`, env) - @test success(cmd) - # Mismatch when shuffling after proc addition. Note that the use of - # `addcode` mimics the behaviour of -p1 as the first worker is started - # before `shufflecode` executes. - failcode = addcode * shufflecode * funcscode * """ - @show workers() - for w in workers() - @test remotecall_fetch(load_path, w) == reverse(LOAD_PATH) == $(repr(load_path)) - @test remotecall_fetch(depot_path, w) == reverse(DEPOT_PATH) == $(repr(depots)) + let (h, t) = DistributedNext.head_and_tail(1:3, 3) + @test h == 1:3 + @test collect(t) == [] end - """ - cmd = setenv(`$(julia) -e $(failcode)`, env) - @test success(cmd) - - # Hideous hack to double escape path separators on Windows so that it gets - # interpolated into the string (and then Cmd) correctly. - escaped_pkg_project = Sys.iswindows() ? replace(pkg_project, "\\" => "\\\\") : pkg_project - - # Passing env or exeflags to addprocs(...) to override defaults - envcode = """ - using DistributedNext - project = mktempdir() - env = Dict( - "JULIA_LOAD_PATH" => string(LOAD_PATH[1], $(repr(pathsep)), "@stdlib", $(repr(pathsep)), "$(escaped_pkg_project)"), - "JULIA_DEPOT_PATH" => DEPOT_PATH[1], - "TMPDIR" => ENV["TMPDIR"], - ) - addprocs(1; env = env, exeflags = `--project=\$(project)`) - env["JULIA_PROJECT"] = project - addprocs(1; env = env) - """ * funcscode * """ - for w in workers() - @test remotecall_fetch(depot_path, w) == [DEPOT_PATH[1]] - @test remotecall_fetch(load_path, w) == [LOAD_PATH[1], "@stdlib", "$(escaped_pkg_project)"] - @test remotecall_fetch(active_project, w) == project - @test remotecall_fetch(Base.active_project, w) == joinpath(project, "Project.toml") + let (h, t) = DistributedNext.head_and_tail(Int[], 3) + @test h == [] + @test collect(t) == [] + end + let (h, t) = DistributedNext.head_and_tail(Int[], 0) + @test h == [] + @test collect(t) == [] + end + + # issue #35937 + let e = @test_throws RemoteException pmap(1) do _ + wait(@async error(42)) + end + # check that the inner TaskFailedException is correctly formed & can be printed + es = sprint(showerror, e.value) + @test contains(es, ":\nTaskFailedException\nStacktrace:\n") + @test contains(es, "\n\n nested task error:") + @test contains(es, "\n\n nested task error: 42\n") end - """ - cmd = setenv(`$(julia) -e $(envcode)`, env) - @test success(cmd) -end end + + # issue #27429, propagate relative `include` path to workers + @everywhere include("includefile.jl") + for p in procs() + @test @fetchfrom(p, i27429) == 27429 + end +end + +@testset "Propagation of package environments for local workers (#28781)" begin + let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp + pkg_project = joinpath(Base.pkgdir(DistributedNext), "Project.toml") + project = mkdir(joinpath(tmp, "project")) + depots = [mkdir(joinpath(tmp, "depot1")), mkdir(joinpath(tmp, "depot2"))] + load_path = [mkdir(joinpath(tmp, "load_path")), "@stdlib", "@", pkg_project] + pathsep = Sys.iswindows() ? ";" : ":" + env = Dict( + "JULIA_DEPOT_PATH" => join(depots, pathsep), + "JULIA_LOAD_PATH" => join(load_path, pathsep), + # Explicitly propagate `TMPDIR`, in the event that we're running on a + # CI system where `TMPDIR` is special. + "TMPDIR" => dirname(tmp), + ) + + funcscode = """ + using Test + + @everywhere begin + depot_path() = DEPOT_PATH + load_path() = LOAD_PATH + active_project() = Base.ACTIVE_PROJECT[] + end + """ + + setupcode = """ + using DistributedNext + addprocs(1) + """ * funcscode + + testcode = setupcode * """ + for w in workers() + @test remotecall_fetch(depot_path, w) == DEPOT_PATH + @test remotecall_fetch(load_path, w) == LOAD_PATH + @test remotecall_fetch(Base.load_path, w) == Base.load_path() + @test remotecall_fetch(active_project, w) == Base.ACTIVE_PROJECT[] + @test remotecall_fetch(Base.active_project, w) == Base.active_project() + end + """ + + # No active project. This test is disabled because it won't work with + # DistributedNext since the package isn't a stdlib. + # extracode = """ + # for w in workers() + # @test remotecall_fetch(active_project, w) === Base.ACTIVE_PROJECT[] === nothing + # end + # """ + # cmd = setenv(`$(julia) -e $(testcode * extracode)`, env) + # @test success(cmd) + + # --project + extracode = """ + for w in workers() + @test remotecall_fetch(active_project, w) == Base.ACTIVE_PROJECT[] == + $(repr(project)) + end + """ + cmd = setenv(`$(julia) --project=$(project) -e $(testcode * extracode)`, env) + @test success(cmd) + # JULIA_PROJECT + cmd = setenv(`$(julia) -e $(testcode * extracode)`, + (env["JULIA_PROJECT"] = project; env)) + @test success(cmd) + # Pkg.activate(...) + activateish = """ + Base.ACTIVE_PROJECT[] = $(repr(project)) + using DistributedNext + addprocs(1) + """ + cmd = setenv(`$(julia) -e $(activateish * testcode * extracode)`, env) + @test success(cmd) + # JULIA_(LOAD|DEPOT)_PATH + shufflecode = """ + d = reverse(DEPOT_PATH) + append!(empty!(DEPOT_PATH), d) + l = reverse(LOAD_PATH) + append!(empty!(LOAD_PATH), l) + """ + addcode = """ + using DistributedNext + addprocs(1) # after shuffling + """ + extracode = """ + for w in workers() + @test remotecall_fetch(load_path, w) == $(repr(reverse(load_path))) + @test remotecall_fetch(depot_path, w) == $(repr(reverse(depots))) + end + """ + cmd = setenv(`$(julia) -e $(shufflecode * addcode * testcode * extracode)`, env) + @test success(cmd) + # Mismatch when shuffling after proc addition. Note that the use of + # `addcode` mimics the behaviour of -p1 as the first worker is started + # before `shufflecode` executes. + failcode = addcode * shufflecode * funcscode * """ + @show workers() + for w in workers() + @test remotecall_fetch(load_path, w) == reverse(LOAD_PATH) == $(repr(load_path)) + @test remotecall_fetch(depot_path, w) == reverse(DEPOT_PATH) == $(repr(depots)) + end + """ + cmd = setenv(`$(julia) -e $(failcode)`, env) + @test success(cmd) + + # Hideous hack to double escape path separators on Windows so that it gets + # interpolated into the string (and then Cmd) correctly. + escaped_pkg_project = Sys.iswindows() ? replace(pkg_project, "\\" => "\\\\") : pkg_project + + # Passing env or exeflags to addprocs(...) to override defaults + envcode = """ + using DistributedNext + project = mktempdir() + env = Dict( + "JULIA_LOAD_PATH" => string(LOAD_PATH[1], $(repr(pathsep)), "@stdlib", $(repr(pathsep)), "$(escaped_pkg_project)"), + "JULIA_DEPOT_PATH" => DEPOT_PATH[1], + "TMPDIR" => ENV["TMPDIR"], + ) + addprocs(1; env = env, exeflags = `--project=\$(project)`) + env["JULIA_PROJECT"] = project + addprocs(1; env = env) + """ * funcscode * """ + for w in workers() + @test remotecall_fetch(depot_path, w) == [DEPOT_PATH[1]] + @test remotecall_fetch(load_path, w) == [LOAD_PATH[1], "@stdlib", "$(escaped_pkg_project)"] + @test remotecall_fetch(active_project, w) == project + @test remotecall_fetch(Base.active_project, w) == joinpath(project, "Project.toml") + end + """ + cmd = setenv(`$(julia) -e $(envcode)`, env) + @test success(cmd) + end end +end include("splitrange.jl") -# Clear all workers for timeout tests (issue #45785) -nprocs() > 1 && rmprocs(workers()) -begin - # First, assert that we get no messages when we close a cooperative worker - w = only(addprocs(1)) - @test_nowarn begin - wait(rmprocs([w])) - end - - # Next, ensure we get a log message when a worker does not cleanly exit - w = only(addprocs(1)) - @test_logs (:warn, r"sending SIGQUIT") begin - remote_do(w) do - # Cause the 'exit()' message that `rmprocs()` sends to do nothing - Core.eval(Base, :(exit() = nothing)) - # Hide the trace that `rmprocs()` will cause this worker to show - redirect_stderr(devnull) - end - wait(rmprocs([w])) +@testset "Clear all workers for timeout tests (issue #45785)" begin + nprocs() > 1 && rmprocs(workers()) + begin + # First, assert that we get no messages when we close a cooperative worker + w = only(addprocs(1)) + @test_nowarn begin + wait(rmprocs([w])) + end + + # Next, ensure we get a log message when a worker does not cleanly exit + w = only(addprocs(1)) + @test_logs (:warn, r"sending SIGQUIT") begin + remote_do(w) do + # Cause the 'exit()' message that `rmprocs()` sends to do nothing + Core.eval(Base, :(exit() = nothing)) + # Hide the trace that `rmprocs()` will cause this worker to show + redirect_stderr(devnull) + end + wait(rmprocs([w])) + end end end diff --git a/test/managers.jl b/test/managers.jl index 54ca3a5..1e0fc81 100644 --- a/test/managers.jl +++ b/test/managers.jl @@ -5,22 +5,24 @@ using DistributedNext using Sockets using DistributedNext: parse_machine, SSHManager, LocalManager -@test parse_machine("127.0.0.1") == ("127.0.0.1", nothing) -@test parse_machine("127.0.0.1:80") == ("127.0.0.1", 80) -@test parse_machine("[2001:db8::1]") == ("2001:db8::1", nothing) -@test parse_machine("[2001:db8::1]:443") == ("2001:db8::1", 443) +@testset "Managers" begin + @test parse_machine("127.0.0.1") == ("127.0.0.1", nothing) + @test parse_machine("127.0.0.1:80") == ("127.0.0.1", 80) + @test parse_machine("[2001:db8::1]") == ("2001:db8::1", nothing) + @test parse_machine("[2001:db8::1]:443") == ("2001:db8::1", 443) -@test parse_machine("127.0.0.1:90") == ("127.0.0.1", 90) -@test parse_machine("127.0.0.1:1") == ("127.0.0.1", 1) -@test parse_machine("127.0.0.1:65535") == ("127.0.0.1", 65535) + @test parse_machine("127.0.0.1:90") == ("127.0.0.1", 90) + @test parse_machine("127.0.0.1:1") == ("127.0.0.1", 1) + @test parse_machine("127.0.0.1:65535") == ("127.0.0.1", 65535) -@test_throws ArgumentError parse_machine("127.0.0.1:-1") -@test_throws ArgumentError parse_machine("127.0.0.1:0") -@test_throws ArgumentError parse_machine("127.0.0.1:65536") -@test_throws ArgumentError parse_machine("[2001:db8::1]:443:888") -@test_throws ArgumentError parse_machine("[2001:db8::1") -@test_throws ArgumentError parse_machine("[2001:db8::1]:aaa") + @test_throws ArgumentError parse_machine("127.0.0.1:-1") + @test_throws ArgumentError parse_machine("127.0.0.1:0") + @test_throws ArgumentError parse_machine("127.0.0.1:65536") + @test_throws ArgumentError parse_machine("[2001:db8::1]:443:888") + @test_throws ArgumentError parse_machine("[2001:db8::1") + @test_throws ArgumentError parse_machine("[2001:db8::1]:aaa") -@test occursin(r"^SSHManager\(machines=.*\)$", - sprint((t,x) -> show(t, "text/plain", x), SSHManager("127.0.0.1"))) -@test sprint((t,x) -> show(t, "text/plain", x), LocalManager(1, true)) == "LocalManager()" + @test occursin(r"^SSHManager\(machines=.*\)$", + sprint((t,x) -> show(t, "text/plain", x), SSHManager("127.0.0.1"))) + @test sprint((t,x) -> show(t, "text/plain", x), LocalManager(1, true)) == "LocalManager()" +end diff --git a/test/runtests.jl b/test/runtests.jl index d4d1d86..f5f56c7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,13 +6,15 @@ include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) cmd = `$test_exename $test_exeflags` -# Run the SSH tests with a single thread because LibSSH.jl is not thread-safe -sshtestfile = joinpath(@__DIR__, "sshmanager.jl") -run(addenv(`$cmd $sshtestfile`, "JULIA_NUM_THREADS" => "1")) - -disttestfile = joinpath(@__DIR__, "distributed_exec.jl") -if !success(pipeline(`$cmd $disttestfile`; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0 - error("Distributed test failed, cmd : $cmd") +# LibSSH.jl currently only works on 64bit unixes +if Sys.isunix() && Sys.WORD_SIZE == 64 + # Run the SSH tests with a single thread because LibSSH.jl is not thread-safe + sshtestfile = joinpath(@__DIR__, "sshmanager.jl") + run(addenv(`$cmd $sshtestfile`, "JULIA_NUM_THREADS" => "1")) +else + @warn "Skipping the SSH tests because this platform is not supported" end +include("distributed_exec.jl") + include("managers.jl") diff --git a/test/splitrange.jl b/test/splitrange.jl index bbb8284..511e9db 100644 --- a/test/splitrange.jl +++ b/test/splitrange.jl @@ -1,35 +1,35 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -using Test -using DistributedNext using DistributedNext: splitrange -@test splitrange(1, 11, 1) == Array{UnitRange{Int64},1}([1:11]) -@test splitrange(0, 10, 1) == Array{UnitRange{Int64},1}([0:10]) -@test splitrange(-1, 9, 1) == Array{UnitRange{Int64},1}([-1:9]) +const BASE_TEST_PATH = joinpath(Sys.BINDIR, "..", "share", "julia", "test") +isdefined(Main, :OffsetArrays) || @eval Main @everywhere include(joinpath($(BASE_TEST_PATH), "testhelpers", "OffsetArrays.jl")) +using .Main.OffsetArrays -@test splitrange(1, 11, 2) == Array{UnitRange{Int64},1}([1:6,7:11]) -@test splitrange(0, 10, 2) == Array{UnitRange{Int64},1}([0:5,6:10]) -@test splitrange(-1, 9, 2) == Array{UnitRange{Int64},1}([-1:4,5:9]) +@testset "splitrange()" begin + @test splitrange(1, 11, 1) == Array{UnitRange{Int64},1}([1:11]) + @test splitrange(0, 10, 1) == Array{UnitRange{Int64},1}([0:10]) + @test splitrange(-1, 9, 1) == Array{UnitRange{Int64},1}([-1:9]) -@test splitrange(1, 11, 3) == Array{UnitRange{Int64},1}([1:4,5:8,9:11]) -@test splitrange(0, 10, 3) == Array{UnitRange{Int64},1}([0:3,4:7,8:10]) -@test splitrange(-1, 9, 3) == Array{UnitRange{Int64},1}([-1:2,3:6,7:9]) + @test splitrange(1, 11, 2) == Array{UnitRange{Int64},1}([1:6,7:11]) + @test splitrange(0, 10, 2) == Array{UnitRange{Int64},1}([0:5,6:10]) + @test splitrange(-1, 9, 2) == Array{UnitRange{Int64},1}([-1:4,5:9]) -@test splitrange(1, 3, 3) == Array{UnitRange{Int64},1}([1:1,2:2,3:3]) -@test splitrange(1, 3, 4) == Array{UnitRange{Int64},1}([1:1,2:2,3:3]) -@test splitrange(0, 2, 3) == Array{UnitRange{Int64},1}([0:0,1:1,2:2]) -@test splitrange(0, 2, 4) == Array{UnitRange{Int64},1}([0:0,1:1,2:2]) -@test splitrange(-1, 1, 3) == Array{UnitRange{Int64},1}([-1:-1,0:0,1:1]) -@test splitrange(-1, 1, 4) == Array{UnitRange{Int64},1}([-1:-1,0:0,1:1]) + @test splitrange(1, 11, 3) == Array{UnitRange{Int64},1}([1:4,5:8,9:11]) + @test splitrange(0, 10, 3) == Array{UnitRange{Int64},1}([0:3,4:7,8:10]) + @test splitrange(-1, 9, 3) == Array{UnitRange{Int64},1}([-1:2,3:6,7:9]) -const BASE_TEST_PATH = joinpath(Sys.BINDIR, "..", "share", "julia", "test") -isdefined(Main, :OffsetArrays) || @eval Main @everywhere include(joinpath($(BASE_TEST_PATH), "testhelpers", "OffsetArrays.jl")) -using .Main.OffsetArrays + @test splitrange(1, 3, 3) == Array{UnitRange{Int64},1}([1:1,2:2,3:3]) + @test splitrange(1, 3, 4) == Array{UnitRange{Int64},1}([1:1,2:2,3:3]) + @test splitrange(0, 2, 3) == Array{UnitRange{Int64},1}([0:0,1:1,2:2]) + @test splitrange(0, 2, 4) == Array{UnitRange{Int64},1}([0:0,1:1,2:2]) + @test splitrange(-1, 1, 3) == Array{UnitRange{Int64},1}([-1:-1,0:0,1:1]) + @test splitrange(-1, 1, 4) == Array{UnitRange{Int64},1}([-1:-1,0:0,1:1]) -oa = OffsetArray([123, -345], (-2,)) + oa = OffsetArray([123, -345], (-2,)) -@everywhere using Test -@sync @distributed for i in eachindex(oa) - @test i ∈ (-1, 0) + @everywhere using Test + @sync @distributed for i in eachindex(oa) + @test i ∈ (-1, 0) + end end diff --git a/test/sshmanager.jl b/test/sshmanager.jl index 9bed971..2f82637 100644 --- a/test/sshmanager.jl +++ b/test/sshmanager.jl @@ -8,28 +8,24 @@ import LibSSH.Demo: DemoServer include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) -# LibSSH.jl currently only works on 64bit unixes -if Sys.isunix() && Sys.WORD_SIZE == 64 - function test_n_remove_pids(new_pids) - for p in new_pids - w_in_remote = sort(remotecall_fetch(workers, p)) - try - @test intersect(new_pids, w_in_remote) == new_pids - catch - print("p : $p\n") - print("newpids : $new_pids\n") - print("w_in_remote : $w_in_remote\n") - print("intersect : $(intersect(new_pids, w_in_remote))\n\n\n") - rethrow() - end +function test_n_remove_pids(new_pids) + for p in new_pids + w_in_remote = sort(remotecall_fetch(workers, p)) + try + @test intersect(new_pids, w_in_remote) == new_pids + catch + print("p : $p\n") + print("newpids : $new_pids\n") + print("w_in_remote : $w_in_remote\n") + print("intersect : $(intersect(new_pids, w_in_remote))\n\n\n") + rethrow() end - - remotecall_fetch(rmprocs, 1, new_pids) end - println("\n\nTesting SSHManager. A minimum of 4GB of RAM is recommended.") - println("Please ensure port 9300 and 2222 are not in use.") + remotecall_fetch(rmprocs, 1, new_pids) +end +@testset "SSHManager" begin DemoServer(2222; auth_methods=[ssh.AuthMethod_None], allow_auth_none=true, verbose=false, timeout=3600) do sshflags = `-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR -p 2222 ` #Issue #9951 diff --git a/test/topology.jl b/test/topology.jl index 66cc78d..5426dcc 100644 --- a/test/topology.jl +++ b/test/topology.jl @@ -2,142 +2,146 @@ using Random -pids = addprocs_with_testenv(4; topology="master_worker") +@testset "Topology" begin + pids = addprocs_with_testenv(4; topology="master_worker") -let p1 = pids[1], p2 = pids[2] - @test_throws RemoteException remotecall_fetch(()->remotecall_fetch(myid, p2), p1) -end + let p1 = pids[1], p2 = pids[2] + @test_throws RemoteException remotecall_fetch(()->remotecall_fetch(myid, p2), p1) + end -function test_worker_counts() - # check if the nprocs/nworkers/workers are the same on the remaining workers - np=nprocs() - nw=nworkers() - ws=sort(workers()) + function test_worker_counts() + # check if the nprocs/nworkers/workers are the same on the remaining workers + np=nprocs() + nw=nworkers() + ws=sort(workers()) - for p in workers() - @test (true, true, true) == remotecall_fetch(p, np, nw, ws) do x,y,z - (x==nprocs(), y==nworkers(), z==sort(workers())) + for p in workers() + @test (true, true, true) == remotecall_fetch(p, np, nw, ws) do x,y,z + (x==nprocs(), y==nworkers(), z==sort(workers())) + end end end -end -function remove_workers_and_test() - while nworkers() > 0 - rmprocs(workers()[1]) - test_worker_counts() - if nworkers() == nprocs() - break + function remove_workers_and_test() + while nworkers() > 0 + rmprocs(workers()[1]) + test_worker_counts() + if nworkers() == nprocs() + break + end end end -end - -remove_workers_and_test() -# connect even pids to other even pids, odd to odd. -mutable struct TopoTestManager <: ClusterManager - np::Integer -end + remove_workers_and_test() -function launch(manager::TopoTestManager, params::Dict, launched::Array, c::Condition) - dir = params[:dir] - exename = params[:exename] - exeflags = params[:exeflags] - - cmd = `$exename $exeflags --bind-to $(DistributedNext.LPROC.bind_addr) $(DistributedNext.get_worker_arg())` - cmd = pipeline(detach(setenv(cmd, dir=dir))) - for i in 1:manager.np - io = open(cmd, "r+") - DistributedNext.write_cookie(io) - - wconfig = WorkerConfig() - wconfig.process = io - wconfig.io = io.out - wconfig.ident = i - wconfig.connect_idents = Vector(i+2:2:manager.np) - push!(launched, wconfig) + # connect even pids to other even pids, odd to odd. + mutable struct TopoTestManager <: ClusterManager + np::Integer end - notify(c) -end + function DistributedNext.launch(manager::TopoTestManager, params::Dict, launched::Array, c::Condition) + dir = params[:dir] + exename = params[:exename] + exeflags = params[:exeflags] + + cmd = `$exename $exeflags --bind-to $(DistributedNext.LPROC.bind_addr) $(DistributedNext.get_worker_arg())` + cmd = pipeline(detach(setenv(cmd, dir=dir))) + for i in 1:manager.np + io = open(cmd, "r+") + DistributedNext.write_cookie(io) + + wconfig = WorkerConfig() + wconfig.process = io + wconfig.io = io.out + wconfig.ident = i + wconfig.connect_idents = Vector(i+2:2:manager.np) + push!(launched, wconfig) + end -const map_pid_ident=Dict() -function manage(manager::TopoTestManager, id::Integer, config::WorkerConfig, op::Symbol) - if op === :register - map_pid_ident[id] = config.ident - elseif op === :interrupt - kill(config.process, 2) + notify(c) end -end -addprocs_with_testenv(TopoTestManager(8); topology="custom") - -while true - if any(x->get(map_pid_ident, x, 0)==0, workers()) - yield() - else - break + map_pid_ident=Dict() + function DistributedNext.manage(manager::TopoTestManager, id::Integer, config::WorkerConfig, op::Symbol) + if op === :register + map_pid_ident[id] = config.ident + elseif op === :interrupt + kill(config.process, 2) + end end -end -let p1, p2 -for p1 in workers() - for p2 in workers() - i1 = map_pid_ident[p1] - i2 = map_pid_ident[p2] - if (iseven(i1) && iseven(i2)) || (isodd(i1) && isodd(i2)) - @test p2 == remotecall_fetch(p->remotecall_fetch(myid, p), p1, p2) + addprocs_with_testenv(TopoTestManager(8); topology="custom") + + while true + if any(x->get(map_pid_ident, x, 0)==0, workers()) + yield() else - @test_throws RemoteException remotecall_fetch(p->remotecall_fetch(myid, p), p1, p2) + break end end -end -end - -remove_workers_and_test() -# test `lazy` connection setup -function def_count_conn() - @everywhere function count_connected_workers() - count(x -> isa(x, DistributedNext.Worker) && isdefined(x, :r_stream) && isopen(x.r_stream), - DistributedNext.PGRP.workers) + let p1, p2 + for p1 in workers() + for p2 in workers() + i1 = map_pid_ident[p1] + i2 = map_pid_ident[p2] + if (iseven(i1) && iseven(i2)) || (isodd(i1) && isodd(i2)) + @test p2 == remotecall_fetch(p->remotecall_fetch(myid, p), p1, p2) + else + @test_throws RemoteException remotecall_fetch(p->remotecall_fetch(myid, p), p1, p2) + end + end + end end -end -addprocs_with_testenv(8) -def_count_conn() - -# Test for 10 random combinations -wl = workers() -combinations = [] -while length(combinations) < 10 - from = rand(wl) - to = rand(wl) - if from == to || ((from,to) in combinations) || ((to,from) in combinations) - continue - else - push!(combinations, (from,to)) + remove_workers_and_test() + + # test `lazy` connection setup + function def_count_conn() + @everywhere if !isdefined(Main, :count_connected_workers) + function count_connected_workers() + count(x -> isa(x, DistributedNext.Worker) && isdefined(x, :r_stream) && isopen(x.r_stream), + DistributedNext.PGRP.workers) + end + end end -end -# Initially only master-worker connections ought to be setup -expected_num_conns = 8 -let num_conns = sum(asyncmap(p->remotecall_fetch(count_connected_workers,p), workers())) - @test num_conns == expected_num_conns -end + addprocs_with_testenv(8) + def_count_conn() + + # Test for 10 random combinations + wl = workers() + combinations = [] + while length(combinations) < 10 + from = rand(wl) + to = rand(wl) + if from == to || ((from,to) in combinations) || ((to,from) in combinations) + continue + else + push!(combinations, (from,to)) + end + end -for (i, (from,to)) in enumerate(combinations) - remotecall_wait(topid->remotecall_fetch(myid, topid), from, to) - global expected_num_conns += 2 # one connection endpoint on both from and to + # Initially only master-worker connections ought to be setup + expected_num_conns = 8 let num_conns = sum(asyncmap(p->remotecall_fetch(count_connected_workers,p), workers())) @test num_conns == expected_num_conns end -end -# With lazy=false, all connections ought to be setup during `addprocs` -nprocs() > 1 && rmprocs(workers()) -addprocs_with_testenv(8; lazy=false) -def_count_conn() -@test sum(asyncmap(p->remotecall_fetch(count_connected_workers,p), workers())) == 64 + for (i, (from,to)) in enumerate(combinations) + remotecall_wait(topid->remotecall_fetch(myid, topid), from, to) + expected_num_conns += 2 # one connection endpoint on both from and to + let num_conns = sum(asyncmap(p->remotecall_fetch(count_connected_workers,p), workers())) + @test num_conns == expected_num_conns + end + end -# Cannot add more workers with a different `lazy` value -@test_throws ArgumentError addprocs_with_testenv(1; lazy=true) + # With lazy=false, all connections ought to be setup during `addprocs` + nprocs() > 1 && rmprocs(workers()) + addprocs_with_testenv(8; lazy=false) + def_count_conn() + @test sum(asyncmap(p->remotecall_fetch(count_connected_workers,p), workers())) == 64 + + # Cannot add more workers with a different `lazy` value + @test_throws ArgumentError addprocs_with_testenv(1; lazy=true) +end From c1a3be84b14662ee9e78628fade0fba04cbdea51 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Fri, 1 Nov 2024 22:44:20 +0100 Subject: [PATCH 9/9] Increase threads test wait time to avoid timeouts --- test/threads.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/threads.jl b/test/threads.jl index 9d1d6d4..19444ae 100644 --- a/test/threads.jl +++ b/test/threads.jl @@ -48,8 +48,8 @@ isfailed(rr) = fetch_from_owner(istaskfailed, rr) # timedwait() instead of @sync to avoid deadlocks. t1 = Threads.@spawn fetch_from_owner(wait, recv) t2 = Threads.@spawn fetch_from_owner(wait, send) - @test timedwait(() -> istaskdone(t1), 5) == :ok - @test timedwait(() -> istaskdone(t2), 5) == :ok + @test timedwait(() -> istaskdone(t1), 60) == :ok + @test timedwait(() -> istaskdone(t2), 60) == :ok # Check the tasks @test isdone(send)