Skip to content

Commit 0c47e18

Browse files
committed
Fix clustermanager for 0.7
1 parent c7ee281 commit 0c47e18

15 files changed

+52
-51
lines changed

.travis.yml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ os:
55
- linux
66
- osx
77
julia:
8-
- 0.6
98
- 0.7
109
- nightly
1110
notifications:
@@ -21,8 +20,6 @@ before_install:
2120
# Work around OpenMPI attempting to create overly long temporary
2221
# file names - and erroring as a result
2322
- export TMPDIR=/tmp
24-
script:
25-
- julia -e 'if VERSION < v"0.7.0-DEV.5183"; Pkg.clone(pwd()) ; else; using Pkg; Pkg.up(); end; Pkg.build("MPI"); Pkg.test("MPI"; coverage=true)'
2623
after_success:
27-
- julia -e 'VERSION < v"0.7.0-DEV.5183" || using Pkg; cd(Pkg.dir("MPI")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
28-
- julia -e 'VERSION < v"0.7.0-DEV.5183" || using Pkg; cd(Pkg.dir("MPI")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
24+
- julia -e 'using Pkg; cd(Pkg.dir("MPI")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
25+
- julia -e 'using Pkg; cd(Pkg.dir("MPI")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'

Project.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,6 @@ BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
88
Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
99
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
1010
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
11+
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
12+
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
13+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,10 @@ The julia master process is NOT part of the MPI cluster. The main script should
118118
launched directly, MPIManager internally calls `mpirun` to launch julia/mpi workers.
119119
All the workers started via MPIManager will be part of the MPI cluster.
120120

121-
`MPIManager(;np=Sys.CPU_CORES, mpi_cmd=false, launch_timeout=60.0)`
121+
`MPIManager(;np=Sys.CPU_THREADS, mpi_cmd=false, launch_timeout=60.0)`
122122

123123
If not specified, `mpi_cmd` defaults to `mpirun -np $np`
124-
STDOUT from the launched workers is redirected back to the julia session calling `addprocs` via a TCP connection.
124+
`stdout` from the launched workers is redirected back to the julia session calling `addprocs` via a TCP connection.
125125
Thus the workers must be able to freely connect via TCP to the host session.
126126
The following lines will be typically required on the julia master process to support both julia and mpi
127127

REQUIRE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
julia 0.6
1+
julia 0.7.0-beta
22
BinDeps
33
Compat 0.66

appveyor.yml

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
environment:
22
matrix:
3-
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
4-
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
53
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
64
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"
7-
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.7/julia-0.7.0-alpha-win32.exe"
8-
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.7/julia-0.7.0-alpha-win64.exe"
5+
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.7/julia-0.7.0-beta2-win32.exe"
6+
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.7/julia-0.7.0-beta2-win64.exe"
97

108
branches:
119
only:
@@ -33,11 +31,7 @@ install:
3331
- set PATH=C:\Program Files\Microsoft MPI\Bin;%PATH%
3432

3533
build_script:
36-
# Need to convert from shallow to complete for Pkg.clone to work
37-
- IF EXIST .git\shallow (git fetch --unshallow)
38-
- C:\projects\julia\bin\julia -e "VERSION < v\"0.7.0-DEV\" || (using InteractiveUtils);
39-
versioninfo(); pkg=\"MPI\";
40-
if VERSION < v\"0.7.0-DEV.5183\"; Pkg.clone(pwd(), pkg); else; using Pkg; Pkg.up(); end; Pkg.build(\"MPI\")"
34+
- C:\projects\julia\bin\julia -e "using Pkg; pkg\"activate .\"; pkg\"build\""
4135

4236
test_script:
43-
- C:\projects\julia\bin\julia --check-bounds=yes -e "VERSION < v\"0.7.0-DEV.5183\" || using Pkg; Pkg.test(\"MPI\")"
37+
- C:\projects\julia\bin\julia --check-bounds=yes -e "using Pkg; pkg\"activate .\"; pkg\"test\""

examples/05-juliacman.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ println("Running 01-hello as part of a Julia cluster")
1111
@mpi_do manager (include("01-hello-impl.jl"); do_hello())
1212

1313
# Interspersed julia parallel call
14-
nheads = @parallel (+) for i=1:10^8
14+
nheads = @distributed (+) for i=1:10^8
1515
Int(rand(Bool))
1616
end
17-
println("@parallel nheads $nheads")
17+
println("@distributed nheads $nheads")
1818

1919
println("Running 02-broadcast as part of a Julia cluster")
2020
@mpi_do manager (include("02-broadcast-impl.jl"); do_broadcast())

src/MPI.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ function __init__()
3333

3434
# look up all symbols ahead of time
3535
for (jname, fname) in _mpi_functions
36-
eval(:(const $jname = Libdl.dlsym(libmpi_handle, $fname)))
36+
Core.eval(MPI, :(const $jname = Libdl.dlsym(libmpi_handle, $fname)))
3737
end
3838
end
3939

src/cman.jl

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ export MPIManager, launch, manage, kill, procs, connect, mpiprocs, @mpi_do
33
export TransportMode, MPI_ON_WORKERS, TCP_TRANSPORT_ALL, MPI_TRANSPORT_ALL
44
using Compat
55
using Compat.Distributed
6-
import Compat.Sockets: connect, listenany, accept, getipaddr, IPv4
6+
import Compat.Sockets: connect, listenany, accept, IPv4, getsockname
77

88

99

@@ -42,6 +42,7 @@ mutable struct MPIManager <: ClusterManager
4242

4343
# TCP Transport
4444
port::UInt16
45+
ip::UInt32
4546
stdout_ios::Array
4647

4748
# MPI transport
@@ -54,7 +55,7 @@ mutable struct MPIManager <: ClusterManager
5455
sending_done::Channel{Nothing}
5556
receiving_done::Channel{Nothing}
5657

57-
function MPIManager(; np::Integer = Sys.CPU_CORES,
58+
function MPIManager(; np::Integer = Sys.CPU_THREADS,
5859
mpirun_cmd::Cmd = `mpiexec -n $np`,
5960
launch_timeout::Real = 60.0,
6061
mode::TransportMode = MPI_ON_WORKERS)
@@ -86,13 +87,15 @@ mutable struct MPIManager <: ClusterManager
8687
if mode != MPI_TRANSPORT_ALL
8788
# Start a listener for capturing stdout from the workers
8889
port, server = listenany(11000)
90+
ip = getsockname(server)[1].host
8991
@async begin
9092
while true
9193
sock = accept(server)
9294
push!(mgr.stdout_ios, sock)
9395
end
9496
end
9597
mgr.port = port
98+
mgr.ip = ip
9699
mgr.stdout_ios = IO[]
97100
else
98101
mgr.rank2streams = Dict{Int,Tuple{IO,IO}}()
@@ -133,7 +136,7 @@ function Distributed.launch(mgr::MPIManager, params::Dict,
133136
throw(ErrorException("Reuse of MPIManager is not allowed."))
134137
end
135138
cookie = string(":cookie_",Distributed.cluster_cookie())
136-
setup_cmds = `using MPI\;MPI.setup_worker'('$(getipaddr().host),$(mgr.port),$cookie')'`
139+
setup_cmds = `using MPI\;MPI.setup_worker'('$(mgr.ip),$(mgr.port),$cookie')'`
137140
mpi_cmd = `$(mgr.mpirun_cmd) $(params[:exename]) -e $(Base.shell_escape(setup_cmds))`
138141
open(detach(mpi_cmd))
139142
mgr.launched = true
@@ -151,7 +154,7 @@ function Distributed.launch(mgr::MPIManager, params::Dict,
151154
end
152155

153156
# Traverse all worker I/O streams and receive their MPI rank
154-
configs = Array{WorkerConfig}(mgr.np)
157+
configs = Array{WorkerConfig}(undef, mgr.np)
155158
@sync begin
156159
for io in mgr.stdout_ios
157160
@async let io=io
@@ -199,12 +202,12 @@ function setup_worker(host, port, cookie)
199202

200203
# Hand over control to Base
201204
if cookie == nothing
202-
Base.start_worker(io)
205+
Distributed.start_worker(io)
203206
else
204207
if isa(cookie, Symbol)
205208
cookie = string(cookie)[8:end] # strip the leading "cookie_"
206209
end
207-
Base.start_worker(io, cookie)
210+
Distributed.start_worker(io, cookie)
208211
end
209212
end
210213

@@ -279,8 +282,8 @@ end
279282
# case
280283
function start_send_event_loop(mgr::MPIManager, rank::Int)
281284
try
282-
r_s = BufferStream()
283-
w_s = BufferStream()
285+
r_s = Base.BufferStream()
286+
w_s = Base.BufferStream()
284287
mgr.rank2streams[rank] = (r_s, w_s)
285288

286289
# TODO: There is one task per communication partner -- this can be
@@ -292,7 +295,7 @@ function start_send_event_loop(mgr::MPIManager, rank::Int)
292295
reqs = MPI.Request[]
293296
while !isready(mgr.initiate_shutdown)
294297
# When data are available, send them
295-
while nb_available(w_s) > 0
298+
while bytesavailable(w_s) > 0
296299
data = take!(w_s.buffer)
297300
push!(reqs, MPI.Isend(data, rank, 0, mgr.comm))
298301
end
@@ -307,7 +310,7 @@ function start_send_event_loop(mgr::MPIManager, rank::Int)
307310
end
308311
(r_s, w_s)
309312
catch e
310-
Base.show_backtrace(STDOUT, catch_backtrace())
313+
Base.show_backtrace(stdout, catch_backtrace())
311314
println(e)
312315
rethrow(e)
313316
end
@@ -334,11 +337,15 @@ function start_main_loop(mode::TransportMode=TCP_TRANSPORT_ALL;
334337
# Create manager object
335338
mgr = MPIManager(np=size-1, mode=mode)
336339
mgr.comm = comm
340+
# Needed because of Julia commit https://github.com/JuliaLang/julia/commit/299300a409c35153a1fa235a05c3929726716600
341+
if isdefined(Distributed, :init_multi)
342+
Distributed.init_multi()
343+
end
337344
# Send connection information to all workers
338345
# TODO: Use Bcast
339346
for j in 1:size-1
340347
cookie = VERSION >= v"0.5.0-dev+4047" ? Distributed.cluster_cookie() : nothing
341-
MPI.send((getipaddr().host, mgr.port, cookie), j, 0, comm)
348+
MPI.send((mgr.ip, mgr.port, cookie), j, 0, comm)
342349
end
343350
# Tell Base about the workers
344351
addprocs(mgr)
@@ -363,6 +370,9 @@ function start_main_loop(mode::TransportMode=TCP_TRANSPORT_ALL;
363370

364371
# Send the cookie over. Introduced in v"0.5.0-dev+4047". Irrelevant under MPI
365372
# transport, but need it to satisfy the changed protocol.
373+
if isdefined(Distributed, :init_multi)
374+
Distributed.init_multi()
375+
end
366376
MPI.bcast(Distributed.cluster_cookie(), 0, comm)
367377
# Start event loop for the workers
368378
@async receive_event_loop(mgr)
@@ -376,7 +386,7 @@ function start_main_loop(mode::TransportMode=TCP_TRANSPORT_ALL;
376386
mgr.comm = comm
377387
# Recv the cookie
378388
cookie = MPI.bcast(nothing, 0, comm)
379-
Base.init_worker(cookie, mgr)
389+
Distributed.init_worker(cookie, mgr)
380390
# Start a worker event loop
381391
receive_event_loop(mgr)
382392
MPI.Finalize()
@@ -394,7 +404,7 @@ function receive_event_loop(mgr::MPIManager)
394404
(hasdata, stat) = MPI.Iprobe(MPI.ANY_SOURCE, 0, mgr.comm)
395405
if hasdata
396406
count = Get_count(stat, UInt8)
397-
buf = Array{UInt8}(count)
407+
buf = Array{UInt8}(undef, count)
398408
from_rank = Get_source(stat)
399409
MPI.Recv!(buf, from_rank, 0, mgr.comm)
400410

@@ -403,7 +413,7 @@ function receive_event_loop(mgr::MPIManager)
403413
# This is the first time we communicate with this rank.
404414
# Set up a new connection.
405415
(r_s, w_s) = start_send_event_loop(mgr, from_rank)
406-
Base.process_messages(r_s, w_s)
416+
Distributed.process_messages(r_s, w_s)
407417
num_send_loops += 1
408418
else
409419
(r_s, w_s) = streams
@@ -459,7 +469,7 @@ end
459469
function mpi_do(mgr::MPIManager, expr)
460470
!mgr.initialized && wait(mgr.cond_initialized)
461471
jpids = keys(mgr.j2mpi)
462-
refs = Array{Any}(length(jpids))
472+
refs = Array{Any}(undef, length(jpids))
463473
for (i,p) in enumerate(Iterators.filter(x -> x != myid(), jpids))
464474
refs[i] = remotecall(expr, p)
465475
end
@@ -490,7 +500,7 @@ end
490500
macro mpi_do(mgr, expr)
491501
quote
492502
# Evaluate expression in Main module
493-
thunk = () -> (eval(Main, $(Expr(:quote, expr))); nothing)
503+
thunk = () -> (Core.eval(Main, $(Expr(:quote, expr))); nothing)
494504
mpi_do($(esc(mgr)), thunk)
495505
end
496506
end

test/runtests.jl

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,8 @@ if Compat.Sys.iswindows()
3030
end
3131
end
3232

33-
if VERSION > v"0.7.0-DEV.2005"
34-
push!(excludedfiles, "test_cman_julia.jl")
35-
push!(excludedfiles, "test_cman_mpi.jl")
36-
push!(excludedfiles, "test_cman_tcp.jl")
37-
end
3833
function runtests()
39-
nprocs = clamp(Sys.CPU_CORES, 2, 4)
34+
nprocs = clamp(Sys.CPU_THREADS, 2, 4)
4035
exename = joinpath(BINDIR, Base.julia_exename())
4136
testdir = dirname(@__FILE__)
4237
istest(f) = endswith(f, ".jl") && startswith(f, "test_")

test/test_bcast.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
using Compat.Test
2-
using Compat.Random
1+
using Test
2+
using Random
33
using MPI
44

55
MPI.Init()
@@ -19,7 +19,7 @@ end
1919

2020
root = 0
2121

22-
srand(17)
22+
Random.seed!(17)
2323

2424
matsize = (17,17)
2525
for typ in Base.uniontypes(MPI.MPIDatatype)

0 commit comments

Comments
 (0)