@@ -99,10 +99,10 @@ mutable struct Worker
99
99
del_msgs:: Array{Any,1} # XXX : Could del_msgs and add_msgs be Channels?
100
100
add_msgs:: Array{Any,1}
101
101
@atomic gcflag:: Bool
102
- state:: WorkerState
103
- c_state:: Condition # wait for state changes
104
- ct_time:: Float64 # creation time
105
- conn_func:: Any # used to setup connections lazily
102
+ @atomic state:: WorkerState
103
+ c_state:: Threads. Condition # wait for state changes, lock for state
104
+ ct_time:: Float64 # creation time
105
+ conn_func:: Any # used to setup connections lazily
106
106
107
107
r_stream:: IO
108
108
w_stream:: IO
@@ -134,7 +134,7 @@ mutable struct Worker
134
134
if haskey (map_pid_wrkr, id)
135
135
return map_pid_wrkr[id]
136
136
end
137
- w= new (id, Threads. ReentrantLock (), [], [], false , W_CREATED, Condition (), time (), conn_func)
137
+ w= new (id, Threads. ReentrantLock (), [], [], false , W_CREATED, Threads . Condition (), time (), conn_func)
138
138
w. initialized = Event ()
139
139
register_worker (w)
140
140
w
@@ -144,12 +144,14 @@ mutable struct Worker
144
144
end
145
145
146
146
function set_worker_state (w, state)
147
- w. state = state
148
- notify (w. c_state; all= true )
147
+ lock (w. c_state) do
148
+ @atomic w. state = state
149
+ notify (w. c_state; all= true )
150
+ end
149
151
end
150
152
151
153
function check_worker_state (w:: Worker )
152
- if w. state === W_CREATED
154
+ if ( @atomic w. state) === W_CREATED
153
155
if ! isclusterlazy ()
154
156
if PGRP. topology === :all_to_all
155
157
# Since higher pids connect with lower pids, the remote worker
@@ -170,6 +172,7 @@ function check_worker_state(w::Worker)
170
172
wait_for_conn (w)
171
173
end
172
174
end
175
+ return nothing
173
176
end
174
177
175
178
exec_conn_func (id:: Int ) = exec_conn_func (worker_from_id (id):: Worker )
@@ -187,13 +190,21 @@ function exec_conn_func(w::Worker)
187
190
end
188
191
189
192
function wait_for_conn (w)
190
- if w. state === W_CREATED
193
+ if ( @atomic w. state) === W_CREATED
191
194
timeout = worker_timeout () - (time () - w. ct_time)
192
195
timeout <= 0 && error (" peer $(w. id) has not connected to $(myid ()) " )
193
196
194
- @async (sleep (timeout); notify (w. c_state; all= true ))
195
- wait (w. c_state)
196
- w. state === W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
197
+ T = Threads. @spawn begin
198
+ sleep ($ timeout)
199
+ lock (w. c_state) do
200
+ notify (w. c_state; all= true )
201
+ end
202
+ end
203
+ errormonitor (T)
204
+ lock (w. c_state) do
205
+ wait (w. c_state)
206
+ (@atomic w. state) === W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
207
+ end
197
208
end
198
209
nothing
199
210
end
@@ -491,7 +502,10 @@ function addprocs_locked(manager::ClusterManager; kwargs...)
491
502
while true
492
503
if isempty (launched)
493
504
istaskdone (t_launch) && break
494
- @async (sleep (1 ); notify (launch_ntfy))
505
+ @async begin
506
+ sleep (1 )
507
+ notify (launch_ntfy)
508
+ end
495
509
wait (launch_ntfy)
496
510
end
497
511
@@ -645,7 +659,12 @@ function create_worker(manager, wconfig)
645
659
# require the value of config.connect_at which is set only upon connection completion
646
660
for jw in PGRP. workers
647
661
if (jw. id != 1 ) && (jw. id < w. id)
648
- (jw. state === W_CREATED) && wait (jw. c_state)
662
+ lock (jw. c_state) do
663
+ # wait for wl to join
664
+ if (@atomic jw. state) === W_CREATED
665
+ wait (jw. c_state)
666
+ end
667
+ end
649
668
push! (join_list, jw)
650
669
end
651
670
end
@@ -668,7 +687,12 @@ function create_worker(manager, wconfig)
668
687
end
669
688
670
689
for wl in wlist
671
- (wl. state === W_CREATED) && wait (wl. c_state)
690
+ lock (wl. c_state) do
691
+ if (@atomic wl. state) === W_CREATED
692
+ # wait for wl to join
693
+ wait (wl. c_state)
694
+ end
695
+ end
672
696
push! (join_list, wl)
673
697
end
674
698
end
@@ -682,10 +706,16 @@ function create_worker(manager, wconfig)
682
706
join_message = JoinPGRPMsg (w. id, all_locs, PGRP. topology, enable_threaded_blas, isclusterlazy ())
683
707
send_msg_now (w, MsgHeader (RRID (0 ,0 ), ntfy_oid), join_message)
684
708
685
- @async manage (w. manager, w. id, w. config, :register )
709
+ errormonitor ( @async manage (w. manager, w. id, w. config, :register ) )
686
710
# wait for rr_ntfy_join with timeout
687
711
timedout = false
688
- @async (sleep ($ timeout); timedout = true ; put! (rr_ntfy_join, 1 ))
712
+ errormonitor (
713
+ @async begin
714
+ sleep ($ timeout)
715
+ timedout = true
716
+ put! (rr_ntfy_join, 1 )
717
+ end
718
+ )
689
719
wait (rr_ntfy_join)
690
720
if timedout
691
721
error (" worker did not connect within $timeout seconds" )
@@ -735,17 +765,20 @@ function check_master_connect()
735
765
if ccall (:jl_running_on_valgrind ,Cint,()) != 0
736
766
return
737
767
end
738
- @async begin
739
- start = time_ns ()
740
- while ! haskey (map_pid_wrkr, 1 ) && (time_ns () - start) < timeout
741
- sleep (1.0 )
742
- end
743
768
744
- if ! haskey (map_pid_wrkr, 1 )
745
- print (stderr , " Master process (id 1) could not connect within $(timeout/ 1e9 ) seconds.\n exiting.\n " )
746
- exit (1 )
769
+ errormonitor (
770
+ @async begin
771
+ start = time_ns ()
772
+ while ! haskey (map_pid_wrkr, 1 ) && (time_ns () - start) < timeout
773
+ sleep (1.0 )
774
+ end
775
+
776
+ if ! haskey (map_pid_wrkr, 1 )
777
+ print (stderr , " Master process (id 1) could not connect within $(timeout/ 1e9 ) seconds.\n exiting.\n " )
778
+ exit (1 )
779
+ end
747
780
end
748
- end
781
+ )
749
782
end
750
783
751
784
@@ -870,7 +903,7 @@ function nprocs()
870
903
n = length (PGRP. workers)
871
904
# filter out workers in the process of being setup/shutdown.
872
905
for jw in PGRP. workers
873
- if ! isa (jw, LocalProcess) && (jw. state != = W_CONNECTED)
906
+ if ! isa (jw, LocalProcess) && (( @atomic jw. state) != = W_CONNECTED)
874
907
n = n - 1
875
908
end
876
909
end
@@ -921,7 +954,7 @@ julia> procs()
921
954
function procs ()
922
955
if myid () == 1 || (PGRP. topology === :all_to_all && ! isclusterlazy ())
923
956
# filter out workers in the process of being setup/shutdown.
924
- return Int[x. id for x in PGRP. workers if isa (x, LocalProcess) || (x. state === W_CONNECTED)]
957
+ return Int[x. id for x in PGRP. workers if isa (x, LocalProcess) || (( @atomic x. state) === W_CONNECTED)]
925
958
else
926
959
return Int[x. id for x in PGRP. workers]
927
960
end
930
963
function id_in_procs (id) # faster version of `id in procs()`
931
964
if myid () == 1 || (PGRP. topology === :all_to_all && ! isclusterlazy ())
932
965
for x in PGRP. workers
933
- if (x. id:: Int ) == id && (isa (x, LocalProcess) || (x:: Worker ). state === W_CONNECTED)
966
+ if (x. id:: Int ) == id && (isa (x, LocalProcess) || (@atomic ( x:: Worker ). state) === W_CONNECTED)
934
967
return true
935
968
end
936
969
end
@@ -952,7 +985,7 @@ Specifically all workers bound to the same ip-address as `pid` are returned.
952
985
"""
953
986
function procs (pid:: Integer )
954
987
if myid () == 1
955
- all_workers = [x for x in PGRP. workers if isa (x, LocalProcess) || (x. state === W_CONNECTED)]
988
+ all_workers = [x for x in PGRP. workers if isa (x, LocalProcess) || (( @atomic x. state) === W_CONNECTED)]
956
989
if (pid == 1 ) || (isa (map_pid_wrkr[pid]. manager, LocalManager))
957
990
Int[x. id for x in filter (w -> (w. id== 1 ) || (isa (w. manager, LocalManager)), all_workers)]
958
991
else
@@ -1059,11 +1092,11 @@ function _rmprocs(pids, waitfor)
1059
1092
1060
1093
start = time_ns ()
1061
1094
while (time_ns () - start) < waitfor* 1e9
1062
- all (w -> w. state === W_TERMINATED, rmprocset) && break
1095
+ all (w -> ( @atomic w. state) === W_TERMINATED, rmprocset) && break
1063
1096
sleep (min (0.1 , waitfor - (time_ns () - start)/ 1e9 ))
1064
1097
end
1065
1098
1066
- unremoved = [wrkr. id for wrkr in filter (w -> w. state != = W_TERMINATED, rmprocset)]
1099
+ unremoved = [wrkr. id for wrkr in filter (w -> ( @atomic w. state) != = W_TERMINATED, rmprocset)]
1067
1100
if length (unremoved) > 0
1068
1101
estr = string (" rmprocs: pids " , unremoved, " not terminated after " , waitfor, " seconds." )
1069
1102
throw (ErrorException (estr))
@@ -1290,18 +1323,16 @@ end
1290
1323
1291
1324
using Random: randstring
1292
1325
1293
- let inited = false
1294
- # do initialization that's only needed when there is more than 1 processor
1295
- global function init_multi ()
1296
- if ! inited
1297
- inited = true
1298
- push! (Base. package_callbacks, _require_callback)
1299
- atexit (terminate_all_workers)
1300
- init_bind_addr ()
1301
- cluster_cookie (randstring (HDR_COOKIE_LEN))
1302
- end
1303
- return nothing
1326
+ # do initialization that's only needed when there is more than 1 processor
1327
+ const inited = Threads. Atomic {Bool} (false )
1328
+ function init_multi ()
1329
+ if ! Threads. atomic_cas! (inited, false , true )
1330
+ push! (Base. package_callbacks, _require_callback)
1331
+ atexit (terminate_all_workers)
1332
+ init_bind_addr ()
1333
+ cluster_cookie (randstring (HDR_COOKIE_LEN))
1304
1334
end
1335
+ return nothing
1305
1336
end
1306
1337
1307
1338
function init_parallel ()
0 commit comments