@@ -99,10 +99,10 @@ mutable struct Worker
99
99
del_msgs:: Array{Any,1} # XXX : Could del_msgs and add_msgs be Channels?
100
100
add_msgs:: Array{Any,1}
101
101
@atomic gcflag:: Bool
102
- state:: WorkerState
103
- c_state:: Condition # wait for state changes
104
- ct_time:: Float64 # creation time
105
- conn_func:: Any # used to setup connections lazily
102
+ @atomic state:: WorkerState
103
+ c_state:: Threads. Condition # wait for state changes, lock for state
104
+ ct_time:: Float64 # creation time
105
+ conn_func:: Any # used to setup connections lazily
106
106
107
107
r_stream:: IO
108
108
w_stream:: IO
@@ -134,7 +134,7 @@ mutable struct Worker
134
134
if haskey (map_pid_wrkr, id)
135
135
return map_pid_wrkr[id]
136
136
end
137
- w= new (id, Threads. ReentrantLock (), [], [], false , W_CREATED, Condition (), time (), conn_func)
137
+ w= new (id, Threads. ReentrantLock (), [], [], false , W_CREATED, Threads . Condition (), time (), conn_func)
138
138
w. initialized = Event ()
139
139
register_worker (w)
140
140
w
@@ -144,12 +144,14 @@ mutable struct Worker
144
144
end
145
145
146
146
function set_worker_state (w, state)
147
- w. state = state
148
- notify (w. c_state; all= true )
147
+ lock (w. c_state) do
148
+ @atomic w. state = state
149
+ notify (w. c_state; all= true )
150
+ end
149
151
end
150
152
151
153
function check_worker_state (w:: Worker )
152
- if w. state === W_CREATED
154
+ if ( @atomic w. state) === W_CREATED
153
155
if ! isclusterlazy ()
154
156
if PGRP. topology === :all_to_all
155
157
# Since higher pids connect with lower pids, the remote worker
@@ -170,6 +172,7 @@ function check_worker_state(w::Worker)
170
172
wait_for_conn (w)
171
173
end
172
174
end
175
+ return nothing
173
176
end
174
177
175
178
exec_conn_func (id:: Int ) = exec_conn_func (worker_from_id (id):: Worker )
@@ -187,13 +190,21 @@ function exec_conn_func(w::Worker)
187
190
end
188
191
189
192
function wait_for_conn (w)
190
- if w. state === W_CREATED
193
+ if ( @atomic w. state) === W_CREATED
191
194
timeout = worker_timeout () - (time () - w. ct_time)
192
195
timeout <= 0 && error (" peer $(w. id) has not connected to $(myid ()) " )
193
196
194
- @async (sleep (timeout); notify (w. c_state; all= true ))
195
- wait (w. c_state)
196
- w. state === W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
197
+ T = Threads. @spawn begin
198
+ sleep ($ timeout)
199
+ lock (w. c_state) do
200
+ notify (w. c_state; all= true )
201
+ end
202
+ end
203
+ errormonitor (T)
204
+ lock (w. c_state) do
205
+ wait (w. c_state)
206
+ (@atomic w. state) === W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
207
+ end
197
208
end
198
209
nothing
199
210
end
@@ -491,7 +502,10 @@ function addprocs_locked(manager::ClusterManager; kwargs...)
491
502
while true
492
503
if isempty (launched)
493
504
istaskdone (t_launch) && break
494
- @async (sleep (1 ); notify (launch_ntfy))
505
+ @async begin
506
+ sleep (1 )
507
+ notify (launch_ntfy)
508
+ end
495
509
wait (launch_ntfy)
496
510
end
497
511
@@ -645,7 +659,12 @@ function create_worker(manager, wconfig)
645
659
# require the value of config.connect_at which is set only upon connection completion
646
660
for jw in PGRP. workers
647
661
if (jw. id != 1 ) && (jw. id < w. id)
648
- (jw. state === W_CREATED) && wait (jw. c_state)
662
+ lock (jw. c_state) do
663
+ # wait for wl to join
664
+ if (@atomic jw. state) === W_CREATED
665
+ wait (jw. c_state)
666
+ end
667
+ end
649
668
push! (join_list, jw)
650
669
end
651
670
end
@@ -668,7 +687,12 @@ function create_worker(manager, wconfig)
668
687
end
669
688
670
689
for wl in wlist
671
- (wl. state === W_CREATED) && wait (wl. c_state)
690
+ lock (wl. c_state) do
691
+ if (@atomic wl. state) === W_CREATED
692
+ # wait for wl to join
693
+ wait (wl. c_state)
694
+ end
695
+ end
672
696
push! (join_list, wl)
673
697
end
674
698
end
@@ -685,7 +709,11 @@ function create_worker(manager, wconfig)
685
709
@async manage (w. manager, w. id, w. config, :register )
686
710
# wait for rr_ntfy_join with timeout
687
711
timedout = false
688
- @async (sleep ($ timeout); timedout = true ; put! (rr_ntfy_join, 1 ))
712
+ @async begin
713
+ sleep ($ timeout)
714
+ timedout = true
715
+ put! (rr_ntfy_join, 1 )
716
+ end
689
717
wait (rr_ntfy_join)
690
718
if timedout
691
719
error (" worker did not connect within $timeout seconds" )
@@ -870,7 +898,7 @@ function nprocs()
870
898
n = length (PGRP. workers)
871
899
# filter out workers in the process of being setup/shutdown.
872
900
for jw in PGRP. workers
873
- if ! isa (jw, LocalProcess) && (jw. state != = W_CONNECTED)
901
+ if ! isa (jw, LocalProcess) && (( @atomic jw. state) != = W_CONNECTED)
874
902
n = n - 1
875
903
end
876
904
end
@@ -921,7 +949,7 @@ julia> procs()
921
949
function procs ()
922
950
if myid () == 1 || (PGRP. topology === :all_to_all && ! isclusterlazy ())
923
951
# filter out workers in the process of being setup/shutdown.
924
- return Int[x. id for x in PGRP. workers if isa (x, LocalProcess) || (x. state === W_CONNECTED)]
952
+ return Int[x. id for x in PGRP. workers if isa (x, LocalProcess) || (( @atomic x. state) === W_CONNECTED)]
925
953
else
926
954
return Int[x. id for x in PGRP. workers]
927
955
end
930
958
function id_in_procs (id) # faster version of `id in procs()`
931
959
if myid () == 1 || (PGRP. topology === :all_to_all && ! isclusterlazy ())
932
960
for x in PGRP. workers
933
- if (x. id:: Int ) == id && (isa (x, LocalProcess) || (x:: Worker ). state === W_CONNECTED)
961
+ if (x. id:: Int ) == id && (isa (x, LocalProcess) || (@atomic ( x:: Worker ). state) === W_CONNECTED)
934
962
return true
935
963
end
936
964
end
@@ -952,7 +980,7 @@ Specifically all workers bound to the same ip-address as `pid` are returned.
952
980
"""
953
981
function procs (pid:: Integer )
954
982
if myid () == 1
955
- all_workers = [x for x in PGRP. workers if isa (x, LocalProcess) || (x. state === W_CONNECTED)]
983
+ all_workers = [x for x in PGRP. workers if isa (x, LocalProcess) || (( @atomic x. state) === W_CONNECTED)]
956
984
if (pid == 1 ) || (isa (map_pid_wrkr[pid]. manager, LocalManager))
957
985
Int[x. id for x in filter (w -> (w. id== 1 ) || (isa (w. manager, LocalManager)), all_workers)]
958
986
else
@@ -1059,11 +1087,11 @@ function _rmprocs(pids, waitfor)
1059
1087
1060
1088
start = time_ns ()
1061
1089
while (time_ns () - start) < waitfor* 1e9
1062
- all (w -> w. state === W_TERMINATED, rmprocset) && break
1090
+ all (w -> ( @atomic w. state) === W_TERMINATED, rmprocset) && break
1063
1091
sleep (min (0.1 , waitfor - (time_ns () - start)/ 1e9 ))
1064
1092
end
1065
1093
1066
- unremoved = [wrkr. id for wrkr in filter (w -> w. state != = W_TERMINATED, rmprocset)]
1094
+ unremoved = [wrkr. id for wrkr in filter (w -> ( @atomic w. state) != = W_TERMINATED, rmprocset)]
1067
1095
if length (unremoved) > 0
1068
1096
estr = string (" rmprocs: pids " , unremoved, " not terminated after " , waitfor, " seconds." )
1069
1097
throw (ErrorException (estr))
0 commit comments