@@ -100,9 +100,9 @@ mutable struct Worker
100
100
add_msgs:: Array{Any,1}
101
101
@atomic gcflag:: Bool
102
102
state:: WorkerState
103
- c_state:: Condition # wait for state changes
104
- ct_time:: Float64 # creation time
105
- conn_func:: Any # used to setup connections lazily
103
+ c_state:: Threads. Condition # wait for state changes, lock for state
104
+ ct_time:: Float64 # creation time
105
+ conn_func:: Any # used to setup connections lazily
106
106
107
107
r_stream:: IO
108
108
w_stream:: IO
@@ -134,7 +134,7 @@ mutable struct Worker
134
134
if haskey (map_pid_wrkr, id)
135
135
return map_pid_wrkr[id]
136
136
end
137
- w= new (id, Threads. ReentrantLock (), [], [], false , W_CREATED, Condition (), time (), conn_func)
137
+ w= new (id, Threads. ReentrantLock (), [], [], false , W_CREATED, Threads . Condition (), time (), conn_func)
138
138
w. initialized = Event ()
139
139
register_worker (w)
140
140
w
@@ -144,12 +144,16 @@ mutable struct Worker
144
144
end
145
145
146
146
function set_worker_state (w, state)
147
- w. state = state
148
- notify (w. c_state; all= true )
147
+ lock (w. c_state) do
148
+ w. state = state
149
+ notify (w. c_state; all= true )
150
+ end
149
151
end
150
152
151
153
function check_worker_state (w:: Worker )
154
+ lock (w. c_state)
152
155
if w. state === W_CREATED
156
+ unlock (w. c_state)
153
157
if ! isclusterlazy ()
154
158
if PGRP. topology === :all_to_all
155
159
# Since higher pids connect with lower pids, the remote worker
@@ -169,6 +173,8 @@ function check_worker_state(w::Worker)
169
173
errormonitor (t)
170
174
wait_for_conn (w)
171
175
end
176
+ else
177
+ unlock (w. c_state)
172
178
end
173
179
end
174
180
@@ -187,13 +193,25 @@ function exec_conn_func(w::Worker)
187
193
end
188
194
189
195
function wait_for_conn (w)
196
+ lock (w. c_state)
190
197
if w. state === W_CREATED
198
+ unlock (w. c_state)
191
199
timeout = worker_timeout () - (time () - w. ct_time)
192
200
timeout <= 0 && error (" peer $(w. id) has not connected to $(myid ()) " )
193
201
194
- @async (sleep (timeout); notify (w. c_state; all= true ))
195
- wait (w. c_state)
196
- w. state === W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
202
+ T = Threads. @spawn begin
203
+ sleep ($ timeout)
204
+ lock (w. c_state) do
205
+ notify (w. c_state; all= true )
206
+ end
207
+ end
208
+ errormonitor (T)
209
+ lock (w. c_state) do
210
+ wait (w. c_state)
211
+ w. state === W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
212
+ end
213
+ else
214
+ unlock (w. c_state)
197
215
end
198
216
nothing
199
217
end
@@ -491,7 +509,10 @@ function addprocs_locked(manager::ClusterManager; kwargs...)
491
509
while true
492
510
if isempty (launched)
493
511
istaskdone (t_launch) && break
494
- @async (sleep (1 ); notify (launch_ntfy))
512
+ @async begin
513
+ sleep (1 )
514
+ notify (launch_ntfy)
515
+ end
495
516
wait (launch_ntfy)
496
517
end
497
518
@@ -645,7 +666,12 @@ function create_worker(manager, wconfig)
645
666
# require the value of config.connect_at which is set only upon connection completion
646
667
for jw in PGRP. workers
647
668
if (jw. id != 1 ) && (jw. id < w. id)
648
- (jw. state === W_CREATED) && wait (jw. c_state)
669
+ # wait for wl to join
670
+ lock (jw. c_state) do
671
+ if jw. state === W_CREATED
672
+ wait (jw. c_state)
673
+ end
674
+ end
649
675
push! (join_list, jw)
650
676
end
651
677
end
@@ -668,7 +694,12 @@ function create_worker(manager, wconfig)
668
694
end
669
695
670
696
for wl in wlist
671
- (wl. state === W_CREATED) && wait (wl. c_state)
697
+ lock (wl. c_state) do
698
+ if wl. state === W_CREATED
699
+ # wait for wl to join
700
+ wait (wl. c_state)
701
+ end
702
+ end
672
703
push! (join_list, wl)
673
704
end
674
705
end
@@ -685,7 +716,11 @@ function create_worker(manager, wconfig)
685
716
@async manage (w. manager, w. id, w. config, :register )
686
717
# wait for rr_ntfy_join with timeout
687
718
timedout = false
688
- @async (sleep ($ timeout); timedout = true ; put! (rr_ntfy_join, 1 ))
719
+ @async begin
720
+ sleep ($ timeout)
721
+ timedout = true
722
+ put! (rr_ntfy_join, 1 )
723
+ end
689
724
wait (rr_ntfy_join)
690
725
if timedout
691
726
error (" worker did not connect within $timeout seconds" )
0 commit comments