@@ -71,6 +71,8 @@ Keyword arguments:
71
71
* `tunnel`: if `true` then SSH tunneling will be used to connect to the worker from the
72
72
master process. Default is `false`.
73
73
74
+ * `multiplex`: if `true` then SSH multiplexing is used for SSH tunneling. Default is `false`.
75
+
74
76
* `sshflags`: specifies additional ssh options, e.g. ```sshflags=\` -i /home/foo/bar.pem\` ```
75
77
76
78
* `max_parallel`: specifies the maximum number of workers connected to in parallel at a
@@ -113,9 +115,9 @@ This timeout can be controlled via environment variable `JULIA_WORKER_TIMEOUT`.
113
115
The value of `JULIA_WORKER_TIMEOUT` on the master process specifies the number of seconds a
114
116
newly launched worker waits for connection establishment.
115
117
"""
116
- function addprocs (machines:: AbstractVector ; tunnel= false , sshflags= ` ` , max_parallel= 10 , kwargs... )
118
+ function addprocs (machines:: AbstractVector ; tunnel= false , multiplex = false , sshflags= ` ` , max_parallel= 10 , kwargs... )
117
119
check_addprocs_args (kwargs)
118
- addprocs (SSHManager (machines); tunnel= tunnel, sshflags= sshflags, max_parallel= max_parallel, kwargs... )
120
+ addprocs (SSHManager (machines); tunnel= tunnel, multiplex = multiplex, sshflags= sshflags, max_parallel= max_parallel, kwargs... )
119
121
end
120
122
121
123
@@ -149,6 +151,8 @@ function launch_on_machine(manager::SSHManager, machine, cnt, params, launched,
149
151
dir = params[:dir ]
150
152
exename = params[:exename ]
151
153
exeflags = params[:exeflags ]
154
+ tunnel = params[:tunnel ]
155
+ multiplex = params[:multiplex ]
152
156
153
157
# machine could be of the format [user@]host[:port] bind_addr[:bind_port]
154
158
# machine format string is split on whitespace
@@ -178,6 +182,20 @@ function launch_on_machine(manager::SSHManager, machine, cnt, params, launched,
178
182
end
179
183
sshflags = ` $(params[:sshflags ]) $portopt `
180
184
185
+ if tunnel
186
+ # First it checks if ssh multiplexing has been already enabled and the master process is running.
187
+ # If it's already running, later ssh sessions also use the same ssh multiplexing session even if
188
+ # `multiplex` is not explicitly specified; otherwise the tunneling session launched later won't
189
+ # go to background and hang. This is because of OpenSSH implementation.
190
+ if success (` ssh $sshflags -O check $host ` )
191
+ multiplex = true
192
+ elseif multiplex
193
+ # automatically create an SSH multiplexing session at the next SSH connection
194
+ controlpath = " ~/.ssh/julia-%r@%h:%p"
195
+ sshflags = ` $sshflags -o ControlMaster=auto -o ControlPath=$controlpath -o ControlPersist=no`
196
+ end
197
+ end
198
+
181
199
# Build up the ssh command
182
200
183
201
# the default worker timeout
@@ -211,7 +229,8 @@ function launch_on_machine(manager::SSHManager, machine, cnt, params, launched,
211
229
wconfig = WorkerConfig ()
212
230
wconfig. io = io. out
213
231
wconfig. host = host
214
- wconfig. tunnel = params[:tunnel ]
232
+ wconfig. tunnel = tunnel
233
+ wconfig. multiplex = multiplex
215
234
wconfig. sshflags = sshflags
216
235
wconfig. exeflags = exeflags
217
236
wconfig. exename = exename
@@ -256,25 +275,32 @@ end
256
275
257
276
258
277
"""
259
- ssh_tunnel(user, host, bind_addr, port, sshflags) -> localport
278
+ ssh_tunnel(user, host, bind_addr, port, sshflags, multiplex ) -> localport
260
279
261
280
Establish an SSH tunnel to a remote worker.
262
281
Return a port number `localport` such that `localhost:localport` connects to `host:port`.
263
282
"""
264
- function ssh_tunnel (user, host, bind_addr, port, sshflags)
283
+ function ssh_tunnel (user, host, bind_addr, port, sshflags, multiplex )
265
284
port = Int (port)
266
285
cnt = ntries = 100
267
- # if we cannot do port forwarding, bail immediately
286
+
268
287
# the connection is forwarded to `port` on the remote server over the local port `localport`
269
- # the -f option backgrounds the ssh session
270
- # `sleep 60` command specifies that an alloted time of 60 seconds is allowed to start the
271
- # remote julia process and establish the network connections specified by the process topology.
272
- # If no connections are made within 60 seconds, ssh will exit and an error will be printed on the
273
- # process that launched the remote process.
274
- ssh = ` ssh -T -a -x -o ExitOnForwardFailure=yes`
275
288
while cnt > 0
276
289
localport = next_tunnel_port ()
277
- if success (detach (` $ssh -f $sshflags $user @$host -L $localport :$bind_addr :$port sleep 60` ))
290
+ if multiplex
291
+ # It assumes that an ssh multiplexing session has been already started by the remote worker.
292
+ cmd = ` ssh $sshflags -O forward -L $localport :$bind_addr :$port $user @$host `
293
+ else
294
+ # if we cannot do port forwarding, fail immediately
295
+ # the -f option backgrounds the ssh session
296
+ # `sleep 60` command specifies that an alloted time of 60 seconds is allowed to start the
297
+ # remote julia process and establish the network connections specified by the process topology.
298
+ # If no connections are made within 60 seconds, ssh will exit and an error will be printed on the
299
+ # process that launched the remote process.
300
+ ssh = ` ssh -T -a -x -o ExitOnForwardFailure=yes`
301
+ cmd = detach (` $ssh -f $sshflags $user @$host -L $localport :$bind_addr :$port sleep 60` )
302
+ end
303
+ if success (cmd)
278
304
return localport
279
305
end
280
306
cnt -= 1
@@ -427,9 +453,11 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig)
427
453
sem = tunnel_hosts_map[pubhost]
428
454
429
455
sshflags = notnothing (config. sshflags)
456
+ multiplex = something (config. multiplex, false )
430
457
acquire (sem)
431
458
try
432
- (s, bind_addr) = connect_to_worker (pubhost, bind_addr, port, user, sshflags)
459
+ (s, bind_addr, forward) = connect_to_worker_with_tunnel (pubhost, bind_addr, port, user, sshflags, multiplex)
460
+ config. forward = forward
433
461
finally
434
462
release (sem)
435
463
end
@@ -515,9 +543,23 @@ function connect_to_worker(host::AbstractString, port::Integer)
515
543
end
516
544
517
545
518
- function connect_to_worker (host:: AbstractString , bind_addr:: AbstractString , port:: Integer , tunnel_user:: AbstractString , sshflags)
519
- s = connect (" localhost" , ssh_tunnel (tunnel_user, host, bind_addr, UInt16 (port), sshflags))
520
- (s, bind_addr)
546
+ function connect_to_worker_with_tunnel (host:: AbstractString , bind_addr:: AbstractString , port:: Integer , tunnel_user:: AbstractString , sshflags, multiplex)
547
+ localport = ssh_tunnel (tunnel_user, host, bind_addr, UInt16 (port), sshflags, multiplex)
548
+ s = connect (" localhost" , localport)
549
+ forward = " $localport :$bind_addr :$port "
550
+ (s, bind_addr, forward)
551
+ end
552
+
553
+
554
+ function cancel_ssh_tunnel (config:: WorkerConfig )
555
+ host = notnothing (config. host)
556
+ sshflags = notnothing (config. sshflags)
557
+ tunnel = something (config. tunnel, false )
558
+ multiplex = something (config. multiplex, false )
559
+ if tunnel && multiplex
560
+ forward = notnothing (config. forward)
561
+ run (` ssh $sshflags -O cancel -L $forward $host ` )
562
+ end
521
563
end
522
564
523
565
@@ -531,7 +573,12 @@ It should cause the remote worker specified by `pid` to exit.
531
573
on `pid`.
532
574
"""
533
575
function kill (manager:: ClusterManager , pid:: Int , config:: WorkerConfig )
534
- remote_do (exit, pid) # For TCP based transports this will result in a close of the socket
535
- # at our end, which will result in a cleanup of the worker.
576
+ remote_do (exit, pid)
577
+ nothing
578
+ end
579
+
580
+ function kill (manager:: SSHManager , pid:: Int , config:: WorkerConfig )
581
+ remote_do (exit, pid)
582
+ cancel_ssh_tunnel (config)
536
583
nothing
537
584
end
0 commit comments