Skip to content

Commit 850cc89

Browse files
nico202JeffBezanson
authored andcommitted
Fix the use of time() in measuring performances (replace with time_ns()) (JuliaLang/julia#34181)
time() is a wrapper for gettimeofday, which is affected by by discontinuous jumps in the system time (e.g., if the system administrator manually changes the system time), see `man gettimeofday`. On the other hand, time_ns() is a wrapper for libuv uv_hrtime that has no drift, whose manual page says "The primary use is for measuring performance between intervals.".
1 parent 6a10312 commit 850cc89

File tree

1 file changed

+10
-11
lines changed

1 file changed

+10
-11
lines changed

src/cluster.jl

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -293,12 +293,11 @@ Base.showerror(io::IO, e::LaunchWorkerError) = print(io, e.msg)
293293
# The master process uses this to connect to the worker and subsequently
294294
# setup a all-to-all network.
295295
function read_worker_host_port(io::IO)
296-
t0 = time()
296+
t0 = time_ns()
297297

298298
# Wait at most for JULIA_WORKER_TIMEOUT seconds to read host:port
299299
# info from the worker
300-
timeout = worker_timeout()
301-
300+
timeout = worker_timeout() * 1e9
302301
# We expect the first line to contain the host:port string. However, as
303302
# the worker may be launched via ssh or a cluster manager like SLURM,
304303
# ignore any informational / warning lines printed by the launch command.
@@ -311,7 +310,7 @@ function read_worker_host_port(io::IO)
311310
while ntries > 0
312311
readtask = @async readline(io)
313312
yield()
314-
while !istaskdone(readtask) && ((time() - t0) < timeout)
313+
while !istaskdone(readtask) && ((time_ns() - t0) < timeout)
315314
sleep(0.05)
316315
end
317316
!istaskdone(readtask) && break
@@ -707,20 +706,20 @@ function redirect_output_from_additional_worker(pid, port)
707706
end
708707

709708
function check_master_connect()
710-
timeout = worker_timeout()
709+
timeout = worker_timeout() * 1e9
711710
# If we do not have at least process 1 connect to us within timeout
712711
# we log an error and exit, unless we're running on valgrind
713712
if ccall(:jl_running_on_valgrind,Cint,()) != 0
714713
return
715714
end
716715
@async begin
717-
start = time()
718-
while !haskey(map_pid_wrkr, 1) && (time() - start) < timeout
716+
start = time_ns()
717+
while !haskey(map_pid_wrkr, 1) && (time_ns() - start) < timeout
719718
sleep(1.0)
720719
end
721720

722721
if !haskey(map_pid_wrkr, 1)
723-
print(stderr, "Master process (id 1) could not connect within $timeout seconds.\nexiting.\n")
722+
print(stderr, "Master process (id 1) could not connect within $(timeout/1e9) seconds.\nexiting.\n")
724723
exit(1)
725724
end
726725
end
@@ -1035,10 +1034,10 @@ function _rmprocs(pids, waitfor)
10351034
end
10361035
end
10371036

1038-
start = time()
1039-
while (time() - start) < waitfor
1037+
start = time_ns()
1038+
while (time_ns() - start) < waitfor*1e9
10401039
all(w -> w.state == W_TERMINATED, rmprocset) && break
1041-
sleep(min(0.1, waitfor - (time() - start)))
1040+
sleep(min(0.1, waitfor - (time_ns() - start)/1e9))
10421041
end
10431042

10441043
unremoved = [wrkr.id for wrkr in filter(w -> w.state != W_TERMINATED, rmprocset)]

0 commit comments

Comments
 (0)