diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f2c9c0e..7322190 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,6 @@ jobs: timeout-minutes: 10 needs: - unit-tests - - test-slurm # Important: the next line MUST be `if: always()`. # Do not change that line. # That line is necessary to make sure that this job runs even if tests fail. @@ -25,13 +24,11 @@ jobs: steps: - run: | echo unit-tests: ${{ needs.unit-tests.result }} - echo test-slurm: ${{ needs.test-slurm.result }} - run: exit 1 # The last line must NOT end with || # All other lines MUST end with || if: | - (needs.unit-tests.result != 'success') || - (needs.test-slurm.result != 'success') + (needs.unit-tests.result != 'success') unit-tests: runs-on: ubuntu-latest timeout-minutes: 20 @@ -62,64 +59,3 @@ jobs: # If this PR is NOT from a fork, then DO fail CI if the Codecov upload errors. # If this is not a PR, then DO fail CI if the Codecov upload errors. fail_ci_if_error: ${{ github.event_name != 'pull_request' || github.repository == github.event.pull_request.head.repo.full_name }} - test-slurm: - runs-on: ubuntu-latest - timeout-minutes: 20 - strategy: - fail-fast: false - matrix: - version: - # Please note: You must specify the full Julia version number (major.minor.patch). - # This is because the value here will be directly interpolated into a download URL. - # - '1.2.0' # minimum Julia version supported in Project.toml - - '1.6.7' # previous LTS - - '1.10.7' # current LTS - - '1.11.2' # currently the latest stable release - steps: - - uses: actions/checkout@v4 - with: - persist-credentials: false - - name: Print Docker version - run: | - docker --version - docker version - # This next bit of code is taken from: - # https://github.com/kleinhenz/SlurmClusterManager.jl - # Original author: Joseph Kleinhenz - # License: MIT - - name: Setup Slurm inside Docker - run: | - docker version - docker compose version - docker build --build-arg "JULIA_VERSION=${MATRIX_JULIA_VERSION:?}" -t slurm-cluster-julia -f ci/Dockerfile . - docker compose -f ci/docker-compose.yml up -d - docker ps - env: - MATRIX_JULIA_VERSION: ${{matrix.version}} - - name: Print some information for debugging purposes - run: | - docker exec -t slurmctld pwd - docker exec -t slurmctld ls -la - docker exec -t slurmctld ls -la ClusterManagers - - name: Instantiate package - run: docker exec -t slurmctld julia --project=ClusterManagers -e 'import Pkg; @show Base.active_project(); Pkg.instantiate(); Pkg.status()' - - name: Run tests without a Slurm allocation - run: docker exec -t slurmctld julia --project=ClusterManagers -e 'import Pkg; Pkg.test(; test_args=["slurm"])' - - name: Run tests inside salloc - run: docker exec -t slurmctld salloc -t 00:10:00 -n 2 julia --project=ClusterManagers -e 'import Pkg; Pkg.test(; test_args=["slurm"], coverage=true)' - - name: Run tests inside sbatch - run: docker exec -t slurmctld ClusterManagers/ci/run_my_sbatch.sh - - run: find . -type f -name '*.cov' - - name: Copy .cov files out of the Docker container - run: docker exec slurmctld /bin/bash -c 'cd /home/docker/ClusterManagers && tar -cf - src/*.cov' | tar -xvf - - - run: find . -type f -name '*.cov' - # - run: find . -type f -name '*.cov' -exec cat {} \; - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v5 - with: - files: lcov.info - token: ${{ secrets.CODECOV_TOKEN }} - # If this PR is from a fork, then do NOT fail CI if the Codecov upload errors. - # If this PR is NOT from a fork, then DO fail CI if the Codecov upload errors. - # If this is not a PR, then DO fail CI if the Codecov upload errors. - fail_ci_if_error: ${{ github.event_name != 'pull_request' || github.repository == github.event.pull_request.head.repo.full_name }} diff --git a/Project.toml b/Project.toml index 1f488c7..9a681fb 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "ClusterManagers" uuid = "34f1f09b-3a8b-5176-ab39-66d58a4d544e" -version = "2.0.0" +version = "3.0.0-DEV" [deps] Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" diff --git a/ci/Dockerfile b/ci/Dockerfile deleted file mode 100644 index 4f7cc33..0000000 --- a/ci/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -# This file is taken from: -# https://github.com/kleinhenz/SlurmClusterManager.jl -# Original author: Joseph Kleinhenz -# License: MIT - -FROM jkleinh/slurm-cluster@sha256:afd20dafc831b0fa781460dc871232579ccf1b54955e434531394c331ce388e4 as base -MAINTAINER Joseph Kleinhenz - -ARG JULIA_VERSION=1.6.0 - -RUN mkdir -p /home/docker/.local/opt/julia \ - && cd /home/docker/.local/opt/julia \ - && folder="$(echo ${JULIA_VERSION} | cut -d. -f1-2)" \ - && curl -L https://julialang-s3.julialang.org/bin/linux/x64/${folder}/julia-${JULIA_VERSION}-linux-x86_64.tar.gz | tar xz --strip 1 \ - && /home/docker/.local/opt/julia/bin/julia --version - -ENV PATH="/home/docker/.local/opt/julia/bin:${PATH}" - -COPY --chown=docker . ClusterManagers - -CMD /bin/bash -l diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml deleted file mode 100644 index 86b1df3..0000000 --- a/ci/docker-compose.yml +++ /dev/null @@ -1,48 +0,0 @@ -# This file is taken from: -# https://github.com/kleinhenz/SlurmClusterManager.jl -# Original author: Joseph Kleinhenz -# License: MIT - -version: "3.3" - -services: - slurmctld: - image: slurm-cluster-julia - command: ["slurmctld"] - container_name: slurmctld - hostname: slurmctld - volumes: - - slurm_jobdir:/home/docker - - var_log_slurm:/var/log/slurm - expose: - - "6817" - - c1: - image: slurm-cluster-julia - command: ["slurmd"] - hostname: c1 - container_name: c1 - volumes: - - slurm_jobdir:/home/docker - - var_log_slurm:/var/log/slurm - expose: - - "6818" - depends_on: - - "slurmctld" - - c2: - image: slurm-cluster-julia - command: ["slurmd"] - hostname: c2 - container_name: c2 - volumes: - - slurm_jobdir:/home/docker - - var_log_slurm:/var/log/slurm - expose: - - "6818" - depends_on: - - "slurmctld" - -volumes: - slurm_jobdir: - var_log_slurm: diff --git a/ci/my_sbatch.sh b/ci/my_sbatch.sh deleted file mode 100644 index 33d98a8..0000000 --- a/ci/my_sbatch.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# Slurm options: -#SBATCH --ntasks=2 -#SBATCH --time=00:10:00 - -# Important note: -# There should be no non-comment non-whitespace lines above this line. - -set -euf -o pipefail - -set -x - -julia --project=ClusterManagers -e 'import Pkg; Pkg.test(; test_args=["slurm"])' diff --git a/ci/run_my_sbatch.sh b/ci/run_my_sbatch.sh deleted file mode 100755 index 509a18d..0000000 --- a/ci/run_my_sbatch.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euf -o pipefail - -set -x - -rm -fv "${HOME:?}/my_stdout.txt" -rm -fv "${HOME:?}/my_stderr.txt" - -sbatch --wait --output="${HOME:?}/my_stdout.txt" --error="${HOME:?}/my_stderr.txt" ./ClusterManagers/ci/my_sbatch.sh - -sleep 5 -cat "${HOME:?}/my_stdout.txt" -cat "${HOME:?}/my_stderr.txt" diff --git a/slurm_test.jl b/slurm_test.jl deleted file mode 100644 index ca8bc6b..0000000 --- a/slurm_test.jl +++ /dev/null @@ -1,18 +0,0 @@ -#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx512/Compiler/gcc7.3/julia/1.1.0/bin/julia -#SBATCH --time=00:05:00 # Running time of hours -#SBATCH --ntasks=4 -#SBATCH --account=def-whitem - - -using Logging, Distributed - -include("/home/mkschleg/mkschleg/ClusterManagers.jl/src/ClusterManagers.jl") - -ClusterManagers.addprocs_slurm(4; exeflags=["--project=.", "--color=yes"], job_file_loc="test_loc") - -@sync begin - @async @sync for job_id in collect(1:100) @spawn begin - println("Hello World $(job_id)") - end - end -end diff --git a/src/ClusterManagers.jl b/src/ClusterManagers.jl index b03f3da..50d8bb3 100755 --- a/src/ClusterManagers.jl +++ b/src/ClusterManagers.jl @@ -16,7 +16,6 @@ worker_arg() = `--worker=$(worker_cookie())` include("qsub.jl") include("scyld.jl") include("condor.jl") -include("slurm.jl") include("affinity.jl") end diff --git a/src/slurm.jl b/src/slurm.jl deleted file mode 100644 index 009448b..0000000 --- a/src/slurm.jl +++ /dev/null @@ -1,180 +0,0 @@ -# ClusterManager for Slurm - -export SlurmManager, addprocs_slurm - -import Logging.@warn - -struct SlurmManager <: ClusterManager - np::Integer - retry_delays -end - -struct SlurmException <: Exception - msg -end - -function launch(manager::SlurmManager, params::Dict, instances_arr::Array, - c::Condition) - let - msg = "The Slurm functionality in the `ClusterManagers.jl` package is deprecated " * - "(including `ClusterManagers.addprocs_slurm` and `ClusterManagers.SlurmManager`). " * - "It will be removed from ClusterManagers.jl in a future release. " * - "We recommend migrating to the " * - "[https://github.com/JuliaParallel/SlurmClusterManager.jl](https://github.com/JuliaParallel/SlurmClusterManager.jl) " * - "package instead." - Base.depwarn(msg, :SlurmManager; force = true) - end - try - exehome = params[:dir] - exename = params[:exename] - exeflags = params[:exeflags] - - stdkeys = keys(Distributed.default_addprocs_params()) - - p = filter(x->(!(x[1] in stdkeys) && x[1] != :job_file_loc), params) - - srunargs = [] - for k in keys(p) - if length(string(k)) == 1 - push!(srunargs, "-$k") - val = p[k] - if length(val) > 0 - push!(srunargs, "$(p[k])") - end - else - k2 = replace(string(k), "_"=>"-") - val = p[k] - if length(val) > 0 - push!(srunargs, "--$(k2)=$(p[k])") - else - push!(srunargs, "--$(k2)") - end - end - end - - # Get job file location from parameter dictionary. - job_file_loc = joinpath(exehome, get(params, :job_file_loc, ".")) - - # Make directory if not already made. - if !isdir(job_file_loc) - mkdir(job_file_loc) - end - - # Check for given output file name - jobname = "julia-$(getpid())" - has_output_name = ("-o" in srunargs) | ("--output" in srunargs) - if has_output_name - loc = findfirst(x-> x == "-o" || x == "--output", srunargs) - job_output_name = srunargs[loc+1] - job_output_template = joinpath(job_file_loc, job_output_name) - srunargs[loc+1] = job_output_template - else - job_output_name = "$(jobname)-$(trunc(Int, Base.time() * 10))" - make_job_output_path(task_num) = joinpath(job_file_loc, "$(job_output_name)-$(task_num).out") - job_output_template = make_job_output_path("%4t") - push!(srunargs, "-o", job_output_template) - end - - np = manager.np - srun_cmd = `srun -J $jobname -n $np -D $exehome $(srunargs) $exename $exeflags $(worker_arg())` - - @info "Starting SLURM job $jobname: $srun_cmd" - srun_proc = open(srun_cmd) - - slurm_spec_regex = r"([\w]+):([\d]+)#(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" - could_not_connect_regex = r"could not connect" - exiting_regex = r"exiting." - retry_delays = manager.retry_delays - - t_start = time() - t_waited = round(Int, time() - t_start) - for i = 0:np - 1 - slurm_spec_match::Union{RegexMatch,Nothing} = nothing - worker_errors = String[] - if has_output_name - fn = job_output_template - else - fn = make_job_output_path(lpad(i, 4, "0")) - end - for retry_delay in push!(collect(retry_delays), 0) - t_waited = round(Int, time() - t_start) - - # Wait for output log to be created and populated, then parse - - if isfile(fn) - if filesize(fn) > 0 - open(fn) do f - # Due to error and warning messages, the specification - # may not appear on the file's first line - for line in eachline(f) - re_match = match(slurm_spec_regex, line) - if !isnothing(re_match) - slurm_spec_match = re_match - end - for expr in [could_not_connect_regex, exiting_regex] - if !isnothing(match(expr, line)) - slurm_spec_match = nothing - push!(worker_errors, line) - end - end - end - end - end - if !isempty(worker_errors) || !isnothing(slurm_spec_match) - break # break if error or specification found - else - @info "Worker $i (after $t_waited s): Output file found, but no connection details yet" - end - else - @info "Worker $i (after $t_waited s): No output file \"$fn\" yet" - end - - # Sleep for some time to limit resource usage while waiting for the job to start - sleep(retry_delay) - end - - if !isempty(worker_errors) - throw(SlurmException("Worker $i failed after $t_waited s: $(join(worker_errors, " "))")) - elseif isnothing(slurm_spec_match) - throw(SlurmException("Timeout after $t_waited s while waiting for worker $i to get ready.")) - end - - config = WorkerConfig() - config.port = parse(Int, slurm_spec_match[2]) - config.host = strip(slurm_spec_match[3]) - @info "Worker $i ready after $t_waited s on host $(config.host), port $(config.port)" - # Keep a reference to the proc, so it's properly closed once - # the last worker exits. - config.userdata = srun_proc - push!(instances_arr, config) - notify(c) - end - catch e - @error "Error launching Slurm job" - rethrow(e) - end -end - -function manage(manager::SlurmManager, id::Integer, config::WorkerConfig, - op::Symbol) - # This function needs to exist, but so far we don't do anything -end - -SlurmManager(np::Integer) = SlurmManager(np, ExponentialBackOff(n=10, first_delay=1, - max_delay=512, factor=2)) - -""" -Launch `np` workers on a cluster managed by slurm. `retry_delays` is a vector of -numbers specifying in seconds how long to repeatedly wait for a worker to start. -Defaults to an exponential backoff. - -# Examples - -``` -addprocs_slurm(100; retry_delays=Iterators.repeated(0.1)) -``` -""" -addprocs_slurm(np::Integer; - retry_delays=ExponentialBackOff(n=10, first_delay=1, - max_delay=512, factor=2), - kwargs...) = addprocs(SlurmManager(np, retry_delays); kwargs...) diff --git a/test/runtests.jl b/test/runtests.jl index 334b825..4ccba9c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,8 +9,7 @@ using Distributed: workers, nworkers using Distributed: procs, nprocs using Distributed: remotecall_fetch, @spawnat using Test: @testset, @test, @test_skip -# Slurm: -using ClusterManagers: addprocs_slurm, SlurmManager + # SGE: using ClusterManagers: addprocs_sge, SGEManager @@ -18,23 +17,9 @@ const test_args = lowercase.(strip.(ARGS)) @info "" test_args -slurm_is_installed() = !isnothing(Sys.which("sbatch")) qsub_is_installed() = !isnothing(Sys.which("qsub")) @testset "ClusterManagers.jl" begin - if slurm_is_installed() - @info "Running the Slurm tests..." Sys.which("sbatch") - include("slurm.jl") - else - if "slurm" in test_args - @error "ERROR: The Slurm tests were explicitly requested in ARGS, but sbatch was not found, so the Slurm tests cannot be run" Sys.which("sbatch") test_args - @test false - else - @warn "sbatch was not found - Slurm tests will be skipped" Sys.which("sbatch") - @test_skip false - end - end - if qsub_is_installed() @info "Running the SGE (via qsub) tests..." Sys.which("qsub") include("sge_qsub.jl") diff --git a/test/slurm.jl b/test/slurm.jl deleted file mode 100644 index 6e5928d..0000000 --- a/test/slurm.jl +++ /dev/null @@ -1,29 +0,0 @@ -@testset "Slurm" begin - mktempdir() do tmpdir - cd(tmpdir) do - outfile = joinpath(tmpdir, "my_slurm_job.out") - p = addprocs_slurm(1; o=outfile) - @test nprocs() == 2 - @test workers() == p - @test fetch(@spawnat :any myid()) == p[1] - @test remotecall_fetch(+,p[1],1,1) == 2 - rmprocs(p) - @test nprocs() == 1 - @test workers() == [1] - - # Check that `outfile` exists: - @test isfile(outfile) - # Check that `outfile` is not empty: - outfile_contents = read(outfile, String) - @test length(strip(outfile_contents)) > 5 - - println(Base.stderr, "# BEGIN: contents of my_slurm_job.out") - println(Base.stderr, outfile_contents) - println(Base.stderr, "# END: contents of my_slurm_job.out") - - # No need to manually delete the `outfile` file. - # The entire `tmpdir` will automatically be removed when the `mktempdir() do ...` block ends. - # rm(outfile) - end - end -end