From bbba8a9ddb5cab343f8c5cc5b4ed16ae7a00780e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 22 Aug 2024 16:28:27 +0200 Subject: [PATCH 01/11] Try to change strategy: don't gather processor info ourselves, but make sure it gets autodetected _for every job we run_ --- reframe_config_bot.py.tmpl | 15 ++++++++------- test_suite.sh | 16 ++++++++++------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/reframe_config_bot.py.tmpl b/reframe_config_bot.py.tmpl index 607373767a..dc94d97063 100644 --- a/reframe_config_bot.py.tmpl +++ b/reframe_config_bot.py.tmpl @@ -15,19 +15,20 @@ site_configuration = { 'modules_system': 'lmod', 'partitions': [ { - 'name': 'default', + 'name': '__JOBID__', 'scheduler': 'local', 'launcher': 'mpirun', 'environs': ['default'], 'features': [ FEATURES[CPU] ] + list(SCALES.keys()), - 'processor': { - 'num_cpus': __NUM_CPUS__, - 'num_sockets': __NUM_SOCKETS__, - 'num_cpus_per_core': __NUM_CPUS_PER_CORE__, - 'num_cpus_per_socket': __NUM_CPUS_PER_SOCKET__, - }, + # 'processor': { + # 'num_cpus': __NUM_CPUS__, + # 'num_sockets': __NUM_SOCKETS__, + # 'num_cpus_per_core': __NUM_CPUS_PER_CORE__, + # 'num_cpus_per_socket': __NUM_CPUS_PER_SOCKET__, + # 'num_cores_per_numa_node': __NUM_CORES_PER_NUMA_NODE__, + # }, 'resources': [ { 'name': 'memory', diff --git a/test_suite.sh b/test_suite.sh index c5a75c2d25..1901f4af43 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -201,13 +201,17 @@ else fi echo "Detected available memory: ${cgroup_mem_mib} MiB" -echo "Replacing detected system information in template ReFrame config file..." +# echo "Replacing detected system information in template ReFrame config file..." +# cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} +# sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES +# sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES +# sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES +# sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES +# sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES +echo "Replacing partition name in the template ReFrame config file, to trigger CPU autodetection for this job" cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} -sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES -sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES -sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES -sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES -sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES +sed -i "s/__JOBID__/${SLURM_JOB_ID}/g" $RFM_CONFIG_FILES + # Make debugging easier by printing the final config file: echo "Final config file (after replacements):" cat "${RFM_CONFIG_FILES}" From 784cfa4e5dfd0029d856888c6aa7a94fd37b8f94 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 22 Aug 2024 16:34:57 +0200 Subject: [PATCH 02/11] Make sure CPU topology file gets cleaned up --- reframe_config_bot.py.tmpl | 2 +- test_suite.sh | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/reframe_config_bot.py.tmpl b/reframe_config_bot.py.tmpl index dc94d97063..ff9d627c24 100644 --- a/reframe_config_bot.py.tmpl +++ b/reframe_config_bot.py.tmpl @@ -15,7 +15,7 @@ site_configuration = { 'modules_system': 'lmod', 'partitions': [ { - 'name': '__JOBID__', + 'name': '__RFM_PARTITION__', 'scheduler': 'local', 'launcher': 'mpirun', 'environs': ['default'], diff --git a/test_suite.sh b/test_suite.sh index 1901f4af43..7c8438d30e 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -210,7 +210,8 @@ echo "Detected available memory: ${cgroup_mem_mib} MiB" # sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES echo "Replacing partition name in the template ReFrame config file, to trigger CPU autodetection for this job" cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} -sed -i "s/__JOBID__/${SLURM_JOB_ID}/g" $RFM_CONFIG_FILES +RFM_PARTITION="$SLURM_JOB_ID" +sed -i "s/__RFM_PARTITION__/${RFM_PARTITION}/g" $RFM_CONFIG_FILES # Make debugging easier by printing the final config file: echo "Final config file (after replacements):" @@ -250,4 +251,8 @@ fi echo ">> Cleaning up ${TMPDIR}..." rm -r ${TMPDIR} +RFM_SYSTEM=$(python3 -c "import ${RFM_CONFIG_FILES}; site_configuration['systems'][0]['name']") +RFM_TOPOLOGY_FILE="${HOME}/.reframe/topology/${RFM_SYSTEM}-${RFM_PARTITION}/processor.json" +echo ">> Cleaning up ReFrame CPU topology file" + exit ${reframe_exit_code} From ba21ef5324628b9350ad64ed5159722d46008ed8 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 22 Aug 2024 16:40:01 +0200 Subject: [PATCH 03/11] Fix actual cleanup... --- test_suite.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test_suite.sh b/test_suite.sh index 7c8438d30e..f0ba64eda2 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -252,7 +252,8 @@ echo ">> Cleaning up ${TMPDIR}..." rm -r ${TMPDIR} RFM_SYSTEM=$(python3 -c "import ${RFM_CONFIG_FILES}; site_configuration['systems'][0]['name']") -RFM_TOPOLOGY_FILE="${HOME}/.reframe/topology/${RFM_SYSTEM}-${RFM_PARTITION}/processor.json" +RFM_TOPOLOGY_DIR="${HOME}/.reframe/topology/${RFM_SYSTEM}-${RFM_PARTITION}" echo ">> Cleaning up ReFrame CPU topology file" +rm -rf ${RFM_TOPOLOGY_DIR} exit ${reframe_exit_code} From 011391a29c5a0e4a6b9aa40e742243d3bae2d247 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 22 Aug 2024 16:50:04 +0200 Subject: [PATCH 04/11] Still needed to replace memory limits, doing that now --- test_suite.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_suite.sh b/test_suite.sh index f0ba64eda2..ffbfa1887a 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -202,14 +202,14 @@ fi echo "Detected available memory: ${cgroup_mem_mib} MiB" # echo "Replacing detected system information in template ReFrame config file..." -# cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} +cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} # sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES # sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES # sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES # sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES -# sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES +sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES echo "Replacing partition name in the template ReFrame config file, to trigger CPU autodetection for this job" -cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} +# cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} RFM_PARTITION="$SLURM_JOB_ID" sed -i "s/__RFM_PARTITION__/${RFM_PARTITION}/g" $RFM_CONFIG_FILES From 4463d73b49e450ce687063043983e253cba12b48 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 22 Aug 2024 17:03:56 +0200 Subject: [PATCH 05/11] ReFrame forces me to start the partition name with an a-z character --- test_suite.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_suite.sh b/test_suite.sh index ffbfa1887a..1e7d0ba9a9 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -210,7 +210,7 @@ cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES echo "Replacing partition name in the template ReFrame config file, to trigger CPU autodetection for this job" # cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} -RFM_PARTITION="$SLURM_JOB_ID" +RFM_PARTITION="job-$SLURM_JOB_ID" sed -i "s/__RFM_PARTITION__/${RFM_PARTITION}/g" $RFM_CONFIG_FILES # Make debugging easier by printing the final config file: From 2f11592d1bb00aa21bd0b58f5aef306201ea2736 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 22 Aug 2024 17:08:30 +0200 Subject: [PATCH 06/11] Get the partition name from the environment. This way, we only need to auto-detect once per partition. Then, the file is cached and available to be used in the next run! --- test_suite.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/test_suite.sh b/test_suite.sh index 1e7d0ba9a9..6a30a4318e 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -210,7 +210,7 @@ cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES echo "Replacing partition name in the template ReFrame config file, to trigger CPU autodetection for this job" # cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} -RFM_PARTITION="job-$SLURM_JOB_ID" +RFM_PARTITION="${SLURM_JOB_PARTITION}" sed -i "s/__RFM_PARTITION__/${RFM_PARTITION}/g" $RFM_CONFIG_FILES # Make debugging easier by printing the final config file: @@ -251,9 +251,4 @@ fi echo ">> Cleaning up ${TMPDIR}..." rm -r ${TMPDIR} -RFM_SYSTEM=$(python3 -c "import ${RFM_CONFIG_FILES}; site_configuration['systems'][0]['name']") -RFM_TOPOLOGY_DIR="${HOME}/.reframe/topology/${RFM_SYSTEM}-${RFM_PARTITION}" -echo ">> Cleaning up ReFrame CPU topology file" -rm -rf ${RFM_TOPOLOGY_DIR} - exit ${reframe_exit_code} From 7eacbf494c5e1e4ea6c9108d0e726fadfde3a8c6 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 22 Aug 2024 17:10:33 +0200 Subject: [PATCH 07/11] Enable remote detect. I'm not sure if this is correct, since we are using local spawner. But right now, it doesn't seem to detect anything --- reframe_config_bot.py.tmpl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/reframe_config_bot.py.tmpl b/reframe_config_bot.py.tmpl index ff9d627c24..8f3c74134b 100644 --- a/reframe_config_bot.py.tmpl +++ b/reframe_config_bot.py.tmpl @@ -57,8 +57,7 @@ site_configuration = { { 'purge_environment': True, 'resolve_module_conflicts': False, # avoid loading the module before submitting the job - # disable automatic detection of CPU architecture (since we're using local scheduler) - 'remote_detect': False, + 'remote_detect': True, } ], 'logging': common_logging_config(), From c6056abf2c207d1c7d8543218a97a0718299f42e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 22 Aug 2024 17:17:11 +0200 Subject: [PATCH 08/11] Cleanup of code that is no longer needed --- test_suite.sh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test_suite.sh b/test_suite.sh index 6a30a4318e..de1c64d977 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -203,14 +203,10 @@ echo "Detected available memory: ${cgroup_mem_mib} MiB" # echo "Replacing detected system information in template ReFrame config file..." cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} -# sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES -# sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES -# sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES -# sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES +echo "Replacing memory limit in the ReFrame config file with the detected CGROUP memory limit: ${cgroup_mem_mib} MiB" sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES -echo "Replacing partition name in the template ReFrame config file, to trigger CPU autodetection for this job" -# cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} RFM_PARTITION="${SLURM_JOB_PARTITION}" +echo "Replacing partition name in the template ReFrame config file: ${RFM_PARTITION}" sed -i "s/__RFM_PARTITION__/${RFM_PARTITION}/g" $RFM_CONFIG_FILES # Make debugging easier by printing the final config file: From bbc3566bb6427b2f04f1e4917d01317e92fcb3d4 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 22 Aug 2024 17:19:32 +0200 Subject: [PATCH 09/11] Cleanup comments --- reframe_config_bot.py.tmpl | 7 ------- 1 file changed, 7 deletions(-) diff --git a/reframe_config_bot.py.tmpl b/reframe_config_bot.py.tmpl index 8f3c74134b..323aafd5ec 100644 --- a/reframe_config_bot.py.tmpl +++ b/reframe_config_bot.py.tmpl @@ -22,13 +22,6 @@ site_configuration = { 'features': [ FEATURES[CPU] ] + list(SCALES.keys()), - # 'processor': { - # 'num_cpus': __NUM_CPUS__, - # 'num_sockets': __NUM_SOCKETS__, - # 'num_cpus_per_core': __NUM_CPUS_PER_CORE__, - # 'num_cpus_per_socket': __NUM_CPUS_PER_SOCKET__, - # 'num_cores_per_numa_node': __NUM_CORES_PER_NUMA_NODE__, - # }, 'resources': [ { 'name': 'memory', From 238b4074cdb3c8fe6f28681e900f217b25ef0bfe Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 3 Sep 2024 18:10:00 +0200 Subject: [PATCH 10/11] Cleanup of code that is now no longer needed since we use ReFrame's CPU autodetection --- test_suite.sh | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/test_suite.sh b/test_suite.sh index de1c64d977..f42c164872 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -141,34 +141,6 @@ export RFM_PREFIX=$PWD/reframe_runs echo "Configured reframe with the following environment variables:" env | grep "RFM_" -# Inject correct CPU/memory properties into the ReFrame config file -echo "Collecting system-specific input for the ReFrame configuration file" -cpuinfo=$(lscpu) -if [[ "${cpuinfo}" =~ CPU\(s\):[^0-9]*([0-9]+) ]]; then - cpu_count=${BASH_REMATCH[1]} - echo "Detected CPU count: ${cpu_count}" -else - fatal_error "Failed to get the number of CPUs for the current test hardware with lscpu." -fi -if [[ "${cpuinfo}" =~ Socket\(s\):[^0-9]*([0-9]+) ]]; then - socket_count=${BASH_REMATCH[1]} - echo "Detected socket count: ${socket_count}" -else - fatal_error "Failed to get the number of sockets for the current test hardware with lscpu." -fi -if [[ "${cpuinfo}" =~ (Thread\(s\) per core:[^0-9]*([0-9]+)) ]]; then - threads_per_core=${BASH_REMATCH[2]} - echo "Detected threads per core: ${threads_per_core}" -else - fatal_error "Failed to get the number of threads per core for the current test hardware with lscpu." -fi -if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then - cores_per_socket=${BASH_REMATCH[2]} - echo "Detected cores per socket: ${cores_per_socket}" -else - fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu." -fi - # The /sys inside the container is not the same as the /sys of the host # We want to extract the memory limit from the cgroup on the host (which is typically set by SLURM). # Thus, bot/test.sh bind-mounts the host's /sys/fs/cgroup into /hostsys/fs/cgroup From c6e0cc29a7badc4d32bc9eca45e9976bd7fe55fc Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 4 Sep 2024 10:52:17 +0200 Subject: [PATCH 11/11] Remove old comment --- test_suite.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/test_suite.sh b/test_suite.sh index f42c164872..31f85f60fd 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -173,7 +173,6 @@ else fi echo "Detected available memory: ${cgroup_mem_mib} MiB" -# echo "Replacing detected system information in template ReFrame config file..." cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} echo "Replacing memory limit in the ReFrame config file with the detected CGROUP memory limit: ${cgroup_mem_mib} MiB" sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES