Skip to content

Allow for CUDA compute capability fallbacks when initialising EESSI #1115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions .github/workflows/scripts/verify_eessi_environment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os

import os
import sys

class EnvVarError(Exception):
"""Custom exception for environment variable comparison errors."""
def __init__(self, message):
super().__init__(f"ENV VALIDATION ERROR: {message}")

def get_env_vars(var1, var2):
val1 = os.environ.get(var1)
val2 = os.environ.get(var2)

if val1 is None:
raise EnvVarError(f"Missing environment variable: '{var1}'")
if val2 is None:
raise EnvVarError(f"Missing environment variable: '{var2}'")

return val1, val2

def check_env_equals(var1, var2):
val1, val2 = get_env_vars(var1, var2)
if val1 != val2:
raise EnvVarError(f"'{var1}' must equal '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'")

def check_env_contains(var1, var2):
val1, val2 = get_env_vars(var1, var2)
if val2 not in val1:
raise EnvVarError(f"'{var1}' must contain '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'")

def check_env_endswith(var1, var2):
val1, val2 = get_env_vars(var1, var2)
if not val1.endswith(val2):
raise EnvVarError(f"'{var1}' must end with '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'")

if __name__ == "__main__":
try:
# accelerator stuff is not guaranteed to exist
expected_eessi_accel_arch = os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE", default=None)

# Verify the software and accelerator targets are set correctly
if os.getenv("EESSI_SOFTWARE_SUBDIR_OVERRIDE", default=None):
check_env_equals("EESSI_SOFTWARE_SUBDIR_OVERRIDE", "EESSI_SOFTWARE_SUBDIR")
if expected_eessi_accel_arch:
# EESSI_ACCEL_SUBDIR is what is detected by archdetect (or respects EESSI_ACCELERATOR_TARGET_OVERRIDE)
check_env_equals("EESSI_ACCELERATOR_TARGET_OVERRIDE", "EESSI_ACCEL_SUBDIR")
# special case is where EESSI_ACCELERATOR_TARGET_OVERRIDE may not match the final
# accelerator architecture chosen (in CI we deliberately choose a non-existent CUDA
# compute cabability for one case)
os.environ["EESSI_FINAL_CC"] = expected_eessi_accel_arch[:-1] + "0"
check_env_equals("EESSI_ACCELERATOR_TARGET", "EESSI_FINAL_CC")
# verify the software paths that should exist
check_env_endswith("EESSI_SOFTWARE_PATH", "EESSI_SOFTWARE_SUBDIR")
check_env_endswith("EESSI_SITE_SOFTWARE_PATH", "EESSI_SOFTWARE_SUBDIR")
# verify the module paths that should exist
check_env_contains("EESSI_MODULEPATH", "EESSI_SOFTWARE_SUBDIR")
check_env_contains("EESSI_SITE_MODULEPATH", "EESSI_SOFTWARE_SUBDIR")
if expected_eessi_accel_arch:
check_env_contains("EESSI_MODULEPATH_ACCEL", "EESSI_SOFTWARE_SUBDIR")
check_env_contains("EESSI_SITE_MODULEPATH_ACCEL", "EESSI_SOFTWARE_SUBDIR")
check_env_contains("EESSI_MODULEPATH_ACCEL", "EESSI_ACCELERATOR_TARGET")
check_env_contains("EESSI_SITE_MODULEPATH_ACCEL", "EESSI_ACCELERATOR_TARGET")
# Finally, verify that all the expected module path are included
check_env_contains("MODULEPATH", "EESSI_MODULEPATH")
check_env_contains("MODULEPATH", "EESSI_SITE_MODULEPATH")
if expected_eessi_accel_arch:
check_env_contains("MODULEPATH", "EESSI_MODULEPATH_ACCEL")
check_env_contains("MODULEPATH", "EESSI_SITE_MODULEPATH_ACCEL")

# We are done
print("Environment variable check passed.")
except EnvVarError as e:
print(str(e), file=sys.stderr)
sys.exit(1)
29 changes: 22 additions & 7 deletions .github/workflows/tests_eessi_module.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ jobs:
- x86_64/amd/zen3
- x86_64/amd/zen4
EESSI_ACCELERATOR_TARGET_OVERRIDE:
- accel/nvidia/cc80
- accel/nvidia/cc80
# This should fall back to cc70
- accel/nvidia/cc77
steps:
- name: Check out software-layer repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
Expand Down Expand Up @@ -102,21 +104,26 @@ jobs:
# Turn on debug output in case we want to take a look
export EESSI_DEBUG_INIT=true
CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath)
export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="dummy/cpu:${CPU_ARCH}:dummy1/cpu1"
module load EESSI/${{matrix.EESSI_VERSION}}
# EESSI_ARCHDETECT_OPTIONS_OVERRIDE/EESSI_DEBUG_INIT only relevant for Lmod init
unset EESSI_ARCHDETECT_OPTIONS_OVERRIDE
# EESSI_DEBUG_INIT/EESSI_ARCHDETECT_OPTIONS only relevant for Lmod init
unset EESSI_DEBUG_INIT
# Store all relevant environment variables
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${moduleoutfile}"
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH|^MODULEPATH)' | grep -v EESSI_ARCHDETECT_OPTIONS | sort > "${moduleoutfile}"
module unload EESSI/${{matrix.EESSI_VERSION}}

# We should only have two EESSI_* variables defined (which set the overrides)
if [ "$(env | grep -c '^EESSI')" -ne 2 ]; then
echo "Expected 2 EESSI-related environment variables, but found a different number."
env | grep '^EESSI'
exit 1
fi

# Now do the init script initialisation
source ./init/bash
# source script version sets environment variables to force archdetect, ignore these
unset EESSI_USE_ARCHSPEC
unset EESSI_USE_ARCHDETECT
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${sourceoutfile}"
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH|^MODULEPATH)' | sort > "${sourceoutfile}"

# Now compare the two results
echo ""
Expand Down Expand Up @@ -149,6 +156,8 @@ jobs:
EESSI_ACCELERATOR_TARGET_OVERRIDE:
- none
- accel/nvidia/cc80
# This should fall back to cc70
- accel/nvidia/cc77
steps:
- name: Check out software-layer repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
Expand Down Expand Up @@ -181,7 +190,7 @@ jobs:
initial_env_file="initial_env.txt"
module_cycled_file="load_unload_cycle.txt"

# prepare Lmod, resetting it in a roundabout given we don't want defaults set
# prepare Lmod, resetting it in a roundabout way given we don't want defaults set
export MODULEPATH=init/modules:.github/workflows/modules
module load fake_module
module purge
Expand All @@ -205,3 +214,9 @@ jobs:
diff --unified=0 "${initial_env_file}" "${module_cycled_file}"
exit 1
fi

module load EESSI/${{matrix.EESSI_VERSION}}
# Make sure our CPU and GPU architectures are what we expect
# (script uses EESSI_SOFTWARE_SUBDIR_OVERRIDE and EESSI_ACCELERATOR_TARGET_OVERRIDE
# as the starting point for the comparison)
python .github/workflows/scripts/verify_eessi_environment.py
10 changes: 8 additions & 2 deletions init/bash
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,18 @@ if [ $? -eq 0 ]; then
# prepend location of modules for EESSI software stack to $MODULEPATH
show_msg "Prepending $EESSI_MODULEPATH to \$MODULEPATH..."
module use $EESSI_MODULEPATH

if [ ! -z ${EESSI_MODULEPATH_ACCEL} ]; then
show_msg "Prepending $EESSI_MODULEPATH_ACCEL to \$MODULEPATH..."
module use $EESSI_MODULEPATH_ACCEL
fi

show_msg "Prepending site path $EESSI_SITE_MODULEPATH to \$MODULEPATH..."
module use $EESSI_SITE_MODULEPATH

if [ ! -z ${EESSI_MODULEPATH_ACCEL} ]; then
show_msg "Prepending $EESSI_MODULEPATH_ACCEL to \$MODULEPATH..."
module use $EESSI_MODULEPATH_ACCEL
show_msg "Prepending $EESSI_SITE_MODULEPATH_ACCEL to \$MODULEPATH..."
module use $EESSI_SITE_MODULEPATH_ACCEL
fi

#show_msg ""
Expand Down
23 changes: 18 additions & 5 deletions init/eessi_environment_variables
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,18 @@ if [ -d $EESSI_PREFIX ]; then
EESSI_ACCEL_SOFTWARE_SUBDIR=${EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE:-$EESSI_SOFTWARE_SUBDIR}
# path to where accel/* subdirectory is located
EESSI_ACCEL_SOFTWARE_PATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_ACCEL_SOFTWARE_SUBDIR}
if [ -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCEL_SUBDIR} ]; then
show_msg "archdetect found supported accelerator for CPU target ${EESSI_ACCEL_SOFTWARE_SUBDIR}: ${EESSI_ACCEL_SUBDIR}"
if [ ! -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCEL_SUBDIR} ]; then
# We should try to use the fallback compute capability
EESSI_ACCELERATOR_TARGET="${EESSI_ACCEL_SUBDIR::-1}0"
show_msg "archdetect found no supported accelerator ${EESSI_ACCEL_SUBDIR}, falling back to ${EESSI_ACCELERATOR_TARGET}"
else
show_msg "No matching path found in ${EESSI_ACCEL_SOFTWARE_SUBDIR} for accelerator detected by archdetect (${EESSI_ACCEL_SUBDIR})"
EESSI_ACCELERATOR_TARGET="${EESSI_ACCEL_SUBDIR}"
fi
if [ -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCELERATOR_TARGET} ]; then
show_msg "archdetect found supported accelerator for CPU target ${EESSI_ACCEL_SOFTWARE_SUBDIR}: ${EESSI_ACCELERATOR_TARGET}"
export EESSI_ACCELERATOR_TARGET
else
show_msg "No matching path found in ${EESSI_ACCEL_SOFTWARE_SUBDIR} for accelerator detected by archdetect (${EESSI_ACCELERATOR_TARGET})"
fi
fi
else
Expand All @@ -95,6 +103,7 @@ if [ -d $EESSI_PREFIX ]; then
lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua"
if [ -f $lmod_rc_file ]; then
show_msg "Found Lmod configuration file at $lmod_rc_file"
export LMOD_RC="$lmod_rc_file"
else
error "Lmod configuration file not found at $lmod_rc_file"
fi
Expand All @@ -112,6 +121,8 @@ if [ -d $EESSI_PREFIX ]; then
elif [ -d $EESSI_SOFTWARE_PATH ]; then
export EESSI_SITE_SOFTWARE_PATH=${EESSI_SOFTWARE_PATH/versions/host_injections}
show_msg "Using ${EESSI_SITE_SOFTWARE_PATH} as the site extension directory for installations."
EESSI_SITE_ACCEL_SOFTWARE_PATH=${EESSI_ACCEL_SOFTWARE_PATH/versions/host_injections}
show_msg "Using ${EESSI_SITE_ACCEL_SOFTWARE_PATH} as the site extension directory for accelerated installations."
# Allow for use of alternative module tree shipped with EESSI
if [ -z ${EESSI_MODULE_SUBDIR+x} ]; then
# EESSI_MODULE_SUBDIR not set
Expand All @@ -137,9 +148,11 @@ if [ -d $EESSI_PREFIX ]; then
false
fi

if [ -d ${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} ]; then
export EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR}
if [ -d ${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR} ]; then
export EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR}
show_msg "Using ${EESSI_MODULEPATH_ACCEL} as additional directory (for accelerators) to be added to MODULEPATH."
export EESSI_SITE_MODULEPATH_ACCEL=${EESSI_SITE_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR}
show_msg "Using ${EESSI_SITE_MODULEPATH_ACCEL} as additional site extension directory (for accelerators) to be added to MODULEPATH."
fi

# Fix wrong path for RHEL >=8 libcurl
Expand Down
26 changes: 23 additions & 3 deletions init/modules/EESSI/2023.06.lua
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,14 @@ function archdetect_accel()
local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper_accel.sh')
-- for unload mode, we need to grab the value before it is unset
local archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or (os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") or "")
if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE ") then
if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") then
if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then
LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ACCELERATOR_TARGET_OVERRIDE to the available accelerator architecture in the form of: accel/nvidia/cc80")
end
-- this script sets EESSI_ACCEL_SUBDIR
source_sh("bash", script)
else
setenv("EESSI_ACCEL_SUBDIR", os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE"))
end
archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or archdetect_accel
eessiDebug("Got archdetect accel option: " .. archdetect_accel)
Expand Down Expand Up @@ -140,16 +143,33 @@ if not (archdetect_accel == nil or archdetect_accel == '') then
-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all
eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir)
eessiDebug("Checking if " .. eessi_module_path_accel .. " exists")
if not isDir(eessi_module_path_accel) then
-- fall back to major version GPU arch if the exact one is not an option (i.e, 7.5 -> 7.0)
local original_archdetect_accel = archdetect_accel
archdetect_accel = archdetect_accel:sub(1,-2) .. "0"
eessiDebug("No directory for " .. original_archdetect_accel .. ", trying " .. archdetect_accel)
eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir)
end
if isDir(eessi_module_path_accel) then
-- set the accelerator target based on what actually exists
setenv("EESSI_ACCELERATOR_TARGET", archdetect_accel)
setenv("EESSI_MODULEPATH_ACCEL", eessi_module_path_accel)
prepend_path("MODULEPATH", eessi_module_path_accel)
eessiDebug("Using acclerator modules at: " .. eessi_module_path_accel)
if ( mode() ~= "spider" ) then
prepend_path("MODULEPATH", eessi_module_path_accel)
eessiDebug("Using accelerator modules at: " .. eessi_module_path_accel)
end
end
end

-- prepend the site module path last so it has priority
prepend_path("MODULEPATH", eessi_site_module_path)
eessiDebug("Adding " .. eessi_site_module_path .. " to MODULEPATH")
if isDir(eessi_module_path_accel) then
eessi_module_path_site_accel = string.gsub(eessi_module_path_accel, "versions", "host_injections")
setenv("EESSI_SITE_MODULEPATH_ACCEL", eessi_module_path_site_accel)
prepend_path("MODULEPATH", eessi_module_path_site_accel)
eessiDebug("Using site accelerator modules at: " .. eessi_module_path_site_accel)
end
if mode() == "load" then
LmodMessage("EESSI/" .. eessi_version .. " loaded successfully")
end