diff --git a/.github/workflows/scripts/verify_eessi_environment.py b/.github/workflows/scripts/verify_eessi_environment.py new file mode 100644 index 0000000000..51990c013f --- /dev/null +++ b/.github/workflows/scripts/verify_eessi_environment.py @@ -0,0 +1,75 @@ +import os + +import os +import sys + +class EnvVarError(Exception): + """Custom exception for environment variable comparison errors.""" + def __init__(self, message): + super().__init__(f"ENV VALIDATION ERROR: {message}") + +def get_env_vars(var1, var2): + val1 = os.environ.get(var1) + val2 = os.environ.get(var2) + + if val1 is None: + raise EnvVarError(f"Missing environment variable: '{var1}'") + if val2 is None: + raise EnvVarError(f"Missing environment variable: '{var2}'") + + return val1, val2 + +def check_env_equals(var1, var2): + val1, val2 = get_env_vars(var1, var2) + if val1 != val2: + raise EnvVarError(f"'{var1}' must equal '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'") + +def check_env_contains(var1, var2): + val1, val2 = get_env_vars(var1, var2) + if val2 not in val1: + raise EnvVarError(f"'{var1}' must contain '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'") + +def check_env_endswith(var1, var2): + val1, val2 = get_env_vars(var1, var2) + if not val1.endswith(val2): + raise EnvVarError(f"'{var1}' must end with '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'") + +if __name__ == "__main__": + try: + # accelerator stuff is not guaranteed to exist + expected_eessi_accel_arch = os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE", default=None) + + # Verify the software and accelerator targets are set correctly + if os.getenv("EESSI_SOFTWARE_SUBDIR_OVERRIDE", default=None): + check_env_equals("EESSI_SOFTWARE_SUBDIR_OVERRIDE", "EESSI_SOFTWARE_SUBDIR") + if expected_eessi_accel_arch: + # EESSI_ACCEL_SUBDIR is what is detected by archdetect (or respects EESSI_ACCELERATOR_TARGET_OVERRIDE) + check_env_equals("EESSI_ACCELERATOR_TARGET_OVERRIDE", "EESSI_ACCEL_SUBDIR") + # special case is where EESSI_ACCELERATOR_TARGET_OVERRIDE may not match the final + # accelerator architecture chosen (in CI we deliberately choose a non-existent CUDA + # compute cabability for one case) + os.environ["EESSI_FINAL_CC"] = expected_eessi_accel_arch[:-1] + "0" + check_env_equals("EESSI_ACCELERATOR_TARGET", "EESSI_FINAL_CC") + # verify the software paths that should exist + check_env_endswith("EESSI_SOFTWARE_PATH", "EESSI_SOFTWARE_SUBDIR") + check_env_endswith("EESSI_SITE_SOFTWARE_PATH", "EESSI_SOFTWARE_SUBDIR") + # verify the module paths that should exist + check_env_contains("EESSI_MODULEPATH", "EESSI_SOFTWARE_SUBDIR") + check_env_contains("EESSI_SITE_MODULEPATH", "EESSI_SOFTWARE_SUBDIR") + if expected_eessi_accel_arch: + check_env_contains("EESSI_MODULEPATH_ACCEL", "EESSI_SOFTWARE_SUBDIR") + check_env_contains("EESSI_SITE_MODULEPATH_ACCEL", "EESSI_SOFTWARE_SUBDIR") + check_env_contains("EESSI_MODULEPATH_ACCEL", "EESSI_ACCELERATOR_TARGET") + check_env_contains("EESSI_SITE_MODULEPATH_ACCEL", "EESSI_ACCELERATOR_TARGET") + # Finally, verify that all the expected module path are included + check_env_contains("MODULEPATH", "EESSI_MODULEPATH") + check_env_contains("MODULEPATH", "EESSI_SITE_MODULEPATH") + if expected_eessi_accel_arch: + check_env_contains("MODULEPATH", "EESSI_MODULEPATH_ACCEL") + check_env_contains("MODULEPATH", "EESSI_SITE_MODULEPATH_ACCEL") + + # We are done + print("Environment variable check passed.") + except EnvVarError as e: + print(str(e), file=sys.stderr) + sys.exit(1) diff --git a/.github/workflows/tests_eessi_module.yml b/.github/workflows/tests_eessi_module.yml index a7b38e2205..9867255679 100644 --- a/.github/workflows/tests_eessi_module.yml +++ b/.github/workflows/tests_eessi_module.yml @@ -73,7 +73,9 @@ jobs: - x86_64/amd/zen3 - x86_64/amd/zen4 EESSI_ACCELERATOR_TARGET_OVERRIDE: - - accel/nvidia/cc80 + - accel/nvidia/cc80 + # This should fall back to cc70 + - accel/nvidia/cc77 steps: - name: Check out software-layer repository uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -102,21 +104,26 @@ jobs: # Turn on debug output in case we want to take a look export EESSI_DEBUG_INIT=true CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath) - export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="dummy/cpu:${CPU_ARCH}:dummy1/cpu1" module load EESSI/${{matrix.EESSI_VERSION}} - # EESSI_ARCHDETECT_OPTIONS_OVERRIDE/EESSI_DEBUG_INIT only relevant for Lmod init - unset EESSI_ARCHDETECT_OPTIONS_OVERRIDE + # EESSI_DEBUG_INIT/EESSI_ARCHDETECT_OPTIONS only relevant for Lmod init unset EESSI_DEBUG_INIT # Store all relevant environment variables - env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${moduleoutfile}" + env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH|^MODULEPATH)' | grep -v EESSI_ARCHDETECT_OPTIONS | sort > "${moduleoutfile}" module unload EESSI/${{matrix.EESSI_VERSION}} + # We should only have two EESSI_* variables defined (which set the overrides) + if [ "$(env | grep -c '^EESSI')" -ne 2 ]; then + echo "Expected 2 EESSI-related environment variables, but found a different number." + env | grep '^EESSI' + exit 1 + fi + # Now do the init script initialisation source ./init/bash # source script version sets environment variables to force archdetect, ignore these unset EESSI_USE_ARCHSPEC unset EESSI_USE_ARCHDETECT - env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${sourceoutfile}" + env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH|^MODULEPATH)' | sort > "${sourceoutfile}" # Now compare the two results echo "" @@ -149,6 +156,8 @@ jobs: EESSI_ACCELERATOR_TARGET_OVERRIDE: - none - accel/nvidia/cc80 + # This should fall back to cc70 + - accel/nvidia/cc77 steps: - name: Check out software-layer repository uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -181,7 +190,7 @@ jobs: initial_env_file="initial_env.txt" module_cycled_file="load_unload_cycle.txt" - # prepare Lmod, resetting it in a roundabout given we don't want defaults set + # prepare Lmod, resetting it in a roundabout way given we don't want defaults set export MODULEPATH=init/modules:.github/workflows/modules module load fake_module module purge @@ -205,3 +214,9 @@ jobs: diff --unified=0 "${initial_env_file}" "${module_cycled_file}" exit 1 fi + + module load EESSI/${{matrix.EESSI_VERSION}} + # Make sure our CPU and GPU architectures are what we expect + # (script uses EESSI_SOFTWARE_SUBDIR_OVERRIDE and EESSI_ACCELERATOR_TARGET_OVERRIDE + # as the starting point for the comparison) + python .github/workflows/scripts/verify_eessi_environment.py diff --git a/init/bash b/init/bash index 928ac6efdf..548c5f0f14 100644 --- a/init/bash +++ b/init/bash @@ -26,12 +26,18 @@ if [ $? -eq 0 ]; then # prepend location of modules for EESSI software stack to $MODULEPATH show_msg "Prepending $EESSI_MODULEPATH to \$MODULEPATH..." module use $EESSI_MODULEPATH + + if [ ! -z ${EESSI_MODULEPATH_ACCEL} ]; then + show_msg "Prepending $EESSI_MODULEPATH_ACCEL to \$MODULEPATH..." + module use $EESSI_MODULEPATH_ACCEL + fi + show_msg "Prepending site path $EESSI_SITE_MODULEPATH to \$MODULEPATH..." module use $EESSI_SITE_MODULEPATH if [ ! -z ${EESSI_MODULEPATH_ACCEL} ]; then - show_msg "Prepending $EESSI_MODULEPATH_ACCEL to \$MODULEPATH..." - module use $EESSI_MODULEPATH_ACCEL + show_msg "Prepending $EESSI_SITE_MODULEPATH_ACCEL to \$MODULEPATH..." + module use $EESSI_SITE_MODULEPATH_ACCEL fi #show_msg "" diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 0f13b1493a..49f3670c9c 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -67,10 +67,18 @@ if [ -d $EESSI_PREFIX ]; then EESSI_ACCEL_SOFTWARE_SUBDIR=${EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE:-$EESSI_SOFTWARE_SUBDIR} # path to where accel/* subdirectory is located EESSI_ACCEL_SOFTWARE_PATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_ACCEL_SOFTWARE_SUBDIR} - if [ -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCEL_SUBDIR} ]; then - show_msg "archdetect found supported accelerator for CPU target ${EESSI_ACCEL_SOFTWARE_SUBDIR}: ${EESSI_ACCEL_SUBDIR}" + if [ ! -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCEL_SUBDIR} ]; then + # We should try to use the fallback compute capability + EESSI_ACCELERATOR_TARGET="${EESSI_ACCEL_SUBDIR::-1}0" + show_msg "archdetect found no supported accelerator ${EESSI_ACCEL_SUBDIR}, falling back to ${EESSI_ACCELERATOR_TARGET}" else - show_msg "No matching path found in ${EESSI_ACCEL_SOFTWARE_SUBDIR} for accelerator detected by archdetect (${EESSI_ACCEL_SUBDIR})" + EESSI_ACCELERATOR_TARGET="${EESSI_ACCEL_SUBDIR}" + fi + if [ -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCELERATOR_TARGET} ]; then + show_msg "archdetect found supported accelerator for CPU target ${EESSI_ACCEL_SOFTWARE_SUBDIR}: ${EESSI_ACCELERATOR_TARGET}" + export EESSI_ACCELERATOR_TARGET + else + show_msg "No matching path found in ${EESSI_ACCEL_SOFTWARE_SUBDIR} for accelerator detected by archdetect (${EESSI_ACCELERATOR_TARGET})" fi fi else @@ -95,6 +103,7 @@ if [ -d $EESSI_PREFIX ]; then lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua" if [ -f $lmod_rc_file ]; then show_msg "Found Lmod configuration file at $lmod_rc_file" + export LMOD_RC="$lmod_rc_file" else error "Lmod configuration file not found at $lmod_rc_file" fi @@ -112,6 +121,8 @@ if [ -d $EESSI_PREFIX ]; then elif [ -d $EESSI_SOFTWARE_PATH ]; then export EESSI_SITE_SOFTWARE_PATH=${EESSI_SOFTWARE_PATH/versions/host_injections} show_msg "Using ${EESSI_SITE_SOFTWARE_PATH} as the site extension directory for installations." + EESSI_SITE_ACCEL_SOFTWARE_PATH=${EESSI_ACCEL_SOFTWARE_PATH/versions/host_injections} + show_msg "Using ${EESSI_SITE_ACCEL_SOFTWARE_PATH} as the site extension directory for accelerated installations." # Allow for use of alternative module tree shipped with EESSI if [ -z ${EESSI_MODULE_SUBDIR+x} ]; then # EESSI_MODULE_SUBDIR not set @@ -137,9 +148,11 @@ if [ -d $EESSI_PREFIX ]; then false fi - if [ -d ${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} ]; then - export EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} + if [ -d ${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR} ]; then + export EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR} show_msg "Using ${EESSI_MODULEPATH_ACCEL} as additional directory (for accelerators) to be added to MODULEPATH." + export EESSI_SITE_MODULEPATH_ACCEL=${EESSI_SITE_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR} + show_msg "Using ${EESSI_SITE_MODULEPATH_ACCEL} as additional site extension directory (for accelerators) to be added to MODULEPATH." fi # Fix wrong path for RHEL >=8 libcurl diff --git a/init/modules/EESSI/2023.06.lua b/init/modules/EESSI/2023.06.lua index d5105e89fc..b21509c1c5 100644 --- a/init/modules/EESSI/2023.06.lua +++ b/init/modules/EESSI/2023.06.lua @@ -63,11 +63,14 @@ function archdetect_accel() local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper_accel.sh') -- for unload mode, we need to grab the value before it is unset local archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or (os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") or "") - if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE ") then + if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") then if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ACCELERATOR_TARGET_OVERRIDE to the available accelerator architecture in the form of: accel/nvidia/cc80") end + -- this script sets EESSI_ACCEL_SUBDIR source_sh("bash", script) + else + setenv("EESSI_ACCEL_SUBDIR", os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE")) end archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or archdetect_accel eessiDebug("Got archdetect accel option: " .. archdetect_accel) @@ -140,16 +143,33 @@ if not (archdetect_accel == nil or archdetect_accel == '') then -- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir) eessiDebug("Checking if " .. eessi_module_path_accel .. " exists") + if not isDir(eessi_module_path_accel) then + -- fall back to major version GPU arch if the exact one is not an option (i.e, 7.5 -> 7.0) + local original_archdetect_accel = archdetect_accel + archdetect_accel = archdetect_accel:sub(1,-2) .. "0" + eessiDebug("No directory for " .. original_archdetect_accel .. ", trying " .. archdetect_accel) + eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir) + end if isDir(eessi_module_path_accel) then + -- set the accelerator target based on what actually exists + setenv("EESSI_ACCELERATOR_TARGET", archdetect_accel) setenv("EESSI_MODULEPATH_ACCEL", eessi_module_path_accel) - prepend_path("MODULEPATH", eessi_module_path_accel) - eessiDebug("Using acclerator modules at: " .. eessi_module_path_accel) + if ( mode() ~= "spider" ) then + prepend_path("MODULEPATH", eessi_module_path_accel) + eessiDebug("Using accelerator modules at: " .. eessi_module_path_accel) + end end end -- prepend the site module path last so it has priority prepend_path("MODULEPATH", eessi_site_module_path) eessiDebug("Adding " .. eessi_site_module_path .. " to MODULEPATH") +if isDir(eessi_module_path_accel) then + eessi_module_path_site_accel = string.gsub(eessi_module_path_accel, "versions", "host_injections") + setenv("EESSI_SITE_MODULEPATH_ACCEL", eessi_module_path_site_accel) + prepend_path("MODULEPATH", eessi_module_path_site_accel) + eessiDebug("Using site accelerator modules at: " .. eessi_module_path_site_accel) +end if mode() == "load" then LmodMessage("EESSI/" .. eessi_version .. " loaded successfully") end