Skip to content

Commit aa362c4

Browse files
authored
Merge pull request #5 from ocaisa/cuda_fallbacks
Allow for CUDA compute capability fallbacks when initialising EESSI
2 parents 3ceef12 + 04a8053 commit aa362c4

File tree

5 files changed

+151
-17
lines changed

5 files changed

+151
-17
lines changed
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import os
2+
import sys
3+
4+
class EnvVarError(Exception):
5+
"""Custom exception for environment variable comparison errors."""
6+
def __init__(self, message):
7+
super().__init__(f"ENV VALIDATION ERROR: {message}")
8+
9+
def get_env_vars(var1, var2):
10+
val1 = os.environ.get(var1)
11+
val2 = os.environ.get(var2)
12+
13+
if val1 is None:
14+
raise EnvVarError(f"Missing environment variable: '{var1}'")
15+
if val2 is None:
16+
raise EnvVarError(f"Missing environment variable: '{var2}'")
17+
18+
return val1, val2
19+
20+
def check_env_equals(var1, var2):
21+
val1, val2 = get_env_vars(var1, var2)
22+
if val1 != val2:
23+
raise EnvVarError(f"'{var1}' must equal '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'")
24+
25+
def check_env_contains(var1, var2):
26+
val1, val2 = get_env_vars(var1, var2)
27+
if val2 not in val1:
28+
raise EnvVarError(f"'{var1}' must contain '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'")
29+
30+
def check_env_endswith(var1, var2):
31+
val1, val2 = get_env_vars(var1, var2)
32+
if not val1.endswith(val2):
33+
raise EnvVarError(f"'{var1}' must end with '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'")
34+
35+
if __name__ == "__main__":
36+
try:
37+
# accelerator stuff is not guaranteed to exist
38+
expected_eessi_accel_arch = os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE", default=None)
39+
40+
# Verify the software and accelerator targets are set correctly
41+
if os.getenv("EESSI_SOFTWARE_SUBDIR_OVERRIDE", default=None):
42+
check_env_equals("EESSI_SOFTWARE_SUBDIR_OVERRIDE", "EESSI_SOFTWARE_SUBDIR")
43+
if expected_eessi_accel_arch:
44+
# EESSI_ACCEL_SUBDIR is what is detected by archdetect (or respects EESSI_ACCELERATOR_TARGET_OVERRIDE)
45+
check_env_equals("EESSI_ACCELERATOR_TARGET_OVERRIDE", "EESSI_ACCEL_SUBDIR")
46+
# special case is where EESSI_ACCELERATOR_TARGET_OVERRIDE may not match the final
47+
# accelerator architecture chosen.
48+
# In CI we set FINAL_ACCELERATOR_PATH_EXPECTED to allow us to compare against an expected value.
49+
check_env_equals("EESSI_ACCELERATOR_TARGET", "FINAL_ACCELERATOR_PATH_EXPECTED")
50+
# verify the software paths that should exist
51+
check_env_endswith("EESSI_SOFTWARE_PATH", "EESSI_SOFTWARE_SUBDIR")
52+
check_env_endswith("EESSI_SITE_SOFTWARE_PATH", "EESSI_SOFTWARE_SUBDIR")
53+
# verify the module paths that should exist
54+
check_env_contains("EESSI_MODULEPATH", "EESSI_SOFTWARE_SUBDIR")
55+
check_env_contains("EESSI_SITE_MODULEPATH", "EESSI_SOFTWARE_SUBDIR")
56+
if expected_eessi_accel_arch:
57+
check_env_contains("EESSI_MODULEPATH_ACCEL", "EESSI_SOFTWARE_SUBDIR")
58+
check_env_contains("EESSI_SITE_MODULEPATH_ACCEL", "EESSI_SOFTWARE_SUBDIR")
59+
check_env_contains("EESSI_MODULEPATH_ACCEL", "EESSI_ACCELERATOR_TARGET")
60+
check_env_contains("EESSI_SITE_MODULEPATH_ACCEL", "EESSI_ACCELERATOR_TARGET")
61+
# Finally, verify that all the expected module path are included
62+
check_env_contains("MODULEPATH", "EESSI_MODULEPATH")
63+
check_env_contains("MODULEPATH", "EESSI_SITE_MODULEPATH")
64+
if expected_eessi_accel_arch:
65+
check_env_contains("MODULEPATH", "EESSI_MODULEPATH_ACCEL")
66+
check_env_contains("MODULEPATH", "EESSI_SITE_MODULEPATH_ACCEL")
67+
68+
# We are done
69+
print("Environment variable check passed.")
70+
except EnvVarError as e:
71+
print(str(e), file=sys.stderr)
72+
sys.exit(1)

.github/workflows/tests_eessi_module.yml

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,10 @@ jobs:
7373
- x86_64/amd/zen3
7474
- x86_64/amd/zen4
7575
EESSI_ACCELERATOR_TARGET_OVERRIDE:
76-
- accel/nvidia/cc80
76+
- accel/nvidia/cc80
77+
# This should fall back to cc70 but that is checked later (in this step we just check for consistency)
78+
- accel/nvidia/cc77
79+
7780
steps:
7881
- name: Check out software-layer repository
7982
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -106,21 +109,26 @@ jobs:
106109
# Turn on debug output in case we want to take a look
107110
export EESSI_DEBUG_INIT=true
108111
CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath)
109-
export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="dummy/cpu:${CPU_ARCH}:dummy1/cpu1"
110112
module load EESSI/${{matrix.EESSI_VERSION}}
111-
# EESSI_ARCHDETECT_OPTIONS_OVERRIDE/EESSI_DEBUG_INIT only relevant for Lmod init
112-
unset EESSI_ARCHDETECT_OPTIONS_OVERRIDE
113+
# EESSI_DEBUG_INIT/EESSI_ARCHDETECT_OPTIONS only relevant for Lmod init
113114
unset EESSI_DEBUG_INIT
114115
# Store all relevant environment variables
115-
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${moduleoutfile}"
116+
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH|^MODULEPATH)' | grep -v EESSI_ARCHDETECT_OPTIONS | sort > "${moduleoutfile}"
116117
module unload EESSI/${{matrix.EESSI_VERSION}}
117118
119+
# We should only have two EESSI_* variables defined (which set the overrides)
120+
if [ "$(env | grep -c '^EESSI')" -ne 2 ]; then
121+
echo "Expected 2 EESSI-related environment variables, but found a different number."
122+
env | grep '^EESSI'
123+
exit 1
124+
fi
125+
118126
# Now do the init script initialisation
119127
source ./init/bash
120128
# source script version sets environment variables to force archdetect, ignore these
121129
unset EESSI_USE_ARCHSPEC
122130
unset EESSI_USE_ARCHDETECT
123-
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${sourceoutfile}"
131+
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH|^MODULEPATH)' | sort > "${sourceoutfile}"
124132
125133
# Now compare the two results
126134
echo ""
@@ -153,6 +161,14 @@ jobs:
153161
EESSI_ACCELERATOR_TARGET_OVERRIDE:
154162
- none
155163
- accel/nvidia/cc80
164+
- accel/nvidia/cc77
165+
include:
166+
# For each override we expect a specific path (which may differ from the original due to overrides)
167+
- EESSI_ACCELERATOR_TARGET_OVERRIDE: accel/nvidia/cc80
168+
FINAL_ACCELERATOR_PATH_EXPECTED: accel/nvidia/cc80
169+
- EESSI_ACCELERATOR_TARGET_OVERRIDE: accel/nvidia/cc77 # deliberately chose a non-existent CUDA capability
170+
FINAL_ACCELERATOR_PATH_EXPECTED: accel/nvidia/cc70 # this reverts to the fallback case (which does exist)
171+
156172
steps:
157173
- name: Check out software-layer repository
158174
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -177,6 +193,7 @@ jobs:
177193
# Set our accelerator path overrides according to our matrix
178194
if [[ "${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}}" != "none" ]]; then
179195
export EESSI_ACCELERATOR_TARGET_OVERRIDE=${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}}
196+
export FINAL_ACCELERATOR_PATH_EXPECTED=${{matrix.FINAL_ACCELERATOR_PATH_EXPECTED}}
180197
fi
181198
182199
# Turn on debug output in case we want to take a look
@@ -185,7 +202,7 @@ jobs:
185202
initial_env_file="initial_env.txt"
186203
module_cycled_file="load_unload_cycle.txt"
187204
188-
# prepare Lmod, resetting it in a roundabout given we don't want defaults set
205+
# prepare Lmod, resetting it in a roundabout way given we don't want defaults set
189206
export MODULEPATH=init/modules:.github/workflows/modules
190207
module load fake_module
191208
module purge
@@ -209,3 +226,9 @@ jobs:
209226
diff --unified=0 "${initial_env_file}" "${module_cycled_file}"
210227
exit 1
211228
fi
229+
230+
module load EESSI/${{matrix.EESSI_VERSION}}
231+
# Make sure our CPU and GPU architectures are what we expect
232+
# (script uses EESSI_SOFTWARE_SUBDIR_OVERRIDE and EESSI_ACCELERATOR_TARGET_OVERRIDE
233+
# as the starting point for the comparison)
234+
python .github/workflows/scripts/verify_eessi_environment.py

init/bash

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,20 @@ if [ $? -eq 0 ]; then
2626
# prepend location of modules for EESSI software stack to $MODULEPATH
2727
show_msg "Prepending $EESSI_MODULEPATH to \$MODULEPATH..."
2828
module use $EESSI_MODULEPATH
29-
show_msg "Prepending site path $EESSI_SITE_MODULEPATH to \$MODULEPATH..."
30-
module use $EESSI_SITE_MODULEPATH
3129

3230
if [ ! -z ${EESSI_MODULEPATH_ACCEL} ]; then
3331
show_msg "Prepending $EESSI_MODULEPATH_ACCEL to \$MODULEPATH..."
3432
module use $EESSI_MODULEPATH_ACCEL
3533
fi
3634

35+
show_msg "Prepending site path $EESSI_SITE_MODULEPATH to \$MODULEPATH..."
36+
module use $EESSI_SITE_MODULEPATH
37+
38+
if [ ! -z ${EESSI_MODULEPATH_ACCEL} ]; then
39+
show_msg "Prepending $EESSI_SITE_MODULEPATH_ACCEL to \$MODULEPATH..."
40+
module use $EESSI_SITE_MODULEPATH_ACCEL
41+
fi
42+
3743
#show_msg ""
3844
#show_msg "*** Known problems in the ${EESSI_VERSION} software stack ***"
3945
#show_msg ""

init/eessi_environment_variables

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,18 @@ if [ -d $EESSI_PREFIX ]; then
6767
EESSI_ACCEL_SOFTWARE_SUBDIR=${EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE:-$EESSI_SOFTWARE_SUBDIR}
6868
# path to where accel/* subdirectory is located
6969
EESSI_ACCEL_SOFTWARE_PATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_ACCEL_SOFTWARE_SUBDIR}
70-
if [ -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCEL_SUBDIR} ]; then
71-
show_msg "archdetect found supported accelerator for CPU target ${EESSI_ACCEL_SOFTWARE_SUBDIR}: ${EESSI_ACCEL_SUBDIR}"
70+
if [ ! -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCEL_SUBDIR} ]; then
71+
# We should try to use the fallback compute capability
72+
EESSI_ACCELERATOR_TARGET="${EESSI_ACCEL_SUBDIR::-1}0"
73+
show_msg "archdetect found no supported accelerator ${EESSI_ACCEL_SUBDIR}, falling back to ${EESSI_ACCELERATOR_TARGET}"
7274
else
73-
show_msg "No matching path found in ${EESSI_ACCEL_SOFTWARE_SUBDIR} for accelerator detected by archdetect (${EESSI_ACCEL_SUBDIR})"
75+
EESSI_ACCELERATOR_TARGET="${EESSI_ACCEL_SUBDIR}"
76+
fi
77+
if [ -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCELERATOR_TARGET} ]; then
78+
show_msg "archdetect found supported accelerator for CPU target ${EESSI_ACCEL_SOFTWARE_SUBDIR}: ${EESSI_ACCELERATOR_TARGET}"
79+
export EESSI_ACCELERATOR_TARGET
80+
else
81+
show_msg "No matching path found in ${EESSI_ACCEL_SOFTWARE_SUBDIR} for accelerator detected by archdetect (${EESSI_ACCELERATOR_TARGET})"
7482
fi
7583
fi
7684
else
@@ -95,6 +103,7 @@ if [ -d $EESSI_PREFIX ]; then
95103
lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua"
96104
if [ -f $lmod_rc_file ]; then
97105
show_msg "Found Lmod configuration file at $lmod_rc_file"
106+
export LMOD_RC="$lmod_rc_file"
98107
else
99108
error "Lmod configuration file not found at $lmod_rc_file"
100109
fi
@@ -112,6 +121,8 @@ if [ -d $EESSI_PREFIX ]; then
112121
elif [ -d $EESSI_SOFTWARE_PATH ]; then
113122
export EESSI_SITE_SOFTWARE_PATH=${EESSI_SOFTWARE_PATH/versions/host_injections}
114123
show_msg "Using ${EESSI_SITE_SOFTWARE_PATH} as the site extension directory for installations."
124+
EESSI_SITE_ACCEL_SOFTWARE_PATH=${EESSI_ACCEL_SOFTWARE_PATH/versions/host_injections}
125+
show_msg "Using ${EESSI_SITE_ACCEL_SOFTWARE_PATH} as the site extension directory for accelerated installations."
115126
# Allow for use of alternative module tree shipped with EESSI
116127
if [ -z ${EESSI_MODULE_SUBDIR+x} ]; then
117128
# EESSI_MODULE_SUBDIR not set
@@ -137,9 +148,11 @@ if [ -d $EESSI_PREFIX ]; then
137148
false
138149
fi
139150

140-
if [ -d ${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} ]; then
141-
export EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR}
151+
if [ -d ${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR} ]; then
152+
export EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR}
142153
show_msg "Using ${EESSI_MODULEPATH_ACCEL} as additional directory (for accelerators) to be added to MODULEPATH."
154+
export EESSI_SITE_MODULEPATH_ACCEL=${EESSI_SITE_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR}
155+
show_msg "Using ${EESSI_SITE_MODULEPATH_ACCEL} as additional site extension directory (for accelerators) to be added to MODULEPATH."
143156
fi
144157

145158
# Fix wrong path for RHEL >=8 libcurl

init/modules/EESSI/2023.06.lua

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,14 @@ function archdetect_accel()
6464
local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper_accel.sh')
6565
-- for unload mode, we need to grab the value before it is unset
6666
local archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or (os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") or "")
67-
if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE ") then
67+
if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") then
6868
if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then
6969
LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ACCELERATOR_TARGET_OVERRIDE to the available accelerator architecture in the form of: accel/nvidia/cc80")
7070
end
71+
-- this script sets EESSI_ACCEL_SUBDIR
7172
source_sh("bash", script)
73+
else
74+
setenv("EESSI_ACCEL_SUBDIR", os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE"))
7275
end
7376
archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or archdetect_accel
7477
eessiDebug("Got archdetect accel option: " .. archdetect_accel)
@@ -141,16 +144,33 @@ if not (archdetect_accel == nil or archdetect_accel == '') then
141144
-- /cvmfs/software.eessi.io/versions/<EESSI_VERSION>/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all
142145
eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir)
143146
eessiDebug("Checking if " .. eessi_module_path_accel .. " exists")
147+
if not isDir(eessi_module_path_accel) then
148+
-- fall back to major version GPU arch if the exact one is not an option (i.e, 7.5 -> 7.0)
149+
local original_archdetect_accel = archdetect_accel
150+
archdetect_accel = archdetect_accel:sub(1,-2) .. "0"
151+
eessiDebug("No directory for " .. original_archdetect_accel .. ", trying " .. archdetect_accel)
152+
eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir)
153+
end
144154
if isDir(eessi_module_path_accel) then
155+
-- set the accelerator target based on what actually exists
156+
setenv("EESSI_ACCELERATOR_TARGET", archdetect_accel)
145157
setenv("EESSI_MODULEPATH_ACCEL", eessi_module_path_accel)
146-
prepend_path("MODULEPATH", eessi_module_path_accel)
147-
eessiDebug("Using acclerator modules at: " .. eessi_module_path_accel)
158+
if ( mode() ~= "spider" ) then
159+
prepend_path("MODULEPATH", eessi_module_path_accel)
160+
eessiDebug("Using accelerator modules at: " .. eessi_module_path_accel)
161+
end
148162
end
149163
end
150164

151165
-- prepend the site module path last so it has priority
152166
prepend_path("MODULEPATH", eessi_site_module_path)
153167
eessiDebug("Adding " .. eessi_site_module_path .. " to MODULEPATH")
168+
if isDir(eessi_module_path_accel) then
169+
eessi_module_path_site_accel = string.gsub(eessi_module_path_accel, "versions", "host_injections")
170+
setenv("EESSI_SITE_MODULEPATH_ACCEL", eessi_module_path_site_accel)
171+
prepend_path("MODULEPATH", eessi_module_path_site_accel)
172+
eessiDebug("Using site accelerator modules at: " .. eessi_module_path_site_accel)
173+
end
154174
if mode() == "load" then
155175
LmodMessage("EESSI/" .. eessi_version .. " loaded successfully")
156176
end

0 commit comments

Comments
 (0)