diff --git a/.github/workflows/scripts/compare_to_generic.sh b/.github/workflows/scripts/compare_to_generic.sh index 59a1397ec5..8f046ec9d4 100755 --- a/.github/workflows/scripts/compare_to_generic.sh +++ b/.github/workflows/scripts/compare_to_generic.sh @@ -23,4 +23,32 @@ esac source_of_truth_modules="$base_dir/$source_of_truth/$modules_subdir" arch_modules="$base_dir/$target_arch/$modules_subdir" echo "Comparing $arch_modules to $source_of_truth_modules" -python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules + +if ! python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules; then + echo "Warning: Comparison failed for CPU stacks" >&2 + exit 1 +fi + +# Also compare NVIDIA GPU software stacks +if [[ -n "$CUDA_COMPUTE_CAPABILITIES" ]]; then + read -ra compute_capabilities <<< "$CUDA_COMPUTE_CAPABILITIES" + echo "Also comparing CUDA-enabled software stacks (for compute capabilities: ${compute_capabilities[@]})" + # Initialize a variable to track failures + any_failure=0 + # Loop over the array + for cc in "${compute_capabilities[@]}"; do + source_of_truth_modules="$base_dir/$source_of_truth/accel/nvidia/cc80/$modules_subdir" + arch_modules="$base_dir/$target_arch/accel/nvidia/$cc/$modules_subdir" + echo "Comparing $arch_modules to $source_of_truth_modules" + if ! python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules; then + echo "Warning: Comparison failed for compute capability $cc" >&2 + any_failure=1 + fi + done + if [[ $any_failure -ne 0 ]]; then + echo "One or more CUDA software stack comparisons failed." >&2 + exit 1 + fi +else + echo "CUDA_COMPUTE_CAPABILITIES is not set or is empty, not checking NVIDIA software stacks" +fi diff --git a/.github/workflows/test_compare_stacks.yml b/.github/workflows/test_compare_stacks.yml index f57a70aa6c..bcaa0db869 100644 --- a/.github/workflows/test_compare_stacks.yml +++ b/.github/workflows/test_compare_stacks.yml @@ -8,11 +8,17 @@ on: permissions: contents: read # to fetch code (actions/checkout) env: - EESSI_ACCELERATOR_TARGETS: | + CUDA_COMPUTE_CAPABILITIES_YAML: | + # Provide a default set of compute capabilities + default: + - cc70 + - cc80 + - cc90 + # and then allow for special cases for specific architectures x86_64/amd/zen2: - - nvidia/cc80 - x86_64/amd/zen3: - - nvidia/cc80 + - cc70 + - cc80 + - cc90 jobs: compare_stacks: runs-on: ubuntu-24.04 @@ -48,4 +54,11 @@ jobs: # Compare the requested architecture to the generic stack # (assumes the general structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/modules/all) + # and include a check for CUDA-enabled software using the environment variable CUDA_COMPUTE_CAPABILITIES + # (which assumes the structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/accel/nvidia/$cc/modules/all) + + # Parse the yaml that makes the compute capabilities arch-dependent + CUDA_COMPUTE_CAPABILITIES=$(echo "${CUDA_COMPUTE_CAPABILITIES_YAML}" | yq ".\"${{matrix.COMPARISON_ARCH}}\" // .default | .[]" | tr '\n' ' ') + export CUDA_COMPUTE_CAPABILITIES=${CUDA_COMPUTE_CAPABILITIES%% } # trim trailing space + .github/workflows/scripts/compare_to_generic.sh ${EESSI_PREFIX}/software/${EESSI_OS_TYPE} ${{matrix.COMPARISON_ARCH}}