From f38d2f624b6641f1f2919aee080d5db05686c58b Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 20 May 2025 15:38:57 +0200 Subject: [PATCH 1/9] Add CUDA software check to stack comparison CI --- .github/workflows/scripts/compare_to_generic.sh | 15 +++++++++++++++ .github/workflows/test_compare_stacks.yml | 12 ++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/.github/workflows/scripts/compare_to_generic.sh b/.github/workflows/scripts/compare_to_generic.sh index 59a1397ec5..de188f4226 100755 --- a/.github/workflows/scripts/compare_to_generic.sh +++ b/.github/workflows/scripts/compare_to_generic.sh @@ -24,3 +24,18 @@ source_of_truth_modules="$base_dir/$source_of_truth/$modules_subdir" arch_modules="$base_dir/$target_arch/$modules_subdir" echo "Comparing $arch_modules to $source_of_truth_modules" python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules + +# Also compare NVIDIA GPU software stacks +if [[ -n "$CUDA_COMPUTE_CAPABILITIES" ]]; then + read -ra compute_capabilities <<< "$CUDA_COMPUTE_CAPABILITIES" + echo "Also comparing CUDA-enabled software stacks (for compute capabilities: ${compute_capabilities[@]})" + # Loop over the array + for cc in "${compute_capabilities[@]}"; do + source_of_truth_modules="$base_dir/$source_of_truth/accel/nvidia/cc80/$modules_subdir" + arch_modules="$base_dir/$target_arch/accel/nvidia/$cc/$modules_subdir" + echo "Comparing $arch_modules to $source_of_truth_modules" + python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules + done +else + echo "CUDA_COMPUTE_CAPABILITIES is not set or is empty, not checking NVIDIA software stacks" +fi diff --git a/.github/workflows/test_compare_stacks.yml b/.github/workflows/test_compare_stacks.yml index f57a70aa6c..e8b4ec4615 100644 --- a/.github/workflows/test_compare_stacks.yml +++ b/.github/workflows/test_compare_stacks.yml @@ -8,11 +8,13 @@ on: permissions: contents: read # to fetch code (actions/checkout) env: - EESSI_ACCELERATOR_TARGETS: | + CUDA_COMPUTE_CAPABILITIES: | + # provide them as something that can be parsed as an array by bash + default: + - "cc70 cc80 cc90" + # Can have exceptional values for specific architectures x86_64/amd/zen2: - - nvidia/cc80 - x86_64/amd/zen3: - - nvidia/cc80 + - "cc70 cc80 cc90" jobs: compare_stacks: runs-on: ubuntu-24.04 @@ -48,4 +50,6 @@ jobs: # Compare the requested architecture to the generic stack # (assumes the general structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/modules/all) + # and include a check for CUDA-enabled software using the environment variable CUDA_COMPUTE_CAPABILITIES + # (which assumes the structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/accel/nvidia/$cc/modules/all) .github/workflows/scripts/compare_to_generic.sh ${EESSI_PREFIX}/software/${EESSI_OS_TYPE} ${{matrix.COMPARISON_ARCH}} From 576c3f7cf26cfb2a0298264ca00fc9e1e4480c35 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 20 May 2025 15:42:49 +0200 Subject: [PATCH 2/9] Comments have to be in a specific place --- .github/workflows/test_compare_stacks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_compare_stacks.yml b/.github/workflows/test_compare_stacks.yml index e8b4ec4615..7bbc0fa134 100644 --- a/.github/workflows/test_compare_stacks.yml +++ b/.github/workflows/test_compare_stacks.yml @@ -9,11 +9,11 @@ permissions: contents: read # to fetch code (actions/checkout) env: CUDA_COMPUTE_CAPABILITIES: | - # provide them as something that can be parsed as an array by bash default: + # provide them as something that can be parsed as an array by bash - "cc70 cc80 cc90" - # Can have exceptional values for specific architectures x86_64/amd/zen2: + # Can have exceptional values for specific architectures - "cc70 cc80 cc90" jobs: compare_stacks: From 3c39008fea3cb47eeaaba6b0e03478bac9d1b6cd Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 20 May 2025 15:58:04 +0200 Subject: [PATCH 3/9] Need to manually parse the yaml of the environment variable --- .github/workflows/test_compare_stacks.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_compare_stacks.yml b/.github/workflows/test_compare_stacks.yml index 7bbc0fa134..276a891908 100644 --- a/.github/workflows/test_compare_stacks.yml +++ b/.github/workflows/test_compare_stacks.yml @@ -8,12 +8,10 @@ on: permissions: contents: read # to fetch code (actions/checkout) env: - CUDA_COMPUTE_CAPABILITIES: | + CUDA_COMPUTE_CAPABILITIES_YAML: | default: - # provide them as something that can be parsed as an array by bash - "cc70 cc80 cc90" x86_64/amd/zen2: - # Can have exceptional values for specific architectures - "cc70 cc80 cc90" jobs: compare_stacks: @@ -52,4 +50,5 @@ jobs: # (assumes the general structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/modules/all) # and include a check for CUDA-enabled software using the environment variable CUDA_COMPUTE_CAPABILITIES # (which assumes the structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/accel/nvidia/$cc/modules/all) + CUDA_COMPUTE_CAPABILITIES=$(echo "${CUDA_COMPUTE_CAPABILITIES_YAML}" | yq ".${COMPARISON_ARCH} // .default | .[]") .github/workflows/scripts/compare_to_generic.sh ${EESSI_PREFIX}/software/${EESSI_OS_TYPE} ${{matrix.COMPARISON_ARCH}} From 3cde01dfefd4ebe2ec5ec51cc91a3142b1af68bd Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 20 May 2025 16:13:49 +0200 Subject: [PATCH 4/9] Need to parse the yaml --- .github/workflows/test_compare_stacks.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_compare_stacks.yml b/.github/workflows/test_compare_stacks.yml index 276a891908..b39e68878b 100644 --- a/.github/workflows/test_compare_stacks.yml +++ b/.github/workflows/test_compare_stacks.yml @@ -10,9 +10,13 @@ permissions: env: CUDA_COMPUTE_CAPABILITIES_YAML: | default: - - "cc70 cc80 cc90" + - cc70 + - cc80 + - cc90 x86_64/amd/zen2: - - "cc70 cc80 cc90" + - cc70 + - cc80 + - cc90 jobs: compare_stacks: runs-on: ubuntu-24.04 @@ -50,5 +54,8 @@ jobs: # (assumes the general structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/modules/all) # and include a check for CUDA-enabled software using the environment variable CUDA_COMPUTE_CAPABILITIES # (which assumes the structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/accel/nvidia/$cc/modules/all) - CUDA_COMPUTE_CAPABILITIES=$(echo "${CUDA_COMPUTE_CAPABILITIES_YAML}" | yq ".${COMPARISON_ARCH} // .default | .[]") + + # Parse the yaml we use to make the compute capabilities arch-dependent + CUDA_COMPUTE_CAPABILITIES=$(echo "${CUDA_COMPUTE_CAPABILITIES_YAML}" | yq ".\"${COMPARISON_ARCH}\" // .default | .[]" | tr '\n' ' ') + CUDA_COMPUTE_CAPABILITIES=${CUDA_COMPUTE_CAPABILITIES%% } # trim trailing space .github/workflows/scripts/compare_to_generic.sh ${EESSI_PREFIX}/software/${EESSI_OS_TYPE} ${{matrix.COMPARISON_ARCH}} From ef74ca92da6b4f175d4d24d4a97566ae9a0b9d2f Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 20 May 2025 16:19:35 +0200 Subject: [PATCH 5/9] Make sure to export the variable --- .github/workflows/test_compare_stacks.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_compare_stacks.yml b/.github/workflows/test_compare_stacks.yml index b39e68878b..0d8352d599 100644 --- a/.github/workflows/test_compare_stacks.yml +++ b/.github/workflows/test_compare_stacks.yml @@ -56,6 +56,7 @@ jobs: # (which assumes the structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/accel/nvidia/$cc/modules/all) # Parse the yaml we use to make the compute capabilities arch-dependent - CUDA_COMPUTE_CAPABILITIES=$(echo "${CUDA_COMPUTE_CAPABILITIES_YAML}" | yq ".\"${COMPARISON_ARCH}\" // .default | .[]" | tr '\n' ' ') - CUDA_COMPUTE_CAPABILITIES=${CUDA_COMPUTE_CAPABILITIES%% } # trim trailing space + CUDA_COMPUTE_CAPABILITIES=$(echo "${CUDA_COMPUTE_CAPABILITIES_YAML}" | yq ".\"${{matrix.COMPARISON_ARCH}}\" // .default | .[]" | tr '\n' ' ') + export CUDA_COMPUTE_CAPABILITIES=${CUDA_COMPUTE_CAPABILITIES%% } # trim trailing space + .github/workflows/scripts/compare_to_generic.sh ${EESSI_PREFIX}/software/${EESSI_OS_TYPE} ${{matrix.COMPARISON_ARCH}} From 06992b090c5d092104e825c0c3229871dcc87411 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 20 May 2025 16:29:44 +0200 Subject: [PATCH 6/9] Make sure to report errors --- .github/workflows/scripts/compare_to_generic.sh | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/.github/workflows/scripts/compare_to_generic.sh b/.github/workflows/scripts/compare_to_generic.sh index de188f4226..8f046ec9d4 100755 --- a/.github/workflows/scripts/compare_to_generic.sh +++ b/.github/workflows/scripts/compare_to_generic.sh @@ -23,19 +23,32 @@ esac source_of_truth_modules="$base_dir/$source_of_truth/$modules_subdir" arch_modules="$base_dir/$target_arch/$modules_subdir" echo "Comparing $arch_modules to $source_of_truth_modules" -python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules + +if ! python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules; then + echo "Warning: Comparison failed for CPU stacks" >&2 + exit 1 +fi # Also compare NVIDIA GPU software stacks if [[ -n "$CUDA_COMPUTE_CAPABILITIES" ]]; then read -ra compute_capabilities <<< "$CUDA_COMPUTE_CAPABILITIES" echo "Also comparing CUDA-enabled software stacks (for compute capabilities: ${compute_capabilities[@]})" + # Initialize a variable to track failures + any_failure=0 # Loop over the array for cc in "${compute_capabilities[@]}"; do source_of_truth_modules="$base_dir/$source_of_truth/accel/nvidia/cc80/$modules_subdir" arch_modules="$base_dir/$target_arch/accel/nvidia/$cc/$modules_subdir" echo "Comparing $arch_modules to $source_of_truth_modules" - python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules + if ! python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules; then + echo "Warning: Comparison failed for compute capability $cc" >&2 + any_failure=1 + fi done + if [[ $any_failure -ne 0 ]]; then + echo "One or more CUDA software stack comparisons failed." >&2 + exit 1 + fi else echo "CUDA_COMPUTE_CAPABILITIES is not set or is empty, not checking NVIDIA software stacks" fi From 2e021b4542dc1be8cc469eb4a2ec771db85abb3c Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 20 May 2025 16:46:23 +0200 Subject: [PATCH 7/9] Test that special cases work --- .github/workflows/test_compare_stacks.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test_compare_stacks.yml b/.github/workflows/test_compare_stacks.yml index 0d8352d599..8022e8832e 100644 --- a/.github/workflows/test_compare_stacks.yml +++ b/.github/workflows/test_compare_stacks.yml @@ -17,6 +17,7 @@ env: - cc70 - cc80 - cc90 + - ccfake jobs: compare_stacks: runs-on: ubuntu-24.04 From de2e370cdfa664e5238d96e032a7e7f5ad18bcea Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 20 May 2025 16:52:17 +0200 Subject: [PATCH 8/9] That works, revert --- .github/workflows/test_compare_stacks.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test_compare_stacks.yml b/.github/workflows/test_compare_stacks.yml index 8022e8832e..c8bcfd33e1 100644 --- a/.github/workflows/test_compare_stacks.yml +++ b/.github/workflows/test_compare_stacks.yml @@ -17,7 +17,6 @@ env: - cc70 - cc80 - cc90 - - ccfake jobs: compare_stacks: runs-on: ubuntu-24.04 @@ -56,7 +55,7 @@ jobs: # and include a check for CUDA-enabled software using the environment variable CUDA_COMPUTE_CAPABILITIES # (which assumes the structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/accel/nvidia/$cc/modules/all) - # Parse the yaml we use to make the compute capabilities arch-dependent + # Parse the yaml that makes the compute capabilities arch-dependent CUDA_COMPUTE_CAPABILITIES=$(echo "${CUDA_COMPUTE_CAPABILITIES_YAML}" | yq ".\"${{matrix.COMPARISON_ARCH}}\" // .default | .[]" | tr '\n' ' ') export CUDA_COMPUTE_CAPABILITIES=${CUDA_COMPUTE_CAPABILITIES%% } # trim trailing space From 0ffc38cd85f2ab6ef870f76b037014bc20a786f9 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 20 May 2025 16:56:31 +0200 Subject: [PATCH 9/9] Add some comments to new yaml envvar --- .github/workflows/test_compare_stacks.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test_compare_stacks.yml b/.github/workflows/test_compare_stacks.yml index c8bcfd33e1..bcaa0db869 100644 --- a/.github/workflows/test_compare_stacks.yml +++ b/.github/workflows/test_compare_stacks.yml @@ -9,10 +9,12 @@ permissions: contents: read # to fetch code (actions/checkout) env: CUDA_COMPUTE_CAPABILITIES_YAML: | + # Provide a default set of compute capabilities default: - cc70 - cc80 - cc90 + # and then allow for special cases for specific architectures x86_64/amd/zen2: - cc70 - cc80