Skip to content

Commit 1361e3b

Browse files
authored
Merge branch 'main' into bandwidth-test
2 parents 4663791 + e4deac6 commit 1361e3b

File tree

45 files changed

+798
-243
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+798
-243
lines changed

.ansible-lint.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ skip_list:
66
- jinja[spacing]
77
- galaxy[no-changelog]
88
- meta-runtime[unsupported-version]
9-
10-
warn_list:
119
- name[missing]
1210
- name[play]
1311
- var-naming

.github/workflows/fatimage.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ jobs:
3636
build:
3737
- image_name: openhpc-RL8
3838
source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.raw
39-
inventory_groups: control,compute,login,update
39+
inventory_groups: fatimage
4040
- image_name: openhpc-RL9
4141
source_image_name: Rocky-9-GenericCloud-Base-9.6-20250531.0.x86_64.qcow2
42-
inventory_groups: control,compute,login,update
42+
inventory_groups: fatimage
4343
env:
4444
ANSIBLE_FORCE_COLOR: True
4545
OS_CLOUD: openstack
@@ -118,6 +118,11 @@ jobs:
118118
. venv/bin/activate
119119
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" || true
120120
121+
- name: Set image properties
122+
run: |
123+
. venv/bin/activate
124+
. dev/image-set-properties.sh "${{ steps.manifest.outputs.image-id }}"
125+
121126
- name: Upload manifest artifact
122127
uses: actions/upload-artifact@v4
123128
with:

.github/workflows/main.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ jobs:
143143
name: Trivy scan image for vulnerabilities
144144
needs: files_changed
145145
if: |
146-
github.event_name == 'pull_request' &&
147146
needs.files_changed.outputs.trivyscan == 'true'
148147
uses: ./.github/workflows/trivyscan.yml
149148
secrets: inherit

.github/workflows/nightly-cleanup.yml

Lines changed: 10 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -46,53 +46,20 @@ jobs:
4646
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
4747
shell: bash
4848

49-
- name: Find CI clusters
49+
- name: Delete all CI clusters
5050
run: |
5151
. venv/bin/activate
52-
CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq || true)
53-
echo "DEBUG: Raw CI clusters: $CI_CLUSTERS"
54-
55-
if [[ -z "$CI_CLUSTERS" ]]; then
56-
echo "No matching CI clusters found."
57-
else
58-
# Flatten multiline value so can be passed as env var
59-
CI_CLUSTERS_FORMATTED=$(echo "$CI_CLUSTERS" | tr '\n' ' ' | sed 's/ $//')
60-
echo "DEBUG: Formatted CI clusters: $CI_CLUSTERS_FORMATTED"
61-
echo "ci_clusters=$CI_CLUSTERS_FORMATTED" >> "$GITHUB_ENV"
62-
fi
52+
./dev/delete-cluster.py slurmci-RL --force
6353
shell: bash
64-
65-
- name: Delete CI clusters
54+
55+
- name: Delete all CI extra build VMs and volumes
6656
run: |
6757
. venv/bin/activate
68-
if [[ -z ${ci_clusters} ]]; then
69-
echo "No clusters to delete."
70-
exit 0
71-
fi
72-
73-
for cluster_prefix in ${ci_clusters}
74-
do
75-
echo "Processing cluster: $cluster_prefix"
76-
77-
# Get all servers with the matching name for control node
78-
CONTROL_SERVERS=$(openstack server list --name "${cluster_prefix}-control" --format json)
79-
80-
# Get unique server names to avoid duplicate cleanup
81-
UNIQUE_NAMES=$(echo "$CONTROL_SERVERS" | jq -r '.[].Name' | sort | uniq)
82-
for name in $UNIQUE_NAMES; do
83-
echo "Deleting cluster with control node: $name"
84-
85-
# Get the first matching server ID by name
86-
server=$(echo "$CONTROL_SERVERS" | jq -r '.[] | select(.Name=="'"$name"'") | .ID' | head -n1)
87-
88-
# Make sure server still exists (wasn't deleted earlier)
89-
if ! openstack server show "$server" &>/dev/null; then
90-
echo "Server $server no longer exists, skipping $name."
91-
continue
92-
fi
58+
./dev/delete-cluster.py openhpc-extra-RL --force
59+
shell: bash
9360

94-
echo "Deleting cluster $cluster_prefix (server $server)..."
95-
./dev/delete-cluster.py "$cluster_prefix" --force
96-
done
97-
done
61+
- name: Delete all fatimage build VMs and volumes
62+
run: |
63+
. venv/bin/activate
64+
./dev/delete-cluster.py openhpc-RL --force
9865
shell: bash

.github/workflows/s3-image-sync.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,11 @@ jobs:
168168
. venv/bin/activate
169169
bash .github/bin/get-s3-image.sh ${{ env.TARGET_IMAGE }} ${{ env.S3_BUCKET }}
170170
171+
- name: Set Glance image properties correctly for Slurm images
172+
run: |
173+
. venv/bin/activate
174+
. dev/image-set-properties.sh "${{ env.TARGET_IMAGE }}"
175+
171176
- name: Cleanup OpenStack Image (on error or cancellation)
172177
if: cancelled() || failure()
173178
run: |

.github/workflows/stackhpc.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,12 @@ jobs:
107107
. venv/bin/activate
108108
. environments/.stackhpc/activate
109109
cd "$STACKHPC_TF_DIR"
110-
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
110+
max_retries=3
111+
delay=30
112+
for i in $(seq 1 $max_retries); do
113+
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" && break
114+
[ "$i" -lt "$max_retries" ] && sleep $delay || exit 1
115+
done
111116
112117
- name: Delete infrastructure if provisioning failed
113118
run: |

.github/workflows/trivyscan.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ jobs:
102102
run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
103103

104104
- name: Run Trivy vulnerability scanner
105-
uses: aquasecurity/trivy-action@0.24.0
105+
uses: aquasecurity/trivy-action@v0.33.1
106106
with:
107107
scan-type: fs
108108
scan-ref: "${{ steps.manifest.outputs.image-name }}"
@@ -116,13 +116,13 @@ jobs:
116116
TRIVY_DB_REPOSITORY: ghcr.io/azimuth-cloud/trivy-db:2
117117

118118
- name: Upload Trivy scan results to GitHub Security tab
119-
uses: github/codeql-action/upload-sarif@v3
119+
uses: github/codeql-action/upload-sarif@v4
120120
with:
121121
sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
122122
category: "${{ matrix.build }}"
123123

124124
- name: Fail if scan has CRITICAL vulnerabilities
125-
uses: aquasecurity/trivy-action@0.24.0
125+
uses: aquasecurity/trivy-action@v0.33.1
126126
with:
127127
scan-type: fs
128128
scan-ref: "${{ steps.manifest.outputs.image-name }}"
@@ -132,6 +132,8 @@ jobs:
132132
severity: 'CRITICAL'
133133
ignore-unfixed: true
134134
timeout: 15m
135+
# On a subsequent call to the action we know trivy is already installed so can skip this
136+
skip-setup-trivy: true
135137
env:
136138
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
137139
TRIVY_DB_REPOSITORY: ghcr.io/azimuth-cloud/trivy-db:2

ansible/adhoc/sync-pulp.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,5 @@
55
name: pulp_site
66
tasks_from: sync.yml
77
vars:
8-
pulp_site_target_arch: "x86_64"
9-
pulp_site_target_distribution: "rocky"
108
# default distribution to *latest* specified for baseos repo:
119
pulp_site_target_distribution_version: "{{ dnf_repos_repos['baseos'].keys() | map('float') | sort | last }}"

ansible/fatimage.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@
117117
- name: Install OpenHPC
118118
ansible.builtin.import_role:
119119
name: stackhpc.openhpc
120-
tasks_from: install.yml
120+
tasks_from: install-ohpc.yml
121121
when: "'openhpc' in group_names"
122122

123123
# - import_playbook: portal.yml
@@ -206,6 +206,7 @@
206206
ansible.builtin.include_role:
207207
name: hpctests
208208
tasks_from: source-hpl.yml
209+
when: "'hpctests' in group_names"
209210

210211
- hosts: prometheus
211212
become: true

ansible/roles/cuda/defaults/main.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,17 @@
22
# yamllint disable-line rule:line-length
33
cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo"
44
cuda_nvidia_driver_stream: '580-open'
5-
cuda_nvidia_driver_pkg: "nvidia-open-3:580.82.07-1.el{{ ansible_distribution_major_version }}"
5+
cuda_nvidia_driver_version: '580.82.07-1'
6+
cuda_nvidia_driver_pkg: "nvidia-open-3:{{ cuda_nvidia_driver_version }}.el{{ ansible_distribution_major_version }}"
67
cuda_package_version: '13.0.1-1'
78
cuda_version_short: "{{ (cuda_package_version | split('.'))[0:2] | join('.') }}" # major.minor
8-
cuda_packages:
9+
cuda_packages_default:
910
- "cuda-toolkit-{{ cuda_package_version }}"
1011
- nvidia-gds
1112
- cmake
13+
cuda_packages_fabricmanager:
14+
- "nvidia-fabricmanager-{{ cuda_nvidia_driver_version }}"
15+
cuda_packages: "{{ cuda_packages_default + ( cuda_packages_fabricmanager if cuda_install_nvidiafabricmanger | bool else [] ) }}"
1216
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
1317
cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples"
1418
cuda_samples_programs:

0 commit comments

Comments
 (0)