Skip to content

Commit beebd0a

Browse files
authored
Merge branch 'develop' into wip/xuanqi-develop-314
2 parents aa590b9 + 1fca286 commit beebd0a

File tree

5 files changed

+135
-92
lines changed

5 files changed

+135
-92
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ CHANGELOG
1919
3.13.1
2020
------
2121

22+
**BUG FIXES**
23+
- Fix build image failures occurring on non-latest versions of Rocky Linux 9.
24+
25+
3.13.1
26+
------
27+
2228
**CHANGES**
2329
- Upgrade Slurm to version 24.05.8.
2430
- Upgrade EFA installer to 1.41.0 (from 1.38.1).

cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ phases:
182182
fi
183183
fi
184184
185-
- name: PinKernelVersion
185+
- name: PinVersion
186186
action: ExecuteBash
187187
inputs:
188188
commands:
@@ -191,7 +191,38 @@ phases:
191191
OS='{{ build.OperatingSystemName.outputs.stdout }}'
192192
PLATFORM='{{ build.PlatformName.outputs.stdout }}'
193193
KERNEL_VERSION=$(uname -a)
194+
RELEASE_VERSION='{{ build.OperatingSystemVersion.outputs.stdout }}'
194195
if [[ ${!PLATFORM} == RHEL ]]; then
196+
if [[ ${!OS} == rhel9 ]] || [[ ${!OS} == rocky9 ]]; then
197+
if [[ ! -f /etc/yum/vars/releasever ]]; then
198+
echo "yes" > /opt/parallelcluster/pin_releasesever
199+
echo ${!RELEASE_VERSION} > /etc/yum/vars/releasever
200+
yum clean all
201+
fi
202+
fi
203+
PACKAGE_LIST="kernel-headers-$(uname -r) kernel-devel-$(uname -r)"
204+
if [[ ${!OS} != "rocky8" ]] && [[ ${!OS} != "rhel8" ]]; then
205+
PACKAGE_LIST+=" kernel-devel-matched-$(uname -r)"
206+
fi
207+
208+
if [[ ${!OS} == "rocky8" ]] || [[ ${!OS} == "rocky9" ]] ; then
209+
for PACKAGE in ${!PACKAGE_LIST}
210+
do
211+
yum install -y ${!PACKAGE}
212+
if [ $? -ne 0 ]; then
213+
# Enable vault repository
214+
sed -i 's|^#baseurl=http://dl.rockylinux.org/$contentdir|baseurl=http://dl.rockylinux.org/vault/rocky|g' /etc/yum.repos.d/*.repo
215+
sed -i 's|^#baseurl=https://dl.rockylinux.org/$contentdir|baseurl=https://dl.rockylinux.org/vault/rocky|g' /etc/yum.repos.d/*.repo
216+
yum install -y ${!PACKAGE}
217+
fi
218+
done
219+
else
220+
for PACKAGE in ${!PACKAGE_LIST}
221+
do
222+
yum -y install ${!PACKAGE}
223+
done
224+
fi
225+
195226
yum install -y yum-plugin-versionlock
196227
# listing all the packages because wildcard does not work as expected
197228
yum versionlock kernel kernel-core kernel-modules
@@ -202,9 +233,24 @@ phases:
202233
yum versionlock redhat-release
203234
fi
204235
else
236+
apt-get -y install linux-headers-$(uname -r)
205237
apt-mark hold linux-aws* linux-base* linux-headers* linux-image*
206238
fi
207-
echo "Kernel version is ${!KERNEL_VERSION}"
239+
echo "Kernel version is ${!KERNEL_VERSION}"
240+
241+
- name: DisableNouveau
242+
action: ExecuteBash
243+
inputs:
244+
commands:
245+
- |
246+
set -v
247+
PLATFORM='{{ build.PlatformName.outputs.stdout }}'
248+
/bin/sed -r -i -e 's/GRUB_CMDLINE_LINUX="(.*)"/GRUB_CMDLINE_LINUX="\1 rd.driver.blacklist=nouveau nouveau.modeset=0"/' /etc/default/grub
249+
if [[ ${!PLATFORM} == RHEL ]]; then
250+
grub2-mkconfig -o /boot/grub2/grub.cfg
251+
elif [[ ${!PLATFORM} == DEBIAN ]]; then
252+
update-grub
253+
fi
208254
209255
# Install prerequisite OS packages
210256
- name: InstallPrerequisite
@@ -215,16 +261,8 @@ phases:
215261
set -v
216262
OS='{{ build.OperatingSystemName.outputs.stdout }}'
217263
PLATFORM='{{ build.PlatformName.outputs.stdout }}'
218-
VERSION='{{ build.OperatingSystemVersion.outputs.stdout }}'
219264
220265
if [[ ${!PLATFORM} == RHEL ]]; then
221-
if [[ ${!OS} == rhel9 ]] || [[ ${!OS} == rocky9 ]]; then
222-
if [[ ! -f /etc/yum/vars/releasever ]]; then
223-
echo "yes" > /opt/parallelcluster/pin_releasesever
224-
echo ${!VERSION} > /etc/yum/vars/releasever
225-
yum clean all
226-
fi
227-
fi
228266
yum -y update krb5-libs
229267
yum -y groupinstall development && sudo yum -y install wget jq
230268
if [[ ${!OS} != alinux2023 ]]; then
@@ -244,6 +282,13 @@ phases:
244282
apt-get -y install build-essential curl wget jq
245283
fi
246284
285+
- name: RebootStep
286+
action: Reboot
287+
onFailure: Abort
288+
maxAttempts: 2
289+
inputs:
290+
delaySeconds: 10
291+
247292
# Install Cinc
248293
- name: InstallCinc
249294
action: ExecuteBash

cli/src/pcluster/resources/imagebuilder/update_and_reboot.yaml

Lines changed: 0 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -97,20 +97,6 @@ phases:
9797
fi
9898
fi
9999
100-
- name: DisableNouveau
101-
action: ExecuteBash
102-
inputs:
103-
commands:
104-
- |
105-
set -v
106-
PLATFORM='{{ build.PlatformName.outputs.stdout }}'
107-
/bin/sed -r -i -e 's/GRUB_CMDLINE_LINUX="(.*)"/GRUB_CMDLINE_LINUX="\1 rd.driver.blacklist=nouveau nouveau.modeset=0"/' /etc/default/grub
108-
if [[ ${!PLATFORM} == RHEL ]]; then
109-
grub2-mkconfig -o /boot/grub2/grub.cfg
110-
elif [[ ${!PLATFORM} == DEBIAN ]]; then
111-
update-grub
112-
fi
113-
114100
- name: DisableUnattendedUpgrades
115101
action: ExecuteBash
116102
inputs:
@@ -129,20 +115,6 @@ phases:
129115
# update package index
130116
DEBIAN_FRONTEND=noninteractive apt-get -y update
131117
fi
132-
133-
- name: InstallEfiBootManager
134-
action: ExecuteBash
135-
inputs:
136-
commands:
137-
- |
138-
set -v
139-
PLATFORM='{{ build.PlatformName.outputs.stdout }}'
140-
ARCH=$(uname -m)
141-
if [[ `echo ${!ARCH}` == 'aarch64' ]] && [[ ${!PLATFORM} == DEBIAN ]]; then
142-
# temporary workaround to solve https://bugs.launchpad.net/ubuntu/+source/grub2-signed/+bug/1936857
143-
apt-get -y install efibootmgr
144-
fi
145-
146118
- name: InstallPrerequisites
147119
action: ExecuteBash
148120
inputs:
@@ -244,42 +216,6 @@ phases:
244216
maxAttempts: 2
245217
inputs:
246218
delaySeconds: 10
247-
- name: InstallAdditionalKernelPackages
248-
action: ExecuteBash
249-
inputs:
250-
commands:
251-
- |
252-
set -v
253-
OS='{{ build.OperatingSystemName.outputs.stdout }}'
254-
PLATFORM='{{ build.PlatformName.outputs.stdout }}'
255-
DISABLE_KERNEL_UPDATE='{{ build.DisableKernelUpdate.outputs.stdout }}'
256-
257-
if [[ ${!PLATFORM} == RHEL ]]; then
258-
# Install kernel-devel during OS update, so that headers are aligned with new kernel.
259-
# The same is done for Debian through `apt-get -y install linux-aws`
260-
if [[ ${!OS} == "rocky8" ]] ; then
261-
PACKAGE="kernel-devel-$(uname -r)"
262-
RELEASE_VERSION=$(source /etc/os-release && echo ${!VERSION_ID})
263-
264-
# try to install kernel source for a specific release version
265-
yum install -y ${!PACKAGE} --releasever ${!RELEASE_VERSION}
266-
if [ $? -ne 0 ]; then
267-
yum install -y wget
268-
# Previous releases are moved into a vault area once a new minor release version is available for at least a week.
269-
# https://wiki.rockylinux.org/rocky/repo/#notes-on-devel
270-
wget https://dl.rockylinux.org/vault/rocky/${!RELEASE_VERSION}/BaseOS/$(uname -m)/os/Packages/k/${!PACKAGE}.rpm
271-
yum install -y ./${!PACKAGE}.rpm
272-
fi
273-
else
274-
yum -y install kernel-headers-$(uname -r)
275-
yum -y install kernel-devel-$(uname -r)
276-
fi
277-
278-
elif [[ ${!PLATFORM} == DEBIAN ]]; then
279-
if [[ ${!DISABLE_KERNEL_UPDATE} != true ]]; then
280-
apt-get -y install linux-aws linux-headers-aws linux-image-aws
281-
fi
282-
fi
283219

284220
- name: RemoveKernelPin
285221
action: ExecuteBash

tests/integration-tests/README.md

Lines changed: 73 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,22 @@ that lists all the available options:
3838
python -m test_runner --help
3939
usage: test_runner.py [-h] --key-name KEY_NAME --key-path KEY_PATH [-n PARALLELISM] [--sequential] [--credential CREDENTIAL] [--use-default-iam-credentials] [--retry-on-failures] [--tests-root-dir TESTS_ROOT_DIR] [--global-build-number GLOBAL_BUILD_NUMBER] [-c TESTS_CONFIG] [-i [INSTANCES ...]]
4040
[-o [OSS ...]] [-s [SCHEDULERS ...]] [-r [REGIONS ...]] [-f FEATURES [FEATURES ...]] [--show-output] [--reports {html,junitxml,json,cw} [{html,junitxml,json,cw} ...]] [--cw-region CW_REGION] [--cw-namespace CW_NAMESPACE] [--cw-timestamp-day-start] [--output-dir OUTPUT_DIR]
41-
[--custom-node-url CUSTOM_NODE_URL] [--custom-cookbook-url CUSTOM_COOKBOOK_URL] [--createami-custom-cookbook-url CREATEAMI_CUSTOM_COOKBOOK_URL] [--createami-custom-node-url CREATEAMI_CUSTOM_NODE_URL] [--custom-awsbatchcli-url CUSTOM_AWSBATCHCLI_URL] [--pre-install PRE_INSTALL]
42-
[--post-install POST_INSTALL] [--instance-types-data INSTANCE_TYPES_DATA] [--custom-ami CUSTOM_AMI] [--pcluster-git-ref PCLUSTER_GIT_REF] [--cookbook-git-ref COOKBOOK_GIT_REF] [--node-git-ref NODE_GIT_REF] [--ami-owner AMI_OWNER] [--available-amis-oss-x86 [AVAILABLE_AMIS_OSS_X86 ...]]
43-
[--available-amis-oss-arm [AVAILABLE_AMIS_OSS_ARM ...]] [--benchmarks] [--benchmarks-target-capacity BENCHMARKS_TARGET_CAPACITY] [--benchmarks-max-time BENCHMARKS_MAX_TIME] [--scaling-test-config SCALING_TEST_CONFIG]
44-
[--cluster-custom-resource-service-token CLUSTER_CUSTOM_RESOURCE_SERVICE_TOKEN] [--resource-bucket RESOURCE_BUCKET] [--lambda-layer-source LAMBDA_LAYER_SOURCE] [--api-definition-s3-uri API_DEFINITION_S3_URI] [--api-infrastructure-s3-uri API_INFRASTRUCTURE_S3_URI] [--api-uri API_URI]
45-
[--policies-uri POLICIES_URI] [--vpc-stack VPC_STACK] [--cluster CLUSTER] [--no-delete] [--delete-logs-on-success] [--stackname-suffix STACKNAME_SUFFIX] [--dry-run] [--iam-user-role-stack-name IAM_USER_ROLE_STACK_NAME] [--directory-stack-name DIRECTORY_STACK_NAME]
46-
[--slurm-database-stack-name SLURM_DATABASE_STACK_NAME] [--slurm-dbd-stack-name SLURM_DBD_STACK_NAME] [--munge-key-secret-arn MUNGE_KEY_SECRET_ARN] [--external-shared-storage-stack-name EXTERNAL_SHARED_STORAGE_STACK_NAME] [--bucket-name BUCKET_NAME]
47-
[--custom-security-groups-stack-name CUSTOM_SECURITY_GROUPS_STACK_NAME] [--force-run-instances] [--force-elastic-ip] [--retain-ad-stack] [--proxy-stack PROXY_STACK] [--build-image-roles-stack BUILD_IMAGE_ROLES_STACK] [--api-stack API_STACK]
41+
[--custom-node-url CUSTOM_NODE_URL] [--custom-cookbook-url CUSTOM_COOKBOOK_URL] [--createami-custom-cookbook-url CREATEAMI_CUSTOM_COOKBOOK_URL] [--createami-custom-node-url CREATEAMI_CUSTOM_NODE_URL] [--custom-awsbatchcli-url CUSTOM_AWSBATCHCLI_URL]
42+
[--pcluster-installer-path PCLUSTER_INSTALLER_PATH] [--pre-install PRE_INSTALL] [--post-install POST_INSTALL] [--instance-types-data INSTANCE_TYPES_DATA] [--custom-ami CUSTOM_AMI] [--pcluster-git-ref PCLUSTER_GIT_REF] [--cookbook-git-ref COOKBOOK_GIT_REF]
43+
[--node-git-ref NODE_GIT_REF] [--ami-owner AMI_OWNER] [--available-amis-oss-x86 [AVAILABLE_AMIS_OSS_X86 ...]] [--available-amis-oss-arm [AVAILABLE_AMIS_OSS_ARM ...]] [--benchmarks] [--benchmarks-target-capacity BENCHMARKS_TARGET_CAPACITY] [--benchmarks-max-time BENCHMARKS_MAX_TIME]
44+
[--scaling-test-config SCALING_TEST_CONFIG] [--cluster-custom-resource-service-token CLUSTER_CUSTOM_RESOURCE_SERVICE_TOKEN] [--resource-bucket RESOURCE_BUCKET] [--lambda-layer-source LAMBDA_LAYER_SOURCE] [--api-definition-s3-uri API_DEFINITION_S3_URI]
45+
[--api-infrastructure-s3-uri API_INFRASTRUCTURE_S3_URI] [--api-uri API_URI] [--policies-uri POLICIES_URI] [--vpc-stack VPC_STACK] [--cluster CLUSTER] [--no-delete] [--delete-logs-on-success] [--stackname-suffix STACKNAME_SUFFIX] [--dry-run]
46+
[--iam-user-role-stack-name IAM_USER_ROLE_STACK_NAME] [--directory-stack-name DIRECTORY_STACK_NAME] [--slurm-database-stack-name SLURM_DATABASE_STACK_NAME] [--slurm-dbd-stack-name SLURM_DBD_STACK_NAME] [--munge-key-secret-arn MUNGE_KEY_SECRET_ARN]
47+
[--external-shared-storage-stack-name EXTERNAL_SHARED_STORAGE_STACK_NAME] [--bucket-name BUCKET_NAME] [--custom-security-groups-stack-name CUSTOM_SECURITY_GROUPS_STACK_NAME] [--force-run-instances] [--force-elastic-ip] [--retain-ad-stack] [--proxy-stack PROXY_STACK]
48+
[--build-image-roles-stack BUILD_IMAGE_ROLES_STACK] [--api-stack API_STACK]
4849
4950
Run integration tests suite.
5051
5152
options:
5253
-h, --help show this help message and exit
5354
--key-name KEY_NAME Key to use for EC2 instances (default: None)
5455
--key-path KEY_PATH Path to the key to use for SSH connections (default: None)
55-
-n PARALLELISM, --parallelism PARALLELISM
56+
-n, --parallelism PARALLELISM
5657
Tests parallelism for every region. (default: None)
5758
--sequential Run tests in a single process. When not specified tests will spawn a process for each region under test. (default: False)
5859
--credential CREDENTIAL
@@ -67,18 +68,16 @@ options:
6768
The build number passed from the testing pipelines (default: 0)
6869
6970
Test dimensions:
70-
-c TESTS_CONFIG, --tests-config TESTS_CONFIG
71-
Config file that specifies the tests to run and the dimensions to enable for each test. Note that when a config file is used the following flags are ignored: instances, regions, oss, schedulers. Refer to the docs for further details on the config format: https://github.com/aws/aws-
72-
parallelcluster/blob/develop/tests/integration-tests/README.md (default: None)
73-
-i [INSTANCES ...], --instances [INSTANCES ...]
71+
-c, --tests-config TESTS_CONFIG
72+
Config file that specifies the tests to run and the dimensions to enable for each test. Note that when a config file is used the following flags are ignored: instances, regions, oss, schedulers. Refer to the docs for further details on the config format: https://github.com/aws/aws-parallelcluster/blob/develop/tests/integration-tests/README.md (default: None)
73+
-i, --instances [INSTANCES ...]
7474
AWS instances under test. Ignored when tests-config is used. (default: [])
75-
-o [OSS ...], --oss [OSS ...]
76-
OSs under test. Ignored when tests-config is used. (default: [])
77-
-s [SCHEDULERS ...], --schedulers [SCHEDULERS ...]
75+
-o, --oss [OSS ...] OSs under test. Ignored when tests-config is used. (default: [])
76+
-s, --schedulers [SCHEDULERS ...]
7877
Schedulers under test. Ignored when tests-config is used. (default: [])
79-
-r [REGIONS ...], --regions [REGIONS ...]
78+
-r, --regions [REGIONS ...]
8079
AWS regions where tests are executed. Ignored when tests-config is used. (default: [])
81-
-f FEATURES [FEATURES ...], --features FEATURES [FEATURES ...]
80+
-f, --features FEATURES [FEATURES ...]
8281
Run only tests for the listed features. Prepending the not keyword to the feature name causes the feature to be excluded. (default: )
8382
8483
Test reports:
@@ -105,6 +104,8 @@ Custom packages and templates:
105104
URL to a custom node package for the createami command. (default: None)
106105
--custom-awsbatchcli-url CUSTOM_AWSBATCHCLI_URL
107106
URL to a custom awsbatch cli package. (default: None)
107+
--pcluster-installer-path PCLUSTER_INSTALLER_PATH
108+
Path to ParallelCluster installer. (default: None)
108109
--pre-install PRE_INSTALL
109110
URL to a pre install script (default: None)
110111
--post-install POST_INSTALL
@@ -312,6 +313,54 @@ the test will be executed in
312313
* `eu-west-1` using the AZ with ZoneId `euw1-az1` (ZoneId is consistent across accounts)
313314
* `eu-central-1` using a random AZ available in the region
314315

316+
#### OS Rotation
317+
The framework includes automatic OS rotation to ensure that all supported operating systems are tested regularly:
318+
319+
- The framework calculates a rotation index daily, and assign values to the jinja variables upon test running
320+
- Tests can be constrained to specific available AMIs using the `--available-amis-oss-x86` and `--available-amis-oss-arm` parameters
321+
- It can be customized to filter OS choices based on feature compatibility (e.g., DCV, Lustre, Batch scheduler support)
322+
323+
Use Jinja variables in test configs with the pattern:
324+
```
325+
dimensions:
326+
- oss: [ {{ OS_X86_3 }} ]
327+
```
328+
329+
#### Instance Type Rotation
330+
331+
Similar to OS rotation, the framework also rotates instance types for dynamic testing:
332+
333+
- Instance types are automatically discovered and rotated for major regions (us-east-1, us-west-2, eu-west-1)
334+
- Excludes legacy instance types (m1, m2, t1, etc.) for better test reliability
335+
336+
Use Jinja variables in test configs with the pattern:
337+
```
338+
dimensions:
339+
- instances: [{{ US_EAST_1_INSTANCE_TYPE_0 }}.xlarge]
340+
```
341+
342+
#### Capacity Reservation Management
343+
344+
The framework includes automatic capacity reservation management for tests that require specific instance types with guaranteed capacity. This feature:
345+
346+
- Automatically detects capacity reservation requirements in test configuration files using Jinja variables
347+
- Creates or modifies existing EC2 capacity reservations as needed
348+
- Supports placement groups and time-limited reservations
349+
- Falls back to default availability zones if reservations cannot be created
350+
351+
Use Jinja variables in test configs with the pattern:
352+
```
353+
dimensions:
354+
- regions: [{{ INSTANCE_TYPE_CAPACITY_RESERVATION_COUNT_INSTANCES_HOURS_HOURS_[YESPG|NOPG]_[OS] }}]
355+
```
356+
357+
Example:
358+
```
359+
# Reserve 2 c5.xlarge instances for 2 hours with placement group
360+
dimensions:
361+
- regions: []{{ c5_xlarge_CAPACITY_RESERVATION_2_INSTANCES_2_HOURS_YESPG_alinux2023 }}]
362+
```
363+
315364
#### Using CLI options
316365

317366
The following options can be used to control the parametrization of test cases:
@@ -390,6 +439,13 @@ metrics. You can use the options `--cw-region` (default `us-east-1`) and `--cw-n
390439
(default `ParallelCluster/IntegrationTests`) to specify what region and what metric namespace
391440
you want to use for the published metrics.
392441

442+
### Test Metadata Management
443+
444+
The framework automatically manages test metadata using DynamoDB tables for tracking test execution and results across regions. This system:
445+
446+
- Creates metadata tables automatically in the appropriate reporting region
447+
- Tracks test execution metadata for analysis and reporting
448+
393449
### Parallelize Tests Execution
394450
The following options can be used to control tests parallelism:
395451
* `--sequential`: by default the tests orchestrator executes a separate parallel process for each region under test.

tests/integration-tests/framework/fixture_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def release(self):
126126
return
127127

128128
timeout = time.time() + 4 * 60 * 60 # 4 hours from now
129-
while data.counter > 1:
129+
while min(data.counter, len(data.currently_using_processes)) > 1:
130130
logging.info(
131131
"Waiting for all processes to release shared fixture %s, currently in use by %d processes (%s)",
132132
self.name,

0 commit comments

Comments
 (0)