Skip to content

Commit 5b6afd9

Browse files
[Build] Further Mitigate build image failures in RHEL/Rocky caused by out of sync repositories
Generalizes the approach created by #3060 Introduces a `robust_package` resource that wraps the Chef `package` resource with a metadata-refresh retry strategy on RHEL/Rocky to mitigate transient build failures from out-of-sync RHUI mirrors. On Amazon Linux and Debian-based platforms it falls back to the stock `package` resource with retries, so behavior is unchanged. In addition, this commit does some refactor to reduce code duplication.
1 parent 3f722af commit 5b6afd9

20 files changed

Lines changed: 134 additions & 128 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
88

99
**ENHANCEMENTS**
1010
- Improve resilience of EBS volume attachment during cluster creation by retrying on transient IMDS connectivity failures.
11+
- Further reduce transient build-image failures on RHEL and Rocky caused by out-of-sync repo mirrors by resetting metadata upon retry.
1112

1213
**BUG FIXES**
1314
- Fix cluster creation failure caused by Slurm accounting bootstrap failing when ClusterName is overridden

cookbooks/aws-parallelcluster-environment/resources/efa/partial/_common.rb

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,8 @@
4545
package_repos 'update package repos' do
4646
action :update
4747
end
48-
package prerequisites do
49-
retries 3
50-
retry_delay 5
48+
robust_package 'install efa prerequisites' do
49+
packages prerequisites
5150
end
5251

5352
action_download_and_install

cookbooks/aws-parallelcluster-environment/resources/efs/partial/_install_from_tar.rb

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,8 @@
2323
action :update
2424
end
2525

26-
package prerequisites do
27-
retries 3
28-
retry_delay 5
26+
robust_package 'install efs-utils prerequisites' do
27+
packages prerequisites
2928
end
3029

3130
directory node['cluster']['sources_dir'] do

cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def self.configure(chef_run)
5656
if %(redhat rocky).include?(platform) || platform == 'amazon' && version == '2023'
5757
%w(environment-modules libibverbs-utils librdmacm-utils rdma-core-devel)
5858
else
59-
"environment-modules"
59+
%w(environment-modules)
6060
end
6161
end
6262
let(:chef_run) do
@@ -98,7 +98,8 @@ def self.configure(chef_run)
9898
is_expected.not_to write_log('efa installed')
9999
is_expected.not_to remove_package(%w(openmpi-devel openmpi))
100100
is_expected.to update_package_repos('update package repos')
101-
is_expected.to install_package(prerequisites)
101+
is_expected.to install_robust_package('install efa prerequisites')
102+
.with(packages: prerequisites)
102103
is_expected.to create_if_missing_remote_file("#{source_dir}/aws-efa-installer.tar.gz")
103104
is_expected.not_to run_bash('install efa')
104105
end
@@ -124,7 +125,8 @@ def self.configure(chef_run)
124125
is_expected.not_to write_log('efa installed')
125126
is_expected.to remove_package(platform == 'ubuntu' ? ['libopenmpi-dev'] : %w(openmpi-devel openmpi))
126127
is_expected.to update_package_repos('update package repos')
127-
is_expected.to install_package(prerequisites)
128+
is_expected.to install_robust_package('install efa prerequisites')
129+
.with(packages: prerequisites)
128130
is_expected.to create_if_missing_remote_file("#{source_dir}/aws-efa-installer.tar.gz")
129131
.with(source: "https://efa-installer.amazonaws.com/aws-efa-installer-#{efa_version}.tar.gz")
130132
.with(mode: '0644')

cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,8 @@ def mock_already_installed(package, expected_version, installed)
155155
end
156156

157157
it 'installs prerequisites' do
158-
is_expected.to install_package(required_packages[platform])
159-
.with(retries: 3)
160-
.with(retry_delay: 5)
158+
is_expected.to install_robust_package('install efs-utils prerequisites')
159+
.with(packages: required_packages[platform])
161160
end
162161

163162
it 'downloads tarball' do

cookbooks/aws-parallelcluster-platform/resources/enroot/partial/_enroot_rhel.rb

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@
1515
action :install_package do
1616
return unless nvidia_enabled? || nvidia_installed?
1717

18-
package prerequisites do
19-
retries 3
20-
retry_delay 5
18+
robust_package 'install enroot prerequisites' do
19+
packages prerequisites
2120
end
2221

2322
bash "Install enroot" do

cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,8 @@ def gdrcopy_checksum
5151
action :update
5252
end
5353

54-
package gdrcopy_build_dependencies do
55-
retries 3
56-
retry_delay 5
54+
robust_package 'install gdrcopy build dependencies' do
55+
packages gdrcopy_build_dependencies
5756
end
5857

5958
bash 'Install NVIDIA GDRCopy' do

cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_alinux2023.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
end
1818

1919
use 'partial/_install_packages_common.rb'
20-
use 'partial/_install_packages_rhel_amazon.rb'
2120

2221
action :install do
2322
package new_resource.packages do

cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_redhat8.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
end
1818

1919
use 'partial/_install_packages_common.rb'
20-
use 'partial/_install_packages_rhel_rocky.rb'
2120

2221
def default_packages
2322
# environment-modules required by EFA, Intel MPI and ARM PL

cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_rocky8.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
end
1818

1919
use 'partial/_install_packages_common.rb'
20-
use 'partial/_install_packages_rhel_rocky.rb'
2120

2221
def default_packages
2322
# environment-modules required by EFA, Intel MPI and ARM PL

0 commit comments

Comments
 (0)