Skip to content

Commit bfef93c

Browse files
committed
[UpdateWorkflow] Make login node use head-node-driven orchestration for the update workflow.
With this change, login nodes do not depend on cfn-hup/cfn-init anymore and mirror the same update mechanism already adopted for compute nodes. With this change we expect the update workflow to be more resilient.
1 parent 5b6afd9 commit bfef93c

26 files changed

Lines changed: 329 additions & 131 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
99
**ENHANCEMENTS**
1010
- Improve resilience of EBS volume attachment during cluster creation by retrying on transient IMDS connectivity failures.
1111
- Further reduce transient build-image failures on RHEL and Rocky caused by out-of-sync repo mirrors by resetting metadata upon retry.
12+
- Improve cluster update resiliency on login nodes by reusing the head-node-driven orchestration already in place on compute nodes,
13+
removing the dependency on cfn-hup and cfn-init.
14+
1215

1316
**BUG FIXES**
1417
- Fix cluster creation failure caused by Slurm accounting bootstrap failing when ClusterName is overridden

cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,14 @@ def cleanup_dna_files
7474
else
7575
# No marker — this is an update failure, clean up DNA files and write marker
7676
Chef::Log.info("#{log_prefix} Update failure detected (no marker at #{marker}), cleaning up DNA files")
77-
command = "#{cookbook_virtualenv_path}/bin/python #{cluster_attributes['scripts_dir']}/share_compute_fleet_dna.py --region #{cluster_attributes['region']} --cleanup"
77+
command = "#{cookbook_virtualenv_path}/bin/python #{cluster_attributes['scripts_dir']}/share_dna.py --region #{cluster_attributes['region']} --cleanup"
7878
command_runner.run_with_retries(command, description: "cleanup DNA files")
7979
::File.write(marker, '')
8080
end
8181
rescue => e
8282
# If marker I/O fails, fall back to deleting DNA files
8383
Chef::Log.warn("#{log_prefix} Error during marker check (#{e.message}), falling back to cleaning up DNA files")
84-
command = "#{cookbook_virtualenv_path}/bin/python #{cluster_attributes['scripts_dir']}/share_compute_fleet_dna.py --region #{cluster_attributes['region']} --cleanup"
84+
command = "#{cookbook_virtualenv_path}/bin/python #{cluster_attributes['scripts_dir']}/share_dna.py --region #{cluster_attributes['region']} --cleanup"
8585
command_runner.run_with_retries(command, description: "cleanup DNA files")
8686
end
8787
end

cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@
144144
end
145145

146146
it 'runs the cleanup command' do
147-
expected_command = "#{virtualenv_path}/bin/python #{scripts_dir}/share_compute_fleet_dna.py --region #{region} --cleanup"
147+
expected_command = "#{virtualenv_path}/bin/python #{scripts_dir}/share_dna.py --region #{region} --cleanup"
148148
expect(command_runner).to receive(:run_with_retries).with(expected_command, description: "cleanup DNA files")
149149
handler.cleanup_dna_files
150150
end
@@ -188,7 +188,7 @@
188188
end
189189

190190
it 'falls back to cleaning up DNA files' do
191-
expected_command = "#{virtualenv_path}/bin/python #{scripts_dir}/share_compute_fleet_dna.py --region #{region} --cleanup"
191+
expected_command = "#{virtualenv_path}/bin/python #{scripts_dir}/share_dna.py --region #{region} --cleanup"
192192
expect(command_runner).to receive(:run_with_retries).with(expected_command, description: "cleanup DNA files")
193193
handler.cleanup_dna_files
194194
end

cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/share_compute_fleet_dna.py renamed to cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/share_dna.py

Lines changed: 127 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,19 @@
2424
from botocore.config import Config
2525
from retrying import retry
2626

27-
COMPUTE_FLEET_SHARED_LOCATION = "/opt/parallelcluster/shared/"
27+
SHARED_LOCATION = "/opt/parallelcluster/shared/"
2828

29-
COMPUTE_FLEET_SHARED_DNA_LOCATION = COMPUTE_FLEET_SHARED_LOCATION + "dna/"
29+
SHARED_DNA_LOCATION = SHARED_LOCATION + "dna/"
3030

31-
COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG = COMPUTE_FLEET_SHARED_LOCATION + "launch-templates-config.json"
31+
LAUNCH_TEMPLATE_CONFIG = SHARED_LOCATION + "launch-templates-config.json"
32+
33+
CLUSTER_CONFIG_PATH = SHARED_LOCATION + "cluster-config.yaml"
3234

3335
logger = logging.getLogger(__name__)
3436
logging.basicConfig(level=logging.INFO)
3537

3638

37-
def get_compute_launch_template_ids(lt_config_file_name):
39+
def get_launch_template_ids(lt_config_file_name):
3840
"""
3941
Load launch-templates-config.json.
4042
@@ -53,25 +55,13 @@ def get_compute_launch_template_ids(lt_config_file_name):
5355
}
5456
}
5557
}
56-
},
57-
"queue2": {
58-
"ComputeResources": {
59-
"queue2-i1": {
60-
"LaunchTemplate": {
61-
"Version": "1",
62-
"LogicalId": "LaunchTemplate012345678901234",
63-
"Id": "lt-01234567890123456"
64-
}
65-
}
66-
}
6758
}
6859
}
6960
}
70-
7161
"""
7262
lt_config = None
7363
try:
74-
logger.info("Getting LaunchTemplate ID and versions from %s", lt_config_file_name)
64+
logger.info("Getting LaunchTemplate IDs and versions from %s", lt_config_file_name)
7565
with open(lt_config_file_name, "r", encoding="utf-8") as file:
7666
lt_config = json.loads(file.read())
7767
except Exception as err:
@@ -80,15 +70,78 @@ def get_compute_launch_template_ids(lt_config_file_name):
8070
return lt_config
8171

8272

83-
def share_compute_fleet_dna(args):
84-
"""Create dna.json for each queue in cluster."""
85-
lt_config = get_compute_launch_template_ids(COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG)
73+
def get_login_pool_names(cluster_config_path):
74+
"""
75+
Read login pool names from cluster-config.yaml.
76+
77+
Login launch templates are not listed in launch-templates-config.json because doing so
78+
would introduce a circular CloudFormation dependency between the head node launch
79+
template and the LoginNodes nested stack. Pool names are read from cluster-config.yaml
80+
and the corresponding launch templates are resolved at runtime by name.
81+
"""
82+
pool_names = []
83+
try:
84+
logger.info("Reading login pool names from %s", cluster_config_path)
85+
with open(cluster_config_path, "r", encoding="utf-8") as file:
86+
cluster_config = yaml.safe_load(file)
87+
login_nodes = (cluster_config or {}).get("LoginNodes") or {}
88+
for pool in login_nodes.get("Pools") or []:
89+
name = pool.get("Name")
90+
if name:
91+
pool_names.append(name)
92+
except Exception as err:
93+
logger.warning("Unable to read login pool names from %s due to %s", cluster_config_path, err)
94+
return pool_names
95+
96+
97+
def share_dna_files(args):
98+
"""Create dna.json for each compute resource and login pool in the cluster."""
99+
lt_config = get_launch_template_ids(LAUNCH_TEMPLATE_CONFIG)
86100
if lt_config:
101+
# Compute fleet
87102
all_queues = lt_config.get("Queues")
88103
for _, queues in all_queues.items():
89104
compute_resources = queues.get("ComputeResources")
90105
for _, compute_res in compute_resources.items():
91-
get_latest_dna_data(compute_res, COMPUTE_FLEET_SHARED_DNA_LOCATION, args)
106+
get_latest_dna_data(compute_res, SHARED_DNA_LOCATION, args)
107+
108+
# Login nodes: launch templates are resolved at runtime by name
109+
# ({stack_name}-{pool_name}) to avoid a circular CloudFormation dependency between the
110+
# head node launch template and the LoginNodes nested stack.
111+
if args.stack_name:
112+
for pool_name in get_login_pool_names(args.cluster_config or CLUSTER_CONFIG_PATH):
113+
share_login_pool_dna(args.stack_name, pool_name, SHARED_DNA_LOCATION, args)
114+
115+
116+
def share_login_pool_dna(stack_name, pool_name, output_location, args):
117+
"""Fetch the latest UserData for a login pool launch template and write its dna.json to shared storage."""
118+
lt_name = f"{stack_name}-{pool_name}"
119+
user_data = get_user_data_by_name(lt_name, args.region)
120+
if not user_data:
121+
return
122+
123+
write_directives = get_write_directives_section(user_data)
124+
if not write_directives:
125+
return
126+
127+
# The LogicalId used in the dna.json filename must match the launch_template_id baked into
128+
# the login node's own dna.json (read from UserData), since the login node looks up the
129+
# file by that name. Extract it from the UserData itself.
130+
logical_id = None
131+
for entry in write_directives:
132+
if entry.get("path") in ["/tmp/dna.json"]: # nosec B108
133+
try:
134+
dna = json.loads(entry["content"])
135+
logical_id = dna.get("cluster", {}).get("launch_template_id")
136+
except Exception as err: # noqa: BLE001
137+
logger.warning("Unable to extract launch_template_id from dna.json for %s: %s", lt_name, err)
138+
break
139+
140+
if not logical_id:
141+
logger.warning("Skipping login pool %s: launch_template_id missing from UserData dna.json", pool_name)
142+
return
143+
144+
write_dna_files(write_directives, output_location + logical_id)
92145

93146

94147
# FIXME: Fix Code Duplication
@@ -136,6 +189,30 @@ def get_user_data(lt_id, lt_version, region_name):
136189
return decoded_data
137190

138191

192+
@retry(stop_max_attempt_number=5, wait_fixed=3000)
193+
def get_user_data_by_name(lt_name, region_name):
194+
"""Get UserData for the latest version of a Launch Template, looked up by name."""
195+
decoded_data = None
196+
try:
197+
proxy_config = parse_proxy_config()
198+
ec2_client = boto3.client("ec2", region_name=region_name, config=proxy_config)
199+
logger.info("Running EC2 DescribeLaunchTemplateVersions API for %s ($Latest)", lt_name)
200+
response = ec2_client.describe_launch_template_versions(
201+
LaunchTemplateName=lt_name,
202+
Versions=["$Latest"],
203+
).get("LaunchTemplateVersions")
204+
if response:
205+
decoded_data = base64.b64decode(response[0]["LaunchTemplateData"]["UserData"], validate=True).decode(
206+
"utf-8"
207+
)
208+
except Exception as err:
209+
if hasattr(err, "message"):
210+
err = err.message
211+
logger.error("Unable to get UserData for launch template %s.\nException: %s", lt_name, err)
212+
213+
return decoded_data
214+
215+
139216
def get_write_directives_section(user_data):
140217
"""Get write_files section from cloud-config section of MIME formatted UserData."""
141218
write_directives_section = None
@@ -203,7 +280,9 @@ def cleanup(directory_loc):
203280
def _parse_cli_args():
204281
"""Parse command line args."""
205282
parser = argparse.ArgumentParser(
206-
description="Get latest User Data from ComputeFleet Launch Templates.", exit_on_error=False
283+
description="Get latest UserData from ComputeFleet and LoginNodes Launch Templates and "
284+
"share dna.json for each in shared storage.",
285+
exit_on_error=False,
207286
)
208287

209288
parser.add_argument(
@@ -223,6 +302,29 @@ def _parse_cli_args():
223302
help="Cleanup DNA files created",
224303
)
225304

305+
parser.add_argument(
306+
"-s",
307+
"--stack-name",
308+
required=False,
309+
type=str,
310+
default=None,
311+
help=(
312+
"CloudFormation stack name of the cluster. Required to resolve LoginNodes pool launch "
313+
"templates at runtime via DescribeLaunchTemplateVersions."
314+
),
315+
)
316+
317+
parser.add_argument(
318+
"--cluster-config",
319+
required=False,
320+
type=str,
321+
default=None,
322+
help=(
323+
"Path to cluster-config.yaml. Used to read LoginNodes pool names. "
324+
f"Defaults to {CLUSTER_CONFIG_PATH}."
325+
),
326+
)
327+
226328
args = parser.parse_args()
227329

228330
return args
@@ -232,14 +334,14 @@ def main():
232334
try:
233335
args = _parse_cli_args()
234336
if args.cleanup:
235-
cleanup(COMPUTE_FLEET_SHARED_DNA_LOCATION)
337+
cleanup(SHARED_DNA_LOCATION)
236338
else:
237-
share_compute_fleet_dna(args)
339+
share_dna_files(args)
238340
except Exception as err:
239341
if hasattr(err, "message"):
240342
err = err.message
241343
logger.exception(
242-
"Encountered exception when fetching latest dna.json for ComputeFleet, exiting gracefully: %s", err
344+
"Encountered exception when fetching latest dna.json, exiting gracefully: %s", err
243345
)
244346
raise SystemExit(0)
245347

cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@
7272
"platforms": {{ default_platforms | tojson}},
7373
"node_roles": [
7474
"HeadNode",
75-
"LoginNode",
7675
"ExternalSlurmDbd"
7776
],
7877
"feature_conditions": []
@@ -541,7 +540,8 @@
541540
],
542541
"platforms": {{ default_platforms | tojson}},
543542
"node_roles": [
544-
"ComputeFleet"
543+
"ComputeFleet",
544+
"LoginNode"
545545
],
546546
"feature_conditions": []
547547
}

cookbooks/aws-parallelcluster-environment/recipes/finalize/finalize_check_update_systemd_service.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
return unless node['cluster']['node_type'] == 'ComputeFleet'
18+
return unless %w(ComputeFleet LoginNode).include?(node['cluster']['node_type'])
1919

2020
service 'pcluster-check-update.timer' do
2121
action [:enable, :start]

cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_efs.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@
8787
node['cluster']['internal_shared_dirs'].each do |dir|
8888
# Don't mount the login nodes shared dir to compute nodes
8989
next if node['cluster']['node_type'] == 'ComputeFleet' && dir == node['cluster']['shared_dir_login_nodes']
90-
next if node['cluster']['node_type'] == 'LoginNode' && dir == node['cluster']['shared_dir_compute']
9190
internal_shared_dir_array.push(dir)
9291
internal_efs_fs_id_array.push(initial_efs_fs_id_array[0])
9392
internal_efs_encryption_array.push(initial_efs_encryption_array[0])

cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@
7676
action :extra_configuration do
7777
case node['cluster']['node_type']
7878
when 'HeadNode'
79-
cookbook_file "#{node['cluster']['scripts_dir']}/share_compute_fleet_dna.py" do
80-
source 'cfn_hup_configuration/share_compute_fleet_dna.py'
79+
cookbook_file "#{node['cluster']['scripts_dir']}/share_dna.py" do
80+
source 'cfn_hup_configuration/share_dna.py'
8181
owner 'root'
8282
group 'root'
8383
mode '0700'
@@ -86,9 +86,9 @@
8686

8787
directory "#{node['cluster']['shared_dir']}/dna"
8888

89-
when 'ComputeFleet'
89+
when 'ComputeFleet', 'LoginNode'
9090
template "#{node['cluster']['scripts_dir']}/cfn-hup-update-action.sh" do
91-
source "cfn_hup_configuration/#{node['cluster']['node_type']}/cfn-hup-update-action.sh.erb"
91+
source 'cfn_hup_configuration/ComputeFleet/cfn-hup-update-action.sh.erb'
9292
owner 'root'
9393
group 'root'
9494
mode '0700'

cookbooks/aws-parallelcluster-environment/spec/unit/recipes/finalize_check_update_systemd_service_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
runner.converge(described_recipe)
2626
end
2727

28-
if node_type == 'ComputeFleet'
28+
if %w(ComputeFleet LoginNode).include?(node_type)
2929
it 'enables and starts the pcluster-check-update.timer service' do
3030
is_expected.to enable_service('pcluster-check-update.timer')
3131
is_expected.to start_service('pcluster-check-update.timer')

cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,10 @@ def self.configure(chef_run)
8787
})
8888
end
8989

90-
if %(ComputeFleet).include?(node_type)
90+
if %w(ComputeFleet LoginNode).include?(node_type)
9191
it "creates the file #{SCRIPT_DIR}/cfn-hup-update-action.sh" do
9292
is_expected.to create_template("#{SCRIPT_DIR}/cfn-hup-update-action.sh")
93-
.with(source: "cfn_hup_configuration/#{node_type}/cfn-hup-update-action.sh.erb")
93+
.with(source: 'cfn_hup_configuration/ComputeFleet/cfn-hup-update-action.sh.erb')
9494
.with(user: "root")
9595
.with(group: "root")
9696
.with(mode: "0700")
@@ -100,9 +100,9 @@ def self.configure(chef_run)
100100
})
101101
end
102102
elsif node_type == 'HeadNode'
103-
it "creates #{SCRIPT_DIR}/share_compute_fleet_dna.py" do
104-
is_expected.to create_if_missing_cookbook_file("#{SCRIPT_DIR}/share_compute_fleet_dna.py")
105-
.with(source: 'cfn_hup_configuration/share_compute_fleet_dna.py')
103+
it "creates #{SCRIPT_DIR}/share_dna.py" do
104+
is_expected.to create_if_missing_cookbook_file("#{SCRIPT_DIR}/share_dna.py")
105+
.with(source: 'cfn_hup_configuration/share_dna.py')
106106
.with(user: 'root')
107107
.with(group: 'root')
108108
.with(mode: '0700')

0 commit comments

Comments
 (0)