2424from botocore .config import Config
2525from retrying import retry
2626
27- COMPUTE_FLEET_SHARED_LOCATION = "/opt/parallelcluster/shared/"
27+ SHARED_LOCATION = "/opt/parallelcluster/shared/"
2828
29- COMPUTE_FLEET_SHARED_DNA_LOCATION = COMPUTE_FLEET_SHARED_LOCATION + "dna/"
29+ SHARED_DNA_LOCATION = SHARED_LOCATION + "dna/"
3030
31- COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG = COMPUTE_FLEET_SHARED_LOCATION + "launch-templates-config.json"
31+ LAUNCH_TEMPLATE_CONFIG = SHARED_LOCATION + "launch-templates-config.json"
32+
33+ CLUSTER_CONFIG_PATH = SHARED_LOCATION + "cluster-config.yaml"
3234
3335logger = logging .getLogger (__name__ )
3436logging .basicConfig (level = logging .INFO )
3537
3638
37- def get_compute_launch_template_ids (lt_config_file_name ):
39+ def get_launch_template_ids (lt_config_file_name ):
3840 """
3941 Load launch-templates-config.json.
4042
@@ -53,25 +55,13 @@ def get_compute_launch_template_ids(lt_config_file_name):
5355 }
5456 }
5557 }
56- },
57- "queue2": {
58- "ComputeResources": {
59- "queue2-i1": {
60- "LaunchTemplate": {
61- "Version": "1",
62- "LogicalId": "LaunchTemplate012345678901234",
63- "Id": "lt-01234567890123456"
64- }
65- }
66- }
6758 }
6859 }
6960 }
70-
7161 """
7262 lt_config = None
7363 try :
74- logger .info ("Getting LaunchTemplate ID and versions from %s" , lt_config_file_name )
64+ logger .info ("Getting LaunchTemplate IDs and versions from %s" , lt_config_file_name )
7565 with open (lt_config_file_name , "r" , encoding = "utf-8" ) as file :
7666 lt_config = json .loads (file .read ())
7767 except Exception as err :
@@ -80,15 +70,78 @@ def get_compute_launch_template_ids(lt_config_file_name):
8070 return lt_config
8171
8272
83- def share_compute_fleet_dna (args ):
84- """Create dna.json for each queue in cluster."""
85- lt_config = get_compute_launch_template_ids (COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG )
73+ def get_login_pool_names (cluster_config_path ):
74+ """
75+ Read login pool names from cluster-config.yaml.
76+
77+ Login launch templates are not listed in launch-templates-config.json because doing so
78+ would introduce a circular CloudFormation dependency between the head node launch
79+ template and the LoginNodes nested stack. Pool names are read from cluster-config.yaml
80+ and the corresponding launch templates are resolved at runtime by name.
81+ """
82+ pool_names = []
83+ try :
84+ logger .info ("Reading login pool names from %s" , cluster_config_path )
85+ with open (cluster_config_path , "r" , encoding = "utf-8" ) as file :
86+ cluster_config = yaml .safe_load (file )
87+ login_nodes = (cluster_config or {}).get ("LoginNodes" ) or {}
88+ for pool in login_nodes .get ("Pools" ) or []:
89+ name = pool .get ("Name" )
90+ if name :
91+ pool_names .append (name )
92+ except Exception as err :
93+ logger .warning ("Unable to read login pool names from %s due to %s" , cluster_config_path , err )
94+ return pool_names
95+
96+
97+ def share_dna_files (args ):
98+ """Create dna.json for each compute resource and login pool in the cluster."""
99+ lt_config = get_launch_template_ids (LAUNCH_TEMPLATE_CONFIG )
86100 if lt_config :
101+ # Compute fleet
87102 all_queues = lt_config .get ("Queues" )
88103 for _ , queues in all_queues .items ():
89104 compute_resources = queues .get ("ComputeResources" )
90105 for _ , compute_res in compute_resources .items ():
91- get_latest_dna_data (compute_res , COMPUTE_FLEET_SHARED_DNA_LOCATION , args )
106+ get_latest_dna_data (compute_res , SHARED_DNA_LOCATION , args )
107+
108+ # Login nodes: launch templates are resolved at runtime by name
109+ # ({stack_name}-{pool_name}) to avoid a circular CloudFormation dependency between the
110+ # head node launch template and the LoginNodes nested stack.
111+ if args .stack_name :
112+ for pool_name in get_login_pool_names (args .cluster_config or CLUSTER_CONFIG_PATH ):
113+ share_login_pool_dna (args .stack_name , pool_name , SHARED_DNA_LOCATION , args )
114+
115+
116+ def share_login_pool_dna (stack_name , pool_name , output_location , args ):
117+ """Fetch the latest UserData for a login pool launch template and write its dna.json to shared storage."""
118+ lt_name = f"{ stack_name } -{ pool_name } "
119+ user_data = get_user_data_by_name (lt_name , args .region )
120+ if not user_data :
121+ return
122+
123+ write_directives = get_write_directives_section (user_data )
124+ if not write_directives :
125+ return
126+
127+ # The LogicalId used in the dna.json filename must match the launch_template_id baked into
128+ # the login node's own dna.json (read from UserData), since the login node looks up the
129+ # file by that name. Extract it from the UserData itself.
130+ logical_id = None
131+ for entry in write_directives :
132+ if entry .get ("path" ) in ["/tmp/dna.json" ]: # nosec B108
133+ try :
134+ dna = json .loads (entry ["content" ])
135+ logical_id = dna .get ("cluster" , {}).get ("launch_template_id" )
136+ except Exception as err : # noqa: BLE001
137+ logger .warning ("Unable to extract launch_template_id from dna.json for %s: %s" , lt_name , err )
138+ break
139+
140+ if not logical_id :
141+ logger .warning ("Skipping login pool %s: launch_template_id missing from UserData dna.json" , pool_name )
142+ return
143+
144+ write_dna_files (write_directives , output_location + logical_id )
92145
93146
94147# FIXME: Fix Code Duplication
@@ -136,6 +189,30 @@ def get_user_data(lt_id, lt_version, region_name):
136189 return decoded_data
137190
138191
192+ @retry (stop_max_attempt_number = 5 , wait_fixed = 3000 )
193+ def get_user_data_by_name (lt_name , region_name ):
194+ """Get UserData for the latest version of a Launch Template, looked up by name."""
195+ decoded_data = None
196+ try :
197+ proxy_config = parse_proxy_config ()
198+ ec2_client = boto3 .client ("ec2" , region_name = region_name , config = proxy_config )
199+ logger .info ("Running EC2 DescribeLaunchTemplateVersions API for %s ($Latest)" , lt_name )
200+ response = ec2_client .describe_launch_template_versions (
201+ LaunchTemplateName = lt_name ,
202+ Versions = ["$Latest" ],
203+ ).get ("LaunchTemplateVersions" )
204+ if response :
205+ decoded_data = base64 .b64decode (response [0 ]["LaunchTemplateData" ]["UserData" ], validate = True ).decode (
206+ "utf-8"
207+ )
208+ except Exception as err :
209+ if hasattr (err , "message" ):
210+ err = err .message
211+ logger .error ("Unable to get UserData for launch template %s.\n Exception: %s" , lt_name , err )
212+
213+ return decoded_data
214+
215+
139216def get_write_directives_section (user_data ):
140217 """Get write_files section from cloud-config section of MIME formatted UserData."""
141218 write_directives_section = None
@@ -203,7 +280,9 @@ def cleanup(directory_loc):
203280def _parse_cli_args ():
204281 """Parse command line args."""
205282 parser = argparse .ArgumentParser (
206- description = "Get latest User Data from ComputeFleet Launch Templates." , exit_on_error = False
283+ description = "Get latest UserData from ComputeFleet and LoginNodes Launch Templates and "
284+ "share dna.json for each in shared storage." ,
285+ exit_on_error = False ,
207286 )
208287
209288 parser .add_argument (
@@ -223,6 +302,29 @@ def _parse_cli_args():
223302 help = "Cleanup DNA files created" ,
224303 )
225304
305+ parser .add_argument (
306+ "-s" ,
307+ "--stack-name" ,
308+ required = False ,
309+ type = str ,
310+ default = None ,
311+ help = (
312+ "CloudFormation stack name of the cluster. Required to resolve LoginNodes pool launch "
313+ "templates at runtime via DescribeLaunchTemplateVersions."
314+ ),
315+ )
316+
317+ parser .add_argument (
318+ "--cluster-config" ,
319+ required = False ,
320+ type = str ,
321+ default = None ,
322+ help = (
323+ "Path to cluster-config.yaml. Used to read LoginNodes pool names. "
324+ f"Defaults to { CLUSTER_CONFIG_PATH } ."
325+ ),
326+ )
327+
226328 args = parser .parse_args ()
227329
228330 return args
@@ -232,14 +334,14 @@ def main():
232334 try :
233335 args = _parse_cli_args ()
234336 if args .cleanup :
235- cleanup (COMPUTE_FLEET_SHARED_DNA_LOCATION )
337+ cleanup (SHARED_DNA_LOCATION )
236338 else :
237- share_compute_fleet_dna (args )
339+ share_dna_files (args )
238340 except Exception as err :
239341 if hasattr (err , "message" ):
240342 err = err .message
241343 logger .exception (
242- "Encountered exception when fetching latest dna.json for ComputeFleet , exiting gracefully: %s" , err
344+ "Encountered exception when fetching latest dna.json, exiting gracefully: %s" , err
243345 )
244346 raise SystemExit (0 )
245347
0 commit comments