77import copy
88import datetime
99import os
10+ import random
1011import time
1112from types import SimpleNamespace
1213from typing import Any , Dict , List , Optional , Tuple , TypeVar , Union
@@ -4543,9 +4544,12 @@ def _setup_azure_monitor_logs(self, mc: ManagedCluster) -> None:
45434544 if not workspace_resource_id :
45444545 ensure_workspace_func = (
45454546 self .context .external_functions .ensure_default_log_analytics_workspace_for_monitoring )
4546- # Retry with backoff to handle 409 Conflict when workspace is still provisioning
4547- # (common when parallel tests or commands target the same default workspace)
4548- for attempt in range (3 ):
4547+ # Retry with exponential backoff + jitter to handle 409 Conflict when
4548+ # workspace is still provisioning (common when parallel commands target
4549+ # the same default workspace). Delays: ~5 s, ~10 s, ~20 s (worst-case
4550+ # total ~52 s; fast path ~7 s if the first retry succeeds).
4551+ max_attempts = 4
4552+ for attempt in range (max_attempts ):
45494553 try :
45504554 workspace_resource_id = ensure_workspace_func (
45514555 self .cmd ,
@@ -4554,8 +4558,10 @@ def _setup_azure_monitor_logs(self, mc: ManagedCluster) -> None:
45544558 )
45554559 break
45564560 except (HttpResponseError , ResourceExistsError ):
4557- if attempt < 2 :
4558- time .sleep (30 )
4561+ if attempt < max_attempts - 1 :
4562+ base_delay = 5 * (2 ** attempt ) # 5 s, 10 s, 20 s
4563+ jitter = random .uniform (0 , base_delay * 0.5 )
4564+ time .sleep (base_delay + jitter )
45594565 else :
45604566 raise
45614567
@@ -4582,26 +4588,9 @@ def _setup_azure_monitor_logs(self, mc: ManagedCluster) -> None:
45824588 }
45834589 mc .addon_profiles [CONST_MONITORING_ADDON_NAME ] = addon_profile
45844590
4585- # Create DCR before the cluster is created (matching base class build_monitoring_addon_profile pattern).
4586- # The DCRA will be created later in postprocessing_after_mc_created.
4587- self .context .external_functions .ensure_container_insights_for_monitoring (
4588- self .cmd ,
4589- addon_profile ,
4590- self .context .get_subscription_id (),
4591- self .context .get_resource_group_name (),
4592- self .context .get_name (),
4593- self .context .get_location (),
4594- remove_monitoring = False ,
4595- aad_route = self .context .get_enable_msi_auth_for_monitoring (),
4596- create_dcr = True ,
4597- create_dcra = False ,
4598- enable_syslog = self .context .get_enable_syslog (),
4599- data_collection_settings = self .context .get_data_collection_settings (),
4600- is_private_cluster = self .context .get_enable_private_cluster (),
4601- ampls_resource_id = self .context .get_ampls_resource_id (),
4602- enable_high_log_scale_mode = self .context .get_enable_high_log_scale_mode (),
4603- )
4604-
4591+ # DCR and DCRA creation is deferred to postprocessing_after_mc_created
4592+ # (_postprocess_monitoring_enable) so that all flags are finalized and
4593+ # the cluster exists. Only MSI clusters need a DCR.
46054594 self .context .set_intermediate ("monitoring_addon_enabled" , True , overwrite_exists = True )
46064595
46074596 def _setup_opentelemetry_metrics (self , mc : ManagedCluster ) -> None :
@@ -5465,7 +5454,7 @@ def _postprocess_monitoring_enable(self, cluster: ManagedCluster) -> None:
54655454 self .context .get_location (),
54665455 remove_monitoring = False ,
54675456 aad_route = self .context .get_enable_msi_auth_for_monitoring (),
5468- create_dcr = self . _is_cnl_or_hlsm_changing () ,
5457+ create_dcr = True ,
54695458 create_dcra = True ,
54705459 enable_syslog = self .context .get_enable_syslog (),
54715460 data_collection_settings = self .context .get_data_collection_settings (),
@@ -5776,17 +5765,9 @@ def update_monitoring_profile_flow_logs(self, mc: ManagedCluster) -> ManagedClus
57765765 if container_network_logs_enabled is not None :
57775766 if mc .addon_profiles :
57785767 addon_consts = self .context .get_addon_consts ()
5779- CONST_MONITORING_ADDON_NAME = addon_consts .get ("CONST_MONITORING_ADDON_NAME" )
5780- # Handle both "omsagent" and "omsAgent" key variants
5781- monitoring_addon_profile = (
5782- mc .addon_profiles .get (CONST_MONITORING_ADDON_NAME ) or
5783- mc .addon_profiles .get (CONST_MONITORING_ADDON_NAME_CAMELCASE )
5784- )
5768+ monitoring_addon_key = _get_monitoring_addon_key (mc , addon_consts )
5769+ monitoring_addon_profile = mc .addon_profiles .get (monitoring_addon_key )
57855770 if monitoring_addon_profile :
5786- monitoring_addon_key = (
5787- CONST_MONITORING_ADDON_NAME if CONST_MONITORING_ADDON_NAME in mc .addon_profiles
5788- else CONST_MONITORING_ADDON_NAME_CAMELCASE
5789- )
57905771 config = monitoring_addon_profile .config or {}
57915772 config ["enableRetinaNetworkFlags" ] = str (container_network_logs_enabled )
57925773 mc .addon_profiles [monitoring_addon_key ].config = config
@@ -7692,9 +7673,12 @@ def _setup_azure_monitor_logs(self, mc: ManagedCluster) -> None:
76927673 if not workspace_resource_id :
76937674 ensure_workspace_func = (
76947675 self .context .external_functions .ensure_default_log_analytics_workspace_for_monitoring )
7695- # Retry with backoff to handle 409 Conflict when workspace is still provisioning
7696- # (common when parallel tests or commands target the same default workspace)
7697- for attempt in range (3 ):
7676+ # Retry with exponential backoff + jitter to handle 409 Conflict when
7677+ # workspace is still provisioning (common when parallel commands target
7678+ # the same default workspace). Delays: ~5 s, ~10 s, ~20 s (worst-case
7679+ # total ~52 s; fast path ~7 s if the first retry succeeds).
7680+ max_attempts = 4
7681+ for attempt in range (max_attempts ):
76987682 try :
76997683 workspace_resource_id = ensure_workspace_func (
77007684 self .cmd ,
@@ -7703,8 +7687,10 @@ def _setup_azure_monitor_logs(self, mc: ManagedCluster) -> None:
77037687 )
77047688 break
77057689 except (HttpResponseError , ResourceExistsError ):
7706- if attempt < 2 :
7707- time .sleep (30 )
7690+ if attempt < max_attempts - 1 :
7691+ base_delay = 5 * (2 ** attempt ) # 5 s, 10 s, 20 s
7692+ jitter = random .uniform (0 , base_delay * 0.5 )
7693+ time .sleep (base_delay + jitter )
77087694 else :
77097695 raise
77107696
0 commit comments