Skip to content

Commit 3e818f4

Browse files
committed
PR comments + tests
1 parent 05523c1 commit 3e818f4

3 files changed

Lines changed: 642 additions & 51 deletions

File tree

src/aks-preview/azext_aks_preview/managed_cluster_decorator.py

Lines changed: 27 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import copy
88
import datetime
99
import os
10+
import random
1011
import time
1112
from types import SimpleNamespace
1213
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
@@ -4543,9 +4544,12 @@ def _setup_azure_monitor_logs(self, mc: ManagedCluster) -> None:
45434544
if not workspace_resource_id:
45444545
ensure_workspace_func = (
45454546
self.context.external_functions.ensure_default_log_analytics_workspace_for_monitoring)
4546-
# Retry with backoff to handle 409 Conflict when workspace is still provisioning
4547-
# (common when parallel tests or commands target the same default workspace)
4548-
for attempt in range(3):
4547+
# Retry with exponential backoff + jitter to handle 409 Conflict when
4548+
# workspace is still provisioning (common when parallel commands target
4549+
# the same default workspace). Delays: ~5 s, ~10 s, ~20 s (worst-case
4550+
# total ~52 s; fast path ~7 s if the first retry succeeds).
4551+
max_attempts = 4
4552+
for attempt in range(max_attempts):
45494553
try:
45504554
workspace_resource_id = ensure_workspace_func(
45514555
self.cmd,
@@ -4554,8 +4558,10 @@ def _setup_azure_monitor_logs(self, mc: ManagedCluster) -> None:
45544558
)
45554559
break
45564560
except (HttpResponseError, ResourceExistsError):
4557-
if attempt < 2:
4558-
time.sleep(30)
4561+
if attempt < max_attempts - 1:
4562+
base_delay = 5 * (2 ** attempt) # 5 s, 10 s, 20 s
4563+
jitter = random.uniform(0, base_delay * 0.5)
4564+
time.sleep(base_delay + jitter)
45594565
else:
45604566
raise
45614567

@@ -4582,26 +4588,9 @@ def _setup_azure_monitor_logs(self, mc: ManagedCluster) -> None:
45824588
}
45834589
mc.addon_profiles[CONST_MONITORING_ADDON_NAME] = addon_profile
45844590

4585-
# Create DCR before the cluster is created (matching base class build_monitoring_addon_profile pattern).
4586-
# The DCRA will be created later in postprocessing_after_mc_created.
4587-
self.context.external_functions.ensure_container_insights_for_monitoring(
4588-
self.cmd,
4589-
addon_profile,
4590-
self.context.get_subscription_id(),
4591-
self.context.get_resource_group_name(),
4592-
self.context.get_name(),
4593-
self.context.get_location(),
4594-
remove_monitoring=False,
4595-
aad_route=self.context.get_enable_msi_auth_for_monitoring(),
4596-
create_dcr=True,
4597-
create_dcra=False,
4598-
enable_syslog=self.context.get_enable_syslog(),
4599-
data_collection_settings=self.context.get_data_collection_settings(),
4600-
is_private_cluster=self.context.get_enable_private_cluster(),
4601-
ampls_resource_id=self.context.get_ampls_resource_id(),
4602-
enable_high_log_scale_mode=self.context.get_enable_high_log_scale_mode(),
4603-
)
4604-
4591+
# DCR and DCRA creation is deferred to postprocessing_after_mc_created
4592+
# (_postprocess_monitoring_enable) so that all flags are finalized and
4593+
# the cluster exists. Only MSI clusters need a DCR.
46054594
self.context.set_intermediate("monitoring_addon_enabled", True, overwrite_exists=True)
46064595

46074596
def _setup_opentelemetry_metrics(self, mc: ManagedCluster) -> None:
@@ -5465,7 +5454,7 @@ def _postprocess_monitoring_enable(self, cluster: ManagedCluster) -> None:
54655454
self.context.get_location(),
54665455
remove_monitoring=False,
54675456
aad_route=self.context.get_enable_msi_auth_for_monitoring(),
5468-
create_dcr=self._is_cnl_or_hlsm_changing(),
5457+
create_dcr=True,
54695458
create_dcra=True,
54705459
enable_syslog=self.context.get_enable_syslog(),
54715460
data_collection_settings=self.context.get_data_collection_settings(),
@@ -5776,17 +5765,9 @@ def update_monitoring_profile_flow_logs(self, mc: ManagedCluster) -> ManagedClus
57765765
if container_network_logs_enabled is not None:
57775766
if mc.addon_profiles:
57785767
addon_consts = self.context.get_addon_consts()
5779-
CONST_MONITORING_ADDON_NAME = addon_consts.get("CONST_MONITORING_ADDON_NAME")
5780-
# Handle both "omsagent" and "omsAgent" key variants
5781-
monitoring_addon_profile = (
5782-
mc.addon_profiles.get(CONST_MONITORING_ADDON_NAME) or
5783-
mc.addon_profiles.get(CONST_MONITORING_ADDON_NAME_CAMELCASE)
5784-
)
5768+
monitoring_addon_key = _get_monitoring_addon_key(mc, addon_consts)
5769+
monitoring_addon_profile = mc.addon_profiles.get(monitoring_addon_key)
57855770
if monitoring_addon_profile:
5786-
monitoring_addon_key = (
5787-
CONST_MONITORING_ADDON_NAME if CONST_MONITORING_ADDON_NAME in mc.addon_profiles
5788-
else CONST_MONITORING_ADDON_NAME_CAMELCASE
5789-
)
57905771
config = monitoring_addon_profile.config or {}
57915772
config["enableRetinaNetworkFlags"] = str(container_network_logs_enabled)
57925773
mc.addon_profiles[monitoring_addon_key].config = config
@@ -7692,9 +7673,12 @@ def _setup_azure_monitor_logs(self, mc: ManagedCluster) -> None:
76927673
if not workspace_resource_id:
76937674
ensure_workspace_func = (
76947675
self.context.external_functions.ensure_default_log_analytics_workspace_for_monitoring)
7695-
# Retry with backoff to handle 409 Conflict when workspace is still provisioning
7696-
# (common when parallel tests or commands target the same default workspace)
7697-
for attempt in range(3):
7676+
# Retry with exponential backoff + jitter to handle 409 Conflict when
7677+
# workspace is still provisioning (common when parallel commands target
7678+
# the same default workspace). Delays: ~5 s, ~10 s, ~20 s (worst-case
7679+
# total ~52 s; fast path ~7 s if the first retry succeeds).
7680+
max_attempts = 4
7681+
for attempt in range(max_attempts):
76987682
try:
76997683
workspace_resource_id = ensure_workspace_func(
77007684
self.cmd,
@@ -7703,8 +7687,10 @@ def _setup_azure_monitor_logs(self, mc: ManagedCluster) -> None:
77037687
)
77047688
break
77057689
except (HttpResponseError, ResourceExistsError):
7706-
if attempt < 2:
7707-
time.sleep(30)
7690+
if attempt < max_attempts - 1:
7691+
base_delay = 5 * (2 ** attempt) # 5 s, 10 s, 20 s
7692+
jitter = random.uniform(0, base_delay * 0.5)
7693+
time.sleep(base_delay + jitter)
77087694
else:
77097695
raise
77107696

src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19141,6 +19141,53 @@ def test_aks_update_standalone_enable_high_log_scale_mode(
1914119141
checks=[self.is_empty()],
1914219142
)
1914319143

19144+
@AllowLargeResponse()
19145+
@AKSCustomResourceGroupPreparer(
19146+
random_name_length=17,
19147+
name_prefix="clitest",
19148+
location="westus2",
19149+
)
19150+
def test_aks_update_disable_hlsm_error_when_cnl_enabled(
19151+
self, resource_group, resource_group_location
19152+
):
19153+
"""Test that disabling --enable-high-log-scale-mode raises an error
19154+
when container network logs are already enabled on the cluster."""
19155+
self.test_resources_count = 0
19156+
aks_name = self.create_random_name("cliakstest", 16)
19157+
self.kwargs.update(
19158+
{
19159+
"resource_group": resource_group,
19160+
"name": aks_name,
19161+
"ssh_key_value": self.generate_ssh_keys(),
19162+
"location": resource_group_location,
19163+
}
19164+
)
19165+
19166+
# Create cluster with monitoring + CNL + HLSM
19167+
create_cmd = (
19168+
"aks create --resource-group={resource_group} --name={name} --location={location} "
19169+
"--ssh-key-value={ssh_key_value} --node-count=1 "
19170+
"--enable-azure-monitor-logs --enable-managed-identity "
19171+
"--enable-acns --enable-container-network-logs --output=json"
19172+
)
19173+
self.cmd(create_cmd, checks=[
19174+
self.check("provisioningState", "Succeeded"),
19175+
])
19176+
19177+
# Attempt to disable HLSM while CNL is still enabled — should fail
19178+
disable_cmd = (
19179+
"aks update --resource-group={resource_group} --name={name} --yes "
19180+
"--enable-high-log-scale-mode false --output=json"
19181+
)
19182+
with self.assertRaisesRegex(Exception, "container network logs"):
19183+
self.cmd(disable_cmd)
19184+
19185+
# delete
19186+
self.cmd(
19187+
"aks delete -g {resource_group} -n {name} --yes --no-wait",
19188+
checks=[self.is_empty()],
19189+
)
19190+
1914419191
@AllowLargeResponse()
1914519192
@AKSCustomResourceGroupPreparer(
1914619193
random_name_length=17,

0 commit comments

Comments
 (0)