Skip to content

Commit d1764bc

Browse files
committed
{AKS} Add provisioningState retry logic with two-phase check validation
Add opt-in retry logic to the Azure CLI test SDK for handling provisioningState race conditions caused by external modifications (e.g., Azure Policy) during live tests. When enabled via AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK=true, the test framework's cmd() method uses a two-phase check approach: Phase 1: Validate provisioningState == Succeeded. If not, poll the resource via 'az resource show --ids' with exponential backoff until it reaches a terminal state (Succeeded, Failed, or Canceled). Phase 2: Validate all remaining checks against the original command result, ensuring the operation itself produced correct output. Features: - Opt-in via environment variable (zero risk to existing tests) - Only activates in live mode (playback tests unaffected) - Exponential backoff with jitter (configurable via env vars) - ETag change detection for external modification warnings - Terminal state short-circuit (Failed/Canceled stop polling) - TimeoutError after configurable max retries
1 parent 81a15e1 commit d1764bc

File tree

2 files changed

+534
-0
lines changed

2 files changed

+534
-0
lines changed

src/azure-cli-testsdk/azure/cli/testsdk/base.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import inspect
1111
import unittest
1212
import tempfile
13+
import time
14+
import random
1315

1416
from .scenario_tests import (IntegrationTestBase, ReplayableTest, SubscriptionRecordingProcessor,
1517
LargeRequestBodyProcessor,
@@ -78,6 +80,127 @@ def is_empty(self): # pylint: disable=no-self-use
7880
from azure.cli.testsdk.checkers import NoneCheck
7981
return NoneCheck()
8082

83+
@staticmethod
84+
def _is_provisioning_state_check(check):
85+
"""Return True if *check* is a JMESPathCheck asserting provisioningState == 'Succeeded'."""
86+
from azure.cli.testsdk.checkers import JMESPathCheck
87+
if not isinstance(check, JMESPathCheck):
88+
return False
89+
query = getattr(check, '_query', '')
90+
if not isinstance(query, str):
91+
return False
92+
query = query.lower()
93+
is_provisioning_state = query == 'provisioningstate' or query.endswith('.provisioningstate')
94+
return is_provisioning_state and getattr(check, '_expected_result', '') == 'Succeeded'
95+
96+
def _should_retry_for_provisioning_state(self, checks):
97+
"""Check if any JMESPathCheck asserts provisioningState == 'Succeeded'."""
98+
env_val = os.environ.get('AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK', 'false').lower()
99+
if not checks or env_val != 'true':
100+
return False
101+
102+
checks_list = checks if isinstance(checks, list) else [checks]
103+
104+
for check in checks_list:
105+
if self._is_provisioning_state_check(check):
106+
return True
107+
return False
108+
109+
def _cmd_with_retry(self, command, checks, cli_ctx):
110+
"""Execute command with two-phase check validation.
111+
112+
Phase 1: Validate provisioningState == Succeeded, retrying with poll if needed.
113+
Phase 2: Validate all remaining checks on the original result.
114+
115+
Uses etag to detect external modifications (e.g. Azure Policy).
116+
"""
117+
import jmespath
118+
from azure.cli.testsdk.exceptions import JMESPathCheckAssertionError
119+
120+
max_retries = int(os.environ.get('AZURE_CLI_TEST_MAX_RETRIES', '10'))
121+
base_delay = int(os.environ.get('AZURE_CLI_TEST_BASE_DELAY', '2'))
122+
max_delay = int(os.environ.get('AZURE_CLI_TEST_MAX_DELAY', '60'))
123+
124+
# Split checks into Phase 1 (provisioningState) and Phase 2 (everything else)
125+
checks_list = checks if isinstance(checks, list) else [checks]
126+
ps_checks = []
127+
other_checks = []
128+
for c in checks_list:
129+
if self._is_provisioning_state_check(c):
130+
ps_checks.append(c)
131+
else:
132+
other_checks.append(c)
133+
134+
# Execute the original command once
135+
result = execute(cli_ctx, command, expect_failure=False)
136+
137+
# Phase 1: Is the resource ready?
138+
try:
139+
result.assert_with_checks(ps_checks)
140+
except JMESPathCheckAssertionError:
141+
142+
# Extract resource id and etag for polling
143+
try:
144+
json_value = result.get_output_in_json()
145+
resource_id = jmespath.search('id', json_value)
146+
original_etag = jmespath.search('etag', json_value) or \
147+
jmespath.search('properties.etag', json_value)
148+
except (KeyError, TypeError, ValueError, AttributeError):
149+
resource_id = None
150+
original_etag = None
151+
152+
if not resource_id:
153+
raise
154+
155+
# Poll with generic ARM GET until provisioningState is terminal
156+
poll_command = 'resource show --ids {}'.format(resource_id)
157+
actual_state = None
158+
current_etag = None
159+
160+
logger.warning(
161+
"provisioningState was not 'Succeeded' for resource '%s'. "
162+
"Polling with '%s' (max %d retries)...",
163+
resource_id, poll_command, max_retries)
164+
165+
for attempt in range(max_retries):
166+
delay = min(base_delay * (2 ** attempt) + random.uniform(0, 1), max_delay)
167+
time.sleep(delay)
168+
169+
poll_result = execute(cli_ctx, poll_command, expect_failure=False)
170+
171+
try:
172+
poll_json = poll_result.get_output_in_json()
173+
actual_state = jmespath.search('properties.provisioningState', poll_json)
174+
current_etag = jmespath.search('etag', poll_json)
175+
except (KeyError, TypeError, ValueError, AttributeError):
176+
actual_state = None
177+
current_etag = None
178+
179+
if original_etag and current_etag and current_etag != original_etag:
180+
logger.warning(
181+
"ETag changed ('%s' -> '%s'): resource modified externally "
182+
"(likely Azure Policy). Waiting for it to complete...",
183+
original_etag[:16], current_etag[:16])
184+
185+
if actual_state == 'Succeeded':
186+
break
187+
188+
if actual_state in ('Failed', 'Canceled'):
189+
raise AssertionError(
190+
"Resource '{}' reached terminal state '{}' after external modification.".format(
191+
resource_id, actual_state))
192+
else:
193+
raise TimeoutError(
194+
"Resource '{}' did not reach 'Succeeded' after {} retries. "
195+
"Last state: '{}'. Original ETag: '{}', Current ETag: '{}'.".format(
196+
resource_id, max_retries, actual_state, original_etag, current_etag))
197+
198+
# Phase 2: Validate the operation result
199+
if other_checks:
200+
result.assert_with_checks(other_checks)
201+
202+
return result
203+
81204

82205
class ScenarioTest(ReplayableTest, CheckerMixin, unittest.TestCase):
83206
def __init__(self, method_name, config_file=None, recording_name=None,
@@ -174,6 +297,9 @@ def create_guid(self):
174297

175298
def cmd(self, command, checks=None, expect_failure=False):
176299
command = self._apply_kwargs(command)
300+
# Only retry in live mode — playback recordings have a fixed HTTP sequence
301+
if self.is_live and not expect_failure and self._should_retry_for_provisioning_state(checks):
302+
return self._cmd_with_retry(command, checks, self.cli_ctx)
177303
return execute(self.cli_ctx, command, expect_failure=expect_failure).assert_with_checks(checks)
178304

179305
def get_subscription_id(self):
@@ -230,6 +356,8 @@ def setUp(self):
230356

231357
def cmd(self, command, checks=None, expect_failure=False):
232358
command = self._apply_kwargs(command)
359+
if not expect_failure and self._should_retry_for_provisioning_state(checks):
360+
return self._cmd_with_retry(command, checks, self.cli_ctx)
233361
return execute(self.cli_ctx, command, expect_failure=expect_failure).assert_with_checks(checks)
234362

235363
def get_subscription_id(self):

0 commit comments

Comments
 (0)