Skip to content

Commit 1d58841

Browse files
saifaldin14Saif Al-Din AliCopilot
authored
[MIGRATE] Fix edge case bugs with az migrate local replication init & new commands (#9675)
* Create extension * Update src/migrate/azext_migrate/__init__.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fix import issues * Update src/migrate/setup.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Small * Small lint * Small * disable lint for this check * Add json * Fix licesnse issue * fix small * Small * Get rid of unused variables * Add service name and code owner * New version * Style * Small * Update * Follow standard * Add suggestions * Small * Not preview * Add flag to become experimental * Update history * Fix * small * Create get job and remove replication commands * Revert "Create get job and remove replication commands" This reverts commit 0f7acb7. * Update version * Sync with other branch * Fix error in init command * Remove duplicate files * Fix local issues * Select correct fabric * Process project name --------- Co-authored-by: Saif Al-Din Ali <saifaldinali@microsoft.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 2286135 commit 1d58841

File tree

8 files changed

+286
-59
lines changed

8 files changed

+286
-59
lines changed

src/migrate/HISTORY.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
33
Release History
44
===============
5+
3.0.0b4
6+
+++++++++++++++
7+
* Fix edge case bugs in az migrate local replication init & new commands.
58

69
3.0.0b3
710
+++++++++++++++

src/migrate/azext_migrate/custom.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def new_local_server_replication(cmd,
172172
subscription_id = get_subscription_id(cmd.cli_ctx)
173173
print(f"Selected Subscription Id: '{subscription_id}'")
174174

175-
rg_uri, machine_id, subscription_id = validate_server_parameters(
175+
rg_uri, machine_id, subscription_id, project_name = validate_server_parameters(
176176
cmd,
177177
machine_id,
178178
machine_index,
@@ -239,7 +239,7 @@ def new_local_server_replication(cmd,
239239
"Please verify your appliance setup and provided "
240240
"-machine_id.")
241241

242-
amh_solution, migrate_project, machine_props = process_amh_solution(
242+
amh_solution, migrate_project, machine_props, project_name = process_amh_solution(
243243
cmd,
244244
machine,
245245
site_object,

src/migrate/azext_migrate/helpers/replication/init/_setup_extension.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,8 @@ def build_extension_body(instance_type, source_fabric_id,
260260

261261

262262
def _wait_for_extension_creation(cmd, extension_uri):
263-
"""Wait for extension creation to complete."""
263+
"""Wait for extension creation to complete. Returns final state."""
264+
ext_state = None
264265
for i in range(20):
265266
time.sleep(30)
266267
try:
@@ -277,6 +278,7 @@ def _wait_for_extension_creation(cmd, extension_uri):
277278
break
278279
except CLIError:
279280
print(f"Waiting for extension... ({i + 1}/20)")
281+
return ext_state
280282

281283

282284
def _handle_extension_creation_error(cmd, extension_uri, create_error):
@@ -315,7 +317,20 @@ def create_replication_extension(cmd, extension_uri, extension_body):
315317
print("Extension creation initiated successfully")
316318
# Wait for the extension to be created
317319
print("Waiting for extension creation to complete...")
318-
_wait_for_extension_creation(cmd, extension_uri)
320+
ext_state = _wait_for_extension_creation(cmd, extension_uri)
321+
if ext_state == ProvisioningState.Failed.value:
322+
raise CLIError(
323+
"Replication extension creation failed. "
324+
"Check the extension resource in the Azure portal "
325+
"for detailed error information.")
326+
if ext_state == ProvisioningState.Canceled.value:
327+
raise CLIError(
328+
"Replication extension creation was canceled.")
329+
if ext_state is None:
330+
raise CLIError(
331+
"Replication extension creation timed out after "
332+
"10 minutes. Check the extension status in the "
333+
"Azure portal.")
319334
except CLIError as create_error:
320335
_handle_extension_creation_error(cmd, extension_uri, create_error)
321336

src/migrate/azext_migrate/helpers/replication/init/_setup_policy.py

Lines changed: 64 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,22 @@ def find_fabric(all_fabrics, appliance_name, fabric_instance_type,
101101
})
102102

103103
if is_succeeded and is_correct_instance and name_matches:
104-
# If solution doesn't match, log warning but still consider it
105-
if not is_correct_solution:
106-
logger.warning(
107-
"Fabric '%s' matches name and type but has "
108-
"different solution ID", fabric_name)
109-
fabric = candidate
110-
break
104+
if is_correct_solution:
105+
# Perfect match - use it immediately
106+
fabric = candidate
107+
break
108+
# Name/type match but wrong solution ID - keep as fallback
109+
if not fabric:
110+
fabric = candidate
111+
112+
if fabric:
113+
fabric_props = fabric.get('properties', {}).get('customProperties', {})
114+
fabric_sol_id = fabric_props.get('migrationSolutionId', '').rstrip('/')
115+
expected_sol_id = amh_solution.get('id', '').rstrip('/')
116+
if fabric_sol_id.lower() != expected_sol_id.lower():
117+
logger.warning(
118+
"Fabric '%s' matches name and type but has "
119+
"different solution ID", fabric.get('name'))
111120

112121
if not fabric:
113122
appliance_type_label = "source" if is_source else "target"
@@ -170,6 +179,7 @@ def find_fabric(all_fabrics, appliance_name, fabric_instance_type,
170179
def get_fabric_agent(cmd, replication_fabrics_uri, fabric, appliance_name,
171180
fabric_instance_type):
172181
"""Get and validate fabric agent (DRA) for the given fabric."""
182+
logger = get_logger(__name__)
173183
fabric_name = fabric.get('name')
174184
dras_uri = (
175185
f"{replication_fabrics_uri}/{fabric_name}"
@@ -180,18 +190,59 @@ def get_fabric_agent(cmd, replication_fabrics_uri, fabric, appliance_name,
180190
dras = dras_response.json().get('value', [])
181191

182192
dra = None
193+
found_but_not_responsive = None
183194
for candidate in dras:
184195
props = candidate.get('properties', {})
185196
custom_props = props.get('customProperties', {})
186-
if (props.get('machineName') == appliance_name and
187-
custom_props.get('instanceType') == fabric_instance_type and
188-
bool(props.get('isResponsive'))):
189-
dra = candidate
190-
break
197+
machine_name = props.get('machineName', '')
198+
if (machine_name.lower() == appliance_name.lower() and
199+
custom_props.get('instanceType') == fabric_instance_type):
200+
if bool(props.get('isResponsive')):
201+
dra = candidate
202+
break
203+
found_but_not_responsive = candidate
204+
205+
# Accept a non-responsive DRA if it's the only match and is provisioned
206+
if not dra and found_but_not_responsive:
207+
nr_props = found_but_not_responsive.get('properties', {})
208+
last_heartbeat = nr_props.get('lastHeartbeat', 'unknown')
209+
if (nr_props.get('provisioningState') ==
210+
ProvisioningState.Succeeded.value):
211+
logger.warning(
212+
"The appliance '%s' DRA is not responsive "
213+
"(last heartbeat: %s). Proceeding since provisioning "
214+
"state is 'Succeeded'.",
215+
appliance_name, last_heartbeat)
216+
dra = found_but_not_responsive
217+
else:
218+
raise CLIError(
219+
f"The appliance '{appliance_name}' is in a "
220+
f"disconnected state (last heartbeat: {last_heartbeat}, "
221+
f"provisioningState: "
222+
f"{nr_props.get('provisioningState')})."
223+
)
191224

192225
if not dra:
226+
# Log available DRAs for diagnostics
227+
if dras:
228+
logger.warning(
229+
"No matching fabric agent found for appliance '%s' "
230+
"(expected instanceType '%s'). Available agents:",
231+
appliance_name, fabric_instance_type)
232+
for candidate in dras:
233+
props = candidate.get('properties', {})
234+
custom_props = props.get('customProperties', {})
235+
logger.warning(
236+
" - machineName: '%s', instanceType: '%s', "
237+
"isResponsive: %s",
238+
props.get('machineName'),
239+
custom_props.get('instanceType'),
240+
props.get('isResponsive'))
241+
193242
raise CLIError(
194-
f"The appliance '{appliance_name}' is in a disconnected state."
243+
f"No fabric agent found for appliance '{appliance_name}' "
244+
f"on fabric '{fabric_name}'. Verify that the appliance is "
245+
f"properly registered and connected."
195246
)
196247

197248
return dra

src/migrate/azext_migrate/helpers/replication/new/_process_inputs.py

Lines changed: 96 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ def process_amh_solution(cmd,
228228
f"'{resource_group_name}' and project '{project_name}'. "
229229
"Please verify your appliance setup."
230230
)
231-
return amh_solution, migrate_project, machine_props
231+
return amh_solution, migrate_project, machine_props, project_name
232232

233233

234234
def process_replication_vault(cmd,
@@ -482,15 +482,23 @@ def _process_source_fabrics(all_fabrics,
482482
})
483483

484484
if is_succeeded and is_correct_instance and name_matches:
485-
# If solution doesn't match, log warning but still consider it
486-
if not is_correct_solution:
487-
logger.warning(
488-
"Fabric '%s' matches name and type but has different "
489-
"solution ID",
490-
fabric_name
491-
)
492-
source_fabric = fabric
493-
break
485+
if is_correct_solution:
486+
source_fabric = fabric
487+
break
488+
if not source_fabric:
489+
source_fabric = fabric
490+
491+
if source_fabric:
492+
sf_props = source_fabric.get('properties', {}).get(
493+
'customProperties', {})
494+
sf_sol_id = sf_props.get('migrationSolutionId', '').rstrip('/')
495+
exp_sol_id = amh_solution.get('id', '').rstrip('/')
496+
if sf_sol_id.lower() != exp_sol_id.lower():
497+
logger.warning(
498+
"Fabric '%s' matches name and type but has different "
499+
"solution ID",
500+
source_fabric.get('name'))
501+
494502
return source_fabric, source_fabric_candidates
495503

496504

@@ -679,12 +687,22 @@ def _process_target_fabrics(all_fabrics,
679687
})
680688

681689
if is_succeeded and is_correct_instance and name_matches:
682-
if not is_correct_solution:
683-
logger.warning(
684-
"Fabric '%s' matches name and type but has different "
685-
"solution ID", fabric_name)
686-
target_fabric = fabric
687-
break
690+
if is_correct_solution:
691+
target_fabric = fabric
692+
break
693+
if not target_fabric:
694+
target_fabric = fabric
695+
696+
if target_fabric:
697+
tf_props = target_fabric.get('properties', {}).get(
698+
'customProperties', {})
699+
tf_sol_id = tf_props.get('migrationSolutionId', '').rstrip('/')
700+
exp_sol_id = amh_solution.get('id', '').rstrip('/')
701+
if tf_sol_id.lower() != exp_sol_id.lower():
702+
logger.warning(
703+
"Fabric '%s' matches name and type but has different "
704+
"solution ID", target_fabric.get('name'))
705+
688706
return target_fabric, target_fabric_candidates, \
689707
target_fabric_instance_type
690708

@@ -731,28 +749,48 @@ def process_target_fabric(cmd,
731749
amh_solution):
732750
# Get source fabric agent (DRA)
733751
source_fabric_name = source_fabric.get('name')
734-
dras_uri = (
752+
source_dras = send_get_request(
753+
cmd,
735754
f"{rg_uri}/providers/Microsoft.DataReplication"
736755
f"/replicationFabrics/{source_fabric_name}/fabricAgents"
737756
f"?api-version={APIVersion.Microsoft_DataReplication.value}"
738-
)
739-
source_dras_response = send_get_request(cmd, dras_uri)
740-
source_dras = source_dras_response.json().get('value', [])
757+
).json().get('value', [])
741758

742759
source_dra = None
760+
source_found_not_responsive = None
743761
for dra in source_dras:
744762
props = dra.get('properties', {})
745763
custom_props = props.get('customProperties', {})
746-
if (props.get('machineName') == source_appliance_name and
747-
custom_props.get('instanceType') == fabric_instance_type and
748-
bool(props.get('isResponsive'))):
749-
source_dra = dra
750-
break
764+
machine_name = props.get('machineName', '')
765+
if (machine_name.lower() == source_appliance_name.lower() and
766+
custom_props.get('instanceType') == fabric_instance_type):
767+
if bool(props.get('isResponsive')):
768+
source_dra = dra
769+
break
770+
source_found_not_responsive = dra
771+
772+
if not source_dra and source_found_not_responsive:
773+
nr_props = source_found_not_responsive.get('properties', {})
774+
last_hb = nr_props.get('lastHeartbeat', 'unknown')
775+
if (nr_props.get('provisioningState') ==
776+
ProvisioningState.Succeeded.value):
777+
logger.warning(
778+
"The source appliance '%s' DRA is not responsive "
779+
"(last heartbeat: %s). Proceeding since provisioning "
780+
"state is 'Succeeded'.",
781+
source_appliance_name, last_hb)
782+
source_dra = source_found_not_responsive
783+
else:
784+
raise CLIError(
785+
f"The source appliance '{source_appliance_name}' is in a "
786+
f"disconnected state (last heartbeat: {last_hb}).")
751787

752788
if not source_dra:
753789
raise CLIError(
754-
f"The source appliance '{source_appliance_name}' is in a "
755-
f"disconnected state.")
790+
f"No fabric agent found for source appliance "
791+
f"'{source_appliance_name}' on fabric "
792+
f"'{source_fabric_name}'. Verify that the appliance is "
793+
f"properly registered and connected.")
756794

757795
target_fabric, target_fabric_candidates, \
758796
target_fabric_instance_type = _process_target_fabrics(
@@ -769,28 +807,48 @@ def process_target_fabric(cmd,
769807

770808
# Get target fabric agent (DRA)
771809
target_fabric_name = target_fabric.get('name')
772-
target_dras_uri = (
810+
target_dras = send_get_request(
811+
cmd,
773812
f"{rg_uri}/providers/Microsoft.DataReplication"
774813
f"/replicationFabrics/{target_fabric_name}/fabricAgents"
775814
f"?api-version={APIVersion.Microsoft_DataReplication.value}"
776-
)
777-
target_dras_response = send_get_request(cmd, target_dras_uri)
778-
target_dras = target_dras_response.json().get('value', [])
815+
).json().get('value', [])
779816

780817
target_dra = None
818+
target_found_not_responsive = None
781819
for dra in target_dras:
782820
props = dra.get('properties', {})
783821
custom_props = props.get('customProperties', {})
784-
if (props.get('machineName') == target_appliance_name and
822+
machine_name = props.get('machineName', '')
823+
if (machine_name.lower() == target_appliance_name.lower() and
785824
custom_props.get('instanceType') ==
786-
target_fabric_instance_type and
787-
bool(props.get('isResponsive'))):
788-
target_dra = dra
789-
break
825+
target_fabric_instance_type):
826+
if bool(props.get('isResponsive')):
827+
target_dra = dra
828+
break
829+
target_found_not_responsive = dra
830+
831+
if not target_dra and target_found_not_responsive:
832+
nr_props = target_found_not_responsive.get('properties', {})
833+
last_hb = nr_props.get('lastHeartbeat', 'unknown')
834+
if (nr_props.get('provisioningState') ==
835+
ProvisioningState.Succeeded.value):
836+
logger.warning(
837+
"The target appliance '%s' DRA is not responsive "
838+
"(last heartbeat: %s). Proceeding since provisioning "
839+
"state is 'Succeeded'.",
840+
target_appliance_name, last_hb)
841+
target_dra = target_found_not_responsive
842+
else:
843+
raise CLIError(
844+
f"The target appliance '{target_appliance_name}' is in a "
845+
f"disconnected state (last heartbeat: {last_hb}).")
790846

791847
if not target_dra:
792848
raise CLIError(
793-
f"The target appliance '{target_appliance_name}' is in a "
794-
f"disconnected state.")
849+
f"No fabric agent found for target appliance "
850+
f"'{target_appliance_name}' on fabric "
851+
f"'{target_fabric_name}'. Verify that the appliance is "
852+
f"properly registered and connected.")
795853

796854
return target_fabric, source_dra, target_dra

src/migrate/azext_migrate/helpers/replication/new/_validate.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,11 @@ def validate_server_parameters( # pylint: disable=too-many-locals,too-many-bran
206206
# machine_id was provided directly
207207
# Check if it's in Microsoft.Migrate format and needs to be resolved
208208
if "/Microsoft.Migrate/MigrateProjects/" in machine_id or "/Microsoft.Migrate/migrateprojects/" in machine_id:
209+
# Extract project_name from the Microsoft.Migrate machine ID
210+
migrate_id_parts = machine_id.split("/")
211+
if len(migrate_id_parts) >= 9 and not project_name:
212+
project_name = migrate_id_parts[8]
213+
209214
# This is a Migrate Project machine ID, need to resolve to OffAzure machine ID
210215
migrate_machine = get_resource_by_id(
211216
cmd, machine_id, APIVersion.Microsoft_Migrate.value)
@@ -257,7 +262,7 @@ def validate_server_parameters( # pylint: disable=too-many-locals,too-many-bran
257262
f"/subscriptions/{subscription_id}/"
258263
f"resourceGroups/{resource_group_name}")
259264

260-
return rg_uri, machine_id, subscription_id
265+
return rg_uri, machine_id, subscription_id, project_name
261266

262267

263268
def validate_required_parameters(machine_id,

0 commit comments

Comments
 (0)