diff --git a/tests/network/migration/test_migration.py b/tests/network/migration/test_migration.py index f0cda02d8c..d440a8b55e 100644 --- a/tests/network/migration/test_migration.py +++ b/tests/network/migration/test_migration.py @@ -358,13 +358,14 @@ def test_connectivity_after_migration_and_restart( ], ) def test_migration_with_masquerade( + admin_client, running_vma, running_vmb, ip_family, ): http_port_accessible( vm=running_vma, - server_ip=running_vmb.custom_service.service_ip(ip_family=ip_family), + server_ip=running_vmb.custom_service.service_ip(admin_client=admin_client, ip_family=ip_family), server_port=running_vmb.custom_service.service_port, ) diff --git a/tests/scale/test_scale_benchmark.py b/tests/scale/test_scale_benchmark.py index 2dfd6a2ff8..814727b217 100644 --- a/tests/scale/test_scale_benchmark.py +++ b/tests/scale/test_scale_benchmark.py @@ -455,7 +455,6 @@ def test_mass_vm_live_migration( for batch in scale_vms: for vm in batch: wait_for_migration_finished( - namespace=vm.namespace, migration=vm_migration_info[vm.name][MIGRATION_INSTANCE_STR], ) verify_vm_migrated( diff --git a/tests/utils.py b/tests/utils.py index 1ab4742c0e..000b5ffb6f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -177,9 +177,7 @@ def hotplug_instance_type_vm_and_verify(vm, client, instance_type): def verify_hotplug(vm, client, sockets=None, memory_guest=None): vmim = get_created_migration_job(vm=vm, client=client) - wait_for_migration_finished( - namespace=vm.namespace, migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN - ) + wait_for_migration_finished(migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN) wait_for_ssh_connectivity(vm=vm) vmi_spec_domain = vm.vmi.instance.spec.domain if sockets: diff --git a/tests/virt/node/descheduler/conftest.py b/tests/virt/node/descheduler/conftest.py index ade5c543f2..8a7a5d4ed8 100644 --- a/tests/virt/node/descheduler/conftest.py +++ b/tests/virt/node/descheduler/conftest.py @@ -122,7 +122,7 @@ def deployed_vms_for_descheduler_test( def all_existing_migrations_completed(admin_client, namespace): # Descheduler may trigger multiple migrations, need to wait when all succeeded for migration in VirtualMachineInstanceMigration.get(client=admin_client, namespace=namespace): - wait_for_migration_finished(namespace=namespace.name, migration=migration, timeout=TIMEOUT_5MIN) + wait_for_migration_finished(migration=migration, timeout=TIMEOUT_5MIN) @pytest.fixture(scope="class") diff --git a/tests/virt/utils.py b/tests/virt/utils.py index 9e1d8113cb..2f845c73a8 100644 --- a/tests/virt/utils.py +++ b/tests/virt/utils.py @@ -139,7 +139,7 @@ def migrate_and_verify_multi_vms(vm_list): for vm in vm_list: migration = vms_dict[vm.name]["vm_mig"] - wait_for_migration_finished(namespace=vm.namespace, migration=migration) + wait_for_migration_finished(migration=migration) migration.clean_up() for vm in vm_list: diff --git a/utilities/exceptions.py b/utilities/exceptions.py index f31a594ce1..92f946a10e 100644 --- a/utilities/exceptions.py +++ b/utilities/exceptions.py @@ -128,6 +128,16 @@ class UnsupportedCPUArchitectureError(Exception): """Exception raised when a CPU architecture is not supported.""" +class MigrationStuckSchedulingError(Exception): + """Exception raised when a migration is stuck in Scheduling state.""" + + def __init__(self, migration_name: str) -> None: + self.migration_name = migration_name + + def __str__(self) -> str: + return f"Migration {self.migration_name} is stuck in Scheduling state." + + def raise_multiple_exceptions(exceptions): """Raising multiple exceptions diff --git a/utilities/unittests/test_exceptions.py b/utilities/unittests/test_exceptions.py index cc92be44d9..7282c4d262 100644 --- a/utilities/unittests/test_exceptions.py +++ b/utilities/unittests/test_exceptions.py @@ -9,6 +9,7 @@ from utilities.exceptions import ( ClusterSanityError, DataVolumeConditionMessageNotFoundError, + MigrationStuckSchedulingError, MissingEnvironmentVariableError, MissingResourceException, OsDictNotFoundError, @@ -314,3 +315,20 @@ def test_unsupported_cpu_architecture_error(self): """Test UnsupportedCPUArchitectureError can be raised""" with pytest.raises(UnsupportedCPUArchitectureError): raise UnsupportedCPUArchitectureError("Test error") + + +class TestMigrationStuckSchedulingError: + """Test cases for MigrationStuckSchedulingError exception""" + + def test_migration_stuck_scheduling_error_init(self): + """Test MigrationStuckSchedulingError initialization""" + migration_name = "test-migration" + error = MigrationStuckSchedulingError(migration_name) + assert error.migration_name == migration_name + + def test_migration_stuck_scheduling_error_str(self): + """Test MigrationStuckSchedulingError string representation""" + migration_name = "test-migration" + error = MigrationStuckSchedulingError(migration_name) + expected = "Migration test-migration is stuck in Scheduling state." + assert str(error) == expected diff --git a/utilities/virt.py b/utilities/virt.py index 9fbc66267a..2028b8b57d 100644 --- a/utilities/virt.py +++ b/utilities/virt.py @@ -94,6 +94,7 @@ Images, ) from utilities.data_collector import collect_vnc_screenshot_for_vms +from utilities.exceptions import MigrationStuckSchedulingError, ResourceValueError from utilities.hco import get_hco_namespace, wait_for_hco_conditions from utilities.network import ( cloud_init_network_data, @@ -1598,7 +1599,20 @@ def to_dict(self): if self.ip_families: self.res["spec"]["ipFamilies"] = self.ip_families - def service_ip(self, ip_family=None): + def service_ip(self, admin_client: DynamicClient, ip_family: str | None = None) -> str: + """ + Get the IP address of the service. + + Args: + admin_client (DynamicClient): admin client to be used for the service. + ip_family (str | None): IP family to be used for the service. + + Returns: + str: IP address of the service. + + Raises: + ResourceValueError: If the service IP cannot be retrieved. + """ if self.service_type == Service.Type.CLUSTER_IP: if ip_family: cluster_ips = [ @@ -1611,11 +1625,8 @@ def service_ip(self, ip_family=None): return self.instance.spec.clusterIP - vm_node = Node( - client=get_client(), - name=self.vmi.instance.status.nodeName, - ) if self.service_type == Service.Type.NODE_PORT: + vm_node = self.vm.vmi.get_node(privileged_client=admin_client) if ip_family: internal_ips = [ internal_ip @@ -1627,6 +1638,10 @@ def service_ip(self, ip_family=None): return self.target_ip or vm_node.internal_ip + raise ResourceValueError( + f"Could not get service IP for service {self.vm.custom_service.name} with type {self.service_type}" + ) + @property def service_port(self): if self.service_type == Service.Type.CLUSTER_IP: @@ -1928,7 +1943,7 @@ def migrate_vm_and_verify( ) as migration: if not wait_for_migration_success: return migration - wait_for_migration_finished(namespace=vm.namespace, migration=migration, timeout=timeout) + wait_for_migration_finished(migration=migration, timeout=timeout) verify_vm_migrated( vm=vm, @@ -1939,7 +1954,20 @@ def migrate_vm_and_verify( return None -def wait_for_migration_finished(namespace, migration, timeout=TIMEOUT_12MIN): +def wait_for_migration_finished(migration: VirtualMachineInstanceMigration, timeout: int = TIMEOUT_12MIN) -> None: + """ + Wait for migration to finish. + If migration is stuck in Scheduling state, abort the migration and collect data. + + Args: + migration (VirtualMachineInstanceMigration): Migration object. + timeout (int): Maximum time to wait for the migration to finish. + + Raises: + MigrationStuckSchedulingError: If the migration is stuck in Scheduling state. + TimeoutExpiredError: If the migration does not finish within the timeout. + """ + sleep = TIMEOUT_10SEC samples = TimeoutSampler(wait_timeout=timeout, sleep=sleep, func=lambda: migration.instance.status.phase) counter = 0 @@ -1948,36 +1976,43 @@ def wait_for_migration_finished(namespace, migration, timeout=TIMEOUT_12MIN): for sample in samples: if sample == migration.Status.SUCCEEDED: break - elif sample == "Scheduling": + if sample == VirtualMachineInstanceMigration.Status.SCHEDULING: counter += 1 # If migration stuck in Scheduling state for more than 4 minutes - most likely it will be failed # Need to collect data before 5 min timeout reached and target POD is removed if counter >= TIMEOUT_4MIN / sleep: - # Get status/events for PODs in non-running or failed state - for pod in utilities.infra.get_pod_by_name_prefix( - client=get_client(), - pod_prefix=VIRT_LAUNCHER, - namespace=namespace, - get_all=True, - ): - if pod.status not in (Pod.Status.RUNNING, Pod.Status.COMPLETED, Pod.Status.SUCCEEDED): - pod_events = [ - event["raw_object"]["message"] - for event in pod.events(timeout=TIMEOUT_5SEC, field_selector="type==Warning") - ] - LOGGER.error( - f"POD Conditions:\n {pod.instance.status.conditions[0]}\n" - f"POD Events:\n {', '.join(pod_events)}" - ) - raise TimeoutExpiredError( - f"VMIM {migration.name} stuck in Scheduling state and probably will be failed" - ) + log_failed_pod_events(migration=migration) + raise MigrationStuckSchedulingError(migration_name=migration.name) except TimeoutExpiredError: if sample: LOGGER.error(f"Status of VMIM {migration.name} is {sample}") raise +def log_failed_pod_events(migration: VirtualMachineInstanceMigration) -> None: + """ + Log failed pod events for a migration. + + Args: + migration (VirtualMachineInstanceMigration): Migration object. + """ + + for pod in utilities.infra.get_pod_by_name_prefix( + client=migration.client, pod_prefix=VIRT_LAUNCHER, namespace=migration.namespace, get_all=True + ): + # Get status/events for PODs in non-running or failed state + if pod.status not in {Pod.Status.RUNNING, Pod.Status.COMPLETED, Pod.Status.SUCCEEDED}: + pod_events = [ + event["raw_object"]["message"] + for event in pod.events(timeout=TIMEOUT_5SEC, field_selector="type==Warning") + ] + LOGGER.error( + f"POD Name: {pod.name}\n" + f"POD Conditions:\n {pod.instance.status.conditions[0]}\n" + f"POD Events:\n {', '.join(pod_events)}" + ) + + def verify_vm_migrated( vm, node_before, @@ -2331,9 +2366,7 @@ def check_migration_process_after_node_drain(client, vm, admin_client): LOGGER.info(f"The VMI was running on {source_node.name}") wait_for_node_schedulable_status(node=source_node, status=False) vmim = get_created_migration_job(vm=vm, client=client, timeout=TIMEOUT_5MIN) - wait_for_migration_finished( - namespace=vm.namespace, migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN - ) + wait_for_migration_finished(migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN) target_pod = vm.vmi.get_virt_launcher_pod(privileged_client=admin_client) target_pod.wait_for_status(status=Pod.Status.RUNNING, timeout=TIMEOUT_3MIN)