Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tests/network/migration/test_migration.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,13 +358,14 @@ def test_connectivity_after_migration_and_restart(
],
)
def test_migration_with_masquerade(
admin_client,
Comment thread
vsibirsk marked this conversation as resolved.
Comment thread
vsibirsk marked this conversation as resolved.
Comment thread
vsibirsk marked this conversation as resolved.
running_vma,
running_vmb,
ip_family,
):
http_port_accessible(
vm=running_vma,
server_ip=running_vmb.custom_service.service_ip(ip_family=ip_family),
server_ip=running_vmb.custom_service.service_ip(admin_client=admin_client, ip_family=ip_family),
server_port=running_vmb.custom_service.service_port,
)

Expand Down
1 change: 0 additions & 1 deletion tests/scale/test_scale_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,6 @@ def test_mass_vm_live_migration(
for batch in scale_vms:
for vm in batch:
wait_for_migration_finished(
namespace=vm.namespace,
migration=vm_migration_info[vm.name][MIGRATION_INSTANCE_STR],
)
verify_vm_migrated(
Expand Down
4 changes: 1 addition & 3 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,7 @@ def hotplug_instance_type_vm_and_verify(vm, client, instance_type):

def verify_hotplug(vm, client, sockets=None, memory_guest=None):
vmim = get_created_migration_job(vm=vm, client=client)
wait_for_migration_finished(
namespace=vm.namespace, migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN
)
wait_for_migration_finished(migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN)
wait_for_ssh_connectivity(vm=vm)
vmi_spec_domain = vm.vmi.instance.spec.domain
if sockets:
Expand Down
2 changes: 1 addition & 1 deletion tests/virt/node/descheduler/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def deployed_vms_for_descheduler_test(
def all_existing_migrations_completed(admin_client, namespace):
# Descheduler may trigger multiple migrations, need to wait when all succeeded
for migration in VirtualMachineInstanceMigration.get(client=admin_client, namespace=namespace):
wait_for_migration_finished(namespace=namespace.name, migration=migration, timeout=TIMEOUT_5MIN)
wait_for_migration_finished(migration=migration, timeout=TIMEOUT_5MIN)


@pytest.fixture(scope="class")
Expand Down
2 changes: 1 addition & 1 deletion tests/virt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def migrate_and_verify_multi_vms(vm_list):

for vm in vm_list:
migration = vms_dict[vm.name]["vm_mig"]
wait_for_migration_finished(namespace=vm.namespace, migration=migration)
wait_for_migration_finished(migration=migration)
migration.clean_up()

for vm in vm_list:
Expand Down
10 changes: 10 additions & 0 deletions utilities/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,16 @@ class UnsupportedCPUArchitectureError(Exception):
"""Exception raised when a CPU architecture is not supported."""


class MigrationStuckSchedulingError(Exception):
"""Exception raised when a migration is stuck in Scheduling state."""

def __init__(self, migration_name: str) -> None:
self.migration_name = migration_name

def __str__(self) -> str:
return f"Migration {self.migration_name} is stuck in Scheduling state."
Comment thread
vsibirsk marked this conversation as resolved.


def raise_multiple_exceptions(exceptions):
"""Raising multiple exceptions

Expand Down
18 changes: 18 additions & 0 deletions utilities/unittests/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from utilities.exceptions import (
ClusterSanityError,
DataVolumeConditionMessageNotFoundError,
MigrationStuckSchedulingError,
MissingEnvironmentVariableError,
MissingResourceException,
OsDictNotFoundError,
Expand Down Expand Up @@ -314,3 +315,20 @@ def test_unsupported_cpu_architecture_error(self):
"""Test UnsupportedCPUArchitectureError can be raised"""
with pytest.raises(UnsupportedCPUArchitectureError):
raise UnsupportedCPUArchitectureError("Test error")


class TestMigrationStuckSchedulingError:
"""Test cases for MigrationStuckSchedulingError exception"""

def test_migration_stuck_scheduling_error_init(self):
"""Test MigrationStuckSchedulingError initialization"""
migration_name = "test-migration"
error = MigrationStuckSchedulingError(migration_name)
assert error.migration_name == migration_name

def test_migration_stuck_scheduling_error_str(self):
"""Test MigrationStuckSchedulingError string representation"""
migration_name = "test-migration"
error = MigrationStuckSchedulingError(migration_name)
expected = "Migration test-migration is stuck in Scheduling state."
assert str(error) == expected
Comment thread
vsibirsk marked this conversation as resolved.
93 changes: 63 additions & 30 deletions utilities/virt.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
Images,
)
from utilities.data_collector import collect_vnc_screenshot_for_vms
from utilities.exceptions import MigrationStuckSchedulingError, ResourceValueError
from utilities.hco import get_hco_namespace, wait_for_hco_conditions
from utilities.network import (
cloud_init_network_data,
Expand Down Expand Up @@ -1598,7 +1599,20 @@ def to_dict(self):
if self.ip_families:
self.res["spec"]["ipFamilies"] = self.ip_families

def service_ip(self, ip_family=None):
def service_ip(self, admin_client: DynamicClient, ip_family: str | None = None) -> str:
"""
Get the IP address of the service.

Args:
admin_client (DynamicClient): admin client to be used for the service.
ip_family (str | None): IP family to be used for the service.

Returns:
str: IP address of the service.

Raises:
ResourceValueError: If the service IP cannot be retrieved.
"""
if self.service_type == Service.Type.CLUSTER_IP:
if ip_family:
cluster_ips = [
Expand All @@ -1611,11 +1625,8 @@ def service_ip(self, ip_family=None):

return self.instance.spec.clusterIP

vm_node = Node(
client=get_client(),
name=self.vmi.instance.status.nodeName,
)
if self.service_type == Service.Type.NODE_PORT:
vm_node = self.vm.vmi.get_node(privileged_client=admin_client)
if ip_family:
internal_ips = [
internal_ip
Expand All @@ -1627,6 +1638,10 @@ def service_ip(self, ip_family=None):

return self.target_ip or vm_node.internal_ip

raise ResourceValueError(
f"Could not get service IP for service {self.vm.custom_service.name} with type {self.service_type}"
)

@property
def service_port(self):
if self.service_type == Service.Type.CLUSTER_IP:
Expand Down Expand Up @@ -1928,7 +1943,7 @@ def migrate_vm_and_verify(
) as migration:
if not wait_for_migration_success:
return migration
wait_for_migration_finished(namespace=vm.namespace, migration=migration, timeout=timeout)
wait_for_migration_finished(migration=migration, timeout=timeout)

verify_vm_migrated(
vm=vm,
Expand All @@ -1939,7 +1954,20 @@ def migrate_vm_and_verify(
return None


def wait_for_migration_finished(namespace, migration, timeout=TIMEOUT_12MIN):
def wait_for_migration_finished(migration: VirtualMachineInstanceMigration, timeout: int = TIMEOUT_12MIN) -> None:
"""
Wait for migration to finish.
If migration is stuck in Scheduling state, abort the migration and collect data.

Args:
migration (VirtualMachineInstanceMigration): Migration object.
timeout (int): Maximum time to wait for the migration to finish.

Raises:
MigrationStuckSchedulingError: If the migration is stuck in Scheduling state.
TimeoutExpiredError: If the migration does not finish within the timeout.
"""

sleep = TIMEOUT_10SEC
samples = TimeoutSampler(wait_timeout=timeout, sleep=sleep, func=lambda: migration.instance.status.phase)
counter = 0
Expand All @@ -1948,36 +1976,43 @@ def wait_for_migration_finished(namespace, migration, timeout=TIMEOUT_12MIN):
for sample in samples:
if sample == migration.Status.SUCCEEDED:
break
elif sample == "Scheduling":
if sample == VirtualMachineInstanceMigration.Status.SCHEDULING:
counter += 1
# If migration stuck in Scheduling state for more than 4 minutes - most likely it will be failed
# Need to collect data before 5 min timeout reached and target POD is removed
if counter >= TIMEOUT_4MIN / sleep:
# Get status/events for PODs in non-running or failed state
for pod in utilities.infra.get_pod_by_name_prefix(
client=get_client(),
pod_prefix=VIRT_LAUNCHER,
namespace=namespace,
get_all=True,
):
if pod.status not in (Pod.Status.RUNNING, Pod.Status.COMPLETED, Pod.Status.SUCCEEDED):
pod_events = [
event["raw_object"]["message"]
for event in pod.events(timeout=TIMEOUT_5SEC, field_selector="type==Warning")
]
LOGGER.error(
f"POD Conditions:\n {pod.instance.status.conditions[0]}\n"
f"POD Events:\n {', '.join(pod_events)}"
)
raise TimeoutExpiredError(
f"VMIM {migration.name} stuck in Scheduling state and probably will be failed"
)
log_failed_pod_events(migration=migration)
raise MigrationStuckSchedulingError(migration_name=migration.name)
except TimeoutExpiredError:
if sample:
LOGGER.error(f"Status of VMIM {migration.name} is {sample}")
raise


def log_failed_pod_events(migration: VirtualMachineInstanceMigration) -> None:
Comment thread
vsibirsk marked this conversation as resolved.
"""
Log failed pod events for a migration.

Args:
migration (VirtualMachineInstanceMigration): Migration object.
"""

for pod in utilities.infra.get_pod_by_name_prefix(
client=migration.client, pod_prefix=VIRT_LAUNCHER, namespace=migration.namespace, get_all=True
):
# Get status/events for PODs in non-running or failed state
if pod.status not in {Pod.Status.RUNNING, Pod.Status.COMPLETED, Pod.Status.SUCCEEDED}:
pod_events = [
event["raw_object"]["message"]
for event in pod.events(timeout=TIMEOUT_5SEC, field_selector="type==Warning")
]
LOGGER.error(
f"POD Name: {pod.name}\n"
f"POD Conditions:\n {pod.instance.status.conditions[0]}\n"
f"POD Events:\n {', '.join(pod_events)}"
)


def verify_vm_migrated(
vm,
node_before,
Expand Down Expand Up @@ -2331,9 +2366,7 @@ def check_migration_process_after_node_drain(client, vm, admin_client):
LOGGER.info(f"The VMI was running on {source_node.name}")
wait_for_node_schedulable_status(node=source_node, status=False)
vmim = get_created_migration_job(vm=vm, client=client, timeout=TIMEOUT_5MIN)
wait_for_migration_finished(
namespace=vm.namespace, migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN
)
wait_for_migration_finished(migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN)

target_pod = vm.vmi.get_virt_launcher_pod(privileged_client=admin_client)
target_pod.wait_for_status(status=Pod.Status.RUNNING, timeout=TIMEOUT_3MIN)
Expand Down
Loading