Skip to content

Commit 56a1e16

Browse files
committed
net, tests, stuntime: Wait for VMI affinity before migration
virt-controller and VMIM controller reconcile independently — if migration is triggered before the VM controller propagates the template affinity to the VMI, the target pod is created with stale scheduling rules. Wait for the VMI to reflect the updated affinity before migrating, and assert post-migration node placement matches the expected affinity. Assisted-by: Claude <noreply@anthropic.com> Signed-off-by: Anat Wax <awax@redhat.com>
1 parent ad04052 commit 56a1e16

4 files changed

Lines changed: 85 additions & 1 deletion

File tree

libs/vm/vm.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from ocp_resources.virtual_machine import VirtualMachine
1111
from ocp_resources.virtual_machine_instance import VirtualMachineInstance
1212
from pytest_testconfig import config as py_config
13+
from timeout_sampler import TimeoutSampler
1314

1415
from libs.net.vmspec import VMInterfaceSpecNotFoundError
1516
from libs.vm.spec import (
@@ -143,6 +144,24 @@ def set_template_affinity(self, affinity: Affinity | None) -> None:
143144
patches = {self: {"spec": {"template": {"spec": {"affinity": template_affinity}}}}}
144145
ResourceEditor(patches=patches).update()
145146

147+
def wait_for_vmi_affinity(self, timeout: int = 10) -> None:
148+
"""Wait for the VMI to reflect the current template affinity.
149+
150+
Args:
151+
timeout: Maximum seconds to wait for reconciliation.
152+
"""
153+
template_affinity = self._spec.template.spec.affinity
154+
expected_affinity = (
155+
asdict(obj=template_affinity, dict_factory=self._filter_out_none_values) if template_affinity else None
156+
)
157+
for sample in TimeoutSampler(
158+
wait_timeout=timeout,
159+
sleep=1,
160+
func=lambda: self.vmi.instance.to_dict()["spec"].get("affinity"),
161+
):
162+
if sample == expected_affinity:
163+
break
164+
146165
@property
147166
def template_spec(self) -> VMISpec:
148167
return self._spec.template.spec

tests/network/l2_bridge/migration_stuntime/test_migration_stuntime.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,16 @@
1919
import pytest
2020

2121
from libs.vm.affinity import new_pod_affinity, new_pod_anti_affinity
22-
from tests.network.libs.stuntime import CLIENT_VM_LABEL, SERVER_VM_LABEL, STUNTIME_THRESHOLD_SECONDS, measure_stuntime
22+
from tests.network.libs.stuntime import (
23+
CLIENT_VM_LABEL,
24+
POD_AFFINITY_TYPE,
25+
POD_ANTI_AFFINITY_TYPE,
26+
SERVER_VM_LABEL,
27+
STUNTIME_THRESHOLD_SECONDS,
28+
assert_affinity_after_migration,
29+
measure_stuntime,
30+
)
31+
from utilities.jira import is_jira_open
2332
from utilities.virt import migrate_vm_and_verify
2433

2534
pytestmark = [pytest.mark.tier3]
@@ -71,7 +80,10 @@ def test_client_migrates_off_server_node(
7180
- Measured stuntime does not exceed the global threshold.
7281
"""
7382
stuntime_client_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=SERVER_VM_LABEL))
83+
stuntime_client_vm.wait_for_vmi_affinity()
7484
migrate_vm_and_verify(vm=stuntime_client_vm, client=admin_client)
85+
if is_jira_open(jira_id="CNV-90576"):
86+
assert_affinity_after_migration(vm=stuntime_client_vm, expected_type=POD_ANTI_AFFINITY_TYPE)
7587
measured_stuntime = measure_stuntime(active_ping=l2_bridge_active_ping)
7688
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
7789
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
@@ -130,7 +142,10 @@ def test_client_migrates_to_server_node(
130142
- Measured stuntime does not exceed the global threshold.
131143
"""
132144
stuntime_client_vm.set_template_affinity(affinity=new_pod_affinity(label=SERVER_VM_LABEL))
145+
stuntime_client_vm.wait_for_vmi_affinity()
133146
migrate_vm_and_verify(vm=stuntime_client_vm, client=admin_client)
147+
if is_jira_open(jira_id="CNV-90576"):
148+
assert_affinity_after_migration(vm=stuntime_client_vm, expected_type=POD_AFFINITY_TYPE)
134149
measured_stuntime = measure_stuntime(active_ping=l2_bridge_active_ping)
135150
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
136151
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
@@ -160,7 +175,10 @@ def test_server_migrates_off_client_node(
160175
- Measured stuntime does not exceed the global threshold.
161176
"""
162177
stuntime_server_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=CLIENT_VM_LABEL))
178+
stuntime_server_vm.wait_for_vmi_affinity()
163179
migrate_vm_and_verify(vm=stuntime_server_vm, client=admin_client)
180+
if is_jira_open(jira_id="CNV-90576"):
181+
assert_affinity_after_migration(vm=stuntime_server_vm, expected_type=POD_ANTI_AFFINITY_TYPE)
164182
measured_stuntime = measure_stuntime(active_ping=l2_bridge_active_ping)
165183
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
166184
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
@@ -219,7 +237,10 @@ def test_server_migrates_to_client_node(
219237
- Measured stuntime does not exceed the global threshold.
220238
"""
221239
stuntime_server_vm.set_template_affinity(affinity=new_pod_affinity(label=CLIENT_VM_LABEL))
240+
stuntime_server_vm.wait_for_vmi_affinity()
222241
migrate_vm_and_verify(vm=stuntime_server_vm, client=admin_client)
242+
if is_jira_open(jira_id="CNV-90576"):
243+
assert_affinity_after_migration(vm=stuntime_server_vm, expected_type=POD_AFFINITY_TYPE)
223244
measured_stuntime = measure_stuntime(active_ping=l2_bridge_active_ping)
224245
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
225246
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"

tests/network/libs/stuntime.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
SERVER_VM_LABEL: Final[tuple[str, str]] = (STUNTIME_LABEL_KEY, "server")
1515
CLIENT_VM_LABEL: Final[tuple[str, str]] = (STUNTIME_LABEL_KEY, "client")
1616
STUNTIME_THRESHOLD_SECONDS: Final[float] = 5.0
17+
POD_AFFINITY_TYPE: Final[str] = "podAffinity"
18+
POD_ANTI_AFFINITY_TYPE: Final[str] = "podAntiAffinity"
1719
STUNTIME_PING_LOG_PATH: Final[str] = "/tmp/stuntime-ping.log"
1820
PING_INTERVAL_SECONDS: Final[float] = 0.01
1921
DEFAULT_COMMAND_TIMEOUT_SECONDS: Final[int] = 10
@@ -141,3 +143,29 @@ def _compute_stuntime(lost_packets: int) -> float:
141143
stuntime = 0.0 if lost_packets == 0 else (lost_packets + 1) * PING_INTERVAL_SECONDS
142144
LOGGER.info(f"Stuntime: {stuntime:.2f}s (from {lost_packets} lost packets)")
143145
return stuntime
146+
147+
148+
def assert_affinity_after_migration(vm: BaseVirtualMachine, expected_type: str) -> None:
149+
"""Verify the migration target pod has the correct affinity after migration.
150+
151+
Detects the virt-controller race (CNV-90576) where the migration target pod is
152+
created from a stale VMI snapshot, resulting in incorrect affinity on the pod.
153+
154+
Args:
155+
vm: The VM that was migrated.
156+
expected_type: The affinity type expected on the pod ("podAffinity" or "podAntiAffinity").
157+
"""
158+
pod_affinity = vm.vmi.virt_launcher_pod.instance.to_dict()["spec"].get("affinity")
159+
stale_type = POD_ANTI_AFFINITY_TYPE if expected_type == POD_AFFINITY_TYPE else POD_AFFINITY_TYPE
160+
assert expected_type in pod_affinity, (
161+
f"POD ({vm.vmi.virt_launcher_pod.name}) missing {expected_type}: {pod_affinity}"
162+
)
163+
if stale_type in pod_affinity:
164+
has_stuntime_rules = any(
165+
expr.get("key", "").startswith("stuntime.")
166+
for rule in pod_affinity[stale_type].get("requiredDuringSchedulingIgnoredDuringExecution", [])
167+
for expr in rule.get("labelSelector", {}).get("matchExpressions", [])
168+
)
169+
assert not has_stuntime_rules, (
170+
f"POD ({vm.vmi.virt_launcher_pod.name}) has stale stuntime {stale_type}: {pod_affinity[stale_type]}"
171+
)

tests/network/localnet/migration_stuntime/test_migration_stuntime.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,14 @@
2121
from libs.vm.affinity import new_pod_affinity, new_pod_anti_affinity
2222
from tests.network.libs.stuntime import (
2323
CLIENT_VM_LABEL,
24+
POD_AFFINITY_TYPE,
25+
POD_ANTI_AFFINITY_TYPE,
2426
SERVER_VM_LABEL,
2527
STUNTIME_THRESHOLD_SECONDS,
28+
assert_affinity_after_migration,
2629
measure_stuntime,
2730
)
31+
from utilities.jira import is_jira_open
2832
from utilities.virt import migrate_vm_and_verify
2933

3034
pytestmark = [pytest.mark.tier3]
@@ -76,7 +80,10 @@ def test_client_migrates_off_server_node(self, admin_client, ip_family, localnet
7680
- Measured stuntime does not exceed the global threshold.
7781
"""
7882
localnet_stuntime_client_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=SERVER_VM_LABEL))
83+
localnet_stuntime_client_vm.wait_for_vmi_affinity()
7984
migrate_vm_and_verify(vm=localnet_stuntime_client_vm, client=admin_client)
85+
if is_jira_open(jira_id="CNV-90576"):
86+
assert_affinity_after_migration(vm=localnet_stuntime_client_vm, expected_type=POD_ANTI_AFFINITY_TYPE)
8087
measured_stuntime = measure_stuntime(active_ping=active_ping)
8188
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
8289
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
@@ -133,7 +140,10 @@ def test_client_migrates_to_server_node(self, admin_client, ip_family, localnet_
133140
- Measured stuntime does not exceed the global threshold.
134141
"""
135142
localnet_stuntime_client_vm.set_template_affinity(affinity=new_pod_affinity(label=SERVER_VM_LABEL))
143+
localnet_stuntime_client_vm.wait_for_vmi_affinity()
136144
migrate_vm_and_verify(vm=localnet_stuntime_client_vm, client=admin_client)
145+
if is_jira_open(jira_id="CNV-90576"):
146+
assert_affinity_after_migration(vm=localnet_stuntime_client_vm, expected_type=POD_AFFINITY_TYPE)
137147
measured_stuntime = measure_stuntime(active_ping=active_ping)
138148
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
139149
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
@@ -161,7 +171,10 @@ def test_server_migrates_off_client_node(self, admin_client, ip_family, localnet
161171
- Measured stuntime does not exceed the global threshold.
162172
"""
163173
localnet_stuntime_server_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=CLIENT_VM_LABEL))
174+
localnet_stuntime_server_vm.wait_for_vmi_affinity()
164175
migrate_vm_and_verify(vm=localnet_stuntime_server_vm, client=admin_client)
176+
if is_jira_open(jira_id="CNV-90576"):
177+
assert_affinity_after_migration(vm=localnet_stuntime_server_vm, expected_type=POD_ANTI_AFFINITY_TYPE)
165178
measured_stuntime = measure_stuntime(active_ping=active_ping)
166179
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
167180
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
@@ -218,7 +231,10 @@ def test_server_migrates_to_client_node(self, admin_client, ip_family, localnet_
218231
- Measured stuntime does not exceed the global threshold.
219232
"""
220233
localnet_stuntime_server_vm.set_template_affinity(affinity=new_pod_affinity(label=CLIENT_VM_LABEL))
234+
localnet_stuntime_server_vm.wait_for_vmi_affinity()
221235
migrate_vm_and_verify(vm=localnet_stuntime_server_vm, client=admin_client)
236+
if is_jira_open(jira_id="CNV-90576"):
237+
assert_affinity_after_migration(vm=localnet_stuntime_server_vm, expected_type=POD_AFFINITY_TYPE)
222238
measured_stuntime = measure_stuntime(active_ping=active_ping)
223239
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
224240
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"

0 commit comments

Comments
 (0)