Skip to content

Commit e3605cc

Browse files
committed
Add rollback for Hyper-V DDA assignment failures
Track each stage of Hyper-V DDA assignment so a later failure can unwind earlier host-side changes. The assignment path now records disabled PnP devices, successfully dismounted devices, and devices already attached to the VM. On failure, rollback removes assigned devices from the VM, mounts dismounted devices back to the host, re-enables the PnP devices that were disabled, and reports rollback errors alongside the original assignment error. Add a focused selftest that simulates a second-device dismount failure and verifies rollback removes, mounts, and re-enables devices in the expected order.
1 parent c41a69d commit e3605cc

5 files changed

Lines changed: 266 additions & 42 deletions

File tree

lisa/microsoft/testsuites/device_passthrough/functional_tests.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,19 +71,31 @@ def verify_device_passthrough_on_guest(
7171
) -> None:
7272
lspci = node.tools[Lspci]
7373
platform = environment.platform
74+
if platform is None:
75+
raise SkippedException(
76+
"Device passthrough validation requires a LISA platform context. "
77+
"Verify the runbook uses cloud-hypervisor or hyperv."
78+
)
7479
platform_name = platform.type_name()
7580

7681
if platform_name == CLOUD_HYPERVISOR:
7782
# Import at runtime to avoid libvirt dependency on other platforms.
78-
from lisa.sut_orchestrator.libvirt.context import get_node_context
83+
from lisa.sut_orchestrator.libvirt.context import (
84+
get_node_context as get_libvirt_node_context,
85+
)
86+
87+
node_context = get_libvirt_node_context(node)
7988
elif platform_name == HYPERV:
80-
from lisa.sut_orchestrator.hyperv.context import get_node_context
89+
from lisa.sut_orchestrator.hyperv.context import (
90+
get_node_context as get_hyperv_node_context,
91+
)
92+
93+
node_context = get_hyperv_node_context(node)
8194
else:
8295
raise SkippedException(
8396
f"Device passthrough validation is not supported on '{platform_name}'"
8497
)
8598

86-
node_context = get_node_context(node)
8799
if not node_context.passthrough_devices:
88100
raise SkippedException("No passthrough devices are assigned to node")
89101

lisa/microsoft/testsuites/performance/networkperf_passthrough.py

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -715,13 +715,17 @@ def _configure_passthrough_nic_for_node(
715715

716716
def _get_passthrough_node_context(self, node: Node) -> Any:
717717
try:
718-
from lisa.sut_orchestrator.libvirt.context import get_node_context
718+
from lisa.sut_orchestrator.libvirt.context import (
719+
get_node_context as get_libvirt_node_context,
720+
)
719721

720-
return get_node_context(node)
722+
return get_libvirt_node_context(node)
721723
except AssertionError:
722-
from lisa.sut_orchestrator.hyperv.context import get_node_context
724+
from lisa.sut_orchestrator.hyperv.context import (
725+
get_node_context as get_hyperv_node_context,
726+
)
723727

724-
return get_node_context(node)
728+
return get_hyperv_node_context(node)
725729

726730
def _get_device_bdf(self, device_addr_obj: Any) -> str:
727731
bus = getattr(device_addr_obj, "bus", "")
@@ -1184,27 +1188,30 @@ def _perf_ntttcp_with_windows_server(
11841188
f"Stderr: {receiver_result.stderr[:2000]}"
11851189
) from parse_error
11861190
if udp_mode:
1187-
perf_message = client_ntttcp.create_ntttcp_udp_performance_message(
1188-
parsed_server_result,
1189-
parsed_client_result,
1190-
str(test_thread),
1191-
buffer_size,
1192-
test_case_name,
1193-
test_result,
1194-
client_mtu,
1191+
notifier.notify(
1192+
client_ntttcp.create_ntttcp_udp_performance_message(
1193+
parsed_server_result,
1194+
parsed_client_result,
1195+
str(test_thread),
1196+
buffer_size,
1197+
test_case_name,
1198+
test_result,
1199+
client_mtu,
1200+
)
11951201
)
11961202
else:
1197-
perf_message = client_ntttcp.create_ntttcp_tcp_performance_message(
1198-
parsed_server_result,
1199-
parsed_client_result,
1200-
Decimal(0),
1201-
str(test_thread),
1202-
buffer_size,
1203-
test_case_name,
1204-
test_result,
1205-
client_mtu,
1203+
notifier.notify(
1204+
client_ntttcp.create_ntttcp_tcp_performance_message(
1205+
parsed_server_result,
1206+
parsed_client_result,
1207+
Decimal(0),
1208+
str(test_thread),
1209+
buffer_size,
1210+
test_case_name,
1211+
test_result,
1212+
client_mtu,
1213+
)
12061214
)
1207-
notifier.notify(perf_message)
12081215

12091216
def _get_host_nic_name(self, node: RemoteNode) -> str:
12101217
ip = node.connection_info[constants.ENVIRONMENTS_NODES_REMOTE_ADDRESS]

lisa/sut_orchestrator/hyperv/hyperv_device_pool.py

Lines changed: 135 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -408,29 +408,149 @@ def _assign_devices_to_vm(
408408
) -> None:
409409
# Assign the devices to the VM
410410
escaped_vm_name = vm_name.replace("'", "''")
411-
config_commands: List[str] = []
412-
for device in devices:
413-
escaped_instance_id = device.instance_id.replace("'", "''")
411+
disabled_devices: List[DeviceAddressSchema] = []
412+
dismounted_devices: List[DeviceAddressSchema] = []
413+
assigned_devices: List[DeviceAddressSchema] = []
414+
powershell = self._server.tools[PowerShell]
415+
416+
try:
417+
for device in devices:
418+
escaped_instance_id = device.instance_id.replace("'", "''")
419+
escaped_location_path = device.location_path.replace("'", "''")
420+
powershell.run_cmdlet(
421+
cmdlet=(
422+
f"Disable-PnpDevice -InstanceId '{escaped_instance_id}' "
423+
"-Confirm:$false"
424+
),
425+
force_run=True,
426+
)
427+
disabled_devices.append(device)
428+
429+
powershell.run_cmdlet(
430+
cmdlet=(
431+
f"Dismount-VMHostAssignableDevice -Force "
432+
f"-LocationPath '{escaped_location_path}'"
433+
),
434+
force_run=True,
435+
)
436+
dismounted_devices.append(device)
437+
438+
powershell.run_cmdlet(
439+
cmdlet=(
440+
f"Add-VMAssignableDevice "
441+
f"-LocationPath '{escaped_location_path}' "
442+
f"-VMName '{escaped_vm_name}'"
443+
),
444+
force_run=True,
445+
)
446+
assigned_devices.append(device)
447+
except LisaException as err:
448+
rollback_errors = self._rollback_dda_assignment(
449+
vm_name=vm_name,
450+
disabled_devices=disabled_devices,
451+
dismounted_devices=dismounted_devices,
452+
assigned_devices=assigned_devices,
453+
)
454+
if rollback_errors:
455+
raise LisaException(
456+
"Failed to assign Hyper-V DDA device(s) to VM "
457+
f"'{vm_name}': {err}. Rollback also failed: "
458+
f"{'; '.join(rollback_errors)}"
459+
) from err
460+
raise
461+
462+
def _rollback_dda_assignment(
463+
self,
464+
vm_name: str,
465+
disabled_devices: List[DeviceAddressSchema],
466+
dismounted_devices: List[DeviceAddressSchema],
467+
assigned_devices: List[DeviceAddressSchema],
468+
) -> List[str]:
469+
powershell = self._server.tools[PowerShell]
470+
escaped_vm_name = vm_name.replace("'", "''")
471+
rollback_errors: List[str] = []
472+
473+
self.log.info(f"Rolling back Hyper-V DDA assignment for VM '{vm_name}'")
474+
475+
for device in reversed(assigned_devices):
414476
escaped_location_path = device.location_path.replace("'", "''")
415-
config_commands.append(
416-
f"Disable-PnpDevice -InstanceId '{escaped_instance_id}' "
417-
"-Confirm:$false"
477+
error = self._run_dda_rollback_cmdlet(
478+
powershell=powershell,
479+
description=(
480+
f"Remove DDA device '{device.location_path}' from VM '{vm_name}'"
481+
),
482+
cmdlet=(
483+
f"Remove-VMAssignableDevice "
484+
f"-LocationPath '{escaped_location_path}' "
485+
f"-VMName '{escaped_vm_name}'"
486+
),
418487
)
419-
config_commands.append(
420-
f"Dismount-VMHostAssignableDevice -Force "
421-
f"-LocationPath '{escaped_location_path}'"
488+
if error:
489+
rollback_errors.append(error)
490+
491+
for device in reversed(dismounted_devices):
492+
escaped_location_path = device.location_path.replace("'", "''")
493+
error = self._run_dda_rollback_cmdlet(
494+
powershell=powershell,
495+
description=f"Mount DDA device '{device.location_path}' on host",
496+
cmdlet=(
497+
f"Mount-VMHostAssignableDevice "
498+
f"-LocationPath '{escaped_location_path}'"
499+
),
422500
)
423-
config_commands.append(
424-
f"Add-VMAssignableDevice -LocationPath '{escaped_location_path}' "
425-
f"-VMName '{escaped_vm_name}'"
501+
if error:
502+
rollback_errors.append(error)
503+
504+
for device in reversed(disabled_devices):
505+
escaped_instance_id = device.instance_id.replace("'", "''")
506+
error = self._run_dda_rollback_cmdlet(
507+
powershell=powershell,
508+
description=(
509+
f"Enable PnP device '{device.instance_id}' for location path "
510+
f"'{device.location_path}'"
511+
),
512+
cmdlet=(
513+
f"Enable-PnpDevice -InstanceId '{escaped_instance_id}' "
514+
"-Confirm:$false"
515+
),
426516
)
517+
if error:
518+
rollback_errors.append(error)
519+
continue
427520

428-
powershell = self._server.tools[PowerShell]
429-
for cmd in config_commands:
521+
try:
522+
self._wait_for_pnp_device_enabled(
523+
device.instance_id,
524+
device.location_path,
525+
)
526+
except LisaException as err:
527+
rollback_errors.append(
528+
f"PnP device '{device.instance_id}' for location path "
529+
f"'{device.location_path}' was not enabled during rollback: "
530+
f"{err}"
531+
)
532+
533+
for error in rollback_errors:
534+
self.log.warning(error)
535+
536+
return rollback_errors
537+
538+
def _run_dda_rollback_cmdlet(
539+
self,
540+
powershell: PowerShell,
541+
description: str,
542+
cmdlet: str,
543+
) -> Optional[str]:
544+
self.log.info(description)
545+
try:
430546
powershell.run_cmdlet(
431-
cmdlet=cmd,
547+
cmdlet=cmdlet,
432548
force_run=True,
433549
)
550+
except LisaException as err:
551+
return f"{description} failed: {err}"
552+
553+
return None
434554

435555
def _set_device_passthrough_node_context(
436556
self,

lisa/tools/ntttcp.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,7 +1009,7 @@ def create_ntttcp_result(
10091009

10101010
class WindowsNtttcp(Ntttcp):
10111011
_download_url = (
1012-
"https://github.com/microsoft/ntttcp/releases/latest/download/" "ntttcp.exe"
1012+
"https://github.com/microsoft/ntttcp/releases/latest/download/ntttcp.exe"
10131013
)
10141014
_total_mbps_pattern = re.compile(
10151015
r"(?im)^\s*TOTAL\s+(?P<throughput>[0-9]+(?:\.[0-9]+)?)\s*$"
@@ -1033,7 +1033,7 @@ def dependencies(self) -> List[Type[Tool]]:
10331033

10341034
def setup_system(self, udp_mode: bool = False, set_task_max: bool = True) -> None:
10351035
self.node.tools[PowerShell].run_cmdlet(
1036-
"Set-NetFirewallProfile -Profile Domain,Public,Private " "-Enabled False",
1036+
"Set-NetFirewallProfile -Profile Domain,Public,Private -Enabled False",
10371037
fail_on_error=False,
10381038
)
10391039

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
from types import SimpleNamespace
5+
from typing import Any, List, cast
6+
from unittest import TestCase
7+
from unittest.mock import MagicMock, patch
8+
9+
from lisa.sut_orchestrator.hyperv.hyperv_device_pool import HyperVDevicePool
10+
from lisa.sut_orchestrator.hyperv.schema import (
11+
DeviceAddressSchema,
12+
HypervPlatformSchema,
13+
)
14+
from lisa.tools import PowerShell
15+
from lisa.util import LisaException
16+
17+
18+
class HyperVDevicePoolTestCase(TestCase):
19+
def test_assign_devices_rolls_back_on_dismount_failure(self) -> None:
20+
first_device = DeviceAddressSchema(
21+
instance_id="PCI\\FIRST",
22+
location_path="PCIROOT(1)#PCI(0000)",
23+
)
24+
second_device = DeviceAddressSchema(
25+
instance_id="PCI\\SECOND",
26+
location_path="PCIROOT(1)#PCI(0001)",
27+
)
28+
commands: List[str] = []
29+
30+
def run_cmdlet(cmdlet: str, **_: Any) -> str:
31+
commands.append(cmdlet)
32+
if (
33+
"Dismount-VMHostAssignableDevice" in cmdlet
34+
and second_device.location_path in cmdlet
35+
):
36+
raise LisaException("pcip failed")
37+
return ""
38+
39+
powershell = SimpleNamespace(run_cmdlet=MagicMock(side_effect=run_cmdlet))
40+
node = SimpleNamespace(tools={PowerShell: powershell})
41+
pool = HyperVDevicePool(
42+
node=cast(Any, node),
43+
runbook=HypervPlatformSchema(),
44+
log=MagicMock(),
45+
)
46+
47+
with patch.object(pool, "_wait_for_pnp_device_enabled") as wait_enabled:
48+
with self.assertRaises(LisaException):
49+
pool._assign_devices_to_vm(
50+
vm_name="vm1",
51+
devices=[first_device, second_device],
52+
)
53+
54+
remove_index = next(
55+
index
56+
for index, command in enumerate(commands)
57+
if "Remove-VMAssignableDevice" in command
58+
and first_device.location_path in command
59+
)
60+
mount_index = next(
61+
index
62+
for index, command in enumerate(commands)
63+
if "Mount-VMHostAssignableDevice" in command
64+
and first_device.location_path in command
65+
)
66+
enable_second_index = next(
67+
index
68+
for index, command in enumerate(commands)
69+
if "Enable-PnpDevice" in command and second_device.instance_id in command
70+
)
71+
enable_first_index = next(
72+
index
73+
for index, command in enumerate(commands)
74+
if "Enable-PnpDevice" in command and first_device.instance_id in command
75+
)
76+
77+
self.assertLess(remove_index, mount_index)
78+
self.assertLess(mount_index, enable_second_index)
79+
self.assertLess(enable_second_index, enable_first_index)
80+
wait_enabled.assert_any_call(
81+
second_device.instance_id, second_device.location_path
82+
)
83+
wait_enabled.assert_any_call(
84+
first_device.instance_id, first_device.location_path
85+
)

0 commit comments

Comments
 (0)