From efb7ca124e406c2027a03e993242920292c27970 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 7 May 2026 08:42:24 +0000 Subject: [PATCH] fix(tests): avoid soft lockup in test_serial_active_tx_snapshot The test was hanging because the unbounded `cat /dev/zero > /dev/ttyS0` transmission causes a TX interrupt storm in the guest, which leads to a soft lockup on the CPU that handles the serial IRQ. While that CPU is locked up, the serial console becomes unusable, so the test could no longer regain a shell after restore. Fix by: - Using SSH as the control channel instead of serial (which is unusable during the flood). - Using 4 vCPUs with explicit IRQ affinity pinning: all IRQs to CPU0 (keeping SSH/virtio-net responsive), the serial IRQ to CPU1 (containing the soft lockup), and the writer pinned to CPU3 so it runs on a different CPU than the serial IRQ handler. Keeping the writer and the IRQ handler on separate CPUs is also what makes it likely that the guest still has TX interrupts enabled at the moment we take the snapshot, which is the scenario this test is meant to exercise. - After snapshot/restore, killing the writer via SSH and then verifying the serial console is functional again. Signed-off-by: Jack Thomson --- .../functional/test_serial_io.py | 39 ++++++++++++++++--- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/tests/integration_tests/functional/test_serial_io.py b/tests/integration_tests/functional/test_serial_io.py index 3d2ce03f89a..dc4ed311753 100644 --- a/tests/integration_tests/functional/test_serial_io.py +++ b/tests/integration_tests/functional/test_serial_io.py @@ -66,9 +66,10 @@ def test_serial_active_tx_snapshot(uvm_plain, microvm_factory): microvm.help.enable_console() microvm.spawn(serial_out_path=None) microvm.basic_config( - vcpu_count=2, + vcpu_count=4, mem_size_mib=256, ) + microvm.add_net_iface() serial = Serial(microvm) serial.open() microvm.start() @@ -80,11 +81,32 @@ def test_serial_active_tx_snapshot(uvm_plain, microvm_factory): # there will be an active transmission at the point of pausing the VM to # take the snapshot. This will saturate the TX buffer of the UART and it # might make the guest driver enable TX interrupts. - serial.tx("cat /dev/zero") + # + # On a multi-vCPU guest this unbounded transmission causes a TX interrupt + # storm that leads to a soft lockup on the CPU handling the serial IRQ. + # We work around this with two pieces of pinning: + # + # 1. Pin all IRQs to CPU0, then move the serial IRQ to CPU1. This keeps + # SSH (virtio-net) on CPU0 reachable while the lockup, if it occurs, + # is contained on CPU1. We need SSH because the serial console is + # unusable during the flood and we still need a way to stop `cat` + # after restore. + # 2. Pin the writer (`cat`) to CPU3 so it runs on a different CPU than + # the serial IRQ handler. Having the writer and the IRQ handler on + # separate CPUs makes it likely that the guest driver still has TX + # interrupts enabled at the moment we take the snapshot, which is + # the scenario this test is meant to exercise. + microvm.ssh.check_output( + "for irq in $(ls /proc/irq/ | grep -E '^[0-9]+$'); do" + " echo 1 > /proc/irq/$irq/smp_affinity 2>/dev/null || true;" + " done;" + " SER_IRQ=$(awk '/ttyS0/{print $1}' /proc/interrupts | tr -d :);" + " echo 2 > /proc/irq/$SER_IRQ/smp_affinity;" + " nohup taskset -c 3 cat /dev/zero > /dev/ttyS0 2>/dev/null &" + ) # Give the guest time to start the transmission time.sleep(1) - # Create snapshot. snapshot = microvm.snapshot_full() # Kill base microVM. microvm.kill() @@ -94,11 +116,16 @@ def test_serial_active_tx_snapshot(uvm_plain, microvm_factory): vm.help.enable_console() vm.spawn(serial_out_path=None) vm.restore_from_snapshot(snapshot, resume=True) + + # The restored VM resumes the cat flood. Kill it via SSH so the serial + # console becomes usable again. + vm.ssh.check_output("pkill -9 cat || true") + serial = Serial(vm) serial.open() - - # Send Ctrl-C to the guest to stop the ongoing transmission and regain the shell - serial.tx("\x03", end="") + # We need to send a newline to signal the serial to flush + # the login content. + serial.tx("") # looking for the # prompt at the end serial.rx(vm.distro.shell_prompt) serial.tx("pwd")