|
| 1 | +# Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | +"""GDB debugging of a microVM *restored from a snapshot* (the e2b resume path). |
| 4 | +
|
| 5 | +Upstream Firecracker only wires GDB into the fresh-boot path; this exercises the |
| 6 | +restore-path wiring added to `build_microvm_from_snapshot`. It boots a multi-vCPU |
| 7 | +VM on the production kernel built with DWARF (KASLR *on*, as in prod), snapshots |
| 8 | +it, restores into a new VM (file/UFFD-backed, 4K/2M hugetlb), recovers the KASLR |
| 9 | +image slide *from the snapshot itself*, attaches GDB, and checks that we can set a |
| 10 | +breakpoint and print kernel structures/memory across multiple vCPUs. |
| 11 | +
|
| 12 | +KASLR slide recovery: the kernel image is slid by a single offset, so |
| 13 | +`slide = MSR_LSTAR - &entry_SYSCALL_64`, where `MSR_LSTAR` is read from the |
| 14 | +snapshot's saved vcpu MSRs (via `snapshot-editor info-vmstate vcpu-states`) and |
| 15 | +`&entry_SYSCALL_64` is the link-time address from the vmlinux symbols. Applied with |
| 16 | +`add-symbol-file <vmlinux> -o <slide>`. This mirrors how resume-build recovers the |
| 17 | +slide in prod. |
| 18 | +""" |
| 19 | + |
| 20 | +import base64 |
| 21 | +import platform |
| 22 | +import re |
| 23 | +import subprocess |
| 24 | +import tempfile |
| 25 | +import time |
| 26 | +from pathlib import Path |
| 27 | + |
| 28 | +import pytest |
| 29 | + |
| 30 | +import host_tools.cargo_build |
| 31 | +from framework.microvm import HugePagesConfig, MicroVMFactory |
| 32 | + |
| 33 | +# Production kernel (6.1.158) built with DWARF, KASLR on — same config as prod, |
| 34 | +# only debug info added. Placed here by the test setup. |
| 35 | +KERNEL = Path(__file__).parents[3] / "build/img/x86_64/vmlinux-6.1.158-dwarf" |
| 36 | + |
| 37 | +GDB_TIMEOUT = 40 |
| 38 | + |
| 39 | + |
| 40 | +def _recover_slide(snapshot_editor, vmstate_path, vmlinux): |
| 41 | + """Recover the KASLR image slide from the snapshot. Uses MSR_LSTAR (the syscall |
| 42 | + entry, i.e. entry_SYSCALL_64 — kernel text, slid with the image) minus the |
| 43 | + link-time address of entry_SYSCALL_64. (IDTR/GDTR are mapped in the fixed |
| 44 | + cpu_entry_area, not slid with the image, so they can't be used.)""" |
| 45 | + out = subprocess.check_output( |
| 46 | + [ |
| 47 | + str(snapshot_editor), |
| 48 | + "info-vmstate", |
| 49 | + "vcpu-states", |
| 50 | + "--vmstate-path", |
| 51 | + str(vmstate_path), |
| 52 | + ], |
| 53 | + text=True, |
| 54 | + ) |
| 55 | + m = re.search(r"msr index=0xc0000082 data=0x([0-9a-fA-F]+)", out) # MSR_LSTAR |
| 56 | + assert m, f"MSR_LSTAR not found in vmstate dump:\n{out[-2000:]}" |
| 57 | + lstar = int(m.group(1), 16) |
| 58 | + |
| 59 | + link = subprocess.check_output( |
| 60 | + f"readelf -sW {vmlinux} | awk '$NF==\"entry_SYSCALL_64\"{{print $2; exit}}'", |
| 61 | + shell=True, |
| 62 | + text=True, |
| 63 | + ).strip() |
| 64 | + assert link, "entry_SYSCALL_64 symbol not found in vmlinux" |
| 65 | + return lstar - int(link, 16) |
| 66 | + |
| 67 | + |
| 68 | +def _spawn_gdb(gdb_socket, out_path, commands): |
| 69 | + """Drive gdb in batch mode against FC's gdbstub, writing all output to |
| 70 | + `out_path`. No symbol file on the command line — symbols are loaded in-script |
| 71 | + with the recovered slide. Polls for the socket (created inside FC's restore |
| 72 | + path, which then blocks for the connection).""" |
| 73 | + with tempfile.NamedTemporaryFile( |
| 74 | + mode="w", suffix=".gdb", delete=False, prefix="fc_gdb_restore_" |
| 75 | + ) as f: |
| 76 | + f.write(commands) |
| 77 | + gdb_script = f.name |
| 78 | + |
| 79 | + return subprocess.Popen( |
| 80 | + f""" |
| 81 | + until [ -S {gdb_socket} ]; do sleep 0.2; done; |
| 82 | + exec gdb -q -batch -x {gdb_script} > {out_path} 2>&1 |
| 83 | + """, |
| 84 | + shell=True, |
| 85 | + ) |
| 86 | + |
| 87 | + |
| 88 | +def _prelude(slide, gdb_socket): |
| 89 | + """gdb commands to load slid symbols and connect.""" |
| 90 | + return f""" |
| 91 | + set pagination off |
| 92 | + set confirm off |
| 93 | + add-symbol-file {KERNEL} -o {slide} |
| 94 | + target remote {gdb_socket} |
| 95 | + """ |
| 96 | + |
| 97 | + |
| 98 | +# Hugetlbfs guest memory is anonymous MAP_HUGETLB, which the File restore backend |
| 99 | +# can't mmap — so the 2M case uses UFFD (also the production backing). |
| 100 | +@pytest.mark.parametrize( |
| 101 | + "use_uffd,huge_pages", |
| 102 | + [ |
| 103 | + (False, HugePagesConfig.NONE), |
| 104 | + (True, HugePagesConfig.NONE), |
| 105 | + (True, HugePagesConfig.HUGETLBFS_2MB), |
| 106 | + ], |
| 107 | + ids=["file-4k", "uffd-4k", "uffd-2M"], |
| 108 | +) |
| 109 | +@pytest.mark.skipif( |
| 110 | + platform.machine() != "x86_64", reason="restore-path GDB wiring is x86_64-only" |
| 111 | +) |
| 112 | +def test_gdb_restore(use_uffd, huge_pages, rootfs): |
| 113 | + """Restore a snapshot under GDB and debug the (KASLR-on) guest kernel.""" |
| 114 | + bin_dir = host_tools.cargo_build.build_gdb() |
| 115 | + if use_uffd: |
| 116 | + host_tools.cargo_build.cargo( |
| 117 | + "build", |
| 118 | + f"--example uffd_on_demand_handler --features gdb " |
| 119 | + f"--target {host_tools.cargo_build.DEFAULT_TARGET}", |
| 120 | + env={"CARGO_TARGET_DIR": str(bin_dir.parents[1])}, |
| 121 | + ) |
| 122 | + vmfcty = MicroVMFactory(bin_dir) |
| 123 | + |
| 124 | + base = vmfcty.build(KERNEL, rootfs) |
| 125 | + base.memory_monitor = None |
| 126 | + base.spawn() |
| 127 | + base.basic_config(vcpu_count=2, mem_size_mib=512, huge_pages=huge_pages) |
| 128 | + base.add_net_iface() |
| 129 | + base.start() |
| 130 | + base.wait_for_ssh_up() |
| 131 | + snapshot = base.snapshot_full() |
| 132 | + slide = _recover_slide(bin_dir / "snapshot-editor", snapshot.vmstate, KERNEL) |
| 133 | + base.kill() |
| 134 | + |
| 135 | + uvm = vmfcty.build() |
| 136 | + uvm.memory_monitor = None |
| 137 | + uvm.spawn(validate_api=False) |
| 138 | + gdb_socket = Path(uvm.jailer.chroot_path(), "gdb.socket") |
| 139 | + gdb_out = Path(uvm.path) / "gdb_out.txt" |
| 140 | + |
| 141 | + gdb_commands = ( |
| 142 | + _prelude(slide, gdb_socket) |
| 143 | + + """ |
| 144 | + echo \\n=== STRUCT ===\\n |
| 145 | + print sizeof(struct task_struct) |
| 146 | + print init_task.pid |
| 147 | + print init_task.comm |
| 148 | + echo \\n=== MEMORY ===\\n |
| 149 | + x/2xg &init_task |
| 150 | + echo \\n=== THREADS ===\\n |
| 151 | + info threads |
| 152 | + echo \\n=== THREAD2-BT ===\\n |
| 153 | + thread 2 |
| 154 | + bt |
| 155 | + echo \\n=== BREAKPOINT ===\\n |
| 156 | + thread 1 |
| 157 | + break do_idle |
| 158 | + continue |
| 159 | + bt |
| 160 | + echo \\n=== DONE ===\\n |
| 161 | + kill |
| 162 | + """ |
| 163 | + ) |
| 164 | + gdb_proc = _spawn_gdb(gdb_socket, gdb_out, gdb_commands) |
| 165 | + |
| 166 | + uffd_handler_name = "on_demand" if use_uffd else None |
| 167 | + uvm.restore_from_snapshot( |
| 168 | + snapshot, |
| 169 | + resume=True, |
| 170 | + uffd_handler_name=uffd_handler_name, |
| 171 | + gdb_socket_path="gdb.socket", |
| 172 | + ) |
| 173 | + |
| 174 | + timed_out = False |
| 175 | + try: |
| 176 | + gdb_proc.wait(timeout=GDB_TIMEOUT) |
| 177 | + except subprocess.TimeoutExpired: |
| 178 | + timed_out = True |
| 179 | + gdb_proc.kill() |
| 180 | + |
| 181 | + out = gdb_out.read_text() if gdb_out.exists() else "(no gdb output captured)" |
| 182 | + diag = f"\nslide={slide:#x} timed_out={timed_out}\n--- gdb output ---\n{out}" |
| 183 | + |
| 184 | + assert not timed_out, f"gdb did not finish in {GDB_TIMEOUT}s:{diag}" |
| 185 | + assert "=== DONE ===" in out, f"gdb script did not run to completion:{diag}" |
| 186 | + assert "swapper" in out, f"init_task.comm (swapper) not read:{diag}" |
| 187 | + assert "$1 = " in out, f"sizeof(struct task_struct) not resolved:{diag}" |
| 188 | + assert ( |
| 189 | + "Breakpoint 1, " in out and "do_idle" in out |
| 190 | + ), f"breakpoint on do_idle not hit:{diag}" |
| 191 | + assert out.count("Vcpu ID:") >= 2, f"both vCPUs not enumerated by gdb:{diag}" |
| 192 | + assert ( |
| 193 | + "#0 " in out.split("=== THREAD2-BT ===", 1)[-1] |
| 194 | + ), f"per-vCPU backtrace of vCPU 1 not resolved:{diag}" |
| 195 | + |
| 196 | + uvm.kill() |
| 197 | + |
| 198 | + |
| 199 | +# A guest workload that continuously page-faults: repeatedly mmap an anonymous |
| 200 | +# region and write every page, attributed to comm "python3". Throttled so it |
| 201 | +# faults steadily without starving sshd. |
| 202 | +_FAULTER_PY = b"""import mmap, time |
| 203 | +ms = [] |
| 204 | +while True: |
| 205 | + m = mmap.mmap(-1, 4 * 1024 * 1024) |
| 206 | + m.write(b"x" * (4 * 1024 * 1024)) |
| 207 | + ms.append(m) |
| 208 | + if len(ms) > 4: |
| 209 | + ms.pop(0) |
| 210 | + time.sleep(0.05) |
| 211 | +""" |
| 212 | + |
| 213 | + |
| 214 | +@pytest.mark.parametrize( |
| 215 | + "huge_pages", |
| 216 | + [HugePagesConfig.NONE, HugePagesConfig.HUGETLBFS_2MB], |
| 217 | + ids=["4k", "2M"], |
| 218 | +) |
| 219 | +@pytest.mark.skipif( |
| 220 | + platform.machine() != "x86_64", reason="restore-path GDB wiring is x86_64-only" |
| 221 | +) |
| 222 | +def test_gdb_restore_fault_attribution(huge_pages, rootfs): |
| 223 | + """Useful application: attribute guest page faults during restore to the |
| 224 | + responsible process and VMA — invisible to host/UFFD telemetry. Breaks |
| 225 | + handle_mm_fault on the restored (KASLR-on) VM and reads, per fault, the |
| 226 | + faulting process (vma->vm_mm->owner) + VMA + address from the SysV args.""" |
| 227 | + bin_dir = host_tools.cargo_build.build_gdb() |
| 228 | + host_tools.cargo_build.cargo( |
| 229 | + "build", |
| 230 | + f"--example uffd_on_demand_handler --features gdb " |
| 231 | + f"--target {host_tools.cargo_build.DEFAULT_TARGET}", |
| 232 | + env={"CARGO_TARGET_DIR": str(bin_dir.parents[1])}, |
| 233 | + ) |
| 234 | + vmfcty = MicroVMFactory(bin_dir) |
| 235 | + |
| 236 | + # Two vCPUs on purpose: both hammer handle_mm_fault, so the gdb event loop has to |
| 237 | + # coalesce concurrent breakpoint hits and drain the stale debug events of the |
| 238 | + # force-paused siblings on each resume. This is the regression test for that drain |
| 239 | + # — without it the pause/resume handshake desyncs under the fault storm and the |
| 240 | + # connection drops. |
| 241 | + base = vmfcty.build(KERNEL, rootfs) |
| 242 | + base.memory_monitor = None |
| 243 | + base.spawn() |
| 244 | + base.basic_config(vcpu_count=2, mem_size_mib=512, huge_pages=huge_pages) |
| 245 | + base.add_net_iface() |
| 246 | + base.start() |
| 247 | + base.wait_for_ssh_up() |
| 248 | + |
| 249 | + b64 = base64.b64encode(_FAULTER_PY).decode() |
| 250 | + base.ssh.check_output(f"echo {b64} | base64 -d > /tmp/faulter.py") |
| 251 | + base.ssh.check_output("nohup python3 /tmp/faulter.py >/dev/null 2>&1 </dev/null &") |
| 252 | + time.sleep(3) |
| 253 | + snapshot = base.snapshot_full() |
| 254 | + slide = _recover_slide(bin_dir / "snapshot-editor", snapshot.vmstate, KERNEL) |
| 255 | + base.kill() |
| 256 | + |
| 257 | + uvm = vmfcty.build() |
| 258 | + uvm.memory_monitor = None |
| 259 | + uvm.spawn(validate_api=False) |
| 260 | + gdb_socket = Path(uvm.jailer.chroot_path(), "gdb.socket") |
| 261 | + gdb_out = Path(uvm.path) / "gdb_fault_out.txt" |
| 262 | + |
| 263 | + gdb_commands = ( |
| 264 | + _prelude(slide, gdb_socket) |
| 265 | + + """ |
| 266 | + break *handle_mm_fault |
| 267 | + set $i = 0 |
| 268 | + while $i < 40 |
| 269 | + continue |
| 270 | + set $vma = (struct vm_area_struct *)$rdi |
| 271 | + set $mm = $vma->vm_mm |
| 272 | + if $mm != 0 |
| 273 | + set $task = $mm->owner |
| 274 | + if $task != 0 |
| 275 | + printf "FAULT comm=%s pid=%d addr=0x%lx vma=0x%lx-0x%lx flags=0x%lx\\n", $task->comm, $task->pid, $rsi, $vma->vm_start, $vma->vm_end, $vma->vm_flags |
| 276 | + end |
| 277 | + end |
| 278 | + set $i = $i + 1 |
| 279 | + end |
| 280 | + echo \\n=== DONE ===\\n |
| 281 | + kill |
| 282 | + """ |
| 283 | + ) |
| 284 | + gdb_proc = _spawn_gdb(gdb_socket, gdb_out, gdb_commands) |
| 285 | + uvm.restore_from_snapshot( |
| 286 | + snapshot, |
| 287 | + resume=True, |
| 288 | + uffd_handler_name="on_demand", |
| 289 | + gdb_socket_path="gdb.socket", |
| 290 | + ) |
| 291 | + |
| 292 | + timed_out = False |
| 293 | + try: |
| 294 | + gdb_proc.wait(timeout=120) |
| 295 | + except subprocess.TimeoutExpired: |
| 296 | + timed_out = True |
| 297 | + gdb_proc.kill() |
| 298 | + |
| 299 | + out = gdb_out.read_text() if gdb_out.exists() else "(no gdb output captured)" |
| 300 | + diag = f"\nslide={slide:#x} timed_out={timed_out}\n--- gdb output ---\n{out}" |
| 301 | + |
| 302 | + assert not timed_out, f"gdb did not finish in 120s:{diag}" |
| 303 | + assert "=== DONE ===" in out, f"gdb script did not run to completion:{diag}" |
| 304 | + |
| 305 | + faults = [ln for ln in out.splitlines() if ln.startswith("FAULT comm=")] |
| 306 | + print("\nGuest faults attributed during restore (sample):") |
| 307 | + print("\n".join(faults[:8])) |
| 308 | + assert len(faults) >= 10, f"too few faults captured ({len(faults)}):{diag}" |
| 309 | + assert any( |
| 310 | + "comm=python3" in ln for ln in faults |
| 311 | + ), f"workload process not attributed:{diag}" |
| 312 | + vmas = re.findall(r"vma=0x([0-9a-f]+)-0x([0-9a-f]+)", out) |
| 313 | + assert vmas and all( |
| 314 | + int(s, 16) < int(e, 16) for s, e in vmas |
| 315 | + ), f"no valid VMA ranges captured:{diag}" |
| 316 | + |
| 317 | + uvm.kill() |
0 commit comments