firecracker-microvm · JackThomson2 · Apr 24, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/.buildkite/pipeline_cross.py b/.buildkite/pipeline_cross.py
@@ -23,10 +23,13 @@
         "m6i.metal",
         "m7i.metal-24xl",
         "m7i.metal-48xl",
+        "m8i.metal-48xl",
         "m6a.metal",
         "m7a.metal-48xl",
     ]
-    instances_aarch64 = ["m7g.metal"]
+    instances_aarch64 = ["m6g.metal", "m7g.metal", "m8g.metal-24xl"]
+    restore_only_platforms = [("al2023", "linux_6.18")]
+    x86_64_platforms = DEFAULT_PLATFORMS + restore_only_platforms
     commands = [
         "./tools/devtool -y test --no-build --no-archive -- -m nonci -n4 integration_tests/functional/test_snapshot_phase1.py",
         # punch holes in mem snapshot tiles and tar them so they are preserved in S3
@@ -35,31 +38,56 @@
         "mkdir -pv snapshots",
         "tar cSvf snapshots/{instance}_{kv}.tar snapshot_artifacts",
     ]
-    pipeline.build_group(
+
+    def create_step_key(instance, kv):
+        """Buildkite key for a snapshot-create step.
+
+        Keys may only contain [A-Za-z0-9_\\-:], so dots in instance names
+        (m5n.metal) and kernel versions (linux_5.10) are sanitized to
+        underscores. Tarball paths stay unchanged.
+        """
+        return f"snap-create-{instance}-{kv}".replace(".", "_")
+
+    # Key each snapshot-create step so restore steps can depend on the
+    # specific source snapshot they need, rather than waiting for every
+    # snapshot-create step to finish. `build_group` doesn't sanitize
+    # substituted key values, so we set the final key after it fans out.
+    x86_create = pipeline.build_group(
         "snapshot-create",
         commands,
         timeout=30,
         artifact_paths="snapshots/**/*",
         instances=instances_x86_64,
         platforms=DEFAULT_PLATFORMS,
     )
-    pipeline.add_step("wait")
 
-    # allow-list of what instances can be restores on what other instances (in
-    # addition to itself)
+    # https://github.com/firecracker-microvm/firecracker/blob/main/docs/snapshotting/snapshot-support.md#where-can-i-resume-my-snapshots
+    aarch64_platforms = [("al2023", "linux_6.1")]
+    aarch64_create = pipeline.build_group(
+        "snapshot-create-aarch64",
+        commands,
+        timeout=30,
+        artifact_paths="snapshots/**/*",
+        instances=instances_aarch64,
+        platforms=aarch64_platforms,
+    )
+    for grp in (x86_create, aarch64_create):
+        for s in grp["steps"]:
+            s["key"] = create_step_key(s["agents"]["instance"], s["agents"]["kv"])
+
+    # allow-list of what instances can be restored on what other instances (in
+    # addition to itself). aarch64 is restricted to same-instance restores.
     supported = {
         "m5n.metal": ["m6i.metal"],
         "m6i.metal": ["m5n.metal"],
     }
-
-    # https://github.com/firecracker-microvm/firecracker/blob/main/docs/kernel-policy.md#experimental-snapshot-compatibility-across-kernel-versions
-    aarch64_platforms = [("al2023", "linux_6.1")]
+    aarch64_all_platforms = aarch64_platforms + restore_only_platforms
     perms_aarch64 = itertools.product(
-        instances_aarch64, aarch64_platforms, instances_aarch64, aarch64_platforms
+        instances_aarch64, aarch64_platforms, instances_aarch64, aarch64_all_platforms
     )
 
     perms_x86_64 = itertools.product(
-        instances_x86_64, DEFAULT_PLATFORMS, instances_x86_64, DEFAULT_PLATFORMS
+        instances_x86_64, DEFAULT_PLATFORMS, instances_x86_64, x86_64_platforms
     )
     steps = []
     for (
@@ -74,6 +102,9 @@
         # newer -> older is not supported, and does not work
         if src_kv > dst_kv:
             continue
+        # only test cross-kernel restore between adjacent kernel versions
+        if src_kv == "linux_5.10" and dst_kv == "linux_6.18":
+            continue
         if src_instance != dst_instance and dst_instance not in supported.get(
             src_instance, []
         ):
@@ -96,6 +127,7 @@
             "label": f"snapshot-restore-src-{src_instance}-{src_kv}-dst-{dst_instance}-{dst_kv}",
             "timeout": 30,
             "agents": {"instance": dst_instance, "kv": dst_kv, "os": dst_os},
+            "depends_on": [create_step_key(src_instance, src_kv)],
             **per_instance,
         }
         steps.append(step)

diff --git a/tests/framework/utils.py b/tests/framework/utils.py
@@ -2,7 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 """Generic utility functions that are used in the framework."""
 
+import base64
 import errno
+import hashlib
 import json
 import logging
 import os
@@ -555,13 +557,15 @@ def start_screen_process(screen_log, session_name, binary_path, binary_params):
 
 
 def guest_run_fio_iteration(ssh_connection, iteration):
-    """Start FIO workload into a microVM."""
-    fio = """fio --filename=/dev/vda --direct=1 --rw=randread --bs=4k \
-        --ioengine=libaio --iodepth=16 --runtime=10 --numjobs=4 --time_based \
-        --group_reporting --name=iops-test-job --eta-newline=1 --readonly \
-        --output /tmp/fio{} > /dev/null &""".format(iteration)
-    exit_code, _, stderr = ssh_connection.run(fio)
-    assert exit_code == 0, stderr
+    """Run FIO workload on a microVM and verify IO completed successfully."""
+    fio = (
+        "fio --filename=/dev/vda --direct=1 --rw=randread --bs=4k "
+        "--ioengine=libaio --iodepth=16 --runtime=10 --numjobs=4 --time_based "
+        "--group_reporting --name=iops-test-job --readonly --output-format=json"
+    )
+    _, stdout, _ = ssh_connection.check_output(fio)
+    total_read = json.loads(stdout)["jobs"][0]["read"]["io_bytes"]
+    assert total_read > 0, f"fio iteration {iteration}: no bytes read from block device"
 
 
 def check_filesystem(ssh_connection, disk_fmt, disk):
@@ -576,6 +580,19 @@ def check_entropy(ssh_connection):
     ssh_connection.check_output("dd if=/dev/hwrng of=/dev/null bs=4096 count=1")
 
 
+def check_network_data_integrity(ssh_connection, size_bytes=64 * 1024):
+    """Push random bytes to the guest over SSH and verify the guest-side sha256
+    matches the host-side hash. Exercises the virtio-net RX path end-to-end."""
+    payload = os.urandom(size_bytes)
+    host_hash = hashlib.sha256(payload).hexdigest()
+    b64 = base64.b64encode(payload).decode("ascii")
+    _, stdout, _ = ssh_connection.check_output(f"echo {b64} | base64 -d | sha256sum")
+    guest_hash = stdout.strip().split()[0]
+    assert (
+        guest_hash == host_hash
+    ), f"Guest hash {guest_hash} does not match host hash {host_hash}"
+
+
 @retry(wait=wait_fixed(0.5), stop=stop_after_attempt(5), reraise=True)
 def wait_process_running(process):
     """Wait for a process to run.

diff --git a/tests/integration_tests/functional/test_snapshot_phase1.py b/tests/integration_tests/functional/test_snapshot_phase1.py
@@ -6,7 +6,6 @@
 """
 
 import json
-import platform
 import re
 
 import pytest
@@ -18,9 +17,6 @@
 )
 from framework.utils_cpu_templates import get_cpu_template_name
 
-if platform.machine() != "x86_64":
-    pytestmark = pytest.mark.skip("only x86_64 architecture supported")
-
 # Default IPv4 address to route MMDS requests.
 IPV4_ADDRESS = "169.254.169.254"
 NET_IFACE_FOR_MMDS = "eth3"
@@ -58,6 +54,8 @@ def test_snapshot_phase1(
     configure_mmds(vm, ["eth3"], version="V2")
     # Add a memory balloon.
     vm.api.balloon.put(amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1)
+    # Add an entropy device.
+    vm.api.entropy.put()
 
     vm.start()
 
@@ -95,6 +93,14 @@ def test_snapshot_phase1(
     _, stdout, _ = vm.ssh.run(cmd)
     assert json.loads(stdout) == data_store
 
+    # Record guest CLOCK_MONOTONIC just before snapshotting. The cross-kernel
+    # restore test reads this back and asserts the clock didn't jump forward
+    # by the pipeline-elapsed time, which would indicate a kvm-clock regression
+    # (see a1fd537f9 "fix(kvm-clock): do not jump monotonic clock on restore").
+    vm.ssh.check_output(
+        "python3 -c 'import time; print(time.monotonic())' > /tmp/monotonic-before"
+    )
+
     # Copy snapshot files to be published to S3 for the 2nd part of the test
     # Create snapshot artifacts directory specific for the kernel version used.
     snapshot = vm.snapshot_full()

diff --git a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py
@@ -5,13 +5,14 @@
 
 import json
 import logging
-import platform
 from pathlib import Path
 
 import pytest
 
 from framework.defs import FC_WORKSPACE_DIR
 from framework.utils import (
+    check_entropy,
+    check_network_data_integrity,
     generate_mmds_get_request,
     generate_mmds_session_token,
     guest_run_fio_iteration,
@@ -27,6 +28,23 @@
 pytestmark = pytest.mark.nonci
 
 
+def _check_guest_monotonic_did_not_jump(ssh_connection, max_delta_sec=10):
+    # Phase1 recorded CLOCK_MONOTONIC to /tmp/monotonic-before just before
+    # snapshotting. Firecracker is supposed to resume MONOTONIC from capture
+    # time, so the delta here should be near zero regardless of how long
+    # phase1 and restore are apart in the pipeline. A large delta indicates
+    # MONOTONIC jumped forward across the snapshot - a kvm-clock regression
+    # that could surface only on some host-kernel combinations.
+    _, before_str, _ = ssh_connection.check_output("cat /tmp/monotonic-before")
+    _, after_str, _ = ssh_connection.check_output(
+        "python3 -c 'import time; print(time.monotonic())'"
+    )
+    delta = float(after_str.strip()) - float(before_str.strip())
+    assert (
+        0 <= delta <= max_delta_sec
+    ), f"Guest MONOTONIC jumped {delta:.3f}s across snapshot (max {max_delta_sec}s)"
+
+
 def _test_balloon(microvm):
     # Check memory usage.
     first_reading = get_stable_rss_mem(microvm)
@@ -71,12 +89,11 @@ def get_snapshot_dirs():
     """Get all the snapshot directories"""
     snapshot_root_name = "snapshot_artifacts"
     snapshot_root_dir = Path(FC_WORKSPACE_DIR) / snapshot_root_name
-    cpu_templates = []
-    if platform.machine() == "x86_64":
-        cpu_templates = ["None"]
-    cpu_templates += get_supported_cpu_templates()
+    cpu_templates = ["None"] + get_supported_cpu_templates()
     for cpu_template in cpu_templates:
-        for snapshot_dir in snapshot_root_dir.glob(f"*_{cpu_template}_guest_snapshot"):
+        for snapshot_dir in snapshot_root_dir.glob(
+            f"**/*_{cpu_template}_guest_snapshot"
+        ):
             assert snapshot_dir.is_dir()
             yield pytest.param(snapshot_dir, id=snapshot_dir.name)
 
@@ -100,7 +117,11 @@ def test_snap_restore_from_artifacts(
     # in the snapshot root dir.
     logger.info("Working with snapshot artifacts in %s.", snapshot_dir)
 
-    vm = microvm_factory.build()
+    # Skip memory monitor: the balloon inflation below fragments the guest
+    # VMA via discard_range's MAP_FIXED anonymous mmap workaround (used only
+    # for private file-backed mappings from snapshot restore), defeating
+    # MemoryMonitor.is_guest_mem. Cross-kernel test, not overhead.
+    vm = microvm_factory.build(monitor_memory=False)
     vm.time_api_requests = False
     vm.spawn()
     logger.info("Loading microVM from snapshot...")
@@ -115,6 +136,15 @@ def test_snap_restore_from_artifacts(
         logger.info("Testing net device %s...", iface["iface"].dev_name)
         vm.ssh_iface(idx).check_output("true")
 
+    # Check MONOTONIC before any other post-restore activity, so the delta
+    # is bounded by the few seconds of post-resume setup rather than the
+    # full test runtime.
+    logger.info("Testing guest MONOTONIC did not jump across snapshot...")
+    _check_guest_monotonic_did_not_jump(vm.ssh)
+
+    logger.info("Testing network data integrity...")
+    check_network_data_integrity(vm.ssh)
+
     logger.info("Testing data store behavior...")
     _test_mmds(vm, vm.iface["eth3"]["iface"])
 
@@ -124,9 +154,10 @@ def test_snap_restore_from_artifacts(
     logger.info("Testing vsock device...")
     check_vsock_device(vm, bin_vsock_path, test_fc_session_root_path, vm.ssh)
 
-    # Run fio on the guest.
-    # TODO: check the result of FIO or use fsck to check that the root device is
-    # not corrupted. No obvious errors will be returned here.
+    logger.info("Testing block device via fio...")
     guest_run_fio_iteration(vm.ssh, 0)
 
+    logger.info("Testing entropy...")
+    check_entropy(vm.ssh)
+
     vm.kill()