From 02d76d4a21b175edc1ffda874bd3e5372a66176a Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 23 Apr 2026 13:50:31 +0000 Subject: [PATCH 01/10] fix(test): fix snapshot artifact discovery on aarch64 Two bugs were preventing cross-kernel restore tests from running: 1. The glob pattern only searched one level deep under snapshot_artifacts/, but Phase 1 artifacts are nested under an additional test-name directory. Use recursive glob (**/) to find snapshot directories regardless of nesting depth. 2. The "None" CPU template was only added to the search list on x86_64, so on aarch64 instances where get_supported_cpu_templates() returns an empty list (e.g. Neoverse N1), the loop yielded zero pytest parameters and the test was silently skipped. Always include "None" in the search list. Signed-off-by: Jack Thomson --- .../functional/test_snapshot_restore_cross_kernel.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py index 253502a2d1f..80596a8166e 100644 --- a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py +++ b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py @@ -5,7 +5,6 @@ import json import logging -import platform from pathlib import Path import pytest @@ -71,12 +70,9 @@ def get_snapshot_dirs(): """Get all the snapshot directories""" snapshot_root_name = "snapshot_artifacts" snapshot_root_dir = Path(FC_WORKSPACE_DIR) / snapshot_root_name - cpu_templates = [] - if platform.machine() == "x86_64": - cpu_templates = ["None"] - cpu_templates += get_supported_cpu_templates() + cpu_templates = ["None"] + get_supported_cpu_templates() for cpu_template in cpu_templates: - for snapshot_dir in snapshot_root_dir.glob(f"*_{cpu_template}_guest_snapshot"): + for snapshot_dir in snapshot_root_dir.glob(f"**/*_{cpu_template}_guest_snapshot"): assert snapshot_dir.is_dir() yield pytest.param(snapshot_dir, id=snapshot_dir.name) From 3a53ff5ba74c74ef2f6e5f041f2acf1c3a7878f2 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 23 Apr 2026 13:53:30 +0000 Subject: [PATCH 02/10] feat(test): add linux 6.1 to 6.18 cross-kernel snapshot restore testing Add AL2023/linux_6.18 as a restore-only platform in the cross-snapshot pipeline for both x86_64 and aarch64. Snapshots created on 6.1 hosts are restored on 6.18 hosts to validate cross-kernel compatibility. The 6.18 platform is scoped to pipeline_cross.py only since 6.18 agents exist exclusively in the private Buildkite queue. Signed-off-by: Jack Thomson --- .buildkite/pipeline_cross.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipeline_cross.py b/.buildkite/pipeline_cross.py index bbe3a98b7cf..1228c27249c 100755 --- a/.buildkite/pipeline_cross.py +++ b/.buildkite/pipeline_cross.py @@ -27,6 +27,8 @@ "m7a.metal-48xl", ] instances_aarch64 = ["m7g.metal"] + restore_only_platforms = [("al2023", "linux_6.18")] + x86_64_platforms = DEFAULT_PLATFORMS + restore_only_platforms commands = [ "./tools/devtool -y test --no-build --no-archive -- -m nonci -n4 integration_tests/functional/test_snapshot_phase1.py", # punch holes in mem snapshot tiles and tar them so they are preserved in S3 @@ -54,12 +56,13 @@ # https://github.com/firecracker-microvm/firecracker/blob/main/docs/kernel-policy.md#experimental-snapshot-compatibility-across-kernel-versions aarch64_platforms = [("al2023", "linux_6.1")] + aarch64_all_platforms = aarch64_platforms + restore_only_platforms perms_aarch64 = itertools.product( - instances_aarch64, aarch64_platforms, instances_aarch64, aarch64_platforms + instances_aarch64, aarch64_platforms, instances_aarch64, aarch64_all_platforms ) perms_x86_64 = itertools.product( - instances_x86_64, DEFAULT_PLATFORMS, instances_x86_64, DEFAULT_PLATFORMS + instances_x86_64, DEFAULT_PLATFORMS, instances_x86_64, x86_64_platforms ) steps = [] for ( @@ -74,6 +77,9 @@ # newer -> older is not supported, and does not work if src_kv > dst_kv: continue + # only test cross-kernel restore between adjacent kernel versions + if src_kv == "linux_5.10" and dst_kv == "linux_6.18": + continue if src_instance != dst_instance and dst_instance not in supported.get( src_instance, [] ): From 565d4b69cb2d2d921ce52b29230e51887153f721 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 23 Apr 2026 14:04:45 +0000 Subject: [PATCH 03/10] fix(test): verify block device IO after cross-kernel snapshot restore guest_run_fio_iteration ran fio in the background and only checked that the process launched, not that IO actually succeeded. Run fio in the foreground with JSON output and assert that bytes were read from the block device. This addresses the TODO about verifying the root device is not corrupted after snapshot restore. Signed-off-by: Jack Thomson --- tests/framework/utils.py | 16 +++++++++------- .../test_snapshot_restore_cross_kernel.py | 8 ++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/tests/framework/utils.py b/tests/framework/utils.py index 0f09bac5a47..727e8e125ce 100644 --- a/tests/framework/utils.py +++ b/tests/framework/utils.py @@ -555,13 +555,15 @@ def start_screen_process(screen_log, session_name, binary_path, binary_params): def guest_run_fio_iteration(ssh_connection, iteration): - """Start FIO workload into a microVM.""" - fio = """fio --filename=/dev/vda --direct=1 --rw=randread --bs=4k \ - --ioengine=libaio --iodepth=16 --runtime=10 --numjobs=4 --time_based \ - --group_reporting --name=iops-test-job --eta-newline=1 --readonly \ - --output /tmp/fio{} > /dev/null &""".format(iteration) - exit_code, _, stderr = ssh_connection.run(fio) - assert exit_code == 0, stderr + """Run FIO workload on a microVM and verify IO completed successfully.""" + fio = ( + "fio --filename=/dev/vda --direct=1 --rw=randread --bs=4k " + "--ioengine=libaio --iodepth=16 --runtime=10 --numjobs=4 --time_based " + "--group_reporting --name=iops-test-job --readonly --output-format=json" + ) + _, stdout, _ = ssh_connection.check_output(fio) + total_read = json.loads(stdout)["jobs"][0]["read"]["io_bytes"] + assert total_read > 0, f"fio iteration {iteration}: no bytes read from block device" def check_filesystem(ssh_connection, disk_fmt, disk): diff --git a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py index 80596a8166e..c183c6ba97f 100644 --- a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py +++ b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py @@ -72,7 +72,9 @@ def get_snapshot_dirs(): snapshot_root_dir = Path(FC_WORKSPACE_DIR) / snapshot_root_name cpu_templates = ["None"] + get_supported_cpu_templates() for cpu_template in cpu_templates: - for snapshot_dir in snapshot_root_dir.glob(f"**/*_{cpu_template}_guest_snapshot"): + for snapshot_dir in snapshot_root_dir.glob( + f"**/*_{cpu_template}_guest_snapshot" + ): assert snapshot_dir.is_dir() yield pytest.param(snapshot_dir, id=snapshot_dir.name) @@ -120,9 +122,7 @@ def test_snap_restore_from_artifacts( logger.info("Testing vsock device...") check_vsock_device(vm, bin_vsock_path, test_fc_session_root_path, vm.ssh) - # Run fio on the guest. - # TODO: check the result of FIO or use fsck to check that the root device is - # not corrupted. No obvious errors will be returned here. + logger.info("Testing block device via fio...") guest_run_fio_iteration(vm.ssh, 0) vm.kill() From cdc4bd37a7c55aa850d26a7e7a73172d54c895fd Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 23 Apr 2026 14:08:26 +0000 Subject: [PATCH 04/10] test(snapshot): verify entropy source after cross-kernel restore Check that /dev/hwrng is functional after restoring a snapshot on a different host kernel version. Signed-off-by: Jack Thomson --- tests/integration_tests/functional/test_snapshot_phase1.py | 2 ++ .../functional/test_snapshot_restore_cross_kernel.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/tests/integration_tests/functional/test_snapshot_phase1.py b/tests/integration_tests/functional/test_snapshot_phase1.py index 9bdfc9d0ce4..01c8fc28583 100644 --- a/tests/integration_tests/functional/test_snapshot_phase1.py +++ b/tests/integration_tests/functional/test_snapshot_phase1.py @@ -58,6 +58,8 @@ def test_snapshot_phase1( configure_mmds(vm, ["eth3"], version="V2") # Add a memory balloon. vm.api.balloon.put(amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1) + # Add an entropy device. + vm.api.entropy.put() vm.start() diff --git a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py index c183c6ba97f..7adebcada70 100644 --- a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py +++ b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py @@ -11,6 +11,7 @@ from framework.defs import FC_WORKSPACE_DIR from framework.utils import ( + check_entropy, generate_mmds_get_request, generate_mmds_session_token, guest_run_fio_iteration, @@ -125,4 +126,7 @@ def test_snap_restore_from_artifacts( logger.info("Testing block device via fio...") guest_run_fio_iteration(vm.ssh, 0) + logger.info("Testing entropy...") + check_entropy(vm.ssh) + vm.kill() From 18a07f72324e518355fcb624069f9dbc19a5c34a Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 23 Apr 2026 14:08:57 +0000 Subject: [PATCH 05/10] test(snapshot): verify MONOTONIC survives cross-kernel restore Record guest CLOCK_MONOTONIC in phase1 just before snapshotting, then read it back after cross-kernel restore and assert the delta is small. Firecracker is supposed to resume MONOTONIC from capture time (see a1fd537f9 "fix(kvm-clock): do not jump monotonic clock on restore"), so the delta should be near zero regardless of how long phase1 and restore are apart in the pipeline. A large delta indicates MONOTONIC jumped forward - a kvm-clock regression that could surface only on some host-kernel combinations. Signed-off-by: Jack Thomson --- .../functional/test_snapshot_phase1.py | 8 +++++++ .../test_snapshot_restore_cross_kernel.py | 23 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/tests/integration_tests/functional/test_snapshot_phase1.py b/tests/integration_tests/functional/test_snapshot_phase1.py index 01c8fc28583..825febddc24 100644 --- a/tests/integration_tests/functional/test_snapshot_phase1.py +++ b/tests/integration_tests/functional/test_snapshot_phase1.py @@ -97,6 +97,14 @@ def test_snapshot_phase1( _, stdout, _ = vm.ssh.run(cmd) assert json.loads(stdout) == data_store + # Record guest CLOCK_MONOTONIC just before snapshotting. The cross-kernel + # restore test reads this back and asserts the clock didn't jump forward + # by the pipeline-elapsed time, which would indicate a kvm-clock regression + # (see a1fd537f9 "fix(kvm-clock): do not jump monotonic clock on restore"). + vm.ssh.check_output( + "python3 -c 'import time; print(time.monotonic())' > /tmp/monotonic-before" + ) + # Copy snapshot files to be published to S3 for the 2nd part of the test # Create snapshot artifacts directory specific for the kernel version used. snapshot = vm.snapshot_full() diff --git a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py index 7adebcada70..8cf9f45edb7 100644 --- a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py +++ b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py @@ -27,6 +27,23 @@ pytestmark = pytest.mark.nonci +def _check_guest_monotonic_did_not_jump(ssh_connection, max_delta_sec=10): + # Phase1 recorded CLOCK_MONOTONIC to /tmp/monotonic-before just before + # snapshotting. Firecracker is supposed to resume MONOTONIC from capture + # time, so the delta here should be near zero regardless of how long + # phase1 and restore are apart in the pipeline. A large delta indicates + # MONOTONIC jumped forward across the snapshot - a kvm-clock regression + # that could surface only on some host-kernel combinations. + _, before_str, _ = ssh_connection.check_output("cat /tmp/monotonic-before") + _, after_str, _ = ssh_connection.check_output( + "python3 -c 'import time; print(time.monotonic())'" + ) + delta = float(after_str.strip()) - float(before_str.strip()) + assert ( + 0 <= delta <= max_delta_sec + ), f"Guest MONOTONIC jumped {delta:.3f}s across snapshot (max {max_delta_sec}s)" + + def _test_balloon(microvm): # Check memory usage. first_reading = get_stable_rss_mem(microvm) @@ -114,6 +131,12 @@ def test_snap_restore_from_artifacts( logger.info("Testing net device %s...", iface["iface"].dev_name) vm.ssh_iface(idx).check_output("true") + # Check MONOTONIC before any other post-restore activity, so the delta + # is bounded by the few seconds of post-resume setup rather than the + # full test runtime. + logger.info("Testing guest MONOTONIC did not jump across snapshot...") + _check_guest_monotonic_did_not_jump(vm.ssh) + logger.info("Testing data store behavior...") _test_mmds(vm, vm.iface["eth3"]["iface"]) From 3a0a1926e2a8b67fc870ab1f655d13d9c511e084 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 23 Apr 2026 14:09:21 +0000 Subject: [PATCH 06/10] test(snapshot): verify network data integrity after cross-kernel restore Add check_network_data_integrity helper that generates random bytes on the host, pushes them to the guest via SSH command-line (base64-encoded to survive argv), has the guest decode and sha256 them, and asserts the guest-side hash matches the host-side hash. This exercises the full virtio-net RX path end-to-end beyond simple connectivity checks. Signed-off-by: Jack Thomson --- tests/framework/utils.py | 15 +++++++++++++++ .../test_snapshot_restore_cross_kernel.py | 4 ++++ 2 files changed, 19 insertions(+) diff --git a/tests/framework/utils.py b/tests/framework/utils.py index 727e8e125ce..bf5c1b3d715 100644 --- a/tests/framework/utils.py +++ b/tests/framework/utils.py @@ -2,7 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 """Generic utility functions that are used in the framework.""" +import base64 import errno +import hashlib import json import logging import os @@ -578,6 +580,19 @@ def check_entropy(ssh_connection): ssh_connection.check_output("dd if=/dev/hwrng of=/dev/null bs=4096 count=1") +def check_network_data_integrity(ssh_connection, size_bytes=64 * 1024): + """Push random bytes to the guest over SSH and verify the guest-side sha256 + matches the host-side hash. Exercises the virtio-net RX path end-to-end.""" + payload = os.urandom(size_bytes) + host_hash = hashlib.sha256(payload).hexdigest() + b64 = base64.b64encode(payload).decode("ascii") + _, stdout, _ = ssh_connection.check_output(f"echo {b64} | base64 -d | sha256sum") + guest_hash = stdout.strip().split()[0] + assert ( + guest_hash == host_hash + ), f"Guest hash {guest_hash} does not match host hash {host_hash}" + + @retry(wait=wait_fixed(0.5), stop=stop_after_attempt(5), reraise=True) def wait_process_running(process): """Wait for a process to run. diff --git a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py index 8cf9f45edb7..a06905244e6 100644 --- a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py +++ b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py @@ -12,6 +12,7 @@ from framework.defs import FC_WORKSPACE_DIR from framework.utils import ( check_entropy, + check_network_data_integrity, generate_mmds_get_request, generate_mmds_session_token, guest_run_fio_iteration, @@ -137,6 +138,9 @@ def test_snap_restore_from_artifacts( logger.info("Testing guest MONOTONIC did not jump across snapshot...") _check_guest_monotonic_did_not_jump(vm.ssh) + logger.info("Testing network data integrity...") + check_network_data_integrity(vm.ssh) + logger.info("Testing data store behavior...") _test_mmds(vm, vm.iface["eth3"]["iface"]) From 2c105b6be3b2c618f52976e9378055e0ac6796c8 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 23 Apr 2026 14:48:58 +0000 Subject: [PATCH 07/10] fix(test): disable memory monitor in cross-kernel snapshot restore test MemoryMonitor's is_guest_mem heuristic matches a single guest-sized VMA, but _test_balloon inflates the balloon after restore, and GuestRegionMmapExt::discard_range overlays MAP_FIXED anonymous mmaps on the reclaimed ranges (a workaround specific to private file-backed mappings from snapshot restore). This fragments the 512 MiB guest VMA into ~190 smaller ones, none of which match the heuristic, and their RSS (~336 MiB) is counted as VMM overhead. This is the only cross-kernel test that inflates the balloon post- restore, and its purpose is validating cross-kernel compatibility, not VMM memory overhead, so the monitor is skipped here as it already is in test_snapshot_phase1. Signed-off-by: Jack Thomson --- .../functional/test_snapshot_restore_cross_kernel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py index a06905244e6..80e93c2931b 100644 --- a/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py +++ b/tests/integration_tests/functional/test_snapshot_restore_cross_kernel.py @@ -117,7 +117,11 @@ def test_snap_restore_from_artifacts( # in the snapshot root dir. logger.info("Working with snapshot artifacts in %s.", snapshot_dir) - vm = microvm_factory.build() + # Skip memory monitor: the balloon inflation below fragments the guest + # VMA via discard_range's MAP_FIXED anonymous mmap workaround (used only + # for private file-backed mappings from snapshot restore), defeating + # MemoryMonitor.is_guest_mem. Cross-kernel test, not overhead. + vm = microvm_factory.build(monitor_memory=False) vm.time_api_requests = False vm.spawn() logger.info("Loading microVM from snapshot...") From cf2a74ef67d8c6d77d9753199f8a49b74d40d06d Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 23 Apr 2026 15:49:16 +0000 Subject: [PATCH 08/10] ci(snapshot): create aarch64 snapshots in cross-restore pipeline The perms_aarch64 loop expects aarch64 phase1 snapshots to exist for restore steps to consume, but the snapshot-create group was x86-only, so every aarch64 restore step failed at artifact download. Add an aarch64 snapshot-create group and enable test_snapshot_phase1 on arm. Signed-off-by: Jack Thomson --- .buildkite/pipeline_cross.py | 14 +++++++++++--- .../functional/test_snapshot_phase1.py | 4 ---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.buildkite/pipeline_cross.py b/.buildkite/pipeline_cross.py index 1228c27249c..b8ea0bbd2ca 100755 --- a/.buildkite/pipeline_cross.py +++ b/.buildkite/pipeline_cross.py @@ -45,6 +45,17 @@ instances=instances_x86_64, platforms=DEFAULT_PLATFORMS, ) + + # https://github.com/firecracker-microvm/firecracker/blob/main/docs/snapshotting/snapshot-support.md#where-can-i-resume-my-snapshots + aarch64_platforms = [("al2023", "linux_6.1")] + pipeline.build_group( + "snapshot-create-aarch64", + commands, + timeout=30, + artifact_paths="snapshots/**/*", + instances=instances_aarch64, + platforms=aarch64_platforms, + ) pipeline.add_step("wait") # allow-list of what instances can be restores on what other instances (in @@ -53,9 +64,6 @@ "m5n.metal": ["m6i.metal"], "m6i.metal": ["m5n.metal"], } - - # https://github.com/firecracker-microvm/firecracker/blob/main/docs/kernel-policy.md#experimental-snapshot-compatibility-across-kernel-versions - aarch64_platforms = [("al2023", "linux_6.1")] aarch64_all_platforms = aarch64_platforms + restore_only_platforms perms_aarch64 = itertools.product( instances_aarch64, aarch64_platforms, instances_aarch64, aarch64_all_platforms diff --git a/tests/integration_tests/functional/test_snapshot_phase1.py b/tests/integration_tests/functional/test_snapshot_phase1.py index 825febddc24..89bed92ee8d 100644 --- a/tests/integration_tests/functional/test_snapshot_phase1.py +++ b/tests/integration_tests/functional/test_snapshot_phase1.py @@ -6,7 +6,6 @@ """ import json -import platform import re import pytest @@ -18,9 +17,6 @@ ) from framework.utils_cpu_templates import get_cpu_template_name -if platform.machine() != "x86_64": - pytestmark = pytest.mark.skip("only x86_64 architecture supported") - # Default IPv4 address to route MMDS requests. IPV4_ADDRESS = "169.254.169.254" NET_IFACE_FOR_MMDS = "eth3" From 659e127b711ee092d0bcddb7ffd2949744b0250d Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 23 Apr 2026 16:08:19 +0000 Subject: [PATCH 09/10] ci(snapshot): extend cross-restore coverage to more instance types Add m8i.metal-48xl (Intel Granite Rapids), m6g.metal (Graviton2) and m8g.metal-24xl (Graviton4) to the cross-restore pipeline. These pick up same-instance cross-kernel coverage only; cross-instance restore permutations are unchanged. Signed-off-by: Jack Thomson --- .buildkite/pipeline_cross.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.buildkite/pipeline_cross.py b/.buildkite/pipeline_cross.py index b8ea0bbd2ca..3b5c13917c3 100755 --- a/.buildkite/pipeline_cross.py +++ b/.buildkite/pipeline_cross.py @@ -23,10 +23,11 @@ "m6i.metal", "m7i.metal-24xl", "m7i.metal-48xl", + "m8i.metal-48xl", "m6a.metal", "m7a.metal-48xl", ] - instances_aarch64 = ["m7g.metal"] + instances_aarch64 = ["m6g.metal", "m7g.metal", "m8g.metal-24xl"] restore_only_platforms = [("al2023", "linux_6.18")] x86_64_platforms = DEFAULT_PLATFORMS + restore_only_platforms commands = [ @@ -58,8 +59,8 @@ ) pipeline.add_step("wait") - # allow-list of what instances can be restores on what other instances (in - # addition to itself) + # allow-list of what instances can be restored on what other instances (in + # addition to itself). aarch64 is restricted to same-instance restores. supported = { "m5n.metal": ["m6i.metal"], "m6i.metal": ["m5n.metal"], From b19c381780f500b8f1ccba411a251d7a691b7b87 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 23 Apr 2026 17:20:38 +0000 Subject: [PATCH 10/10] ci(snapshot): restore per-source depends_on instead of global wait Previously every restore step waited for the entire snapshot-create group to finish via a pipeline-wide wait step. Each restore only needs its own source snapshot, so key each create step by instance/kv and have each restore depends_on the specific source it consumes. Restores now start as soon as their source snapshot is ready. Signed-off-by: Jack Thomson --- .buildkite/pipeline_cross.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/.buildkite/pipeline_cross.py b/.buildkite/pipeline_cross.py index 3b5c13917c3..4563736e312 100755 --- a/.buildkite/pipeline_cross.py +++ b/.buildkite/pipeline_cross.py @@ -38,7 +38,21 @@ "mkdir -pv snapshots", "tar cSvf snapshots/{instance}_{kv}.tar snapshot_artifacts", ] - pipeline.build_group( + + def create_step_key(instance, kv): + """Buildkite key for a snapshot-create step. + + Keys may only contain [A-Za-z0-9_\\-:], so dots in instance names + (m5n.metal) and kernel versions (linux_5.10) are sanitized to + underscores. Tarball paths stay unchanged. + """ + return f"snap-create-{instance}-{kv}".replace(".", "_") + + # Key each snapshot-create step so restore steps can depend on the + # specific source snapshot they need, rather than waiting for every + # snapshot-create step to finish. `build_group` doesn't sanitize + # substituted key values, so we set the final key after it fans out. + x86_create = pipeline.build_group( "snapshot-create", commands, timeout=30, @@ -49,7 +63,7 @@ # https://github.com/firecracker-microvm/firecracker/blob/main/docs/snapshotting/snapshot-support.md#where-can-i-resume-my-snapshots aarch64_platforms = [("al2023", "linux_6.1")] - pipeline.build_group( + aarch64_create = pipeline.build_group( "snapshot-create-aarch64", commands, timeout=30, @@ -57,7 +71,9 @@ instances=instances_aarch64, platforms=aarch64_platforms, ) - pipeline.add_step("wait") + for grp in (x86_create, aarch64_create): + for s in grp["steps"]: + s["key"] = create_step_key(s["agents"]["instance"], s["agents"]["kv"]) # allow-list of what instances can be restored on what other instances (in # addition to itself). aarch64 is restricted to same-instance restores. @@ -111,6 +127,7 @@ "label": f"snapshot-restore-src-{src_instance}-{src_kv}-dst-{dst_instance}-{dst_kv}", "timeout": 30, "agents": {"instance": dst_instance, "kv": dst_kv, "os": dst_os}, + "depends_on": [create_step_key(src_instance, src_kv)], **per_instance, } steps.append(step)