Skip to content

Commit 37d2b74

Browse files
committed
ci(snapshot): restore per-source depends_on instead of global wait
Previously every restore step waited for the entire snapshot-create group to finish via a pipeline-wide wait step. Each restore only needs its own source snapshot, so key each create step by instance/kv and have each restore depends_on the specific source it consumes. Restores now start as soon as their source snapshot is ready. Signed-off-by: Jack Thomson <jackabt@amazon.com>
1 parent 7690e9f commit 37d2b74

1 file changed

Lines changed: 20 additions & 3 deletions

File tree

.buildkite/pipeline_cross.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,21 @@
3838
"mkdir -pv snapshots",
3939
"tar cSvf snapshots/{instance}_{kv}.tar snapshot_artifacts",
4040
]
41-
pipeline.build_group(
41+
42+
def create_step_key(instance, kv):
43+
"""Buildkite key for a snapshot-create step.
44+
45+
Keys may only contain [A-Za-z0-9_\\-:], so dots in instance names
46+
(m5n.metal) and kernel versions (linux_5.10) are sanitized to
47+
underscores. Tarball paths stay unchanged.
48+
"""
49+
return f"snap-create-{instance}-{kv}".replace(".", "_")
50+
51+
# Key each snapshot-create step so restore steps can depend on the
52+
# specific source snapshot they need, rather than waiting for every
53+
# snapshot-create step to finish. `build_group` doesn't sanitize
54+
# substituted key values, so we set the final key after it fans out.
55+
x86_create = pipeline.build_group(
4256
"snapshot-create",
4357
commands,
4458
timeout=30,
@@ -49,15 +63,17 @@
4963

5064
# https://github.com/firecracker-microvm/firecracker/blob/main/docs/snapshotting/snapshot-support.md#where-can-i-resume-my-snapshots
5165
aarch64_platforms = [("al2023", "linux_6.1")]
52-
pipeline.build_group(
66+
aarch64_create = pipeline.build_group(
5367
"snapshot-create-aarch64",
5468
commands,
5569
timeout=30,
5670
artifact_paths="snapshots/**/*",
5771
instances=instances_aarch64,
5872
platforms=aarch64_platforms,
5973
)
60-
pipeline.add_step("wait")
74+
for grp in (x86_create, aarch64_create):
75+
for s in grp["steps"]:
76+
s["key"] = create_step_key(s["agents"]["instance"], s["agents"]["kv"])
6177

6278
# allow-list of what instances can be restored on what other instances (in
6379
# addition to itself). aarch64 is restricted to same-instance restores.
@@ -111,6 +127,7 @@
111127
"label": f"snapshot-restore-src-{src_instance}-{src_kv}-dst-{dst_instance}-{dst_kv}",
112128
"timeout": 30,
113129
"agents": {"instance": dst_instance, "kv": dst_kv, "os": dst_os},
130+
"depends_on": [create_step_key(src_instance, src_kv)],
114131
**per_instance,
115132
}
116133
steps.append(step)

0 commit comments

Comments
 (0)