Skip to content

Commit b5bbe61

Browse files
authored
Merge pull request #598 from posit-dev/fix/publish-await-available
fix(ci): absorb GHCR eventual consistency in publish path
2 parents f75a1b5 + 5deba97 commit b5bbe61

9 files changed

Lines changed: 874 additions & 15 deletions

File tree

posit-bakery/posit_bakery/cli/ci.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,10 +294,12 @@ def publish(
294294
"""
295295
# Imports kept local to mirror existing patterns and to avoid bloating
296296
# module load time when this command isn't invoked.
297+
from posit_bakery.error import BakeryToolRuntimeError
297298
from posit_bakery.plugins.builtin.oras.oras import (
298299
OrasIndexCopyWorkflow,
299300
OrasIndexCreateWorkflow,
300301
OrasIndexVerifyWorkflow,
302+
OrasWaitForSourcesWorkflow,
301303
find_oras_bin,
302304
)
303305
from posit_bakery.plugins.registry import get_plugin
@@ -355,6 +357,32 @@ def publish(
355357
key=lambda t: t.push_sort_key,
356358
)
357359

360+
# Pre-flight: wait for every per-platform source digest to be readable
361+
# before we touch them. Those manifests are pushed by digest from separate
362+
# build runners, and registries with read-after-write (eventual
363+
# consistency) behaviour — notably GHCR — can briefly 404 them. Polling
364+
# here turns propagation lag into condition-based waiting and logs exactly
365+
# which digest lagged, rather than failing a downstream phase opaquely.
366+
all_sources = sorted({s for t in targets for s in t.get_merge_sources()})
367+
if all_sources:
368+
log.info(f"Waiting for {len(all_sources)} source digest(s) to be readable before publishing.")
369+
try:
370+
wait = OrasWaitForSourcesWorkflow(
371+
oras_bin=oras_bin,
372+
sources=all_sources,
373+
).run(dry_run=dry_run)
374+
except BakeryToolRuntimeError as e:
375+
# A non-transient registry error (auth, bad reference, ...) while
376+
# probing sources is fatal and won't self-heal — surface it cleanly
377+
# rather than letting it escape as an unhandled traceback.
378+
log.error(f"Failed while waiting for source digests: {e.dump_stderr() or e}")
379+
raise typer.Exit(code=1)
380+
if not wait.success:
381+
log.error(f"Source digests not available: {wait.error}")
382+
raise typer.Exit(code=1)
383+
if wait.ready:
384+
log.info(f"All {len(wait.ready)} source digest(s) readable after {wait.waited_seconds:.0f}s.")
385+
358386
# Phase 1: index create. Failures abort.
359387
temp_refs: dict[str, str] = {}
360388
for t in targets:

posit-bakery/posit_bakery/plugins/builtin/oras/oras.py

Lines changed: 131 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,16 @@
22
import itertools
33
import logging
44
import subprocess
5+
import time
56
from abc import ABC, abstractmethod
67
from pathlib import Path
7-
from typing import Annotated, Self
8+
from typing import Annotated, Callable, Self
89

910
from pydantic import BaseModel, ConfigDict, Field, model_validator
1011

1112
from posit_bakery.error import BakeryToolRuntimeError
1213
from posit_bakery.image.image_target import ImageTarget, Tag
14+
from posit_bakery.retry import RetryPolicy, is_transient_error, retry_on_transient
1315
from posit_bakery.util import find_bin
1416

1517
log = logging.getLogger(__name__)
@@ -200,6 +202,7 @@ class OrasIndexCreateWorkflow(BaseModel):
200202
image_target: Annotated[ImageTarget, Field(description="Target this index represents.")]
201203
annotations: Annotated[dict[str, str], Field(default_factory=dict)]
202204
plain_http: Annotated[bool, Field(default=False)]
205+
retry_policy: Annotated[RetryPolicy, Field(default_factory=RetryPolicy)]
203206

204207
@property
205208
def temp_index_tag(self) -> str:
@@ -209,14 +212,22 @@ def temp_index_tag(self) -> str:
209212
)
210213

211214
def run(self, dry_run: bool = False) -> OrasIndexCreateResult:
215+
# Retry transient registry errors: the per-platform source manifests
216+
# are pushed by digest from separate runners and may not yet be
217+
# readable here due to registry eventual consistency.
218+
cmd = OrasManifestIndexCreate(
219+
oras_bin=self.oras_bin,
220+
sources=self.image_target.get_merge_sources(),
221+
destination=self.temp_index_tag,
222+
annotations=self.annotations,
223+
plain_http=self.plain_http,
224+
)
212225
try:
213-
OrasManifestIndexCreate(
214-
oras_bin=self.oras_bin,
215-
sources=self.image_target.get_merge_sources(),
216-
destination=self.temp_index_tag,
217-
annotations=self.annotations,
218-
plain_http=self.plain_http,
219-
).run(dry_run=dry_run)
226+
retry_on_transient(
227+
lambda: cmd.run(dry_run=dry_run),
228+
policy=self.retry_policy,
229+
description=f"index-create for '{self.image_target.uid}'",
230+
)
220231
return OrasIndexCreateResult(success=True, temp_ref=self.temp_index_tag)
221232
except BakeryToolRuntimeError as e:
222233
log.error(f"oras index-create failed: {e}")
@@ -239,18 +250,26 @@ class OrasIndexCopyWorkflow(BaseModel):
239250
oras_bin: Annotated[str, Field(description="Path to the oras binary.")]
240251
image_target: Annotated[ImageTarget, Field(description="Target whose tags to fan out to.")]
241252
plain_http: Annotated[bool, Field(default=False)]
253+
retry_policy: Annotated[RetryPolicy, Field(default_factory=RetryPolicy)]
242254

243255
def run(self, source: str, dry_run: bool = False) -> OrasIndexCopyResult:
244256
try:
245257
destinations = []
246258
for destination, tags in itertools.groupby(self.image_target.tags, lambda x: x.destination):
247259
combined = destination + ":" + ",".join(t.suffix for t in tags)
248-
OrasCopy(
260+
copy = OrasCopy(
249261
oras_bin=self.oras_bin,
250262
source=source,
251263
destination=combined,
252264
plain_http=self.plain_http,
253-
).run(dry_run=dry_run)
265+
)
266+
# Retry transient registry errors: the temp-registry source
267+
# index may still be propagating when the copy first reads it.
268+
retry_on_transient(
269+
lambda c=copy: c.run(dry_run=dry_run),
270+
policy=self.retry_policy,
271+
description=f"index-copy for '{self.image_target.uid}' -> {combined}",
272+
)
254273
destinations.append(combined)
255274
return OrasIndexCopyResult(success=True, destinations=destinations)
256275
except BakeryToolRuntimeError as e:
@@ -303,6 +322,108 @@ def run(self, dry_run: bool = False) -> OrasIndexVerifyResult:
303322
return OrasIndexVerifyResult(success=False, verified=verified, error=str(e))
304323

305324

325+
class OrasSourcesReadyResult(BaseModel):
326+
"""Result of a pre-flight source-digest availability wait."""
327+
328+
success: Annotated[bool, Field(description="Whether every source digest became readable before the timeout.")]
329+
ready: Annotated[
330+
list[str], Field(default_factory=list, description="Source refs confirmed readable, in resolution order.")
331+
]
332+
missing: Annotated[
333+
list[str], Field(default_factory=list, description="Source refs still unreadable when the wait gave up.")
334+
]
335+
waited_seconds: Annotated[float, Field(default=0.0, description="Wall-clock seconds spent waiting.")]
336+
error: Annotated[str | None, Field(default=None, description="Diagnostic message on timeout.")]
337+
338+
339+
class OrasWaitForSourcesWorkflow(BaseModel):
340+
"""Poll source digests until they are all readable from the registry.
341+
342+
Per-platform manifests are pushed *by digest* from separate build runners,
343+
and registries with read-after-write (eventual consistency) behaviour —
344+
notably GHCR — may briefly 404 those digests when the publish runner first
345+
asks for them. This pre-flight turns "hope it has propagated" into
346+
condition-based waiting: each source is probed with ``oras manifest fetch
347+
--descriptor`` (a lightweight existence check) and removed from the pending
348+
set once it resolves. The wait succeeds as soon as every source resolves,
349+
and fails (logging exactly which digests lagged) once ``timeout`` elapses.
350+
"""
351+
352+
model_config = ConfigDict(arbitrary_types_allowed=True)
353+
354+
oras_bin: Annotated[str, Field(description="Path to the oras binary.")]
355+
sources: Annotated[list[str], Field(description="Source refs (registry refs, typically by-digest) to await.")]
356+
timeout: Annotated[float, Field(default=600.0, description="Maximum seconds to wait for all sources (10 min).")]
357+
poll_interval: Annotated[float, Field(default=5.0, description="Seconds between polling sweeps.")]
358+
plain_http: Annotated[bool, Field(default=False)]
359+
360+
def _is_available(self, ref: str) -> bool:
361+
try:
362+
OrasManifestFetch(
363+
oras_bin=self.oras_bin,
364+
reference=ref,
365+
descriptor=True,
366+
plain_http=self.plain_http,
367+
).run(dry_run=False)
368+
return True
369+
except BakeryToolRuntimeError as e:
370+
if is_transient_error(e):
371+
return False
372+
raise
373+
374+
def run(
375+
self,
376+
dry_run: bool = False,
377+
*,
378+
sleep: Callable[[float], None] = time.sleep,
379+
now: Callable[[], float] = time.monotonic,
380+
) -> OrasSourcesReadyResult:
381+
"""Probe each source until all resolve or ``timeout`` elapses.
382+
383+
:param dry_run: When True, report success without contacting the
384+
registry (nothing has been pushed to wait on).
385+
:param sleep: Sleep function, injectable for testing.
386+
:param now: Monotonic clock, injectable for testing.
387+
"""
388+
unique_sources = list(dict.fromkeys(self.sources))
389+
if dry_run or not unique_sources:
390+
return OrasSourcesReadyResult(success=True, ready=unique_sources)
391+
392+
start = now()
393+
ready: list[str] = []
394+
pending = list(unique_sources)
395+
while True:
396+
still_pending: list[str] = []
397+
for ref in pending:
398+
if self._is_available(ref):
399+
ready.append(ref)
400+
else:
401+
still_pending.append(ref)
402+
pending = still_pending
403+
404+
if not pending:
405+
return OrasSourcesReadyResult(success=True, ready=ready, waited_seconds=now() - start)
406+
407+
elapsed = now() - start
408+
if elapsed >= self.timeout:
409+
return OrasSourcesReadyResult(
410+
success=False,
411+
ready=ready,
412+
missing=pending,
413+
waited_seconds=elapsed,
414+
error=(
415+
f"{len(pending)} source digest(s) still unreadable after {elapsed:.0f}s "
416+
f"(timeout {self.timeout:.0f}s): {', '.join(pending)}"
417+
),
418+
)
419+
420+
log.info(
421+
f"Waiting on {len(pending)} source digest(s) to become readable "
422+
f"({elapsed:.0f}s/{self.timeout:.0f}s elapsed); retrying in {self.poll_interval:.0f}s."
423+
)
424+
sleep(self.poll_interval)
425+
426+
306427
class OrasMergeWorkflowResult(BaseModel):
307428
"""Result of an ORAS merge workflow execution."""
308429

posit-bakery/posit_bakery/plugins/builtin/soci/soci.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from posit_bakery.image.image_target import ImageTarget
1616
from posit_bakery.plugins.builtin.oras.oras import OrasCopy
1717
from posit_bakery.plugins.builtin.soci.options import SociOptions
18+
from posit_bakery.retry import RetryPolicy, retry_on_transient
1819
from posit_bakery.util import find_bin
1920

2021
log = logging.getLogger(__name__)
@@ -134,6 +135,7 @@ class SociConvertWorkflow(BaseModel):
134135
image_target: Annotated[ImageTarget, Field(description="The image target.")]
135136
options: Annotated[SociOptions, Field(description="Per-target SOCI configuration.")]
136137
source_ref: Annotated[str, Field(description="Temp-registry ref to convert from.")]
138+
retry_policy: Annotated[RetryPolicy, Field(default_factory=RetryPolicy)]
137139

138140
@property
139141
def destination_ref(self) -> str:
@@ -183,13 +185,20 @@ def run(self, dry_run: bool = False) -> SociConvertWorkflowResult:
183185
out_layout = scratch / "out"
184186
try:
185187
# 1. registry -> OCI layout. The layout tag is arbitrary; soci
186-
# reads the whole layout.
187-
OrasCopy(
188+
# reads the whole layout. Retry transient registry errors: the
189+
# temp-registry index/children may still be propagating when
190+
# this pull first reads them (registry eventual consistency).
191+
pull = OrasCopy(
188192
oras_bin=self.oras_bin,
189193
source=self.source_ref,
190194
destination=f"{src_layout}:image",
191195
to_oci_layout=True,
192-
).run(dry_run=dry_run)
196+
)
197+
retry_on_transient(
198+
lambda: pull.run(dry_run=dry_run),
199+
policy=self.retry_policy,
200+
description=f"soci pull for '{self.image_target.uid}'",
201+
)
193202

194203
# 2. convert local layout -> local layout (directory so we can read
195204
# the resulting index.json for its digest).

0 commit comments

Comments
 (0)