Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions .builders/inputs_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""Content-hash the inputs that determine each builder container image.

The `Resolve Dependencies and Build Wheels` workflow uses these hashes to
decide whether to rebuild a builder image from scratch, or pull the existing
one by digest from .deps/image_digests.json. The pinned hashes live in
.deps/builder_inputs.toml and are rewritten by .builders/upload.py whenever
dependency resolution publishes new artifacts.

A "target" is one of the builder image names we maintain — one per
(OS, CPU architecture) pair that the Agent ships Python wheels for.
The names match the subdirectories of .builders/images/, for example:
`linux-x86_64`, `linux-aarch64`, `windows-x86_64`.
"""
from __future__ import annotations

import argparse
import sys
import tomllib
from collections.abc import Iterator
from hashlib import sha256
from pathlib import Path

HERE = Path(__file__).parent
PINNED_FILE = HERE.parent / '.deps' / 'builder_inputs.toml'

# Files and directories whose contents determine a builder image. A change to
# any of these should force a rebuild. Paths are relative to .builders/ and
# are shared across all targets; per-target inputs live under images/<target>/.
# If you add a new input under .builders/ that affects image contents, add it here.
COMMON_INPUTS = [
'build.py',
'deps/build_dependencies.txt',
'scripts',
'patches',
'images/helpers.ps1',
'images/install-from-source.sh',
'images/runner_dependencies.txt',
]


def _iter_files(root: Path) -> Iterator[Path]:
if root.is_file():
yield root
elif root.is_dir():
for path in root.rglob('*'):
rel_parts = path.relative_to(root).parts
if path.is_file() and not any(_is_ignored(part) for part in rel_parts):
yield path


def _is_ignored(name: str) -> bool:
if name == '.gitkeep':
return False
return name.startswith('.') or name == '__pycache__'


def compute(target: str) -> str:
"""Hash the working-tree inputs for `target` and return hex sha256."""
target_dir = HERE / 'images' / target
if not target_dir.is_dir():
raise FileNotFoundError(f'Unknown builder target: {target} (expected {target_dir})')

paths: set[Path] = set()
for rel in COMMON_INPUTS:
files = list(_iter_files(HERE / rel))
if not files:
print(f'warning: {rel} matched no files under {HERE}', file=sys.stderr)
paths.update(files)
paths.update(_iter_files(target_dir))

# Sort by the relative POSIX path string, not by Path objects: WindowsPath
# sorting is case-insensitive and uses backslashes, which produces a
# different iteration order (and therefore a different hash) than on
# POSIX systems for the same input set.
sorted_paths = sorted(paths, key=lambda p: p.relative_to(HERE).as_posix())

digest = sha256()
for path in sorted_paths:
rel_path = path.relative_to(HERE).as_posix().encode('utf-8')
digest.update(rel_path + b'\0')
digest.update(path.read_bytes())
digest.update(b'\0')
return digest.hexdigest()


def pinned(target: str) -> str:
"""Return the hash pinned for `target` in builder_inputs.toml, or empty if absent."""
if not PINNED_FILE.is_file():
print(f'{PINNED_FILE} not found; treating as unpinned', file=sys.stderr)
return ''
try:
with PINNED_FILE.open('rb') as f:
data = tomllib.load(f)
except tomllib.TOMLDecodeError as e:
raise RuntimeError(f'{PINNED_FILE} is malformed (it should not be edited by hand): {e}') from e
inputs = data.get('inputs', {})
if target not in inputs:
print(f'{PINNED_FILE}: no entry for {target}; treating as unpinned', file=sys.stderr)
return inputs.get(target, '')


def main() -> None:
parser = argparse.ArgumentParser(
description=(
'Gate rebuilds of the builder container images. The resolve-build-deps '
'workflow compares the working-tree hash against the pinned hash to '
'decide whether to rebuild from scratch or pull the existing image.'
),
epilog=(
'A "target" is a builder image name matching a subdirectory of '
'.builders/images/ (e.g. linux-x86_64, linux-aarch64, windows-x86_64).\n\n'
'Examples:\n'
' python .builders/inputs_hash.py compute linux-x86_64\n'
' python .builders/inputs_hash.py pinned linux-x86_64\n'
' diff <(… compute linux-x86_64) <(… pinned linux-x86_64)'
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
subparsers = parser.add_subparsers(dest='command', required=True)

compute_parser = subparsers.add_parser(
'compute',
help='Hash the working-tree inputs for a target.',
description=(
'Answers "would the current tree produce a different image than '
'the one we have pinned?". Run this against the checked-out tree '
'and compare to `pinned` — a mismatch means the image needs a rebuild.'
),
epilog=(
'Examples:\n'
' python .builders/inputs_hash.py compute linux-x86_64\n'
' python .builders/inputs_hash.py compute windows-x86_64'
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
compute_parser.add_argument('target', help='Builder image name (e.g. linux-x86_64).')

pinned_parser = subparsers.add_parser(
'pinned',
help='Print the hash pinned for a target in .deps/builder_inputs.toml.',
description=(
'Answers "which inputs produced the image we are pulling today?". '
'Returns an empty string when the file or entry is missing, so a '
'naive string compare with `compute` correctly flags first-run and '
'never-built targets as needing a rebuild.'
),
epilog=(
'Examples:\n'
' python .builders/inputs_hash.py pinned linux-x86_64\n'
' python .builders/inputs_hash.py pinned windows-x86_64'
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
pinned_parser.add_argument('target', help='Builder image name (e.g. linux-x86_64).')

args = parser.parse_args()
if args.command == 'compute':
sys.stdout.write(compute(args.target))
elif args.command == 'pinned':
sys.stdout.write(pinned(args.target))


if __name__ == '__main__':
main()
34 changes: 32 additions & 2 deletions .builders/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,21 +288,51 @@ def generate_lockfiles(targets_dir, lockfiles):
f.write(f'{contents}\n')

image_digests = {}
builder_inputs = {}
for target_name, lockfile_lines in lockfiles.items():
# The lockfiles contain the major.minor Python version
# so that the Agent can transition safely
lock_file = LOCK_FILE_DIR / f'{target_name}_{CURRENT_PYTHON_VERSION}.txt'
lock_file.write_text('\n'.join(lockfile_lines), encoding='utf-8')

# these `image_digest` files are generated in the 'Save new image digest'
# step of the github workflow
# The `image_digest` and `inputs_sha256` files are written by the
# 'Save new image digest' / 'Persist current image digest' steps of
# the github workflow; macOS targets don't produce them.
if (image_digest_file := targets_dir / target_name / 'image_digest').is_file():
image_digests[target_name] = image_digest_file.read_text(encoding='utf-8').strip()
if (inputs_hash_file := targets_dir / target_name / 'inputs_sha256').is_file():
builder_inputs[target_name] = inputs_hash_file.read_text(encoding='utf-8').strip()

with RESOLUTION_DIR.joinpath('image_digests.json').open('w', encoding='utf-8') as f:
contents = json.dumps(image_digests, indent=2, sort_keys=True)
f.write(f'{contents}\n')

_write_builder_inputs(RESOLUTION_DIR / 'builder_inputs.toml', builder_inputs)


_BUILDER_INPUTS_HEADER = """\
# Content hashes of the inputs that determine each builder image.
#
# The `Resolve Dependencies and Build Wheels` workflow compares these
# hashes against hashes computed from the working tree (via
# .builders/inputs_hash.py) to decide whether to rebuild a builder image
# from scratch or pull the existing one by digest from image_digests.json.
#
# This file is rewritten by .builders/upload.py whenever dependency
# resolution publishes new artifacts and should not be edited by hand.
# The set of files covered by each hash is defined by COMMON_INPUTS in
# .builders/inputs_hash.py plus everything under .builders/images/<target>/.

[inputs]
"""


def _write_builder_inputs(path: Path, hashes: dict[str, str]) -> None:
lines = [_BUILDER_INPUTS_HEADER.rstrip('\n')]
for target in sorted(hashes):
lines.append(f'{target} = "{hashes[target]}"')
path.write_text('\n'.join(lines) + '\n', encoding='utf-8')


def upload(targets_dir: Path, bucket: Bucket | None = None) -> dict[str, list[str]]:
bucket = bucket or Bucket(BUCKET_NAME)
Expand Down
16 changes: 16 additions & 0 deletions .deps/builder_inputs.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Content hashes of the inputs that determine each builder image.
#
# The `Resolve Dependencies and Build Wheels` workflow compares these
# hashes against hashes computed from the working tree (via
# .builders/inputs_hash.py) to decide whether to rebuild a builder image
# from scratch or pull the existing one by digest from image_digests.json.
#
# This file is rewritten by .builders/upload.py whenever dependency
# resolution publishes new artifacts and should not be edited by hand.
# The set of files covered by each hash is defined by COMMON_INPUTS in
# .builders/inputs_hash.py plus everything under .builders/images/<target>/.

[inputs]
linux-aarch64 = "e48bf769667a4f30addf317d2b889aea045e025b981276a5cbfdf5b53ae86ca8"
linux-x86_64 = "5e769b5c5678a0c578bbaf2289a8359ade8f3420de2a7f7ed3c09217866014c7"
windows-x86_64 = "beba3774e0929bc4f9377cabcde2f797a83309d88be329154b3e099961c95884"
2 changes: 1 addition & 1 deletion .deps/image_digests.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"linux-aarch64": "sha256:223fa103a2f1b67c98d2c15a250a9eec39b6b1768011096260f6e12cef1870c4",
"linux-x86_64": "sha256:7dca4137ffe24807e2e5ffafb4d50dbd9a201534c1c5e072618075d378e9b975",
"windows-x86_64": "sha256:ad7d49d4cef858824a652547291553fa99d220992024421e0618a59e87f96718"
"windows-x86_64": "sha256:ee9772fc71d78c232f6315e9609cf78a78213200ca24bcc55040f18d8e63313f"
}
Loading
Loading