Skip to content

Commit c3bf4b6

Browse files
authored
Merge pull request #1582 from PolicyEngine/codex/data-build-fingerprint
Add data build metadata helpers
2 parents 352af38 + fa12880 commit c3bf4b6

3 files changed

Lines changed: 126 additions & 0 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Expose build metadata helpers for UK data artifacts, including a stable data-build fingerprint and build provenance metadata.

policyengine_uk/build_metadata.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
from __future__ import annotations
2+
3+
from functools import lru_cache
4+
import hashlib
5+
from importlib import metadata
6+
from pathlib import Path
7+
import subprocess
8+
9+
PACKAGE_NAME = "policyengine-uk"
10+
PACKAGE_ROOT = Path(__file__).resolve().parent
11+
DATA_BUILD_SURFACE = (
12+
"data",
13+
"parameters",
14+
"variables",
15+
"entities.py",
16+
"microsimulation.py",
17+
"simulation.py",
18+
"system.py",
19+
"tax_benefit_system.py",
20+
"programs.yaml",
21+
)
22+
23+
24+
def _iter_surface_files() -> list[Path]:
25+
files: list[Path] = []
26+
for relative_path in DATA_BUILD_SURFACE:
27+
path = PACKAGE_ROOT / relative_path
28+
if path.is_file():
29+
files.append(path)
30+
continue
31+
if path.is_dir():
32+
files.extend(
33+
child
34+
for child in sorted(path.rglob("*"))
35+
if child.is_file()
36+
and "__pycache__" not in child.parts
37+
and child.suffix not in {".pyc", ".pyo"}
38+
)
39+
return files
40+
41+
42+
def _get_package_version() -> str | None:
43+
try:
44+
return metadata.version(PACKAGE_NAME)
45+
except metadata.PackageNotFoundError:
46+
return None
47+
48+
49+
def _get_git_sha() -> str | None:
50+
for candidate in (PACKAGE_ROOT, *PACKAGE_ROOT.parents):
51+
if not (candidate / ".git").exists():
52+
continue
53+
try:
54+
return subprocess.check_output(
55+
["git", "-C", str(candidate), "rev-parse", "HEAD"],
56+
stderr=subprocess.DEVNULL,
57+
text=True,
58+
).strip()
59+
except Exception:
60+
return None
61+
return None
62+
63+
64+
@lru_cache(maxsize=1)
65+
def get_data_build_fingerprint() -> str:
66+
digest = hashlib.sha256()
67+
for file_path in _iter_surface_files():
68+
relative_path = file_path.relative_to(PACKAGE_ROOT).as_posix()
69+
digest.update(relative_path.encode("utf-8"))
70+
digest.update(b"\0")
71+
digest.update(file_path.read_bytes())
72+
digest.update(b"\0")
73+
return f"sha256:{digest.hexdigest()}"
74+
75+
76+
def get_data_build_metadata() -> dict[str, str | None]:
77+
return {
78+
"name": PACKAGE_NAME,
79+
"version": _get_package_version(),
80+
"git_sha": _get_git_sha(),
81+
"data_build_fingerprint": get_data_build_fingerprint(),
82+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from unittest.mock import patch
2+
3+
from policyengine_uk.build_metadata import (
4+
get_data_build_fingerprint,
5+
get_data_build_metadata,
6+
)
7+
8+
9+
def test_data_build_fingerprint_is_stable_within_process():
10+
get_data_build_fingerprint.cache_clear()
11+
12+
first = get_data_build_fingerprint()
13+
second = get_data_build_fingerprint()
14+
15+
assert first.startswith("sha256:")
16+
assert first == second
17+
18+
19+
def test_get_data_build_metadata_includes_version_git_sha_and_fingerprint():
20+
get_data_build_fingerprint.cache_clear()
21+
22+
with (
23+
patch(
24+
"policyengine_uk.build_metadata._get_package_version",
25+
return_value="2.74.0",
26+
),
27+
patch(
28+
"policyengine_uk.build_metadata._get_git_sha",
29+
return_value="deadbeef",
30+
),
31+
patch(
32+
"policyengine_uk.build_metadata.get_data_build_fingerprint",
33+
return_value="sha256:fingerprint",
34+
),
35+
):
36+
metadata = get_data_build_metadata()
37+
38+
assert metadata == {
39+
"name": "policyengine-uk",
40+
"version": "2.74.0",
41+
"git_sha": "deadbeef",
42+
"data_build_fingerprint": "sha256:fingerprint",
43+
}

0 commit comments

Comments
 (0)