Skip to content

Commit fb3b987

Browse files
authored
Add HF destination guard constants and xfail AST detection test (#356)
* Add HF destination guard constants and xfail AST detection test * Apply ruff format
1 parent f06bc50 commit fb3b987

3 files changed

Lines changed: 234 additions & 0 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add `policyengine_uk_data.utils.hf_destinations` with `PRIVATE_REPO` / `PUBLIC_REPO` constants and an AST-based xfail test (`tests/test_hf_destinations.py`) that flags every `upload(...)`, `upload_file(...)`, `upload_files_to_hf(...)`, and `upload_data_files(...)` call site that still bypasses the shared constants.
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
"""AST-based guard that every HF upload call routes through the shared
2+
`PRIVATE_REPO` / `PUBLIC_REPO` constants in
3+
:mod:`policyengine_uk_data.utils.hf_destinations`.
4+
5+
Motivation (bug-hunt finding S1):
6+
7+
- ``storage/upload_private_prerequisites.py`` uploads UKDS-licensed FRS/LCFS/
8+
WAS/ETB/SPI zips with a literal ``repo="policyengine/policyengine-uk-data"``
9+
argument — i.e. the PUBLIC repo.
10+
- ``utils/data_upload.py::upload_data_files`` defaults ``hf_repo_name`` to the
11+
PUBLIC repo, while the sibling ``upload_files_to_hf`` defaults to the
12+
PRIVATE repo.
13+
- Mixed literals across the codebase mean one typo in a future script could
14+
silently leak microdata.
15+
16+
Approach:
17+
18+
- Parse every ``.py`` file in this package with :mod:`ast`.
19+
- For every call to one of the upload entry points (``upload``,
20+
``upload_file``, ``upload_files_to_hf``, ``upload_data_files``), look for
21+
the ``repo=`` / ``hf_repo_name=`` keyword argument.
22+
- If the argument is a string literal that isn't accessed via
23+
``hf_destinations.PRIVATE_REPO`` / ``PUBLIC_REPO`` (or the module-level
24+
``ALLOWED_REPOS`` set), record it as a violation.
25+
26+
The test is marked ``xfail`` until every call site is migrated. It will
27+
begin failing (i.e. "pass unexpectedly") as a signal that the clean-up is
28+
complete, at which point the ``xfail`` decorator should be removed.
29+
30+
**Do NOT silence this test by changing the destinations in place.** The
31+
existing destinations are preserved by repo policy (see CLAUDE.md rule 1
32+
and the policyengine-uk-data private/public split). Resolving the naming
33+
inconsistency requires a data-controller decision — either rename the HF
34+
repos or migrate each script individually with sign-off — not a blanket
35+
string swap in this PR.
36+
"""
37+
38+
from __future__ import annotations
39+
40+
import ast
41+
from pathlib import Path
42+
43+
import pytest
44+
45+
46+
UPLOAD_CALL_NAMES: set[str] = {
47+
"upload",
48+
"upload_file",
49+
"upload_files_to_hf",
50+
"upload_data_files",
51+
}
52+
53+
KEYWORD_ARGS: set[str] = {"repo", "hf_repo_name", "repo_id"}
54+
55+
# Files that this test intentionally does NOT scan for violations:
56+
# - The constants module itself (defines the literals).
57+
# - The test file that validates those literals.
58+
# - The xfail guard below.
59+
IGNORED_FILENAMES: set[str] = {
60+
"hf_destinations.py",
61+
"test_hf_destinations.py",
62+
}
63+
64+
65+
def _iter_py_files() -> list[Path]:
66+
root = Path(__file__).resolve().parent.parent
67+
files: list[Path] = []
68+
for path in root.rglob("*.py"):
69+
if path.name in IGNORED_FILENAMES:
70+
continue
71+
if "__pycache__" in path.parts:
72+
continue
73+
files.append(path)
74+
return files
75+
76+
77+
def _call_name(call: ast.Call) -> str | None:
78+
"""Return the simple name of a call target, if recognisable.
79+
80+
Handles both ``upload(...)`` and ``hf.upload(...)`` styles.
81+
"""
82+
func = call.func
83+
if isinstance(func, ast.Name):
84+
return func.id
85+
if isinstance(func, ast.Attribute):
86+
return func.attr
87+
return None
88+
89+
90+
def _kwarg_value(call: ast.Call, name: str) -> ast.AST | None:
91+
for kw in call.keywords:
92+
if kw.arg == name:
93+
return kw.value
94+
return None
95+
96+
97+
def _is_allowed_reference(node: ast.AST) -> bool:
98+
"""Check whether a kwarg value routes through the shared constants."""
99+
if isinstance(node, ast.Attribute):
100+
# hf_destinations.PRIVATE_REPO / .PUBLIC_REPO
101+
if node.attr in {"PRIVATE_REPO", "PUBLIC_REPO"}:
102+
return True
103+
if isinstance(node, ast.Name):
104+
# Imported as `from ...hf_destinations import PRIVATE_REPO`.
105+
if node.id in {"PRIVATE_REPO", "PUBLIC_REPO"}:
106+
return True
107+
return False
108+
109+
110+
def _collect_violations() -> list[str]:
111+
violations: list[str] = []
112+
for path in _iter_py_files():
113+
try:
114+
tree = ast.parse(path.read_text(encoding="utf-8"), filename=str(path))
115+
except SyntaxError:
116+
# Don't let a single syntax-invalid file abort the scan.
117+
continue
118+
for node in ast.walk(tree):
119+
if not isinstance(node, ast.Call):
120+
continue
121+
name = _call_name(node)
122+
if name not in UPLOAD_CALL_NAMES:
123+
continue
124+
for kwarg in KEYWORD_ARGS:
125+
value = _kwarg_value(node, kwarg)
126+
if value is None:
127+
continue
128+
if isinstance(value, ast.Constant) and isinstance(value.value, str):
129+
# A raw string literal — flag it.
130+
violations.append(
131+
f"{path}:{node.lineno} "
132+
f"{name}(..., {kwarg}={value.value!r}, ...)"
133+
)
134+
elif not _is_allowed_reference(value):
135+
# Any other expression (variable, call, f-string, etc.)
136+
# that isn't demonstrably the shared constant.
137+
violations.append(
138+
f"{path}:{node.lineno} {name}(..., {kwarg}=<expr>)"
139+
)
140+
return violations
141+
142+
143+
@pytest.mark.xfail(
144+
reason=(
145+
"Known naming inconsistency; existing destinations intentionally "
146+
"preserved per repo policy. Resolve by renaming the HF repo or "
147+
"migrating scripts — NOT by changing code in place without owner "
148+
"approval."
149+
),
150+
strict=False,
151+
)
152+
def test_every_hf_upload_routes_through_guard_constants() -> None:
153+
violations = _collect_violations()
154+
if violations:
155+
formatted = "\n ".join(violations)
156+
pytest.fail(
157+
"The following upload-site call arguments bypass the shared "
158+
"PRIVATE_REPO / PUBLIC_REPO constants in "
159+
"policyengine_uk_data.utils.hf_destinations:\n " + formatted
160+
)
161+
162+
163+
def test_hf_destinations_constants_are_distinct_and_well_formed() -> None:
164+
"""Sanity: the two constants are different and look like HF repo ids."""
165+
from policyengine_uk_data.utils.hf_destinations import (
166+
ALLOWED_REPOS,
167+
PRIVATE_REPO,
168+
PUBLIC_REPO,
169+
)
170+
171+
assert PRIVATE_REPO != PUBLIC_REPO
172+
assert PRIVATE_REPO == "policyengine/policyengine-uk-data-private"
173+
assert PUBLIC_REPO == "policyengine/policyengine-uk-data"
174+
assert ALLOWED_REPOS == {PRIVATE_REPO, PUBLIC_REPO}
175+
for repo in ALLOWED_REPOS:
176+
assert repo.startswith("policyengine/"), repo
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""HuggingFace destination constants for policyengine-uk-data.
2+
3+
The repo uploads data to two distinct HuggingFace model repos:
4+
5+
- ``PRIVATE_REPO = "policyengine/policyengine-uk-data-private"`` holds every
6+
artefact that is derived from UKDS-licensed microdata — raw FRS/LCFS/WAS/
7+
ETB/SPI zips, the enhanced FRS h5 files built on top of them, and any
8+
weights produced by calibrating against those datasets. Access is
9+
restricted to PolicyEngine collaborators who have accepted the UK Data
10+
Service End User Licence.
11+
- ``PUBLIC_REPO = "policyengine/policyengine-uk-data"`` is a separate,
12+
publicly readable repo that is populated via a different process and is
13+
**NOT** a place to push FRS-derived microdata. If in doubt about whether
14+
an artefact may go here, check with the data controller (currently
15+
Nikhil Woodruff).
16+
17+
This module exposes the destinations as module-level constants so callers
18+
can reference them by name instead of duplicating string literals across
19+
the codebase. It intentionally does NOT change any existing upload
20+
destinations — the PR that introduces this module only adds detection
21+
scaffolding. Existing destinations are preserved per repo policy and
22+
CLAUDE.md rule 1 ("NEVER upload data to any public location"), and any
23+
resolution of the naming inconsistency should happen explicitly in a
24+
separate PR signed off by the data controller.
25+
"""
26+
27+
from __future__ import annotations
28+
29+
from typing import Final
30+
31+
32+
PRIVATE_REPO: Final[str] = "policyengine/policyengine-uk-data-private"
33+
"""HuggingFace repo for UKDS-licensed FRS-derived artefacts.
34+
35+
Every upload of FRS, LCFS, WAS, ETB, SPI or enhanced FRS data — plus any
36+
weights or manifests derived from them — MUST land here.
37+
"""
38+
39+
PUBLIC_REPO: Final[str] = "policyengine/policyengine-uk-data"
40+
"""HuggingFace repo for the separately-maintained public mirror.
41+
42+
Publicly readable. Populated through a distinct process and not a valid
43+
destination for FRS-derived microdata. Referenced here so we can
44+
distinguish intentional public reads (e.g. loading a non-UKDS sample
45+
dataset) from accidental public writes.
46+
"""
47+
48+
49+
ALLOWED_REPOS: Final[frozenset[str]] = frozenset({PRIVATE_REPO, PUBLIC_REPO})
50+
"""The only HF repo names code in this package should reference.
51+
52+
Used by `tests/test_hf_destinations.py` to AST-scan every `upload(...)`,
53+
`upload_file(...)`, `upload_files_to_hf(...)` and `upload_data_files(...)`
54+
call site. A destination outside this set is a code error; a destination
55+
in this set that is the wrong choice for the data at hand is a policy
56+
decision that must be reviewed by the data controller.
57+
"""

0 commit comments

Comments
 (0)