|
| 1 | +"""AST-based guard that every HF upload call routes through the shared |
| 2 | +`PRIVATE_REPO` / `PUBLIC_REPO` constants in |
| 3 | +:mod:`policyengine_uk_data.utils.hf_destinations`. |
| 4 | +
|
| 5 | +Motivation (bug-hunt finding S1): |
| 6 | +
|
| 7 | +- ``storage/upload_private_prerequisites.py`` uploads UKDS-licensed FRS/LCFS/ |
| 8 | + WAS/ETB/SPI zips with a literal ``repo="policyengine/policyengine-uk-data"`` |
| 9 | + argument — i.e. the PUBLIC repo. |
| 10 | +- ``utils/data_upload.py::upload_data_files`` defaults ``hf_repo_name`` to the |
| 11 | + PUBLIC repo, while the sibling ``upload_files_to_hf`` defaults to the |
| 12 | + PRIVATE repo. |
| 13 | +- Mixed literals across the codebase mean one typo in a future script could |
| 14 | + silently leak microdata. |
| 15 | +
|
| 16 | +Approach: |
| 17 | +
|
| 18 | +- Parse every ``.py`` file in this package with :mod:`ast`. |
| 19 | +- For every call to one of the upload entry points (``upload``, |
| 20 | + ``upload_file``, ``upload_files_to_hf``, ``upload_data_files``), look for |
| 21 | + the ``repo=`` / ``hf_repo_name=`` keyword argument. |
| 22 | +- If the argument is a string literal that isn't accessed via |
| 23 | + ``hf_destinations.PRIVATE_REPO`` / ``PUBLIC_REPO`` (or the module-level |
| 24 | + ``ALLOWED_REPOS`` set), record it as a violation. |
| 25 | +
|
| 26 | +The test is marked ``xfail`` until every call site is migrated. It will |
| 27 | +begin failing (i.e. "pass unexpectedly") as a signal that the clean-up is |
| 28 | +complete, at which point the ``xfail`` decorator should be removed. |
| 29 | +
|
| 30 | +**Do NOT silence this test by changing the destinations in place.** The |
| 31 | +existing destinations are preserved by repo policy (see CLAUDE.md rule 1 |
| 32 | +and the policyengine-uk-data private/public split). Resolving the naming |
| 33 | +inconsistency requires a data-controller decision — either rename the HF |
| 34 | +repos or migrate each script individually with sign-off — not a blanket |
| 35 | +string swap in this PR. |
| 36 | +""" |
| 37 | + |
| 38 | +from __future__ import annotations |
| 39 | + |
| 40 | +import ast |
| 41 | +from pathlib import Path |
| 42 | + |
| 43 | +import pytest |
| 44 | + |
| 45 | + |
| 46 | +UPLOAD_CALL_NAMES: set[str] = { |
| 47 | + "upload", |
| 48 | + "upload_file", |
| 49 | + "upload_files_to_hf", |
| 50 | + "upload_data_files", |
| 51 | +} |
| 52 | + |
| 53 | +KEYWORD_ARGS: set[str] = {"repo", "hf_repo_name", "repo_id"} |
| 54 | + |
| 55 | +# Files that this test intentionally does NOT scan for violations: |
| 56 | +# - The constants module itself (defines the literals). |
| 57 | +# - The test file that validates those literals. |
| 58 | +# - The xfail guard below. |
| 59 | +IGNORED_FILENAMES: set[str] = { |
| 60 | + "hf_destinations.py", |
| 61 | + "test_hf_destinations.py", |
| 62 | +} |
| 63 | + |
| 64 | + |
| 65 | +def _iter_py_files() -> list[Path]: |
| 66 | + root = Path(__file__).resolve().parent.parent |
| 67 | + files: list[Path] = [] |
| 68 | + for path in root.rglob("*.py"): |
| 69 | + if path.name in IGNORED_FILENAMES: |
| 70 | + continue |
| 71 | + if "__pycache__" in path.parts: |
| 72 | + continue |
| 73 | + files.append(path) |
| 74 | + return files |
| 75 | + |
| 76 | + |
| 77 | +def _call_name(call: ast.Call) -> str | None: |
| 78 | + """Return the simple name of a call target, if recognisable. |
| 79 | +
|
| 80 | + Handles both ``upload(...)`` and ``hf.upload(...)`` styles. |
| 81 | + """ |
| 82 | + func = call.func |
| 83 | + if isinstance(func, ast.Name): |
| 84 | + return func.id |
| 85 | + if isinstance(func, ast.Attribute): |
| 86 | + return func.attr |
| 87 | + return None |
| 88 | + |
| 89 | + |
| 90 | +def _kwarg_value(call: ast.Call, name: str) -> ast.AST | None: |
| 91 | + for kw in call.keywords: |
| 92 | + if kw.arg == name: |
| 93 | + return kw.value |
| 94 | + return None |
| 95 | + |
| 96 | + |
| 97 | +def _is_allowed_reference(node: ast.AST) -> bool: |
| 98 | + """Check whether a kwarg value routes through the shared constants.""" |
| 99 | + if isinstance(node, ast.Attribute): |
| 100 | + # hf_destinations.PRIVATE_REPO / .PUBLIC_REPO |
| 101 | + if node.attr in {"PRIVATE_REPO", "PUBLIC_REPO"}: |
| 102 | + return True |
| 103 | + if isinstance(node, ast.Name): |
| 104 | + # Imported as `from ...hf_destinations import PRIVATE_REPO`. |
| 105 | + if node.id in {"PRIVATE_REPO", "PUBLIC_REPO"}: |
| 106 | + return True |
| 107 | + return False |
| 108 | + |
| 109 | + |
| 110 | +def _collect_violations() -> list[str]: |
| 111 | + violations: list[str] = [] |
| 112 | + for path in _iter_py_files(): |
| 113 | + try: |
| 114 | + tree = ast.parse(path.read_text(encoding="utf-8"), filename=str(path)) |
| 115 | + except SyntaxError: |
| 116 | + # Don't let a single syntax-invalid file abort the scan. |
| 117 | + continue |
| 118 | + for node in ast.walk(tree): |
| 119 | + if not isinstance(node, ast.Call): |
| 120 | + continue |
| 121 | + name = _call_name(node) |
| 122 | + if name not in UPLOAD_CALL_NAMES: |
| 123 | + continue |
| 124 | + for kwarg in KEYWORD_ARGS: |
| 125 | + value = _kwarg_value(node, kwarg) |
| 126 | + if value is None: |
| 127 | + continue |
| 128 | + if isinstance(value, ast.Constant) and isinstance(value.value, str): |
| 129 | + # A raw string literal — flag it. |
| 130 | + violations.append( |
| 131 | + f"{path}:{node.lineno} " |
| 132 | + f"{name}(..., {kwarg}={value.value!r}, ...)" |
| 133 | + ) |
| 134 | + elif not _is_allowed_reference(value): |
| 135 | + # Any other expression (variable, call, f-string, etc.) |
| 136 | + # that isn't demonstrably the shared constant. |
| 137 | + violations.append( |
| 138 | + f"{path}:{node.lineno} {name}(..., {kwarg}=<expr>)" |
| 139 | + ) |
| 140 | + return violations |
| 141 | + |
| 142 | + |
| 143 | +@pytest.mark.xfail( |
| 144 | + reason=( |
| 145 | + "Known naming inconsistency; existing destinations intentionally " |
| 146 | + "preserved per repo policy. Resolve by renaming the HF repo or " |
| 147 | + "migrating scripts — NOT by changing code in place without owner " |
| 148 | + "approval." |
| 149 | + ), |
| 150 | + strict=False, |
| 151 | +) |
| 152 | +def test_every_hf_upload_routes_through_guard_constants() -> None: |
| 153 | + violations = _collect_violations() |
| 154 | + if violations: |
| 155 | + formatted = "\n ".join(violations) |
| 156 | + pytest.fail( |
| 157 | + "The following upload-site call arguments bypass the shared " |
| 158 | + "PRIVATE_REPO / PUBLIC_REPO constants in " |
| 159 | + "policyengine_uk_data.utils.hf_destinations:\n " + formatted |
| 160 | + ) |
| 161 | + |
| 162 | + |
| 163 | +def test_hf_destinations_constants_are_distinct_and_well_formed() -> None: |
| 164 | + """Sanity: the two constants are different and look like HF repo ids.""" |
| 165 | + from policyengine_uk_data.utils.hf_destinations import ( |
| 166 | + ALLOWED_REPOS, |
| 167 | + PRIVATE_REPO, |
| 168 | + PUBLIC_REPO, |
| 169 | + ) |
| 170 | + |
| 171 | + assert PRIVATE_REPO != PUBLIC_REPO |
| 172 | + assert PRIVATE_REPO == "policyengine/policyengine-uk-data-private" |
| 173 | + assert PUBLIC_REPO == "policyengine/policyengine-uk-data" |
| 174 | + assert ALLOWED_REPOS == {PRIVATE_REPO, PUBLIC_REPO} |
| 175 | + for repo in ALLOWED_REPOS: |
| 176 | + assert repo.startswith("policyengine/"), repo |
0 commit comments