Harden long-run production provenance (#1002)

MaxGhenis · web-flow · commit a9bc1a6fd368 · 2026-05-18T02:20:53.000-04:00
diff --git a/changelog.d/1002.fixed.md b/changelog.d/1002.fixed.md
@@ -0,0 +1 @@
+Harden long-run production provenance checks for Modal source packaging and output artifacts.
diff --git a/modal_app/long_term_projection.py b/modal_app/long_term_projection.py
@@ -69,6 +69,44 @@ def _local_git_sha() -> str:
     return result.stdout.strip()
 
 
+def _local_git_dirty() -> bool:
+    try:
+        result = subprocess.run(
+            ["git", "status", "--porcelain"],
+            cwd=_local,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+    except (OSError, subprocess.CalledProcessError):
+        return True
+    return bool(result.stdout.strip())
+
+
+def _validate_local_source(source_sha: str, *, allow_dirty_source: bool) -> None:
+    if allow_dirty_source:
+        return
+    if _local_git_dirty():
+        raise ValueError(
+            "The local policyengine-us-data checkout has uncommitted changes. "
+            "Commit and pass its SHA with --source-sha before running Modal, or "
+            "rerun with --allow-dirty-source for an explicitly non-publishable "
+            "experiment."
+        )
+    local_sha = _local_git_sha()
+    if not local_sha:
+        raise ValueError(
+            "Could not resolve the local policyengine-us-data git SHA; pass "
+            "--allow-dirty-source only for an explicitly non-publishable experiment."
+        )
+    if local_sha != source_sha:
+        raise ValueError(
+            "The requested source_sha does not match the local checkout that Modal "
+            f"will package: {source_sha} != {local_sha}. Check out the exact "
+            "source SHA before running production."
+        )
+
+
 def _append_optional_value(
     command: list[str],
     flag: str,
@@ -359,11 +397,16 @@ def main(
     support_augmentation_sanitize_worker_non_target_income: bool = False,
     support_augmentation_sanitize_clone_non_target_income: bool = False,
     spawn: bool = False,
+    allow_dirty_source: bool = False,
 ) -> None:
     if not source_sha:
         source_sha = os.environ.get("GITHUB_SHA", "") or _local_git_sha()
     if not source_sha:
         raise ValueError("source_sha is required; pass --source-sha.")
+    _validate_local_source(
+        source_sha,
+        allow_dirty_source=allow_dirty_source,
+    )
     run_id = sanitize_run_id(run_id)
     kwargs = {
         "years": years,
diff --git a/policyengine_us_data/datasets/cps/long_term/run_long_term_production.py b/policyengine_us_data/datasets/cps/long_term/run_long_term_production.py
@@ -8,7 +8,7 @@
 import subprocess
 import sys
 from datetime import UTC, datetime
-from importlib import metadata
+from importlib import import_module, metadata
 from pathlib import Path
 
 from policyengine_us_data.datasets.cps.long_term.run_household_projection_parallel import (
@@ -27,6 +27,10 @@
 DEFAULT_HF_REPO = "policyengine/policyengine-us-data"
 DEFAULT_ARTIFACT_PREFIX = "long_term"
 DEFAULT_TAX_ASSUMPTION = "trustees-2025-core-thresholds-v1"
+PACKAGE_VERSION_MODULES = {
+    "policyengine-us-data": "policyengine_us_data.__version__",
+    "policyengine_us_data": "policyengine_us_data.__version__",
+}
 
 
 def _git_sha() -> str:
@@ -47,7 +51,73 @@ def _package_version(package_name: str) -> str | None:
     try:
         return metadata.version(package_name)
     except metadata.PackageNotFoundError:
-        return None
+        version_module = PACKAGE_VERSION_MODULES.get(package_name)
+        if version_module is None:
+            return None
+        try:
+            module = import_module(version_module)
+        except ImportError:
+            return None
+        return getattr(module, "__version__", None)
+
+
+def _write_json(path: Path, payload: dict) -> None:
+    path.write_text(
+        json.dumps(payload, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+
+
+def stamp_projection_provenance(
+    *,
+    output_dir: Path,
+    source_sha: str,
+    run_id: str,
+) -> None:
+    """Stamp run provenance into artifacts created by the year runner."""
+    metadata_paths = sorted(output_dir.glob("*.h5.metadata.json"))
+    if not metadata_paths:
+        raise FileNotFoundError(f"No year metadata sidecars found in {output_dir}.")
+
+    for metadata_path in metadata_paths:
+        payload = json.loads(metadata_path.read_text(encoding="utf-8"))
+        if source_sha:
+            payload["source_sha"] = source_sha
+        if run_id:
+            payload["run_id"] = run_id
+        _write_json(metadata_path, payload)
+
+        h5_path = Path(str(metadata_path).removesuffix(".metadata.json"))
+        if not h5_path.exists():
+            raise FileNotFoundError(f"Missing H5 artifact for {metadata_path}.")
+        with _open_h5_append(h5_path) as h5_file:
+            if source_sha:
+                h5_file.attrs["source_sha"] = source_sha
+            if run_id:
+                h5_file.attrs["run_id"] = run_id
+
+    calibration_manifest_path = output_dir / "calibration_manifest.json"
+    if not calibration_manifest_path.exists():
+        raise FileNotFoundError(
+            f"Missing calibration manifest: {calibration_manifest_path}"
+        )
+    manifest = json.loads(calibration_manifest_path.read_text(encoding="utf-8"))
+    if source_sha:
+        manifest["source_sha"] = source_sha
+    if run_id:
+        manifest["run_id"] = run_id
+    for dataset in manifest.get("datasets", {}).values():
+        if source_sha:
+            dataset["source_sha"] = source_sha
+        if run_id:
+            dataset["run_id"] = run_id
+    _write_json(calibration_manifest_path, manifest)
+
+
+def _open_h5_append(path: Path):
+    import h5py
+
+    return h5py.File(path, "a")
 
 
 def _add_optional_value(
@@ -198,7 +268,7 @@ def write_manifest(
             "run_url": os.environ.get("US_DATA_GITHUB_RUN_URL", ""),
         },
         "package_versions": {
-            "policyengine-us-data": _package_version("policyengine_us_data"),
+            "policyengine-us-data": _package_version("policyengine-us-data"),
             "policyengine-us": _package_version("policyengine-us"),
             "policyengine-core": _package_version("policyengine-core"),
         },
@@ -339,6 +409,11 @@ def main() -> int:
     print("Running long-run projection command:")
     print(" ".join(command))
     subprocess.run(command, check=True)
+    stamp_projection_provenance(
+        output_dir=output_dir,
+        source_sha=source_sha,
+        run_id=run_id,
+    )
 
     artifacts = collect_artifacts(output_dir, args.artifact_prefix)
     manifest_path = write_manifest(
diff --git a/tests/unit/test_long_term_calibration_contract.py b/tests/unit/test_long_term_calibration_contract.py
@@ -3,7 +3,10 @@
 import hashlib
 import json
 import subprocess
+import sys
 from argparse import Namespace
+from importlib import metadata
+from pathlib import Path
 from types import SimpleNamespace
 import numpy as np
 import pytest
@@ -16,6 +19,7 @@
 from policyengine_us_data.datasets.cps.long_term import (
     prototype_synthetic_2100_support as synthetic_support_module,
 )
+from policyengine_us_data.datasets.cps.long_term import run_long_term_production
 from policyengine_us_data.datasets.cps.long_term.calibration import (
     assess_nonnegative_feasibility,
     build_calibration_audit,
@@ -88,7 +92,9 @@
     year_output_dir,
 )
 from policyengine_us_data.datasets.cps.long_term.run_long_term_production import (
+    _package_version,
     build_projection_command,
+    stamp_projection_provenance,
 )
 
 
@@ -2118,6 +2124,170 @@ def test_long_term_production_command_carries_2100_contract(tmp_path):
     assert "--allow-validation-failures" in command
 
 
+def test_long_term_production_stamps_source_sha_into_projection_artifacts(tmp_path):
+    import h5py
+
+    h5_path = tmp_path / "2075.h5"
+    with h5py.File(h5_path, "w") as h5_file:
+        h5_file.create_dataset("household_weight/2075", data=[1.0])
+    metadata_path = tmp_path / "2075.h5.metadata.json"
+    metadata_path.write_text(
+        json.dumps(
+            {
+                "year": 2075,
+                "calibration_audit": {"calibration_quality": "exact"},
+            }
+        ),
+        encoding="utf-8",
+    )
+    manifest_path = tmp_path / "calibration_manifest.json"
+    manifest_path.write_text(
+        json.dumps({"datasets": {"2075": {"h5": "2075.h5"}}}),
+        encoding="utf-8",
+    )
+
+    stamp_projection_provenance(
+        output_dir=tmp_path,
+        source_sha="abc123",
+        run_id="run-123",
+    )
+
+    metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+    assert metadata["source_sha"] == "abc123"
+    assert metadata["run_id"] == "run-123"
+    with h5py.File(h5_path) as h5_file:
+        assert h5_file.attrs["source_sha"] == "abc123"
+        assert h5_file.attrs["run_id"] == "run-123"
+    assert manifest["source_sha"] == "abc123"
+    assert manifest["run_id"] == "run-123"
+    assert manifest["datasets"]["2075"]["source_sha"] == "abc123"
+    assert manifest["datasets"]["2075"]["run_id"] == "run-123"
+
+
+def test_long_term_production_main_uploads_stamped_artifacts(
+    tmp_path,
+    monkeypatch,
+):
+    import h5py
+
+    output_dir = tmp_path / "out"
+    captured_command = []
+
+    def fake_run(command, check):
+        del check
+        captured_command.extend(command)
+        command_output_dir = Path(command[command.index("--output-dir") + 1])
+        command_output_dir.mkdir(parents=True, exist_ok=True)
+        with h5py.File(command_output_dir / "2075.h5", "w") as h5_file:
+            h5_file.create_dataset("household_weight/2075", data=[1.0])
+        (command_output_dir / "2075.h5.metadata.json").write_text(
+            json.dumps(
+                {
+                    "year": 2075,
+                    "base_dataset_path": "hf://example/base.h5",
+                    "profile": {"name": "ss-payroll-tob"},
+                    "calibration_audit": {"calibration_quality": "exact"},
+                }
+            ),
+            encoding="utf-8",
+        )
+        (command_output_dir / "calibration_manifest.json").write_text(
+            json.dumps({"datasets": {"2075": {"h5": "2075.h5"}}}),
+            encoding="utf-8",
+        )
+
+    uploaded = []
+
+    def fake_upload(
+        *,
+        artifacts,
+        output_dir,
+        args,
+        run_id,
+        source_sha,
+    ):
+        uploaded.extend(path.name for path in artifacts if path.suffix == ".json")
+        metadata_payload = json.loads(
+            (output_dir / "2075.h5.metadata.json").read_text(encoding="utf-8")
+        )
+        manifest_payload = json.loads(
+            (output_dir / "calibration_manifest.json").read_text(encoding="utf-8")
+        )
+        assert metadata_payload["source_sha"] == "abc123"
+        assert manifest_payload["source_sha"] == "abc123"
+        assert run_id == "run-123"
+        assert source_sha == "abc123"
+        assert args.upload_to_hf_staging is True
+        return len(artifacts)
+
+    build_info = PolicyEngineUSBuildInfo(
+        version="1.693.4",
+        locked_version="1.693.4",
+        package_file_sha256="file-sha",
+        package_tree_sha256="tree-sha",
+    )
+    monkeypatch.setattr(
+        run_long_term_production.subprocess,
+        "run",
+        fake_run,
+    )
+    monkeypatch.setattr(
+        run_long_term_production,
+        "assert_locked_policyengine_us_version",
+        lambda: build_info,
+    )
+    monkeypatch.setattr(
+        run_long_term_production,
+        "upload_artifacts",
+        fake_upload,
+    )
+    monkeypatch.setattr(run_long_term_production, "_git_sha", lambda: "abc123")
+    monkeypatch.setenv("HUGGING_FACE_TOKEN", "token")
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "run_long_term_production.py",
+            "--years",
+            "2075",
+            "--jobs",
+            "1",
+            "--output-dir",
+            str(output_dir),
+            "--run-id",
+            "run-123",
+            "--source-sha",
+            "abc123",
+            "--upload-to-hf-staging",
+        ],
+    )
+
+    assert run_long_term_production.main() == 0
+
+    assert captured_command
+    assert "2075.h5.metadata.json" in uploaded
+    assert "calibration_manifest.json" in uploaded
+    with h5py.File(output_dir / "2075.h5") as h5_file:
+        assert h5_file.attrs["source_sha"] == "abc123"
+        assert h5_file.attrs["run_id"] == "run-123"
+
+
+def test_long_term_production_reads_source_tree_data_package_version(monkeypatch):
+    from policyengine_us_data.__version__ import __version__
+
+    def fail_metadata_version(package_name):
+        raise metadata.PackageNotFoundError(package_name)
+
+    monkeypatch.setattr(
+        "policyengine_us_data.datasets.cps.long_term."
+        "run_long_term_production.metadata.version",
+        fail_metadata_version,
+    )
+
+    assert _package_version("policyengine-us-data") == __version__
+
+
 def test_parallel_projection_validate_forwarded_args_rejects_wrapper_flags():
     with pytest.raises(ValueError, match="--output-dir"):
         validate_forwarded_args(["--output-dir", "/tmp/out"])
diff --git a/tests/unit/test_long_term_modal_projection.py b/tests/unit/test_long_term_modal_projection.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Harden long-run production provenance checks for Modal source packaging and output artifacts.`