22
33from datetime import datetime , timezone
44import hashlib
5- from importlib import metadata as importlib_metadata
65import json
76from pathlib import Path
8- import subprocess
97from typing import Any
108
9+ from policyengine_us_data .utils .policyengine import (
10+ PolicyEngineUSBuildInfo ,
11+ get_policyengine_us_build_info ,
12+ )
13+
1114try :
1215 from .calibration_profiles import (
1316 classify_calibration_quality ,
2730SUPPORT_AUGMENTATION_REPORT_FILENAME = "support_augmentation_report.json"
2831
2932
33+ def _coerce_policyengine_us_metadata (
34+ policyengine_us : PolicyEngineUSBuildInfo | dict [str , Any ] | None ,
35+ ) -> dict [str , Any ]:
36+ if policyengine_us is None :
37+ return get_policyengine_us_build_info ().to_metadata_dict ()
38+ if isinstance (policyengine_us , PolicyEngineUSBuildInfo ):
39+ return policyengine_us .to_metadata_dict ()
40+ coerced = json .loads (json .dumps (policyengine_us ))
41+ git_head = coerced .get ("git_head" )
42+ if git_head is not None :
43+ coerced .setdefault ("git_commit" , git_head )
44+ coerced .setdefault ("commit_id" , git_head )
45+ coerced .setdefault (
46+ "direct_url" ,
47+ {"vcs_info" : {"commit_id" : git_head , "vcs" : "git" }},
48+ )
49+ return coerced
50+
51+
52+ def _metadata_policyengine_us (
53+ metadata_path : str | Path ,
54+ ) -> dict [str , Any ] | None :
55+ path = Path (metadata_path )
56+ if not path .exists ():
57+ return None
58+ metadata = json .loads (path .read_text (encoding = "utf-8" ))
59+ policyengine_us = metadata .get ("policyengine_us" )
60+ if policyengine_us is None :
61+ return None
62+ return json .loads (json .dumps (policyengine_us ))
63+
64+
3065def metadata_path_for (h5_path : str | Path ) -> Path :
3166 return Path (f"{ Path (h5_path )} .metadata.json" )
3267
@@ -39,55 +74,8 @@ def _sha256_file(path: Path) -> str:
3974 return hashlib .sha256 (path .read_bytes ()).hexdigest ()
4075
4176
42- def _find_git_repo_root (path : Path ) -> Path | None :
43- current = path if path .is_dir () else path .parent
44- for candidate in (current , * current .parents ):
45- if (candidate / ".git" ).exists ():
46- return candidate
47- return None
48-
49-
5077def capture_policyengine_us_provenance () -> dict [str , Any ]:
51- import policyengine_us
52-
53- package_file = Path (policyengine_us .__file__ ).resolve ()
54- version = getattr (policyengine_us , "__version__" , None )
55- if version is None :
56- try :
57- version = importlib_metadata .version ("policyengine-us" )
58- except importlib_metadata .PackageNotFoundError :
59- version = None
60- provenance : dict [str , Any ] = {
61- "package_file" : str (package_file ),
62- "package_file_sha256" : _sha256_file (package_file ),
63- "package_mtime_ns" : package_file .stat ().st_mtime_ns ,
64- "package_size" : package_file .stat ().st_size ,
65- "version" : version ,
66- }
67- repo_root = _find_git_repo_root (package_file )
68- if repo_root is None :
69- return provenance
70-
71- provenance ["repo_root" ] = str (repo_root )
72- head = subprocess .run (
73- ["git" , "rev-parse" , "HEAD" ],
74- cwd = repo_root ,
75- check = False ,
76- capture_output = True ,
77- text = True ,
78- )
79- if head .returncode == 0 :
80- provenance ["git_head" ] = head .stdout .strip ()
81- status = subprocess .run (
82- ["git" , "status" , "--porcelain=v1" ],
83- cwd = repo_root ,
84- check = False ,
85- capture_output = True ,
86- text = True ,
87- )
88- if status .returncode == 0 :
89- provenance ["git_dirty" ] = bool (status .stdout .strip ())
90- return provenance
78+ return get_policyengine_us_build_info ().to_metadata_dict ()
9179
9280
9381def _resolve_base_dataset_path (base_dataset_path : str ) -> Path | None :
@@ -237,7 +225,7 @@ def write_year_metadata(
237225 target_source : dict [str , Any ] | None = None ,
238226 tax_assumption : dict [str , Any ] | None = None ,
239227 support_augmentation : dict [str , Any ] | None = None ,
240- policyengine_us : dict [str , Any ] | None = None ,
228+ policyengine_us : PolicyEngineUSBuildInfo | dict [str , Any ] | None = None ,
241229 base_dataset_snapshot : dict [str , Any ] | None = None ,
242230) -> Path :
243231 metadata = {
@@ -246,15 +234,14 @@ def write_year_metadata(
246234 "base_dataset_path" : base_dataset_path ,
247235 "profile" : profile ,
248236 "calibration_audit" : calibration_audit ,
237+ "policyengine_us" : _coerce_policyengine_us_metadata (policyengine_us ),
249238 }
250239 if target_source is not None :
251240 metadata ["target_source" ] = target_source
252241 if tax_assumption is not None :
253242 metadata ["tax_assumption" ] = tax_assumption
254243 if support_augmentation is not None :
255244 metadata ["support_augmentation" ] = support_augmentation
256- if policyengine_us is not None :
257- metadata ["policyengine_us" ] = policyengine_us
258245 if base_dataset_snapshot is not None :
259246 metadata ["base_dataset_snapshot" ] = base_dataset_snapshot
260247 metadata = normalize_metadata (metadata )
@@ -294,7 +281,7 @@ def update_dataset_manifest(
294281 target_source : dict [str , Any ] | None = None ,
295282 tax_assumption : dict [str , Any ] | None = None ,
296283 support_augmentation : dict [str , Any ] | None = None ,
297- policyengine_us : dict [str , Any ] | None = None ,
284+ policyengine_us : PolicyEngineUSBuildInfo | dict [str , Any ] | None = None ,
298285 base_dataset_snapshot : dict [str , Any ] | None = None ,
299286) -> Path :
300287 output_dir = Path (output_dir )
@@ -303,7 +290,13 @@ def update_dataset_manifest(
303290 target_source = _json_clone (target_source )
304291 tax_assumption = _json_clone (tax_assumption )
305292 support_augmentation = _json_clone (support_augmentation )
306- policyengine_us = _json_clone (policyengine_us )
293+ policyengine_us = (
294+ _coerce_policyengine_us_metadata (policyengine_us )
295+ if policyengine_us is not None
296+ else _metadata_policyengine_us (metadata_path )
297+ )
298+ if policyengine_us is None :
299+ policyengine_us = get_policyengine_us_build_info ().to_metadata_dict ()
307300 base_dataset_snapshot = _json_clone (base_dataset_snapshot )
308301
309302 if manifest_path .exists ():
@@ -402,6 +395,7 @@ def update_dataset_manifest(
402395 "negative_weight_household_pct" : calibration_audit .get (
403396 "negative_weight_household_pct"
404397 ),
398+ "policyengine_us_version" : policyengine_us .get ("version" ),
405399 "validation_passed" : calibration_audit .get ("validation_passed" ),
406400 "validation_issue_count" : len (calibration_audit .get ("validation_issues" , [])),
407401 }
0 commit comments