Skip to content

Commit 9c553d1

Browse files
committed
Use repo-local raw PUF fallback
1 parent 960ac2f commit 9c553d1

2 files changed

Lines changed: 84 additions & 4 deletions

File tree

src/microplex_us/data_sources/puf.py

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,31 @@ def download_puf(cache_dir: Path | None = None) -> Path:
401401
return Path(puf_path), Path(demo_path)
402402

403403

404+
def _resolve_policyengine_repo_local_puf_paths(
405+
policyengine_us_data_repo: str | Path | None,
406+
) -> tuple[Path, Path | None] | None:
407+
"""Resolve raw PUF CSVs from a local policyengine-us-data checkout."""
408+
409+
if policyengine_us_data_repo is None:
410+
return None
411+
try:
412+
repo_root = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo)
413+
except FileNotFoundError:
414+
return None
415+
candidate_dirs = (
416+
repo_root / "policyengine_us_data" / "storage",
417+
repo_root / "data" / "raw",
418+
)
419+
for candidate_dir in candidate_dirs:
420+
puf_path = candidate_dir / "puf_2015.csv"
421+
demographics_path = candidate_dir / "demographics_2015.csv"
422+
if puf_path.exists():
423+
return puf_path, (
424+
demographics_path if demographics_path.exists() else None
425+
)
426+
return None
427+
428+
404429
def load_puf_raw(puf_path: Path, demographics_path: Path | None = None) -> pd.DataFrame:
405430
"""Load raw PUF data from CSV."""
406431
print(f"Loading PUF from {puf_path}...")
@@ -1772,8 +1797,16 @@ def load_puf(
17721797
Returns:
17731798
DataFrame with common variable names, ready for stacking with CPS
17741799
"""
1775-
# Download if needed
1776-
puf_path, demo_path = download_puf(cache_dir)
1800+
# Prefer a repo-local raw PUF copy when available to avoid remote auth/cache
1801+
# requirements during rebuild runs.
1802+
local_repo_paths = _resolve_policyengine_repo_local_puf_paths(
1803+
policyengine_us_data_repo
1804+
)
1805+
if local_repo_paths is not None:
1806+
puf_path, demo_path = local_repo_paths
1807+
print(f"Using repo-local PUF from {puf_path}...")
1808+
else:
1809+
puf_path, demo_path = download_puf(cache_dir)
17771810

17781811
# Load raw data
17791812
raw = load_puf_raw(puf_path, demo_path)
@@ -2195,8 +2228,15 @@ def load_frame(self, query: SourceQuery | None = None) -> ObservationFrame:
21952228
)
21962229
)
21972230
if puf_path is None:
2198-
loader = self.loader or download_puf
2199-
loaded_puf_path, loaded_demographics_path = loader(self.cache_dir)
2231+
local_repo_paths = _resolve_policyengine_repo_local_puf_paths(
2232+
policyengine_us_data_repo
2233+
)
2234+
if local_repo_paths is not None:
2235+
loaded_puf_path, loaded_demographics_path = local_repo_paths
2236+
print(f"Using repo-local PUF from {loaded_puf_path}...")
2237+
else:
2238+
loader = self.loader or download_puf
2239+
loaded_puf_path, loaded_demographics_path = loader(self.cache_dir)
22002240
puf_path = loaded_puf_path
22012241
if demographics_path is None:
22022242
demographics_path = loaded_demographics_path

tests/test_puf_source_provider.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,46 @@ def fail_download(*args, **kwargs):
10581058
assert resolved_demo_path == demographics_path
10591059

10601060

1061+
def test_puf_source_provider_prefers_policyengine_repo_local_raw_files(
1062+
tmp_path, monkeypatch
1063+
):
1064+
repo_root = tmp_path / "policyengine-us-data"
1065+
storage_dir = repo_root / "policyengine_us_data" / "storage"
1066+
storage_dir.mkdir(parents=True)
1067+
pd.DataFrame(
1068+
{
1069+
"RECID": [101],
1070+
"MARS": [1],
1071+
"XTOT": [1],
1072+
"S006": [100.0],
1073+
"E00200": [50_000.0],
1074+
"E02400": [0.0],
1075+
"E01400": [0.0],
1076+
"AGE_HEAD": [45],
1077+
"GENDER": [1],
1078+
}
1079+
).to_csv(storage_dir / "puf_2015.csv", index=False)
1080+
pd.DataFrame({"RECID": [101]}).to_csv(
1081+
storage_dir / "demographics_2015.csv", index=False
1082+
)
1083+
1084+
def fail_loader(*args, **kwargs):
1085+
raise AssertionError("remote/cache loader should not run when repo-local PUF exists")
1086+
1087+
provider = PUFSourceProvider(
1088+
target_year=2015,
1089+
policyengine_us_data_repo=repo_root,
1090+
loader=fail_loader,
1091+
social_security_share_model_loader=_mock_social_security_share_model_loader,
1092+
)
1093+
1094+
frame = provider.load_frame(SourceQuery(period=2015))
1095+
persons = frame.tables[EntityType.PERSON]
1096+
1097+
assert len(persons) == 1
1098+
assert persons["employment_income"].sum() == 50_000.0
1099+
1100+
10611101
def test_map_puf_variables_seed_controls_age_imputation():
10621102
puf = pd.DataFrame(
10631103
{

0 commit comments

Comments
 (0)