Skip to content

Commit d698369

Browse files
committed
remove H5 dependency for make database, pass year argument instead
1 parent d658aa7 commit d698369

4 files changed

Lines changed: 41 additions & 65 deletions

File tree

Makefile

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
SOI_SOURCE_YEAR ?= 2021
44
SOI_TARGET_YEAR ?= 2023
55

6+
YEAR ?= 2024
7+
68
GPU ?= T4
79
EPOCHS ?= 1000
810
NATIONAL_GPU ?= T4
@@ -75,14 +77,14 @@ documentation-dev:
7577
database:
7678
rm -f policyengine_us_data/storage/calibration/policy_data.db
7779
python policyengine_us_data/db/create_database_tables.py
78-
python policyengine_us_data/db/create_initial_strata.py
79-
python policyengine_us_data/db/etl_national_targets.py
80-
python policyengine_us_data/db/etl_age.py
81-
python policyengine_us_data/db/etl_medicaid.py
82-
python policyengine_us_data/db/etl_snap.py
83-
python policyengine_us_data/db/etl_state_income_tax.py
84-
python policyengine_us_data/db/etl_irs_soi.py
85-
python policyengine_us_data/db/etl_pregnancy.py
80+
python policyengine_us_data/db/create_initial_strata.py --year $(YEAR)
81+
python policyengine_us_data/db/etl_national_targets.py --year $(YEAR)
82+
python policyengine_us_data/db/etl_age.py --year $(YEAR)
83+
python policyengine_us_data/db/etl_medicaid.py --year $(YEAR)
84+
python policyengine_us_data/db/etl_snap.py --year $(YEAR)
85+
python policyengine_us_data/db/etl_state_income_tax.py --year $(YEAR)
86+
python policyengine_us_data/db/etl_irs_soi.py --year $(YEAR)
87+
python policyengine_us_data/db/etl_pregnancy.py --year $(YEAR)
8688
python policyengine_us_data/db/validate_database.py
8789

8890
database-refresh:

policyengine_us_data/db/etl_national_targets.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,19 @@
1313
RETIREMENT_CONTRIBUTION_TARGETS,
1414
)
1515
from policyengine_us_data.utils.db import (
16-
DEFAULT_DATASET,
16+
DEFAULT_YEAR,
1717
etl_argparser,
1818
)
1919

2020

21-
def extract_national_targets(dataset: str = DEFAULT_DATASET):
21+
def extract_national_targets(year: int = DEFAULT_YEAR):
2222
"""
2323
Extract national calibration targets from various sources.
2424
2525
Parameters
2626
----------
27-
dataset : str
28-
Path to the calibration dataset (local path or HuggingFace URL).
29-
The time period is derived from the dataset's default_calculation_period.
27+
year : int
28+
Target year for calibration data.
3029
3130
Returns
3231
-------
@@ -38,15 +37,14 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
3837
- conditional_count_targets: Enrollment counts requiring constraints
3938
- cbo_targets: List of CBO projection targets
4039
- treasury_targets: List of Treasury/JCT targets
41-
- time_period: The year derived from the dataset
40+
- time_period: The target year
4241
"""
43-
from policyengine_us import Microsimulation
42+
from policyengine_us import CountryTaxBenefitSystem
4443

45-
print(f"Loading dataset: {dataset}")
46-
sim = Microsimulation(dataset=dataset)
44+
time_period = year
45+
print(f"Using time_period: {time_period}")
4746

48-
time_period = int(sim.default_calculation_period)
49-
print(f"Derived time_period from dataset: {time_period}")
47+
tax_benefit_system = CountryTaxBenefitSystem()
5048

5149
# Hardcoded dollar targets are specific to 2024 and should be
5250
# labeled as such. Only CBO/Treasury parameter lookups use the
@@ -400,7 +398,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
400398
for variable_name in cbo_vars:
401399
param_name = cbo_param_name_map.get(variable_name, variable_name)
402400
try:
403-
value = sim.tax_benefit_system.parameters(
401+
value = tax_benefit_system.parameters(
404402
time_period
405403
).calibration.gov.cbo._children[param_name]
406404
cbo_targets.append(
@@ -420,7 +418,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
420418

421419
# Treasury/JCT targets (EITC) - use time_period derived from dataset
422420
try:
423-
eitc_value = sim.tax_benefit_system.parameters.calibration.gov.treasury.tax_expenditures.eitc(
421+
eitc_value = tax_benefit_system.parameters.calibration.gov.treasury.tax_expenditures.eitc(
424422
time_period
425423
)
426424
treasury_targets = [
@@ -883,11 +881,11 @@ def load_national_targets(
883881

884882
def main():
885883
"""Main ETL pipeline for national targets."""
886-
args, _ = etl_argparser("ETL for national calibration targets")
884+
_, year = etl_argparser("ETL for national calibration targets")
887885

888886
# Extract
889887
print("Extracting national targets...")
890-
raw_targets = extract_national_targets(dataset=args.dataset)
888+
raw_targets = extract_national_targets(year=year)
891889
time_period = raw_targets["time_period"]
892890
print(f"Using time_period={time_period} for CBO/Treasury targets")
893891

policyengine_us_data/utils/db.py

Lines changed: 9 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import argparse
2-
from pathlib import Path
32
from typing import Dict, List, Optional, Tuple
43

54
from sqlmodel import Session, select
@@ -9,54 +8,37 @@
98
Stratum,
109
StratumConstraint,
1110
)
12-
from policyengine_us_data.storage import STORAGE_FOLDER
1311

14-
DEFAULT_DATASET = str(STORAGE_FOLDER / "source_imputed_stratified_extended_cps_2024.h5")
12+
DEFAULT_YEAR = 2024
1513

1614

1715
def etl_argparser(
1816
description: str,
1917
extra_args_fn=None,
2018
) -> Tuple[argparse.Namespace, int]:
21-
"""Shared argument parsing and dataset-year derivation for ETL scripts.
19+
"""Shared argument parsing for ETL scripts.
2220
2321
Args:
2422
description: Description for the argparse help text.
2523
extra_args_fn: Optional callable that receives the parser to add
2624
extra arguments before parsing.
2725
2826
Returns:
29-
(args, year) where *year* is derived from the dataset's
30-
``default_calculation_period``.
27+
(args, year) tuple.
3128
"""
3229
parser = argparse.ArgumentParser(description=description)
3330
parser.add_argument(
34-
"--dataset",
35-
default=DEFAULT_DATASET,
36-
help=(
37-
"Source dataset (local path or HuggingFace URL). "
38-
"The year is derived from the dataset's "
39-
"default_calculation_period. Default: %(default)s"
40-
),
31+
"--year",
32+
type=int,
33+
default=DEFAULT_YEAR,
34+
help="Target year for calibration data. Default: %(default)s",
4135
)
4236
if extra_args_fn is not None:
4337
extra_args_fn(parser)
4438

4539
args = parser.parse_args()
46-
47-
if not args.dataset.startswith("hf://") and not Path(args.dataset).exists():
48-
raise FileNotFoundError(
49-
f"Dataset not found: {args.dataset}\n"
50-
f"Either build it locally (`make data`) or pass a "
51-
f"HuggingFace URL via --dataset hf://policyengine/..."
52-
)
53-
54-
from policyengine_us import Microsimulation
55-
56-
print(f"Loading dataset: {args.dataset}")
57-
sim = Microsimulation(dataset=args.dataset)
58-
year = int(sim.default_calculation_period)
59-
print(f"Derived year from dataset: {year}")
40+
year = args.year
41+
print(f"Using year: {year}")
6042

6143
return args, year
6244

tests/integration/test_database_build.py

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import sqlite3
1111
import subprocess
1212
import sys
13-
from pathlib import Path
1413

1514
import pytest
1615

@@ -20,23 +19,18 @@
2019
DB_DIR = STORAGE_FOLDER / "calibration"
2120
DB_PATH = DB_DIR / "policy_data.db"
2221

23-
# HuggingFace URL for the stratified CPS dataset.
24-
# ETL scripts use this only to derive the time period (2024).
25-
HF_DATASET = (
26-
"hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
27-
)
28-
2922
# Scripts run in the same order as `make database` in the Makefile.
30-
# create_database_tables.py does not use etl_argparser.
23+
# create_database_tables.py and validate_database.py do not use etl_argparser.
3124
PIPELINE_SCRIPTS = [
3225
("db/create_database_tables.py", []),
33-
("db/create_initial_strata.py", ["--dataset", HF_DATASET]),
34-
("db/etl_national_targets.py", ["--dataset", HF_DATASET]),
35-
("db/etl_age.py", ["--dataset", HF_DATASET]),
36-
("db/etl_medicaid.py", ["--dataset", HF_DATASET]),
37-
("db/etl_snap.py", ["--dataset", HF_DATASET]),
38-
("db/etl_state_income_tax.py", ["--dataset", HF_DATASET]),
39-
("db/etl_irs_soi.py", ["--dataset", HF_DATASET]),
26+
("db/create_initial_strata.py", ["--year", "2024"]),
27+
("db/etl_national_targets.py", ["--year", "2024"]),
28+
("db/etl_age.py", ["--year", "2024"]),
29+
("db/etl_medicaid.py", ["--year", "2024"]),
30+
("db/etl_snap.py", ["--year", "2024"]),
31+
("db/etl_state_income_tax.py", ["--year", "2024"]),
32+
("db/etl_irs_soi.py", ["--year", "2024"]),
33+
("db/etl_pregnancy.py", ["--year", "2024"]),
4034
("db/validate_database.py", []),
4135
]
4236

0 commit comments

Comments
 (0)