|
1 | | -import argparse |
2 | 1 | import logging |
3 | 2 | from typing import Optional |
4 | 3 |
|
|
8 | 7 | from sqlmodel import Session, create_engine, select |
9 | 8 |
|
10 | 9 | from policyengine_us_data.storage import STORAGE_FOLDER |
11 | | - |
12 | | -DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" |
13 | | - |
14 | | -# IRS SOI data is typically available ~2 years after the tax year |
15 | | -IRS_SOI_LAG_YEARS = 2 |
16 | | -from policyengine_us_data.utils.raw_cache import ( |
17 | | - is_cached, |
18 | | - cache_path, |
19 | | - save_bytes, |
20 | | -) |
21 | | - |
22 | | -logger = logging.getLogger(__name__) |
23 | | - |
24 | 10 | from policyengine_us_data.db.create_database_tables import ( |
25 | 11 | Stratum, |
26 | 12 | StratumConstraint, |
|
33 | 19 | get_stratum_parent, |
34 | 20 | parse_ucgid, |
35 | 21 | get_geographic_strata, |
| 22 | + etl_argparser, |
36 | 23 | ) |
37 | 24 | from policyengine_us_data.utils.db_metadata import ( |
38 | 25 | get_or_create_source, |
|
43 | 30 | from policyengine_us_data.storage.calibration_targets.make_district_mapping import ( |
44 | 31 | get_district_mapping, |
45 | 32 | ) |
46 | | -from policyengine_us_data.utils.constraint_validation import ( |
47 | | - Constraint, |
48 | | - ensure_consistent_constraint_set, |
| 33 | +from policyengine_us_data.utils.raw_cache import ( |
| 34 | + is_cached, |
| 35 | + cache_path, |
| 36 | + save_bytes, |
49 | 37 | ) |
50 | 38 |
|
| 39 | +logger = logging.getLogger(__name__) |
| 40 | + |
| 41 | + |
| 42 | +# IRS SOI data is typically available ~2 years after the tax year |
| 43 | +IRS_SOI_LAG_YEARS = 2 |
| 44 | + |
51 | 45 | """See the 22incddocguide.docx manual from the IRS SOI""" |
52 | 46 | # Language in the doc: '$10,000 under $25,000' means >= $10,000 and < $25,000 |
53 | 47 | AGI_STUB_TO_INCOME_RANGE = { |
@@ -1316,40 +1310,23 @@ def load_soi_data(long_dfs, year): |
1316 | 1310 |
|
1317 | 1311 |
|
1318 | 1312 | def main(): |
1319 | | - parser = argparse.ArgumentParser( |
1320 | | - description="ETL for IRS SOI calibration targets" |
1321 | | - ) |
1322 | | - parser.add_argument( |
1323 | | - "--dataset", |
1324 | | - default=DEFAULT_DATASET, |
1325 | | - help=( |
1326 | | - "Source dataset (local path or HuggingFace URL). " |
1327 | | - "The year for IRS SOI data is derived from the dataset's " |
1328 | | - "default_calculation_period minus IRS_SOI_LAG_YEARS. " |
1329 | | - "Default: %(default)s" |
1330 | | - ), |
1331 | | - ) |
1332 | | - parser.add_argument( |
1333 | | - "--lag", |
1334 | | - type=int, |
1335 | | - default=IRS_SOI_LAG_YEARS, |
1336 | | - help=( |
1337 | | - "Years to subtract from dataset year for IRS SOI data " |
1338 | | - "(default: %(default)s, since IRS data is ~2 years behind)" |
1339 | | - ), |
1340 | | - ) |
1341 | | - args = parser.parse_args() |
1342 | | - |
1343 | | - # Derive year from dataset with lag applied |
1344 | | - from policyengine_us import Microsimulation |
| 1313 | + def add_lag_arg(parser): |
| 1314 | + parser.add_argument( |
| 1315 | + "--lag", |
| 1316 | + type=int, |
| 1317 | + default=IRS_SOI_LAG_YEARS, |
| 1318 | + help=( |
| 1319 | + "Years to subtract from dataset year for IRS SOI data " |
| 1320 | + "(default: %(default)s, since IRS data is ~2 years behind)" |
| 1321 | + ), |
| 1322 | + ) |
1345 | 1323 |
|
1346 | | - print(f"Loading dataset: {args.dataset}") |
1347 | | - sim = Microsimulation(dataset=args.dataset) |
1348 | | - dataset_year = int(sim.default_calculation_period) |
1349 | | - year = dataset_year - args.lag |
1350 | | - print( |
1351 | | - f"Dataset year: {dataset_year}, IRS SOI year: {year} (lag={args.lag})" |
| 1324 | + args, dataset_year = etl_argparser( |
| 1325 | + "ETL for IRS SOI calibration targets", |
| 1326 | + extra_args_fn=add_lag_arg, |
1352 | 1327 | ) |
| 1328 | + year = dataset_year - args.lag |
| 1329 | + print(f"IRS SOI year: {year} (lag={args.lag})") |
1353 | 1330 |
|
1354 | 1331 | # Extract ----------------------- |
1355 | 1332 | raw_df = extract_soi_data() |
|
0 commit comments