|
1 | | -import argparse |
2 | 1 | import logging |
3 | 2 | from typing import Optional |
4 | 3 |
|
|
8 | 7 | from sqlmodel import Session, create_engine, select |
9 | 8 |
|
10 | 9 | from policyengine_us_data.storage import STORAGE_FOLDER |
11 | | - |
12 | | -DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" |
13 | | - |
14 | | -# IRS SOI data is typically available ~2 years after the tax year |
15 | | -IRS_SOI_LAG_YEARS = 2 |
16 | | -from policyengine_us_data.utils.raw_cache import ( |
17 | | - is_cached, |
18 | | - cache_path, |
19 | | - save_bytes, |
20 | | -) |
21 | | - |
22 | | -logger = logging.getLogger(__name__) |
23 | | - |
24 | 10 | from policyengine_us_data.db.create_database_tables import ( |
25 | 11 | Stratum, |
26 | 12 | StratumConstraint, |
|
34 | 20 | get_stratum_parent, |
35 | 21 | parse_ucgid, |
36 | 22 | get_geographic_strata, |
| 23 | + etl_argparser, |
37 | 24 | ) |
38 | 25 | from policyengine_us_data.utils.db_metadata import ( |
39 | 26 | get_or_create_source, |
|
44 | 31 | from policyengine_us_data.storage.calibration_targets.make_district_mapping import ( |
45 | 32 | get_district_mapping, |
46 | 33 | ) |
| 34 | +from policyengine_us_data.utils.raw_cache import ( |
| 35 | + is_cached, |
| 36 | + cache_path, |
| 37 | + save_bytes, |
| 38 | +) |
| 39 | + |
| 40 | +logger = logging.getLogger(__name__) |
| 41 | + |
| 42 | + |
| 43 | +# IRS SOI data is typically available ~2 years after the tax year |
| 44 | +IRS_SOI_LAG_YEARS = 2 |
47 | 45 |
|
48 | 46 | """See the 22incddocguide.docx manual from the IRS SOI""" |
49 | 47 | # Language in the doc: '$10,000 under $25,000' means >= $10,000 and < $25,000 |
@@ -1236,40 +1234,23 @@ def load_soi_data(long_dfs, year): |
1236 | 1234 |
|
1237 | 1235 |
|
1238 | 1236 | def main(): |
1239 | | - parser = argparse.ArgumentParser( |
1240 | | - description="ETL for IRS SOI calibration targets" |
1241 | | - ) |
1242 | | - parser.add_argument( |
1243 | | - "--dataset", |
1244 | | - default=DEFAULT_DATASET, |
1245 | | - help=( |
1246 | | - "Source dataset (local path or HuggingFace URL). " |
1247 | | - "The year for IRS SOI data is derived from the dataset's " |
1248 | | - "default_calculation_period minus IRS_SOI_LAG_YEARS. " |
1249 | | - "Default: %(default)s" |
1250 | | - ), |
1251 | | - ) |
1252 | | - parser.add_argument( |
1253 | | - "--lag", |
1254 | | - type=int, |
1255 | | - default=IRS_SOI_LAG_YEARS, |
1256 | | - help=( |
1257 | | - "Years to subtract from dataset year for IRS SOI data " |
1258 | | - "(default: %(default)s, since IRS data is ~2 years behind)" |
1259 | | - ), |
1260 | | - ) |
1261 | | - args = parser.parse_args() |
1262 | | - |
1263 | | - # Derive year from dataset with lag applied |
1264 | | - from policyengine_us import Microsimulation |
| 1237 | + def add_lag_arg(parser): |
| 1238 | + parser.add_argument( |
| 1239 | + "--lag", |
| 1240 | + type=int, |
| 1241 | + default=IRS_SOI_LAG_YEARS, |
| 1242 | + help=( |
| 1243 | + "Years to subtract from dataset year for IRS SOI data " |
| 1244 | + "(default: %(default)s, since IRS data is ~2 years behind)" |
| 1245 | + ), |
| 1246 | + ) |
1265 | 1247 |
|
1266 | | - print(f"Loading dataset: {args.dataset}") |
1267 | | - sim = Microsimulation(dataset=args.dataset) |
1268 | | - dataset_year = int(sim.default_calculation_period) |
1269 | | - year = dataset_year - args.lag |
1270 | | - print( |
1271 | | - f"Dataset year: {dataset_year}, IRS SOI year: {year} (lag={args.lag})" |
| 1248 | + args, dataset_year = etl_argparser( |
| 1249 | + "ETL for IRS SOI calibration targets", |
| 1250 | + extra_args_fn=add_lag_arg, |
1272 | 1251 | ) |
| 1252 | + year = dataset_year - args.lag |
| 1253 | + print(f"IRS SOI year: {year} (lag={args.lag})") |
1273 | 1254 |
|
1274 | 1255 | # Extract ----------------------- |
1275 | 1256 | raw_df = extract_soi_data() |
|
0 commit comments