|
31 | 31 | save_bytes, |
32 | 32 | ) |
33 | 33 | from policyengine_us_data.utils.soi import get_tracked_soi_row |
| 34 | +from policyengine_us_data.storage.calibration_targets.pull_soi_targets import ( |
| 35 | + STATE_ABBR_TO_FIPS, |
| 36 | +) |
| 37 | +from policyengine_us_data.storage.calibration_targets.refresh_soi_table_targets import ( |
| 38 | + _load_workbook, |
| 39 | + _scaled_cell, |
| 40 | +) |
34 | 41 |
|
35 | 42 | logger = logging.getLogger(__name__) |
36 | 43 |
|
|
57 | 64 | 9: (500_000, np.inf), # $500,000 or more |
58 | 65 | } |
59 | 66 |
|
| 67 | +STATE_FINE_AGI_STUBS = { |
| 68 | + 9: (500_000, 1_000_000), # $500,000 under $1,000,000 |
| 69 | + 10: (1_000_000, np.inf), # $1,000,000 or more |
| 70 | +} |
| 71 | + |
| 72 | +NATIONAL_FINE_AGI_BRACKETS = { |
| 73 | + 23: (500_000, 1_000_000), # Table 1.4 row 23 |
| 74 | + 24: (1_000_000, 1_500_000), # row 24 |
| 75 | + 25: (1_500_000, 2_000_000), # row 25 |
| 76 | + 26: (2_000_000, 5_000_000), # row 26 |
| 77 | + 27: (5_000_000, 10_000_000), # row 27 |
| 78 | + 28: (10_000_000, np.inf), # row 28 |
| 79 | +} |
| 80 | + |
| 81 | + |
| 82 | +def _skip_coarse_state_agi_person_count_target(geo_type: str, agi_stub: int) -> bool: |
| 83 | + """Skip the coarse state 500k+ count target when fine state bins are loaded. |
| 84 | +
|
| 85 | + The standard geography-file SOI feed only has a top-coded state AGI stub 9 |
| 86 | + (500k+). We separately load `in55cmcsv`, which splits that state tail into |
| 87 | + 500k-1m and 1m+. Keeping the coarse state count target alongside the fine |
| 88 | + rows would double-constrain the same top-tail population in calibration. |
| 89 | + """ |
| 90 | + |
| 91 | + return geo_type == "state" and agi_stub == 9 |
| 92 | + |
| 93 | + |
60 | 94 | # These variables map cleanly from Publication 1304 aggregate tables to the |
61 | 95 | # existing national IRS-SOI domain strata. We intentionally leave `aca_ptc` |
62 | 96 | # and `refundable_ctc` on the geography-file path for now because the |
@@ -396,6 +430,179 @@ def load_national_workbook_soi_targets( |
396 | 430 | ) |
397 | 431 |
|
398 | 432 |
|
| 433 | +def extract_state_fine_agi_data(year: int) -> pd.DataFrame: |
| 434 | + """Download the state-level SOI file (in55cmcsv) with stubs 9 and 10.""" |
| 435 | + year_prefix = _year_prefix(year) |
| 436 | + cache_file = f"irs_soi_{year_prefix}in55cmcsv.csv" |
| 437 | + if is_cached(cache_file): |
| 438 | + logger.info(f"Using cached {cache_file}") |
| 439 | + df = pd.read_csv(cache_path(cache_file), thousands=",") |
| 440 | + else: |
| 441 | + import requests |
| 442 | + |
| 443 | + url = f"https://www.irs.gov/pub/irs-soi/{year_prefix}in55cmcsv.csv" |
| 444 | + response = requests.get(url) |
| 445 | + response.raise_for_status() |
| 446 | + save_bytes(cache_file, response.content) |
| 447 | + df = pd.read_csv(cache_path(cache_file), thousands=",") |
| 448 | + |
| 449 | + df = df[df["AGI_STUB"].isin(STATE_FINE_AGI_STUBS.keys())] |
| 450 | + df = df[df["STATE"].isin(STATE_ABBR_TO_FIPS.keys())] |
| 451 | + return df |
| 452 | + |
| 453 | + |
| 454 | +def load_state_fine_agi_targets( |
| 455 | + session: Session, filer_strata: dict, year: int |
| 456 | +) -> None: |
| 457 | + """Create strata and targets for state-level fine AGI brackets (stubs 9/10).""" |
| 458 | + df = extract_state_fine_agi_data(year) |
| 459 | + |
| 460 | + for _, row in df.iterrows(): |
| 461 | + state_abbr = row["STATE"] |
| 462 | + stub = int(row["AGI_STUB"]) |
| 463 | + fips_str = STATE_ABBR_TO_FIPS[state_abbr] |
| 464 | + fips_int = int(fips_str) |
| 465 | + lower, upper = STATE_FINE_AGI_STUBS[stub] |
| 466 | + |
| 467 | + parent_stratum_id = filer_strata["state"][fips_int] |
| 468 | + note = f"State FIPS {fips_int} filers, AGI >= {lower}, AGI < {upper}" |
| 469 | + |
| 470 | + existing = ( |
| 471 | + session.query(Stratum) |
| 472 | + .filter( |
| 473 | + Stratum.parent_stratum_id == parent_stratum_id, |
| 474 | + Stratum.notes == note, |
| 475 | + ) |
| 476 | + .first() |
| 477 | + ) |
| 478 | + |
| 479 | + if existing: |
| 480 | + stratum = existing |
| 481 | + else: |
| 482 | + stratum = Stratum( |
| 483 | + parent_stratum_id=parent_stratum_id, |
| 484 | + notes=note, |
| 485 | + ) |
| 486 | + stratum.constraints_rel.extend( |
| 487 | + [ |
| 488 | + StratumConstraint( |
| 489 | + constraint_variable="tax_unit_is_filer", |
| 490 | + operation="==", |
| 491 | + value="1", |
| 492 | + ), |
| 493 | + StratumConstraint( |
| 494 | + constraint_variable="state_fips", |
| 495 | + operation="==", |
| 496 | + value=str(fips_int), |
| 497 | + ), |
| 498 | + StratumConstraint( |
| 499 | + constraint_variable="adjusted_gross_income", |
| 500 | + operation=">=", |
| 501 | + value=str(lower), |
| 502 | + ), |
| 503 | + StratumConstraint( |
| 504 | + constraint_variable="adjusted_gross_income", |
| 505 | + operation="<", |
| 506 | + value=str(upper), |
| 507 | + ), |
| 508 | + ] |
| 509 | + ) |
| 510 | + session.add(stratum) |
| 511 | + session.flush() |
| 512 | + |
| 513 | + person_count = float(row["N2"]) |
| 514 | + agi_amount = float(row["A00100"]) * 1000 |
| 515 | + |
| 516 | + _upsert_target( |
| 517 | + session, |
| 518 | + stratum_id=stratum.stratum_id, |
| 519 | + variable="person_count", |
| 520 | + period=year, |
| 521 | + value=person_count, |
| 522 | + source="IRS SOI", |
| 523 | + notes=f"State fine AGI stub {stub} from in55cmcsv", |
| 524 | + ) |
| 525 | + _upsert_target( |
| 526 | + session, |
| 527 | + stratum_id=stratum.stratum_id, |
| 528 | + variable="adjusted_gross_income", |
| 529 | + period=year, |
| 530 | + value=agi_amount, |
| 531 | + source="IRS SOI", |
| 532 | + notes=f"State fine AGI stub {stub} from in55cmcsv", |
| 533 | + ) |
| 534 | + |
| 535 | + |
| 536 | +def load_national_fine_agi_targets( |
| 537 | + session: Session, national_filer_stratum_id: int, target_year: int |
| 538 | +) -> None: |
| 539 | + """Create strata and targets for national fine AGI brackets from Table 1.4.""" |
| 540 | + workbook = _load_workbook("Table 1.4", target_year) |
| 541 | + |
| 542 | + for excel_row, (lower, upper) in NATIONAL_FINE_AGI_BRACKETS.items(): |
| 543 | + note = f"National filers, AGI >= {lower}, AGI < {upper}" |
| 544 | + |
| 545 | + existing = ( |
| 546 | + session.query(Stratum) |
| 547 | + .filter( |
| 548 | + Stratum.parent_stratum_id == national_filer_stratum_id, |
| 549 | + Stratum.notes == note, |
| 550 | + ) |
| 551 | + .first() |
| 552 | + ) |
| 553 | + |
| 554 | + if existing: |
| 555 | + stratum = existing |
| 556 | + else: |
| 557 | + stratum = Stratum( |
| 558 | + parent_stratum_id=national_filer_stratum_id, |
| 559 | + notes=note, |
| 560 | + ) |
| 561 | + stratum.constraints_rel.extend( |
| 562 | + [ |
| 563 | + StratumConstraint( |
| 564 | + constraint_variable="tax_unit_is_filer", |
| 565 | + operation="==", |
| 566 | + value="1", |
| 567 | + ), |
| 568 | + StratumConstraint( |
| 569 | + constraint_variable="adjusted_gross_income", |
| 570 | + operation=">=", |
| 571 | + value=str(lower), |
| 572 | + ), |
| 573 | + StratumConstraint( |
| 574 | + constraint_variable="adjusted_gross_income", |
| 575 | + operation="<", |
| 576 | + value=str(upper), |
| 577 | + ), |
| 578 | + ] |
| 579 | + ) |
| 580 | + session.add(stratum) |
| 581 | + session.flush() |
| 582 | + |
| 583 | + count_value = _scaled_cell(workbook, excel_row, "B", is_count=True) |
| 584 | + agi_value = _scaled_cell(workbook, excel_row, "C", is_count=False) |
| 585 | + |
| 586 | + _upsert_target( |
| 587 | + session, |
| 588 | + stratum_id=stratum.stratum_id, |
| 589 | + variable="tax_unit_count", |
| 590 | + period=target_year, |
| 591 | + value=count_value, |
| 592 | + source="IRS SOI", |
| 593 | + notes=f"Table 1.4 row {excel_row} fine AGI bracket", |
| 594 | + ) |
| 595 | + _upsert_target( |
| 596 | + session, |
| 597 | + stratum_id=stratum.stratum_id, |
| 598 | + variable="adjusted_gross_income", |
| 599 | + period=target_year, |
| 600 | + value=agi_value, |
| 601 | + source="IRS SOI", |
| 602 | + notes=f"Table 1.4 row {excel_row} fine AGI bracket", |
| 603 | + ) |
| 604 | + |
| 605 | + |
399 | 606 | def transform_soi_data(raw_df): |
400 | 607 |
|
401 | 608 | TARGETS = [ |
@@ -645,7 +852,9 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None): |
645 | 852 | filer_strata["national"], |
646 | 853 | national_year, |
647 | 854 | ) |
| 855 | + load_national_fine_agi_targets(session, filer_strata["national"], national_year) |
648 | 856 |
|
| 857 | + load_state_fine_agi_targets(session, filer_strata, year) |
649 | 858 | session.commit() |
650 | 859 |
|
651 | 860 | # Load EITC data -------------------------------------------------------- |
@@ -1048,6 +1257,9 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None): |
1048 | 1257 | geo_info = parse_ucgid(ucgid_i) |
1049 | 1258 | person_count = agi_df.iloc[i][["target_value"]].values[0] |
1050 | 1259 |
|
| 1260 | + if _skip_coarse_state_agi_person_count_target(geo_info["type"], agi_stub): |
| 1261 | + continue |
| 1262 | + |
1051 | 1263 | if geo_info["type"] == "state": |
1052 | 1264 | parent_stratum_id = filer_strata["state"][geo_info["state_fips"]] |
1053 | 1265 | note = f"State FIPS {geo_info['state_fips']} filers, AGI >= {agi_income_lower}, AGI < {agi_income_upper}" |
|
0 commit comments