Skip to content

Commit 320dfc0

Browse files
baogorekclaude
andcommitted
Add tax_unit_itemizes constraint for itemized deduction targets
SOI targets for SALT, real estate taxes, and medical expense deduction are reported only for the ~10% of filers who itemize, but the existing `variable > 0` constraint captures everyone with economic exposure (~80-90% of filers). This mismatch causes massive count and dollar overestimates. Adding `tax_unit_itemizes == 1` fixes the population alignment. Changes: - etl_irs_soi.py: For salt, real_estate_taxes, and medical_expense_deduction, append a `tax_unit_itemizes == 1` constraint to child strata in the generic target loop. - etl_national_targets.py: Split JCT itemized deduction targets (salt_deduction, medical_expense_deduction, charitable_deduction, interest_deduction) into a separate itemizer_targets list loaded into a new "United States - Itemizing Tax Filers" stratum with both filer and itemizer constraints. QBI deduction remains in the plain filer stratum (above-the-line). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4991b1e commit 320dfc0

2 files changed

Lines changed: 110 additions & 14 deletions

File tree

policyengine_us_data/db/etl_irs_soi.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
logger = logging.getLogger(__name__)
3131

32+
ITEMIZED_DEDUCTION_VARIABLES = {"salt", "real_estate_taxes", "medical_expense_deduction"}
3233

3334
# IRS SOI data is typically available ~2 years after the tax year
3435
IRS_SOI_LAG_YEARS = 2
@@ -661,7 +662,11 @@ def load_soi_data(long_dfs, year):
661662

662663
# Create child stratum with constraint for this IRS variable
663664
# Note: This stratum will have the constraint that amount_variable > 0
664-
note = f"{geo_description} filers with {amount_variable_name} > 0"
665+
is_itemized = amount_variable_name in ITEMIZED_DEDUCTION_VARIABLES
666+
if is_itemized:
667+
note = f"{geo_description} itemizing filers with {amount_variable_name} > 0"
668+
else:
669+
note = f"{geo_description} filers with {amount_variable_name} > 0"
665670

666671
# Check if child stratum already exists
667672
existing_stratum = (
@@ -698,6 +703,15 @@ def load_soi_data(long_dfs, year):
698703
]
699704
)
700705

706+
if is_itemized:
707+
child_stratum.constraints_rel.append(
708+
StratumConstraint(
709+
constraint_variable="tax_unit_itemizes",
710+
operation="==",
711+
value="1",
712+
)
713+
)
714+
701715
# Add geographic constraints if applicable
702716
if geo_info["type"] == "state":
703717
child_stratum.constraints_rel.append(

policyengine_us_data/db/etl_national_targets.py

Lines changed: 95 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,17 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
5757

5858
# Separate tax-related targets that need filer constraint
5959
tax_filer_targets = [
60+
{
61+
"variable": "qualified_business_income_deduction",
62+
"value": 63.1e9,
63+
"source": "Joint Committee on Taxation",
64+
"notes": "QBI deduction tax expenditure",
65+
"year": HARDCODED_YEAR,
66+
},
67+
]
68+
69+
# Itemized deduction targets need both filer and itemizer constraints
70+
itemizer_targets = [
6071
{
6172
"variable": "salt_deduction",
6273
"value": 21.247e9,
@@ -85,13 +96,6 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
8596
"notes": "Mortgage interest deduction tax expenditure",
8697
"year": HARDCODED_YEAR,
8798
},
88-
{
89-
"variable": "qualified_business_income_deduction",
90-
"value": 63.1e9,
91-
"source": "Joint Committee on Taxation",
92-
"notes": "QBI deduction tax expenditure",
93-
"year": HARDCODED_YEAR,
94-
},
9599
]
96100

97101
direct_sum_targets = [
@@ -394,6 +398,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
394398
return {
395399
"direct_sum_targets": direct_sum_targets,
396400
"tax_filer_targets": tax_filer_targets,
401+
"itemizer_targets": itemizer_targets,
397402
"conditional_count_targets": conditional_count_targets,
398403
"cbo_targets": cbo_targets,
399404
"treasury_targets": treasury_targets,
@@ -413,9 +418,10 @@ def transform_national_targets(raw_targets):
413418
Returns
414419
-------
415420
tuple
416-
(direct_targets_df, tax_filer_df, conditional_targets)
421+
(direct_targets_df, tax_filer_df, itemizer_df, conditional_targets)
417422
- direct_targets_df: DataFrame with direct sum targets
418423
- tax_filer_df: DataFrame with tax-related targets needing filer constraint
424+
- itemizer_df: DataFrame with itemized deduction targets needing filer + itemizer constraints
419425
- conditional_targets: List of conditional count targets
420426
"""
421427

@@ -444,14 +450,19 @@ def transform_national_targets(raw_targets):
444450
tax_filer_df = (
445451
pd.DataFrame(all_tax_filer_targets) if all_tax_filer_targets else pd.DataFrame()
446452
)
453+
itemizer_df = (
454+
pd.DataFrame(raw_targets["itemizer_targets"])
455+
if raw_targets["itemizer_targets"]
456+
else pd.DataFrame()
457+
)
447458

448459
# Conditional targets stay as list for special processing
449460
conditional_targets = raw_targets["conditional_count_targets"]
450461

451-
return direct_df, tax_filer_df, conditional_targets
462+
return direct_df, tax_filer_df, itemizer_df, conditional_targets
452463

453464

454-
def load_national_targets(direct_targets_df, tax_filer_df, conditional_targets):
465+
def load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditional_targets):
455466
"""
456467
Load national targets into the database.
457468
@@ -461,6 +472,8 @@ def load_national_targets(direct_targets_df, tax_filer_df, conditional_targets):
461472
DataFrame with direct sum target data
462473
tax_filer_df : pd.DataFrame
463474
DataFrame with tax-related targets needing filer constraint
475+
itemizer_df : pd.DataFrame
476+
DataFrame with itemized deduction targets needing filer + itemizer constraints
464477
conditional_targets : list
465478
List of conditional count targets requiring strata
466479
"""
@@ -590,6 +603,74 @@ def load_national_targets(direct_targets_df, tax_filer_df, conditional_targets):
590603
session.add(target)
591604
print(f"Added filer target: {target_data['variable']}")
592605

606+
# Process itemized deduction targets that need filer + itemizer constraints
607+
if not itemizer_df.empty:
608+
national_itemizer_stratum = (
609+
session.query(Stratum)
610+
.filter(
611+
Stratum.parent_stratum_id == us_stratum.stratum_id,
612+
Stratum.notes == "United States - Itemizing Tax Filers",
613+
)
614+
.first()
615+
)
616+
617+
if not national_itemizer_stratum:
618+
national_itemizer_stratum = Stratum(
619+
parent_stratum_id=us_stratum.stratum_id,
620+
notes="United States - Itemizing Tax Filers",
621+
)
622+
national_itemizer_stratum.constraints_rel = [
623+
StratumConstraint(
624+
constraint_variable="tax_unit_is_filer",
625+
operation="==",
626+
value="1",
627+
),
628+
StratumConstraint(
629+
constraint_variable="tax_unit_itemizes",
630+
operation="==",
631+
value="1",
632+
),
633+
]
634+
session.add(national_itemizer_stratum)
635+
session.flush()
636+
print("Created national itemizer stratum")
637+
638+
for _, target_data in itemizer_df.iterrows():
639+
target_year = target_data["year"]
640+
existing_target = (
641+
session.query(Target)
642+
.filter(
643+
Target.stratum_id == national_itemizer_stratum.stratum_id,
644+
Target.variable == target_data["variable"],
645+
Target.period == target_year,
646+
)
647+
.first()
648+
)
649+
650+
notes_parts = []
651+
if pd.notna(target_data.get("notes")):
652+
notes_parts.append(target_data["notes"])
653+
notes_parts.append(f"Source: {target_data.get('source', 'Unknown')}")
654+
combined_notes = " | ".join(notes_parts)
655+
656+
if existing_target:
657+
existing_target.value = target_data["value"]
658+
existing_target.notes = combined_notes
659+
existing_target.source = "PolicyEngine"
660+
print(f"Updated itemizer target: {target_data['variable']}")
661+
else:
662+
target = Target(
663+
stratum_id=national_itemizer_stratum.stratum_id,
664+
variable=target_data["variable"],
665+
period=target_year,
666+
value=target_data["value"],
667+
active=True,
668+
source="PolicyEngine",
669+
notes=combined_notes,
670+
)
671+
session.add(target)
672+
print(f"Added itemizer target: {target_data['variable']}")
673+
593674
# Process conditional count targets (enrollment counts)
594675
for cond_target in conditional_targets:
595676
constraint_var = cond_target["constraint_variable"]
@@ -686,11 +767,12 @@ def load_national_targets(direct_targets_df, tax_filer_df, conditional_targets):
686767
session.commit()
687768

688769
total_targets = (
689-
len(direct_targets_df) + len(tax_filer_df) + len(conditional_targets)
770+
len(direct_targets_df) + len(tax_filer_df) + len(itemizer_df) + len(conditional_targets)
690771
)
691772
print(f"\nSuccessfully loaded {total_targets} national targets")
692773
print(f" - {len(direct_targets_df)} direct sum targets")
693774
print(f" - {len(tax_filer_df)} tax filer targets")
775+
print(f" - {len(itemizer_df)} itemizer targets")
694776
print(f" - {len(conditional_targets)} enrollment count targets (as strata)")
695777

696778

@@ -706,13 +788,13 @@ def main():
706788

707789
# Transform
708790
print("Transforming targets...")
709-
direct_targets_df, tax_filer_df, conditional_targets = transform_national_targets(
791+
direct_targets_df, tax_filer_df, itemizer_df, conditional_targets = transform_national_targets(
710792
raw_targets
711793
)
712794

713795
# Load
714796
print("Loading targets into database...")
715-
load_national_targets(direct_targets_df, tax_filer_df, conditional_targets)
797+
load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditional_targets)
716798

717799
print("\nETL pipeline complete!")
718800

0 commit comments

Comments
 (0)