Skip to content
This repository was archived by the owner on Jun 14, 2026. It is now read-only.

Commit 14c114a

Browse files
authored
Coalesce sparse PE input aliases by row
Coalesce sparse PE income/input aliases row-by-row when canonical export columns are zero but source aliases carry values. Adds regression tests for PE input augmentation and dividend normalization.
1 parent c14f8d4 commit 14c114a

4 files changed

Lines changed: 117 additions & 18 deletions

File tree

src/microplex_us/pipelines/us.py

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10151,6 +10151,27 @@ def first_present(*columns: str) -> pd.Series:
1015110151
)
1015210152
return zero.copy()
1015310153

10154+
def first_nonzero_or_present(*columns: str) -> pd.Series:
10155+
values = zero.copy()
10156+
found = False
10157+
for column in columns:
10158+
if column not in result.columns:
10159+
continue
10160+
candidate = (
10161+
pd.to_numeric(
10162+
result[column],
10163+
errors="coerce",
10164+
)
10165+
.fillna(0.0)
10166+
.astype(float)
10167+
)
10168+
if not found:
10169+
values = candidate.copy()
10170+
found = True
10171+
continue
10172+
values = values.where(values.ne(0.0), candidate)
10173+
return values if found else zero.copy()
10174+
1015410175
def has_any(*columns: str) -> bool:
1015510176
return any(column in result.columns for column in columns)
1015610177

@@ -10272,14 +10293,17 @@ def has_any(*columns: str) -> bool:
1027210293
result["takes_up_ssi_if_eligible"] = first_present("ssi").gt(0.0)
1027310294

1027410295
known_nonemployment = (
10275-
first_present("self_employment_income")
10276-
+ first_present("taxable_interest_income", "interest_income")
10277-
+ first_present("ordinary_dividend_income", "dividend_income")
10296+
first_nonzero_or_present(
10297+
"self_employment_income_before_lsr",
10298+
"self_employment_income",
10299+
)
10300+
+ first_nonzero_or_present("taxable_interest_income", "interest_income")
10301+
+ first_nonzero_or_present("ordinary_dividend_income", "dividend_income")
1027810302
+ first_present("rental_income")
1027910303
+ first_present("gross_social_security", "social_security")
1028010304
+ first_present("ssi")
1028110305
+ first_present("public_assistance")
10282-
+ first_present("taxable_pension_income", "pension_income")
10306+
+ first_nonzero_or_present("taxable_pension_income", "pension_income")
1028310307
+ first_present("unemployment_compensation")
1028410308
)
1028510309
fallback_employment_income = (
@@ -10290,19 +10314,19 @@ def has_any(*columns: str) -> bool:
1029010314
).clip(lower=0.0)
1029110315

1029210316
result["employment_income_before_lsr"] = (
10293-
first_present(
10317+
first_nonzero_or_present(
1029410318
"employment_income_before_lsr", "employment_income", "wage_income"
1029510319
)
1029610320
if has_any(
1029710321
"employment_income_before_lsr", "employment_income", "wage_income"
1029810322
)
1029910323
else fallback_employment_income
1030010324
)
10301-
result["self_employment_income_before_lsr"] = first_present(
10325+
result["self_employment_income_before_lsr"] = first_nonzero_or_present(
1030210326
"self_employment_income_before_lsr",
1030310327
"self_employment_income",
1030410328
)
10305-
result["taxable_interest_income"] = first_present(
10329+
result["taxable_interest_income"] = first_nonzero_or_present(
1030610330
"taxable_interest_income",
1030710331
"interest_income",
1030810332
)
@@ -10315,17 +10339,21 @@ def has_any(*columns: str) -> bool:
1031510339
result["non_qualified_dividend_income"] = first_present(
1031610340
"non_qualified_dividend_income",
1031710341
).clip(lower=0.0)
10318-
result["ordinary_dividend_income"] = first_present(
10342+
dividend_alias = first_nonzero_or_present(
1031910343
"ordinary_dividend_income",
1032010344
"dividend_income",
1032110345
).clip(lower=0.0)
10346+
result["ordinary_dividend_income"] = dividend_alias
1032210347
if has_any("qualified_dividend_income", "non_qualified_dividend_income"):
1032310348
dividend_total = (
1032410349
result["qualified_dividend_income"]
1032510350
+ result["non_qualified_dividend_income"]
1032610351
).clip(lower=0.0)
10327-
result["ordinary_dividend_income"] = dividend_total
10328-
result["dividend_income"] = dividend_total
10352+
result["ordinary_dividend_income"] = dividend_total.where(
10353+
dividend_total.ne(0.0),
10354+
dividend_alias,
10355+
)
10356+
result["dividend_income"] = result["ordinary_dividend_income"]
1032910357
else:
1033010358
result = normalize_dividend_columns(result)
1033110359

@@ -10335,15 +10363,17 @@ def has_any(*columns: str) -> bool:
1033510363
"capital_gains_distributions",
1033610364
)
1033710365
result["long_term_capital_gains_before_response"] = (
10338-
first_present(
10366+
first_nonzero_or_present(
1033910367
"long_term_capital_gains_before_response",
1034010368
"long_term_capital_gains",
10369+
"capital_gains",
1034110370
)
1034210371
if has_any(
1034310372
"long_term_capital_gains_before_response",
1034410373
"long_term_capital_gains",
10374+
"capital_gains",
1034510375
)
10346-
else first_present("capital_gains")
10376+
else zero.copy()
1034710377
)
1034810378
result["partnership_s_corp_income"] = first_present("partnership_s_corp_income")
1034910379
result["partnership_se_income"] = first_present("partnership_se_income")

src/microplex_us/variables.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -666,17 +666,22 @@ def normalize_dividend_columns(frame: pd.DataFrame) -> pd.DataFrame:
666666
result = frame.copy()
667667
qualified = _nonnegative_series(result, "qualified_dividend_income")
668668
non_qualified = _nonnegative_series(result, "non_qualified_dividend_income")
669-
total = (
670-
_nonnegative_series(result, "ordinary_dividend_income")
671-
if "ordinary_dividend_income" in result.columns
672-
else _nonnegative_series(result, "dividend_income")
673-
)
669+
ordinary_total = _nonnegative_series(result, "ordinary_dividend_income")
670+
dividend_total = _nonnegative_series(result, "dividend_income")
671+
if "ordinary_dividend_income" in result.columns:
672+
total = ordinary_total.where(ordinary_total.ne(0.0), dividend_total)
673+
else:
674+
total = dividend_total
674675

675676
has_qualified = "qualified_dividend_income" in result.columns
676677
has_non_qualified = "non_qualified_dividend_income" in result.columns
677678

678679
if has_qualified and has_non_qualified:
679-
normalized_total = qualified + non_qualified
680+
component_total = qualified + non_qualified
681+
total_only = component_total.eq(0.0) & total.gt(0.0)
682+
non_qualified = non_qualified.where(~total_only, total)
683+
component_total = qualified + non_qualified
684+
normalized_total = component_total.where(component_total.ne(0.0), total)
680685
elif has_qualified:
681686
normalized_total = np.maximum(total.to_numpy(dtype=float), qualified.to_numpy(dtype=float))
682687
non_qualified = pd.Series(

tests/pipelines/test_us.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4919,6 +4919,52 @@ def test_augment_policyengine_person_inputs_materializes_agi_parity_inputs(self)
49194919
assert augmented["self_employed_health_insurance_ald"].tolist() == [15.0]
49204920
assert augmented["self_employed_pension_contribution_ald"].tolist() == [10.0]
49214921

4922+
def test_augment_policyengine_person_inputs_coalesces_sparse_source_aliases_by_row(
4923+
self,
4924+
):
4925+
pipeline = USMicroplexPipeline(USMicroplexBuildConfig())
4926+
persons = pd.DataFrame(
4927+
{
4928+
"age": [45, 50, 55],
4929+
"sex": [1, 2, 1],
4930+
"income": [60_000.0, 75_000.0, 0.0],
4931+
"employment_income_before_lsr": [0.0, 70_000.0, 0.0],
4932+
"wage_income": [50_000.0, 80_000.0, 0.0],
4933+
"self_employment_income_before_lsr": [0.0, 200.0, -300.0],
4934+
"self_employment_income": [500.0, 999.0, 50.0],
4935+
"taxable_interest_income": [0.0, 20.0, 0.0],
4936+
"interest_income": [100.0, 999.0, 0.0],
4937+
"ordinary_dividend_income": [0.0, 30.0, 0.0],
4938+
"dividend_income": [80.0, 999.0, 0.0],
4939+
"qualified_dividend_income": [0.0, 5.0, 0.0],
4940+
"non_qualified_dividend_income": [0.0, 25.0, 0.0],
4941+
"long_term_capital_gains_before_response": [0.0, 60.0, -10.0],
4942+
"long_term_capital_gains": [40.0, 999.0, 0.0],
4943+
"capital_gains": [999.0, 999.0, 25.0],
4944+
}
4945+
)
4946+
4947+
augmented = pipeline._augment_policyengine_person_inputs(persons)
4948+
4949+
assert augmented["employment_income_before_lsr"].tolist() == [
4950+
50_000.0,
4951+
70_000.0,
4952+
0.0,
4953+
]
4954+
assert augmented["self_employment_income_before_lsr"].tolist() == [
4955+
500.0,
4956+
200.0,
4957+
-300.0,
4958+
]
4959+
assert augmented["taxable_interest_income"].tolist() == [100.0, 20.0, 0.0]
4960+
assert augmented["ordinary_dividend_income"].tolist() == [80.0, 30.0, 0.0]
4961+
assert augmented["dividend_income"].tolist() == [80.0, 30.0, 0.0]
4962+
assert augmented["long_term_capital_gains_before_response"].tolist() == [
4963+
40.0,
4964+
60.0,
4965+
-10.0,
4966+
]
4967+
49224968
def test_augment_policyengine_person_inputs_derives_marital_status_flags_from_cps_codes(
49234969
self,
49244970
):

tests/test_variables.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,24 @@ def test_normalize_dividend_columns_prefers_atomic_components_over_totals():
4545
assert normalized["dividend_income"].tolist() == [42.0]
4646

4747

48+
def test_normalize_dividend_columns_coalesces_sparse_total_aliases_by_row():
49+
frame = pd.DataFrame(
50+
{
51+
"ordinary_dividend_income": [0.0, 30.0, 0.0],
52+
"dividend_income": [80.0, 999.0, 0.0],
53+
"qualified_dividend_income": [0.0, 5.0, 0.0],
54+
"non_qualified_dividend_income": [0.0, 25.0, 0.0],
55+
}
56+
)
57+
58+
normalized = normalize_dividend_columns(frame)
59+
60+
assert normalized["qualified_dividend_income"].tolist() == [0.0, 5.0, 0.0]
61+
assert normalized["non_qualified_dividend_income"].tolist() == [80.0, 25.0, 0.0]
62+
assert normalized["ordinary_dividend_income"].tolist() == [80.0, 30.0, 0.0]
63+
assert normalized["dividend_income"].tolist() == [80.0, 30.0, 0.0]
64+
65+
4866
def test_normalize_social_security_columns_tracks_unclassified_residual():
4967
frame = pd.DataFrame(
5068
{

0 commit comments

Comments
 (0)