Skip to content
This repository was archived by the owner on Jun 14, 2026. It is now read-only.

Commit c14f8d4

Browse files
authored
Remove PUF support clone top-tail cap (#170)
1 parent 940016a commit c14f8d4

2 files changed

Lines changed: 45 additions & 547 deletions

File tree

src/microplex_us/pipelines/us.py

Lines changed: 3 additions & 297 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@
217217
}
218218
)
219219

220+
# Refresh categorical/status fields against the PUF income surface, but never
221+
# overwrite amount fields here. PUF and CPS income amounts must come from donor
222+
# imputation/calibration, not from post-hoc bucket or nearest-neighbor surgery.
220223
PUF_SUPPORT_CLONE_CPS_REFRESH_VARIABLES: tuple[str, ...] = (
221224
"is_male",
222225
"cps_race",
@@ -231,105 +234,10 @@
231234
"difficulty_doing_errands",
232235
"difficulty_remembering_or_making_decisions",
233236
"meets_ssi_disability_criteria",
234-
"social_security_retirement",
235-
"social_security_disability",
236-
"social_security_survivors",
237-
"social_security_dependents",
238-
"disability_benefits",
239-
"workers_compensation",
240-
"unemployment_compensation",
241-
"child_support_received",
242-
"veterans_benefits",
243-
"educational_assistance",
244-
"financial_assistance",
245-
"survivor_benefits",
246-
"strike_benefits",
247237
"receives_wic",
248238
"receives_housing_assistance",
249-
"spm_unit_energy_subsidy",
250-
"spm_unit_pre_subsidy_childcare_expenses",
251-
"employer_sponsored_insurance_premiums",
252-
"health_insurance_premiums_without_medicare_part_b",
253-
"other_health_insurance_premiums",
254-
"over_the_counter_health_expenses",
255-
"other_medical_expenses",
256-
"child_support_expense",
257-
"weekly_hours_worked",
258-
"hours_worked",
259-
"hours_worked_last_week",
260-
"weekly_hours_worked_before_lsr",
261-
"weeks_worked",
262-
"hourly_wage",
263239
"is_paid_hourly",
264240
"is_union_member_or_covered",
265-
"employment_income_last_year",
266-
"self_employment_income_last_year",
267-
"taxable_401k_distributions",
268-
"tax_exempt_401k_distributions",
269-
"taxable_403b_distributions",
270-
"tax_exempt_403b_distributions",
271-
"keogh_distributions",
272-
"taxable_sep_distributions",
273-
"tax_exempt_sep_distributions",
274-
"traditional_401k_contributions_desired",
275-
"roth_401k_contributions_desired",
276-
"traditional_ira_contributions_desired",
277-
"roth_ira_contributions_desired",
278-
"self_employed_pension_contributions_desired",
279-
)
280-
281-
PUF_SUPPORT_CLONE_TOP_TAIL_ROUGH_AGI_CAP = 78_999_999.0
282-
PUF_SUPPORT_CLONE_TOP_TAIL_ROUGH_AGI_VARIABLES: tuple[str, ...] = (
283-
"employment_income",
284-
"employment_income_before_lsr",
285-
"tip_income",
286-
"fsla_overtime_premium",
287-
"self_employment_income",
288-
"self_employment_income_before_lsr",
289-
"taxable_interest_income",
290-
"tax_exempt_interest_income",
291-
"capital_gains",
292-
"long_term_capital_gains_before_response",
293-
"long_term_capital_gains",
294-
"short_term_capital_gains",
295-
"non_sch_d_capital_gains",
296-
"dividend_income",
297-
"ordinary_dividend_income",
298-
"qualified_dividend_income",
299-
"non_qualified_dividend_income",
300-
"partnership_s_corp_income",
301-
"rental_income",
302-
"farm_income",
303-
"farm_operations_income",
304-
"farm_rent_income",
305-
"ira_distributions",
306-
"taxable_pension_income",
307-
"taxable_private_pension_income",
308-
"taxable_ira_distributions",
309-
"taxable_401k_distributions",
310-
"taxable_403b_distributions",
311-
"taxable_sep_distributions",
312-
"total_pension_income",
313-
"taxable_social_security",
314-
"social_security",
315-
"social_security_retirement",
316-
"social_security_disability",
317-
"social_security_survivors",
318-
"social_security_dependents",
319-
)
320-
PUF_SUPPORT_CLONE_TOP_TAIL_SCALE_VARIABLES: tuple[str, ...] = (
321-
"capital_gains",
322-
"long_term_capital_gains_before_response",
323-
"long_term_capital_gains",
324-
"short_term_capital_gains",
325-
"non_sch_d_capital_gains",
326-
"partnership_s_corp_income",
327-
"dividend_income",
328-
"qualified_dividend_income",
329-
"non_qualified_dividend_income",
330-
"ordinary_dividend_income",
331-
"taxable_interest_income",
332-
"tax_exempt_interest_income",
333241
)
334242

335243
DEFAULT_ACA_TAKEUP_RATE = 0.672
@@ -2143,15 +2051,6 @@ class USMicroplexBuildConfig:
21432051
puf_support_clone_cps_refresh_condition_variables: tuple[str, ...] = (
21442052
PUF_SUPPORT_CLONE_CPS_REFRESH_CONDITION_VARIABLES
21452053
)
2146-
puf_support_clone_top_tail_rough_agi_cap: float | None = (
2147-
PUF_SUPPORT_CLONE_TOP_TAIL_ROUGH_AGI_CAP
2148-
)
2149-
puf_support_clone_top_tail_rough_agi_variables: tuple[str, ...] = (
2150-
PUF_SUPPORT_CLONE_TOP_TAIL_ROUGH_AGI_VARIABLES
2151-
)
2152-
puf_support_clone_top_tail_scale_variables: tuple[str, ...] = (
2153-
PUF_SUPPORT_CLONE_TOP_TAIL_SCALE_VARIABLES
2154-
)
21552054
dependent_tax_leaf_soft_cap_multiplier: float | None = None
21562055
dependent_tax_leaf_soft_cap_base_variables: tuple[str, ...] = (
21572056
"employment_income",
@@ -5791,194 +5690,6 @@ def _reconcile_puf_support_clone_social_security(
57915690
] = total.loc[fallback_mask & age.lt(62)]
57925691
return subcomponents
57935692

5794-
def _puf_support_clone_top_tail_rough_agi(
5795-
self,
5796-
clone: pd.DataFrame,
5797-
) -> tuple[pd.Series, list[str]]:
5798-
"""Compute a nonredundant rough AGI proxy for PUF clone top-tail checks."""
5799-
5800-
configured = set(self.config.puf_support_clone_top_tail_rough_agi_variables)
5801-
5802-
def numeric(variable: str) -> pd.Series:
5803-
return (
5804-
pd.to_numeric(clone[variable], errors="coerce")
5805-
.replace([np.inf, -np.inf], np.nan)
5806-
.fillna(0.0)
5807-
)
5808-
5809-
components: list[pd.Series] = []
5810-
variables: list[str] = []
5811-
5812-
def add(variable: str) -> bool:
5813-
if variable not in configured or variable not in clone.columns:
5814-
return False
5815-
components.append(numeric(variable))
5816-
variables.append(variable)
5817-
return True
5818-
5819-
def add_first(*variables: str) -> bool:
5820-
return any(add(variable) for variable in variables)
5821-
5822-
def add_all(*variables: str) -> bool:
5823-
added = False
5824-
for variable in variables:
5825-
added = add(variable) or added
5826-
return added
5827-
5828-
add_first("employment_income", "employment_income_before_lsr")
5829-
if "employment_income" not in variables:
5830-
add_all("tip_income", "fsla_overtime_premium")
5831-
5832-
add_first("self_employment_income", "self_employment_income_before_lsr")
5833-
5834-
for variable in (
5835-
"taxable_interest_income",
5836-
"tax_exempt_interest_income",
5837-
"partnership_s_corp_income",
5838-
"rental_income",
5839-
):
5840-
add(variable)
5841-
5842-
if not add("farm_income"):
5843-
add_all("farm_operations_income", "farm_rent_income")
5844-
5845-
added_capital_gain_components = False
5846-
if add("long_term_capital_gains_before_response"):
5847-
added_capital_gain_components = True
5848-
elif add("long_term_capital_gains"):
5849-
added_capital_gain_components = True
5850-
if add("short_term_capital_gains"):
5851-
added_capital_gain_components = True
5852-
if not added_capital_gain_components:
5853-
add("capital_gains")
5854-
add("non_sch_d_capital_gains")
5855-
5856-
if not add("dividend_income") and not add("ordinary_dividend_income"):
5857-
add("qualified_dividend_income")
5858-
add("non_qualified_dividend_income")
5859-
5860-
if not add("taxable_pension_income") and not add("total_pension_income"):
5861-
add_all(
5862-
"ira_distributions",
5863-
"taxable_private_pension_income",
5864-
"taxable_ira_distributions",
5865-
"taxable_401k_distributions",
5866-
"taxable_403b_distributions",
5867-
"taxable_sep_distributions",
5868-
)
5869-
if not add("taxable_social_security") and not add("social_security"):
5870-
add_all(
5871-
"social_security_retirement",
5872-
"social_security_disability",
5873-
"social_security_survivors",
5874-
"social_security_dependents",
5875-
)
5876-
5877-
if not components:
5878-
return pd.Series(0.0, index=clone.index, dtype=float), []
5879-
return sum(components), variables
5880-
5881-
def _apply_puf_support_clone_top_tail_guard(
5882-
self,
5883-
clone: pd.DataFrame,
5884-
*,
5885-
integrated_variables: Iterable[str],
5886-
) -> tuple[pd.DataFrame, dict[str, Any]]:
5887-
"""Avoid arbitrary state placement of unsupported PUF top-tail clones.
5888-
5889-
PUF has no state geography, so the CPS support clone inherits state from
5890-
its scaffold row. Until the top tail gets state-aware support records,
5891-
do not let a single imputed clone enter the open-ended SOI AGI count bin
5892-
and then receive a large calibrated state weight.
5893-
"""
5894-
5895-
cap = self.config.puf_support_clone_top_tail_rough_agi_cap
5896-
summary: dict[str, Any] = {
5897-
"enabled": cap is not None,
5898-
"cap": float(cap) if cap is not None else None,
5899-
"affected_rows": 0,
5900-
"rough_agi_variables": [],
5901-
"scaled_variables": [],
5902-
"scale_basis_variables": [],
5903-
"max_rough_agi_before": None,
5904-
"max_rough_agi_after": None,
5905-
}
5906-
if cap is None or cap <= 0.0 or clone.empty:
5907-
return clone, summary
5908-
5909-
rough_agi, rough_agi_variables = self._puf_support_clone_top_tail_rough_agi(
5910-
clone
5911-
)
5912-
if not rough_agi_variables:
5913-
return clone, summary
5914-
5915-
summary["rough_agi_variables"] = rough_agi_variables
5916-
summary["max_rough_agi_before"] = float(rough_agi.max())
5917-
5918-
over_cap = rough_agi > float(cap)
5919-
if not bool(over_cap.any()):
5920-
summary["max_rough_agi_after"] = summary["max_rough_agi_before"]
5921-
return clone, summary
5922-
5923-
integrated_set = set(integrated_variables)
5924-
5925-
def is_integrated_or_export_alias(variable: str) -> bool:
5926-
if variable in integrated_set:
5927-
return True
5928-
return (
5929-
variable == "long_term_capital_gains_before_response"
5930-
and "long_term_capital_gains" in integrated_set
5931-
)
5932-
5933-
scale_variables = [
5934-
variable
5935-
for variable in self.config.puf_support_clone_top_tail_scale_variables
5936-
if variable in clone.columns and is_integrated_or_export_alias(variable)
5937-
]
5938-
if not scale_variables:
5939-
summary["max_rough_agi_after"] = summary["max_rough_agi_before"]
5940-
return clone, summary
5941-
scale_basis_variables = [
5942-
variable for variable in scale_variables if variable in rough_agi_variables
5943-
]
5944-
if not scale_basis_variables:
5945-
summary["max_rough_agi_after"] = summary["max_rough_agi_before"]
5946-
return clone, summary
5947-
5948-
scale_frame = pd.DataFrame(
5949-
{
5950-
variable: pd.to_numeric(clone[variable], errors="coerce")
5951-
.replace([np.inf, -np.inf], np.nan)
5952-
.fillna(0.0)
5953-
.clip(lower=0.0)
5954-
for variable in scale_basis_variables
5955-
},
5956-
index=clone.index,
5957-
)
5958-
scalable = scale_frame.sum(axis=1)
5959-
nonscalable = rough_agi - scalable
5960-
desired_scalable = (float(cap) - nonscalable).clip(lower=0.0)
5961-
eligible = over_cap & scalable.gt(0.0)
5962-
if not bool(eligible.any()):
5963-
summary["max_rough_agi_after"] = summary["max_rough_agi_before"]
5964-
return clone, summary
5965-
5966-
scale = (desired_scalable[eligible] / scalable[eligible]).clip(
5967-
lower=0.0,
5968-
upper=1.0,
5969-
)
5970-
guarded = clone.copy()
5971-
for variable in scale_variables:
5972-
values = pd.to_numeric(guarded.loc[eligible, variable], errors="coerce")
5973-
guarded.loc[eligible, variable] = values.fillna(0.0).clip(lower=0.0) * scale
5974-
5975-
guarded_rough_agi, _ = self._puf_support_clone_top_tail_rough_agi(guarded)
5976-
summary["affected_rows"] = int(eligible.sum())
5977-
summary["scaled_variables"] = scale_variables
5978-
summary["scale_basis_variables"] = scale_basis_variables
5979-
summary["max_rough_agi_after"] = float(guarded_rough_agi.max())
5980-
return guarded, summary
5981-
59825693
def _finalize_puf_support_clone_frame(
59835694
self,
59845695
*,
@@ -6013,10 +5724,6 @@ def _finalize_puf_support_clone_frame(
60135724
integrated_variables=integrated_variables,
60145725
preclone_columns=preclone_columns,
60155726
)
6016-
clone, top_tail_guard_summary = self._apply_puf_support_clone_top_tail_guard(
6017-
clone,
6018-
integrated_variables=integrated_variables,
6019-
)
60205727

60215728
generated_entity_id_columns = sorted(
60225729
set(ENTITY_ID_COLUMNS.values()) & (set(clone.columns) - preclone_columns)
@@ -6075,7 +5782,6 @@ def _finalize_puf_support_clone_frame(
60755782
"donor_only_variables": donor_only_variables,
60765783
"both_halves_override_variables": sorted(both_halves_override),
60775784
"cps_only_refresh": cps_refresh_summary,
6078-
"top_tail_guard": top_tail_guard_summary,
60795785
"dropped_generated_entity_id_columns": generated_entity_id_columns,
60805786
"variable_surface": {
60815787
"ecps_imputed_variables": list(PUF_SUPPORT_CLONE_IMPUTED_VARIABLES),

0 commit comments

Comments
 (0)