Skip to content

Commit e163d24

Browse files
authored
Fix long-run support for household-derived weights
Fix long-run support generation and H5 materialization after policyengine-core began deriving all entity weights from household_weight. Rehydrates identity columns omitted from input exports, drops legacy person_weight outputs, updates policyengine-us to 1.696.0, and adds regression coverage.
1 parent 40f8c44 commit e163d24

6 files changed

Lines changed: 365 additions & 90 deletions

File tree

changelog.d/1032.fixed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix long-run support augmentation and H5 materialization under household-derived microsimulation weights.

policyengine_us_data/datasets/cps/long_term/projection_utils.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,62 @@
88
from policyengine_core.data.dataset import Dataset
99

1010

11+
PERSON_LEVEL_IDENTITY_INPUTS = (
12+
"person_id",
13+
"household_id",
14+
"person_household_id",
15+
"family_id",
16+
"person_family_id",
17+
"tax_unit_id",
18+
"person_tax_unit_id",
19+
"spm_unit_id",
20+
"person_spm_unit_id",
21+
"marital_unit_id",
22+
"person_marital_unit_id",
23+
)
24+
25+
26+
def _row_values(series):
27+
"""Return unweighted row values from a MicroSeries-like object."""
28+
if hasattr(series, "array"):
29+
return np.asarray(series.array)
30+
if hasattr(series, "values"):
31+
return np.asarray(series.values)
32+
return np.asarray(series)
33+
34+
35+
def _person_level_values(sim, variable, *, period):
36+
try:
37+
series = sim.calculate(variable, period=period, map_to="person")
38+
except Exception:
39+
series = sim.calculate(variable, period=period)
40+
return _row_values(series)
41+
42+
43+
def ensure_person_level_identity_inputs(df, sim, *, base_period):
44+
"""
45+
Rehydrate identity columns omitted by newer policyengine-core input exports.
46+
47+
policyengine-core derives relationships and weights from household-level
48+
inputs. The H5 materialization path still needs explicit person-row identity
49+
columns so ``Dataset.from_dataframe`` can rebuild a runtime dataset.
50+
"""
51+
output = df.copy()
52+
person_row_count = len(output)
53+
for variable in PERSON_LEVEL_IDENTITY_INPUTS:
54+
column = f"{variable}__{base_period}"
55+
if column in output.columns:
56+
continue
57+
values = _person_level_values(sim, variable, period=base_period)
58+
if len(values) != person_row_count:
59+
raise ValueError(
60+
f"Expected {variable} to map to {person_row_count} person rows; "
61+
f"got {len(values)}."
62+
)
63+
output[column] = values
64+
return output
65+
66+
1167
def validate_projected_social_security_cap(
1268
parameter_accessor,
1369
year: int,
@@ -256,6 +312,7 @@ def create_household_year_h5(
256312
base_period = int(sim.default_calculation_period)
257313

258314
df = sim.to_input_dataframe()
315+
df = ensure_person_level_identity_inputs(df, sim, base_period=base_period)
259316

260317
# Remove pseudo-input variables (aggregates of calculated values)
261318
pseudo_inputs = get_pseudo_input_variables(sim)
@@ -271,9 +328,9 @@ def create_household_year_h5(
271328
person_household_id = df[f"person_household_id__{base_period}"]
272329

273330
hh_to_weight = dict(zip(household_ids, household_weights))
274-
person_weights = person_household_id.map(hh_to_weight)
331+
person_level_household_weights = person_household_id.map(hh_to_weight)
275332

276-
df[f"household_weight__{year}"] = person_weights
333+
df[f"household_weight__{year}"] = person_level_household_weights
277334
df.drop(
278335
columns=[
279336
f"household_weight__{base_period}",
@@ -370,6 +427,8 @@ def calculate_year_statistics(
370427
income_tax_values = income_tax_hh.values
371428

372429
household_microseries = sim.calculate("household_id", map_to="household")
430+
# Explicit weight access is reserved for the household-level calibration
431+
# decision vector; ordinary aggregates should use MicroSeries methods.
373432
baseline_weights_actual = household_microseries.weights.values
374433

375434
ss_values = None

policyengine_us_data/datasets/cps/long_term/prototype_synthetic_2100_support.py

Lines changed: 134 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,15 @@ def _period_column(name: str, base_year: int) -> str:
470470
return f"{name}__{base_year}"
471471

472472

473+
def _row_values(series: object) -> np.ndarray:
474+
"""Return unweighted row values from a MicroSeries-like object."""
475+
if hasattr(series, "array"):
476+
return np.asarray(series.array)
477+
if hasattr(series, "values"):
478+
return np.asarray(series.values)
479+
return np.asarray(series)
480+
481+
473482
def classify_archetype(
474483
*,
475484
head_age: float,
@@ -520,37 +529,49 @@ def build_tax_unit_summary(
520529
reform: object | None = None,
521530
) -> pd.DataFrame:
522531
sim = Microsimulation(dataset=dataset, reform=reform)
523-
input_df = sim.to_input_dataframe()
532+
# policyengine-core derives all microsimulation weights from household_weight.
533+
# Build the person-row donor summary from that source of truth.
534+
household_weight_at_person_level = _row_values(
535+
sim.calculate("household_weight", period=period, map_to="person")
536+
).astype(float)
524537

525538
person_df = pd.DataFrame(
526539
{
527-
"tax_unit_id": sim.calculate("person_tax_unit_id", period=period).values,
528-
"household_id": sim.calculate("person_household_id", period=period).values,
529-
"age": sim.calculate("age", period=period).values,
530-
"is_head": sim.calculate("is_tax_unit_head", period=period).values,
531-
"is_spouse": sim.calculate("is_tax_unit_spouse", period=period).values,
532-
"is_dependent": sim.calculate(
533-
"is_tax_unit_dependent", period=period
534-
).values,
535-
"social_security": sim.calculate("social_security", period=period).values,
540+
"tax_unit_id": _row_values(
541+
sim.calculate("person_tax_unit_id", period=period)
542+
),
543+
"household_id": _row_values(
544+
sim.calculate("person_household_id", period=period)
545+
),
546+
"age": _row_values(sim.calculate("age", period=period)),
547+
"is_head": _row_values(sim.calculate("is_tax_unit_head", period=period)),
548+
"is_spouse": _row_values(
549+
sim.calculate("is_tax_unit_spouse", period=period)
550+
),
551+
"is_dependent": _row_values(
552+
sim.calculate("is_tax_unit_dependent", period=period)
553+
),
554+
"social_security": _row_values(
555+
sim.calculate("social_security", period=period)
556+
),
536557
"payroll": (
537-
sim.calculate(
538-
"taxable_earnings_for_social_security", period=period
539-
).values
540-
+ sim.calculate(
541-
"social_security_taxable_self_employment_income", period=period
542-
).values
558+
_row_values(
559+
sim.calculate("taxable_earnings_for_social_security", period=period)
560+
)
561+
+ _row_values(
562+
sim.calculate(
563+
"social_security_taxable_self_employment_income",
564+
period=period,
565+
)
566+
)
567+
),
568+
"dividend_income": _row_values(
569+
sim.calculate("qualified_dividend_income", period=period)
543570
),
544-
"dividend_income": sim.calculate(
545-
"qualified_dividend_income", period=period
546-
).values,
547-
"pension_income": sim.calculate(
548-
"taxable_pension_income", period=period
549-
).values,
550-
"person_weight": input_df[f"person_weight__{period}"].astype(float).values,
551-
"household_weight": input_df[f"household_weight__{period}"]
552-
.astype(float)
553-
.values,
571+
"pension_income": _row_values(
572+
sim.calculate("taxable_pension_income", period=period)
573+
),
574+
"household_weight": household_weight_at_person_level,
554575
}
555576
)
556577

@@ -594,7 +615,6 @@ def build_tax_unit_summary(
594615
"dividend_income": float(group["dividend_income"].sum()),
595616
"pension_income": float(group["pension_income"].sum()),
596617
"support_count_weight": 1.0,
597-
"person_weight_proxy": float(group["person_weight"].max()),
598618
"household_weight_proxy": float(group["household_weight"].max()),
599619
}
600620
row["archetype"] = classify_archetype(
@@ -646,11 +666,11 @@ def attach_person_uprating_factors(
646666
else np.zeros(len(df), dtype=float)
647667
)
648668
uprated_payroll = sum(
649-
sim.calculate(component, period=target_year).values.astype(float)
669+
_row_values(sim.calculate(component, period=target_year)).astype(float)
650670
for component in PAYROLL_COMPONENTS
651671
)
652672
uprated_ss = sum(
653-
sim.calculate(component, period=target_year).values.astype(float)
673+
_row_values(sim.calculate(component, period=target_year)).astype(float)
654674
for component in SS_COMPONENTS
655675
)
656676
df[PAYROLL_UPRATING_FACTOR_COLUMN] = np.where(
@@ -666,32 +686,80 @@ def attach_person_uprating_factors(
666686
return df
667687

668688

689+
def _person_level_values(
690+
sim: Microsimulation,
691+
variable: str,
692+
*,
693+
period: int,
694+
) -> np.ndarray:
695+
try:
696+
series = sim.calculate(variable, period=period, map_to="person")
697+
except Exception:
698+
series = sim.calculate(variable, period=period)
699+
return _row_values(series)
700+
701+
702+
def ensure_person_level_core_inputs(
703+
input_df: pd.DataFrame,
704+
sim: Microsimulation,
705+
*,
706+
base_year: int,
707+
) -> pd.DataFrame:
708+
"""Fill aliases that newer policyengine-core omits from input exports.
709+
710+
policyengine-core#497 made household_weight the source of truth for all
711+
microsimulation weights and stopped relying on redundant stored person
712+
weights. The support augmentation code still needs person-row IDs so it can
713+
clone donors and assign fresh entity identifiers.
714+
"""
715+
df = input_df.copy()
716+
person_row_count = len(df)
717+
required_person_level_inputs = [
718+
PERSON_ID_COLUMN,
719+
*(column for columns in ENTITY_ID_COLUMNS.values() for column in columns),
720+
"household_weight",
721+
]
722+
for variable in required_person_level_inputs:
723+
column = _period_column(variable, base_year)
724+
if column in df.columns:
725+
continue
726+
values = _person_level_values(sim, variable, period=base_year)
727+
if len(values) != person_row_count:
728+
raise ValueError(
729+
f"Expected {variable} to map to {person_row_count} person rows; "
730+
f"got {len(values)}."
731+
)
732+
df[column] = values
733+
df.drop(
734+
columns=[_period_column("person_weight", base_year)],
735+
inplace=True,
736+
errors="ignore",
737+
)
738+
return df
739+
740+
669741
def load_base_aggregates(
670742
base_dataset: str,
671743
*,
672744
reform: object | None = None,
673745
) -> dict[str, float]:
674746
sim = Microsimulation(dataset=base_dataset, reform=reform)
675-
household_series = sim.calculate(
676-
"household_id", period=BASE_YEAR, map_to="household"
677-
)
678-
weights = household_series.weights.values.astype(float)
679-
ss = sim.calculate("social_security", period=BASE_YEAR, map_to="household").values
680-
payroll = (
681-
sim.calculate(
682-
"taxable_earnings_for_social_security",
683-
period=BASE_YEAR,
684-
map_to="household",
685-
).values
686-
+ sim.calculate(
687-
"social_security_taxable_self_employment_income",
688-
period=BASE_YEAR,
689-
map_to="household",
690-
).values
747+
ss = sim.calculate("social_security", period=BASE_YEAR, map_to="household")
748+
taxable_wages = sim.calculate(
749+
"taxable_earnings_for_social_security",
750+
period=BASE_YEAR,
751+
map_to="household",
752+
)
753+
taxable_self_employment = sim.calculate(
754+
"social_security_taxable_self_employment_income",
755+
period=BASE_YEAR,
756+
map_to="household",
691757
)
692758
return {
693-
"weighted_ss_total": float(np.sum(ss * weights)),
694-
"weighted_payroll_total": float(np.sum(payroll * weights)),
759+
"weighted_ss_total": float(ss.sum()),
760+
"weighted_payroll_total": float(
761+
taxable_wages.sum() + taxable_self_employment.sum()
762+
),
695763
}
696764

697765

@@ -1961,7 +2029,6 @@ def _clone_tax_unit_rows_to_target(
19612029
) -> tuple[pd.DataFrame, dict[str, int]] | tuple[None, dict[str, int]]:
19622030
age_col = _period_column("age", base_year)
19632031
household_weight_col = _period_column("household_weight", base_year)
1964-
person_weight_col = _period_column("person_weight", base_year)
19652032
person_id_col = _period_column(PERSON_ID_COLUMN, base_year)
19662033

19672034
adults = donor_rows[donor_rows[age_col] >= 18].sort_values(age_col, ascending=False)
@@ -1975,7 +2042,10 @@ def _clone_tax_unit_rows_to_target(
19752042
):
19762043
return None, id_counters
19772044

1978-
cloned = donor_rows.copy()
2045+
cloned = donor_rows.drop(
2046+
columns=[_period_column("person_weight", base_year)],
2047+
errors="ignore",
2048+
).copy()
19792049
household_id = id_counters["household"]
19802050
id_counters["household"] += 1
19812051
for entity_name, columns in ENTITY_ID_COLUMNS.items():
@@ -2003,13 +2073,6 @@ def _clone_tax_unit_rows_to_target(
20032073
* clone_weight_scale
20042074
/ max(clone_weight_divisor, 1)
20052075
)
2006-
if person_weight_col in cloned.columns:
2007-
cloned[person_weight_col] = (
2008-
cloned[person_weight_col].astype(float)
2009-
* clone_weight_scale
2010-
/ max(clone_weight_divisor, 1)
2011-
)
2012-
20132076
adult_indices = adults.index.tolist()
20142077
head_idx = adult_indices[0]
20152078
spouse_idx = adult_indices[1] if target_has_spouse else None
@@ -2115,7 +2178,6 @@ def _compose_role_donor_rows_to_target(
21152178
) -> tuple[pd.DataFrame, dict[str, int]] | tuple[None, dict[str, int]]:
21162179
age_col = _period_column("age", base_year)
21172180
household_weight_col = _period_column("household_weight", base_year)
2118-
person_weight_col = _period_column("person_weight", base_year)
21192181
person_id_col = _period_column(PERSON_ID_COLUMN, base_year)
21202182

21212183
def _adult_rows(df: pd.DataFrame | None) -> pd.DataFrame:
@@ -2248,6 +2310,11 @@ def _dependent_rows(df: pd.DataFrame | None) -> pd.DataFrame:
22482310
# Reset duplicate donor indices so later row-specific retargeting only touches
22492311
# the intended clone row.
22502312
cloned = pd.DataFrame(selected_rows).reset_index(drop=True).copy()
2313+
cloned.drop(
2314+
columns=[_period_column("person_weight", base_year)],
2315+
inplace=True,
2316+
errors="ignore",
2317+
)
22512318
cloned_sources = pd.Series(selected_sources, index=cloned.index)
22522319
household_id = id_counters["household"]
22532320
id_counters["household"] += 1
@@ -2276,13 +2343,6 @@ def _dependent_rows(df: pd.DataFrame | None) -> pd.DataFrame:
22762343
* clone_weight_scale
22772344
/ max(clone_weight_divisor, 1)
22782345
)
2279-
if person_weight_col in cloned.columns:
2280-
cloned[person_weight_col] = (
2281-
cloned[person_weight_col].astype(float)
2282-
* clone_weight_scale
2283-
/ max(clone_weight_divisor, 1)
2284-
)
2285-
22862346
head_idx = cloned.index[0]
22872347
spouse_idx = cloned.index[1] if target_candidate.spouse_age is not None else None
22882348
dependent_indices = (
@@ -2401,6 +2461,11 @@ def build_donor_backed_augmented_input_dataframe(
24012461
base_year=base_year,
24022462
target_year=target_year,
24032463
)
2464+
input_df = ensure_person_level_core_inputs(
2465+
input_df,
2466+
sim,
2467+
base_year=base_year,
2468+
)
24042469
actual_summary = build_actual_tax_unit_summary(base_dataset, reform=reform)
24052470
base_aggregates = load_base_aggregates(base_dataset, reform=reform)
24062471
ss_scale = load_ssa_benefit_projections(target_year) / max(
@@ -2552,6 +2617,11 @@ def build_role_composite_augmented_input_dataframe(
25522617
base_year=base_year,
25532618
target_year=target_year,
25542619
)
2620+
input_df = ensure_person_level_core_inputs(
2621+
input_df,
2622+
sim,
2623+
base_year=base_year,
2624+
)
25552625
actual_summary = build_actual_tax_unit_summary(base_dataset, reform=reform)
25562626
base_aggregates = load_base_aggregates(base_dataset, reform=reform)
25572627
ss_scale = load_ssa_benefit_projections(target_year) / max(

0 commit comments

Comments
 (0)