PolicyEngine
diff --git a/‎changelog.d/1032.fixed.md‎
Lines changed: 1 addition & 0 deletions b/‎changelog.d/1032.fixed.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎policyengine_us_data/datasets/cps/long_term/projection_utils.py‎
Lines changed: 61 additions & 2 deletions b/‎policyengine_us_data/datasets/cps/long_term/projection_utils.py‎
Lines changed: 61 additions & 2 deletions
diff --git a/‎policyengine_us_data/datasets/cps/long_term/prototype_synthetic_2100_support.py‎
Lines changed: 134 additions & 64 deletions b/‎policyengine_us_data/datasets/cps/long_term/prototype_synthetic_2100_support.py‎
Lines changed: 134 additions & 64 deletions
@@ -0,0 +1 @@
+Fix long-run support augmentation and H5 materialization under household-derived microsimulation weights.
@@ -8,6 +8,62 @@
 from policyengine_core.data.dataset import Dataset
 
 
+PERSON_LEVEL_IDENTITY_INPUTS = (
+    "person_id",
+    "household_id",
+    "person_household_id",
+    "family_id",
+    "person_family_id",
+    "tax_unit_id",
+    "person_tax_unit_id",
+    "spm_unit_id",
+    "person_spm_unit_id",
+    "marital_unit_id",
+    "person_marital_unit_id",
+)
+
+
+def _row_values(series):
+    """Return unweighted row values from a MicroSeries-like object."""
+    if hasattr(series, "array"):
+        return np.asarray(series.array)
+    if hasattr(series, "values"):
+        return np.asarray(series.values)
+    return np.asarray(series)
+
+
+def _person_level_values(sim, variable, *, period):
+    try:
+        series = sim.calculate(variable, period=period, map_to="person")
+    except Exception:
+        series = sim.calculate(variable, period=period)
+    return _row_values(series)
+
+
+def ensure_person_level_identity_inputs(df, sim, *, base_period):
+    """
+    Rehydrate identity columns omitted by newer policyengine-core input exports.
+
+    policyengine-core derives relationships and weights from household-level
+    inputs. The H5 materialization path still needs explicit person-row identity
+    columns so ``Dataset.from_dataframe`` can rebuild a runtime dataset.
+    """
+    output = df.copy()
+    person_row_count = len(output)
+    for variable in PERSON_LEVEL_IDENTITY_INPUTS:
+        column = f"{variable}__{base_period}"
+        if column in output.columns:
+            continue
+        values = _person_level_values(sim, variable, period=base_period)
+        if len(values) != person_row_count:
+            raise ValueError(
+                f"Expected {variable} to map to {person_row_count} person rows; "
+                f"got {len(values)}."
+            )
+        output[column] = values
+    return output
+
+
 def validate_projected_social_security_cap(
     parameter_accessor,
     year: int,
@@ -256,6 +312,7 @@ def create_household_year_h5(
     base_period = int(sim.default_calculation_period)
 
     df = sim.to_input_dataframe()
+    df = ensure_person_level_identity_inputs(df, sim, base_period=base_period)
 
     # Remove pseudo-input variables (aggregates of calculated values)
     pseudo_inputs = get_pseudo_input_variables(sim)
@@ -271,9 +328,9 @@ def create_household_year_h5(
     person_household_id = df[f"person_household_id__{base_period}"]
 
     hh_to_weight = dict(zip(household_ids, household_weights))
-    person_weights = person_household_id.map(hh_to_weight)
+    person_level_household_weights = person_household_id.map(hh_to_weight)
 
-    df[f"household_weight__{year}"] = person_weights
+    df[f"household_weight__{year}"] = person_level_household_weights
     df.drop(
         columns=[
             f"household_weight__{base_period}",
@@ -370,6 +427,8 @@ def calculate_year_statistics(
     income_tax_values = income_tax_hh.values
 
     household_microseries = sim.calculate("household_id", map_to="household")
+    # Explicit weight access is reserved for the household-level calibration
+    # decision vector; ordinary aggregates should use MicroSeries methods.
     baseline_weights_actual = household_microseries.weights.values
 
     ss_values = None
 
@@ -470,6 +470,15 @@ def _period_column(name: str, base_year: int) -> str:
     return f"{name}__{base_year}"
 
 
+def _row_values(series: object) -> np.ndarray:
+    """Return unweighted row values from a MicroSeries-like object."""
+    if hasattr(series, "array"):
+        return np.asarray(series.array)
+    if hasattr(series, "values"):
+        return np.asarray(series.values)
+    return np.asarray(series)
+
+
 def classify_archetype(
     *,
     head_age: float,
@@ -520,37 +529,49 @@ def build_tax_unit_summary(
     reform: object | None = None,
 ) -> pd.DataFrame:
     sim = Microsimulation(dataset=dataset, reform=reform)
-    input_df = sim.to_input_dataframe()
+    # policyengine-core derives all microsimulation weights from household_weight.
+    # Build the person-row donor summary from that source of truth.
+    household_weight_at_person_level = _row_values(
+        sim.calculate("household_weight", period=period, map_to="person")
+    ).astype(float)
 
     person_df = pd.DataFrame(
         {
-            "tax_unit_id": sim.calculate("person_tax_unit_id", period=period).values,
-            "household_id": sim.calculate("person_household_id", period=period).values,
-            "age": sim.calculate("age", period=period).values,
-            "is_head": sim.calculate("is_tax_unit_head", period=period).values,
-            "is_spouse": sim.calculate("is_tax_unit_spouse", period=period).values,
-            "is_dependent": sim.calculate(
-                "is_tax_unit_dependent", period=period
-            ).values,
-            "social_security": sim.calculate("social_security", period=period).values,
+            "tax_unit_id": _row_values(
+                sim.calculate("person_tax_unit_id", period=period)
+            ),
+            "household_id": _row_values(
+                sim.calculate("person_household_id", period=period)
+            ),
+            "age": _row_values(sim.calculate("age", period=period)),
+            "is_head": _row_values(sim.calculate("is_tax_unit_head", period=period)),
+            "is_spouse": _row_values(
+                sim.calculate("is_tax_unit_spouse", period=period)
+            ),
+            "is_dependent": _row_values(
+                sim.calculate("is_tax_unit_dependent", period=period)
+            ),
+            "social_security": _row_values(
+                sim.calculate("social_security", period=period)
+            ),
             "payroll": (
-                sim.calculate(
-                    "taxable_earnings_for_social_security", period=period
-                ).values
-                + sim.calculate(
-                    "social_security_taxable_self_employment_income", period=period
-                ).values
+                _row_values(
+                    sim.calculate("taxable_earnings_for_social_security", period=period)
+                )
+                + _row_values(
+                    sim.calculate(
+                        "social_security_taxable_self_employment_income",
+                        period=period,
+                    )
+                )
+            ),
+            "dividend_income": _row_values(
+                sim.calculate("qualified_dividend_income", period=period)
             ),
-            "dividend_income": sim.calculate(
-                "qualified_dividend_income", period=period
-            ).values,
-            "pension_income": sim.calculate(
-                "taxable_pension_income", period=period
-            ).values,
-            "person_weight": input_df[f"person_weight__{period}"].astype(float).values,
-            "household_weight": input_df[f"household_weight__{period}"]
-            .astype(float)
-            .values,
+            "pension_income": _row_values(
+                sim.calculate("taxable_pension_income", period=period)
+            ),
+            "household_weight": household_weight_at_person_level,
         }
     )
 
@@ -594,7 +615,6 @@ def build_tax_unit_summary(
             "dividend_income": float(group["dividend_income"].sum()),
             "pension_income": float(group["pension_income"].sum()),
             "support_count_weight": 1.0,
-            "person_weight_proxy": float(group["person_weight"].max()),
             "household_weight_proxy": float(group["household_weight"].max()),
         }
         row["archetype"] = classify_archetype(
@@ -646,11 +666,11 @@ def attach_person_uprating_factors(
         else np.zeros(len(df), dtype=float)
     )
     uprated_payroll = sum(
-        sim.calculate(component, period=target_year).values.astype(float)
+        _row_values(sim.calculate(component, period=target_year)).astype(float)
         for component in PAYROLL_COMPONENTS
     )
     uprated_ss = sum(
-        sim.calculate(component, period=target_year).values.astype(float)
+        _row_values(sim.calculate(component, period=target_year)).astype(float)
         for component in SS_COMPONENTS
     )
     df[PAYROLL_UPRATING_FACTOR_COLUMN] = np.where(
@@ -666,32 +686,80 @@ def attach_person_uprating_factors(
     return df
 
 
+def _person_level_values(
+    sim: Microsimulation,
+    variable: str,
+    *,
+    period: int,
+) -> np.ndarray:
+    try:
+        series = sim.calculate(variable, period=period, map_to="person")
+    except Exception:
+        series = sim.calculate(variable, period=period)
+    return _row_values(series)
+
+
+def ensure_person_level_core_inputs(
+    input_df: pd.DataFrame,
+    sim: Microsimulation,
+    *,
+    base_year: int,
+) -> pd.DataFrame:
+    """Fill aliases that newer policyengine-core omits from input exports.
+
+    policyengine-core#497 made household_weight the source of truth for all
+    microsimulation weights and stopped relying on redundant stored person
+    weights. The support augmentation code still needs person-row IDs so it can
+    clone donors and assign fresh entity identifiers.
+    """
+    df = input_df.copy()
+    person_row_count = len(df)
+    required_person_level_inputs = [
+        PERSON_ID_COLUMN,
+        *(column for columns in ENTITY_ID_COLUMNS.values() for column in columns),
+        "household_weight",
+    ]
+    for variable in required_person_level_inputs:
+        column = _period_column(variable, base_year)
+        if column in df.columns:
+            continue
+        values = _person_level_values(sim, variable, period=base_year)
+        if len(values) != person_row_count:
+            raise ValueError(
+                f"Expected {variable} to map to {person_row_count} person rows; "
+                f"got {len(values)}."
+            )
+        df[column] = values
+    df.drop(
+        columns=[_period_column("person_weight", base_year)],
+        inplace=True,
+        errors="ignore",
+    )
+    return df
+
+
 def load_base_aggregates(
     base_dataset: str,
     *,
     reform: object | None = None,
 ) -> dict[str, float]:
     sim = Microsimulation(dataset=base_dataset, reform=reform)
-    household_series = sim.calculate(
-        "household_id", period=BASE_YEAR, map_to="household"
-    )
-    weights = household_series.weights.values.astype(float)
-    ss = sim.calculate("social_security", period=BASE_YEAR, map_to="household").values
-    payroll = (
-        sim.calculate(
-            "taxable_earnings_for_social_security",
-            period=BASE_YEAR,
-            map_to="household",
-        ).values
-        + sim.calculate(
-            "social_security_taxable_self_employment_income",
-            period=BASE_YEAR,
-            map_to="household",
-        ).values
+    ss = sim.calculate("social_security", period=BASE_YEAR, map_to="household")
+    taxable_wages = sim.calculate(
+        "taxable_earnings_for_social_security",
+        period=BASE_YEAR,
+        map_to="household",
+    )
+    taxable_self_employment = sim.calculate(
+        "social_security_taxable_self_employment_income",
+        period=BASE_YEAR,
+        map_to="household",
     )
     return {
-        "weighted_ss_total": float(np.sum(ss * weights)),
-        "weighted_payroll_total": float(np.sum(payroll * weights)),
+        "weighted_ss_total": float(ss.sum()),
+        "weighted_payroll_total": float(
+            taxable_wages.sum() + taxable_self_employment.sum()
+        ),
     }
 
 
@@ -1961,7 +2029,6 @@ def _clone_tax_unit_rows_to_target(
 ) -> tuple[pd.DataFrame, dict[str, int]] | tuple[None, dict[str, int]]:
     age_col = _period_column("age", base_year)
     household_weight_col = _period_column("household_weight", base_year)
-    person_weight_col = _period_column("person_weight", base_year)
     person_id_col = _period_column(PERSON_ID_COLUMN, base_year)
 
     adults = donor_rows[donor_rows[age_col] >= 18].sort_values(age_col, ascending=False)
@@ -1975,7 +2042,10 @@ def _clone_tax_unit_rows_to_target(
     ):
         return None, id_counters
 
-    cloned = donor_rows.copy()
+    cloned = donor_rows.drop(
+        columns=[_period_column("person_weight", base_year)],
+        errors="ignore",
+    ).copy()
     household_id = id_counters["household"]
     id_counters["household"] += 1
     for entity_name, columns in ENTITY_ID_COLUMNS.items():
@@ -2003,13 +2073,6 @@ def _clone_tax_unit_rows_to_target(
             * clone_weight_scale
             / max(clone_weight_divisor, 1)
         )
-    if person_weight_col in cloned.columns:
-        cloned[person_weight_col] = (
-            cloned[person_weight_col].astype(float)
-            * clone_weight_scale
-            / max(clone_weight_divisor, 1)
-        )
-
     adult_indices = adults.index.tolist()
     head_idx = adult_indices[0]
     spouse_idx = adult_indices[1] if target_has_spouse else None
@@ -2115,7 +2178,6 @@ def _compose_role_donor_rows_to_target(
 ) -> tuple[pd.DataFrame, dict[str, int]] | tuple[None, dict[str, int]]:
     age_col = _period_column("age", base_year)
     household_weight_col = _period_column("household_weight", base_year)
-    person_weight_col = _period_column("person_weight", base_year)
     person_id_col = _period_column(PERSON_ID_COLUMN, base_year)
 
     def _adult_rows(df: pd.DataFrame | None) -> pd.DataFrame:
@@ -2248,6 +2310,11 @@ def _dependent_rows(df: pd.DataFrame | None) -> pd.DataFrame:
     # Reset duplicate donor indices so later row-specific retargeting only touches
     # the intended clone row.
     cloned = pd.DataFrame(selected_rows).reset_index(drop=True).copy()
+    cloned.drop(
+        columns=[_period_column("person_weight", base_year)],
+        inplace=True,
+        errors="ignore",
+    )
     cloned_sources = pd.Series(selected_sources, index=cloned.index)
     household_id = id_counters["household"]
     id_counters["household"] += 1
@@ -2276,13 +2343,6 @@ def _dependent_rows(df: pd.DataFrame | None) -> pd.DataFrame:
             * clone_weight_scale
             / max(clone_weight_divisor, 1)
         )
-    if person_weight_col in cloned.columns:
-        cloned[person_weight_col] = (
-            cloned[person_weight_col].astype(float)
-            * clone_weight_scale
-            / max(clone_weight_divisor, 1)
-        )
-
     head_idx = cloned.index[0]
     spouse_idx = cloned.index[1] if target_candidate.spouse_age is not None else None
     dependent_indices = (
@@ -2401,6 +2461,11 @@ def build_donor_backed_augmented_input_dataframe(
         base_year=base_year,
         target_year=target_year,
     )
+    input_df = ensure_person_level_core_inputs(
+        input_df,
+        sim,
+        base_year=base_year,
+    )
     actual_summary = build_actual_tax_unit_summary(base_dataset, reform=reform)
     base_aggregates = load_base_aggregates(base_dataset, reform=reform)
     ss_scale = load_ssa_benefit_projections(target_year) / max(
@@ -2552,6 +2617,11 @@ def build_role_composite_augmented_input_dataframe(
         base_year=base_year,
         target_year=target_year,
     )
+    input_df = ensure_person_level_core_inputs(
+        input_df,
+        sim,
+        base_year=base_year,
+    )
     actual_summary = build_actual_tax_unit_summary(base_dataset, reform=reform)
     base_aggregates = load_base_aggregates(base_dataset, reform=reform)
     ss_scale = load_ssa_benefit_projections(target_year) / max(
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Fix long-run support augmentation and H5 materialization under household-derived microsimulation weights.`