Merge pull request #211 from PolicyEngine/refactor-income-imputation

nikhilwoodruff · web-flow · commit 048f389649bf · 2025-10-21T11:18:19.000+01:00
Refactor income imputation and remove winter fuel allowance from loss calculations
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- bump: patch
+  changes:
+    changed:
+    - Refactored income imputation to selectively impute only dividend income on the main dataset.
+    - Removed winter fuel allowance from loss calculations.
diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py
@@ -140,6 +140,31 @@ def create_income_model(overwrite_existing: bool = False):
     return save_imputation_models()
 
 
+def impute_over_incomes(
+    dataset: UKSingleYearDataset, model, output_variables: list[str]
+) -> pd.DataFrame:
+    """
+    Impute specified income components using trained model.
+
+    Args:
+        dataset: PolicyEngine UK dataset to augment with income data.
+        output_variables: List of income components to impute.
+
+    Returns:
+        DataFrame with imputed income components.
+    """
+    dataset = dataset.copy()
+    input_df = Microsimulation(dataset=dataset).calculate_dataframe(
+        ["age", "gender", "region"]
+    )
+    output_df = model.predict(input_df)
+
+    for column in output_variables:
+        dataset.person[column] = output_df[column].fillna(0).values
+
+    return dataset
+
+
 def impute_income(dataset: UKSingleYearDataset) -> UKSingleYearDataset:
     """
     Impute detailed income components using trained model.
@@ -161,16 +186,23 @@ def impute_income(dataset: UKSingleYearDataset) -> UKSingleYearDataset:
     zero_weight_copy = subsample_dataset(zero_weight_copy, 10_000)
 
     model = create_income_model()
-    sim = Microsimulation(dataset=zero_weight_copy)
 
-    input_df = sim.calculate_dataframe(["age", "gender", "region"])
+    # Impute just dividends on the original, full variable set on the copy
 
-    output_df = model.predict(input_df)
+    zero_weight_copy = impute_over_incomes(
+        zero_weight_copy,
+        model,
+        IMPUTATIONS,
+    )
 
-    for column in output_df.columns:
-        zero_weight_copy.person[column] = output_df[column].fillna(0).values
+    dataset = impute_over_incomes(
+        dataset,
+        model,
+        ["dividend_income"],
+    )
 
     zero_weight_copy.validate()
+    dataset.validate()
 
     data = stack_datasets(
         dataset,
diff --git a/policyengine_uk_data/utils/loss.py b/policyengine_uk_data/utils/loss.py
@@ -122,7 +122,7 @@ def pe_count(*variables):
         on_uc * ~unemployed
     )
 
-    df["obr/winter_fuel_allowance_count"] = pe_count("winter_fuel_allowance")
+    # df["obr/winter_fuel_allowance_count"] = pe_count("winter_fuel_allowance")
     df["obr/capital_gains_tax"] = pe("capital_gains_tax")
     df["obr/child_benefit"] = pe("child_benefit")
 
@@ -152,7 +152,7 @@ def pe_count(*variables):
     )
 
     df["obr/vat"] = pe("vat")
-    df["obr/winter_fuel_allowance"] = pe("winter_fuel_allowance")
+    # df["obr/winter_fuel_allowance"] = pe("winter_fuel_allowance")
 
     # Not strictly from the OBR but from the 2024 Independent Schools Council census. OBR will be using that.
     df["obr/private_school_students"] = pe("attends_private_school")

Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,7 @@ def pe_count(*variables):`
`122`	`122`	`on_uc * ~unemployed`
`123`	`123`	`)`
`124`	`124`
`125`		`- df["obr/winter_fuel_allowance_count"] = pe_count("winter_fuel_allowance")`
	`125`	`+ # df["obr/winter_fuel_allowance_count"] = pe_count("winter_fuel_allowance")`
`126`	`126`	`df["obr/capital_gains_tax"] = pe("capital_gains_tax")`
`127`	`127`	`df["obr/child_benefit"] = pe("child_benefit")`
`128`	`128`
`@@ -152,7 +152,7 @@ def pe_count(*variables):`
`152`	`152`	`)`
`153`	`153`
`154`	`154`	`df["obr/vat"] = pe("vat")`
`155`		`- df["obr/winter_fuel_allowance"] = pe("winter_fuel_allowance")`
	`155`	`+ # df["obr/winter_fuel_allowance"] = pe("winter_fuel_allowance")`
`156`	`156`
`157`	`157`	`# Not strictly from the OBR but from the 2024 Independent Schools Council census. OBR will be using that.`
`158`	`158`	`df["obr/private_school_students"] = pe("attends_private_school")`