Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/student-loan-balance-imputation.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Impute `student_loan_balance` from WAS loan aggregates and retrain stale cached wealth models when that new output is missing.
101 changes: 55 additions & 46 deletions policyengine_uk_data/datasets/imputations/wealth.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk import Microsimulation
from policyengine_uk_data.utils.qrf import QRF

WAS_TAB_FOLDER = STORAGE_FOLDER / "was_2006_20"

Expand Down Expand Up @@ -52,8 +53,51 @@
"non_residential_property_value",
"savings",
"num_vehicles",
"student_loan_balance",
]

WAS_RENAMES = {
"R7xshhwgt": "household_weight",
# Components for estimating land holdings.
"DVLUKValR7_sum": "owned_land", # In the UK.
"DVPropertyR7": "property_wealth",
"DVFESHARESR7_aggr": "emp_shares_options",
"DVFShUKVR7_aggr": "uk_shares",
"DVIISAVR7_aggr": "investment_isas",
"DVFCollVR7_aggr": "unit_investment_trusts",
"TotpenR7_aggr": "pensions",
"DvvalDBTR7_aggr": "db_pensions",
# Predictors for fusing to FRS.
"dvtotgirR7": "gross_income",
"NumAdultW7": "num_adults",
"NumCh18W7": "num_children",
# Household Gross Annual income from occupational or private pensions
"DVGIPPENR7_AGGR": "private_pension_income",
"DVGISER7_AGGR": "self_employment_income",
# Household Gross annual income from investments
"DVGIINVR7_aggr": "capital_income",
# Household Total Annual Gross employee income
"DVGIEMPR7_AGGR": "employment_income",
"HBedrmW7": "num_bedrooms",
"GORR7": "region",
"DVPriRntW7": "is_renter", # {1, 2} TODO: Get codebook values.
"CTAmtW7": "council_tax",
# Other columns for reference.
"DVLOSValR7_sum": "non_uk_land",
"HFINWNTR7_Sum": "net_financial_wealth",
"DVLUKDebtR7_sum": "uk_land_debt",
"HFINWR7_Sum": "gross_financial_wealth",
"TotWlthR7": "wealth",
"DVhvalueR7": "main_residence_value",
"DVHseValR7_sum": "other_residential_property_value",
"DVBlDValR7_sum": "non_residential_property_value",
"DVTotinc_bhcR7": "household_net_income",
"DVSaValR7_aggr": "savings",
"vcarnr7": "num_vehicles",
"Tot_LosR7_aggr": "total_loans",
"Tot_los_exc_SLCR7_aggr": "total_loans_exc_slc",
}


def generate_was_table(was: pd.DataFrame):
"""
Expand All @@ -70,47 +114,7 @@ def generate_was_table(was: pd.DataFrame):
to_remove = []
to_add = {}

RENAMES = {
"R7xshhwgt": "household_weight",
# Components for estimating land holdings.
"DVLUKValR7_sum": "owned_land", # In the UK.
"DVPropertyR7": "property_wealth",
"DVFESHARESR7_aggr": "emp_shares_options",
"DVFShUKVR7_aggr": "uk_shares",
"DVIISAVR7_aggr": "investment_isas",
"DVFCollVR7_aggr": "unit_investment_trusts",
"TotpenR7_aggr": "pensions",
"DvvalDBTR7_aggr": "db_pensions",
# Predictors for fusing to FRS.
"dvtotgirR7": "gross_income",
"NumAdultW7": "num_adults",
"NumCh18W7": "num_children",
# Household Gross Annual income from occupational or private pensions
"DVGIPPENR7_AGGR": "private_pension_income",
"DVGISER7_AGGR": "self_employment_income",
# Household Gross annual income from investments
"DVGIINVR7_aggr": "capital_income",
# Household Total Annual Gross employee income
"DVGIEMPR7_AGGR": "employment_income",
"HBedrmW7": "num_bedrooms",
"GORR7": "region",
"DVPriRntW7": "is_renter", # {1, 2} TODO: Get codebook values.
"CTAmtW7": "council_tax",
# Other columns for reference.
"DVLOSValR7_sum": "non_uk_land",
"HFINWNTR7_Sum": "net_financial_wealth",
"DVLUKDebtR7_sum": "uk_land_debt",
"HFINWR7_Sum": "gross_financial_wealth",
"TotWlthR7": "wealth",
"DVhvalueR7": "main_residence_value",
"DVHseValR7_sum": "other_residential_property_value",
"DVBlDValR7_sum": "non_residential_property_value",
"DVTotinc_bhcR7": "household_net_income",
"DVSaValR7_aggr": "savings",
"vcarnr7": "num_vehicles",
}

RENAMES = {x.lower(): y for x, y in RENAMES.items()}
RENAMES = {x.lower(): y for x, y in WAS_RENAMES.items()}

for key in RENAMES:
key = key.lower()
Expand Down Expand Up @@ -145,19 +149,24 @@ def generate_was_table(was: pd.DataFrame):
"unit_investment_trusts",
]
].sum(axis=1)
was["student_loan_balance"] = was["total_loans"] - was["total_loans_exc_slc"]
was["region"] = was["region"].map(REGIONS)
return was


def _wealth_model_outputs_are_current(model: QRF) -> bool:
"""Check whether a cached wealth model includes all current output columns."""
trained_outputs = getattr(model.model, "imputed_variables", None)
return list(trained_outputs) == IMPUTE_VARIABLES


def save_imputation_models():
"""
Train and save wealth imputation model.

Returns:
Trained QRF model.
"""
from policyengine_uk_data.utils.qrf import QRF

was = pd.read_csv(
WAS_TAB_FOLDER / "was_round_7_hhold_eul_march_2022.tab",
sep="\t",
Expand Down Expand Up @@ -185,10 +194,10 @@ def create_wealth_model(overwrite_existing: bool = False):
Returns:
QRF model for wealth imputation.
"""
from policyengine_uk_data.utils.qrf import QRF

if (STORAGE_FOLDER / "wealth.pkl").exists() and not overwrite_existing:
return QRF(file_path=STORAGE_FOLDER / "wealth.pkl")
wealth = QRF(file_path=STORAGE_FOLDER / "wealth.pkl")
if _wealth_model_outputs_are_current(wealth):
return wealth
return save_imputation_models()


Expand Down
74 changes: 74 additions & 0 deletions policyengine_uk_data/tests/test_student_loan_balance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import importlib.util
from pathlib import Path
from types import SimpleNamespace

import pandas as pd

_WEALTH_PATH = (
Path(__file__).resolve().parents[1] / "datasets" / "imputations" / "wealth.py"
)
_WEALTH_SPEC = importlib.util.spec_from_file_location(
"student_loan_balance_wealth_module",
_WEALTH_PATH,
)
wealth = importlib.util.module_from_spec(_WEALTH_SPEC)
_WEALTH_SPEC.loader.exec_module(wealth)


def test_generate_was_table_derives_student_loan_balance():
row = {column: 0 for column in wealth.WAS_RENAMES}
row["R7xshhwgt"] = 1
row["GORR7"] = 11
row["DVPriRntW7"] = 1
row["TotpenR7_aggr"] = 100
row["DvvalDBTR7_aggr"] = 25
row["Tot_LosR7_aggr"] = 20_000
row["Tot_los_exc_SLCR7_aggr"] = 5_000

was = wealth.generate_was_table(pd.DataFrame([row]))

assert "student_loan_balance" in was.columns
assert was.student_loan_balance.iloc[0] == 15_000
assert "student_loan_balance" in wealth.IMPUTE_VARIABLES


def test_create_wealth_model_reuses_current_cached_model(tmp_path, monkeypatch):
model_path = tmp_path / "wealth.pkl"
model_path.write_bytes(b"placeholder")
cached_model = SimpleNamespace(
model=SimpleNamespace(imputed_variables=list(wealth.IMPUTE_VARIABLES))
)

class DummyQRF:
def __init__(self, file_path=None):
assert file_path == model_path
self.model = cached_model.model

monkeypatch.setattr(wealth, "STORAGE_FOLDER", tmp_path)
monkeypatch.setattr(wealth, "QRF", DummyQRF)
monkeypatch.setattr(
wealth,
"save_imputation_models",
lambda: (_ for _ in ()).throw(AssertionError("should not retrain")),
)

model = wealth.create_wealth_model()
assert model.model.imputed_variables == list(wealth.IMPUTE_VARIABLES)


def test_create_wealth_model_retrains_when_cached_outputs_stale(tmp_path, monkeypatch):
model_path = tmp_path / "wealth.pkl"
model_path.write_bytes(b"placeholder")

class DummyQRF:
def __init__(self, file_path=None):
assert file_path == model_path
self.model = SimpleNamespace(imputed_variables=["owned_land"])

fresh_model = object()

monkeypatch.setattr(wealth, "STORAGE_FOLDER", tmp_path)
monkeypatch.setattr(wealth, "QRF", DummyQRF)
monkeypatch.setattr(wealth, "save_imputation_models", lambda: fresh_model)

assert wealth.create_wealth_model() is fresh_model
Loading