From b1fbc05f22a9c776cc4696329cbbb57263e36de2 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 15 Apr 2026 09:26:28 -0400 Subject: [PATCH] Assign is_parent from FRS microdata --- changelog.d/73.md | 1 + policyengine_uk_data/datasets/frs.py | 47 +++++++++++++++++++ .../tests/test_is_parent_from_frs.py | 27 +++++++++++ .../tests/test_legacy_benefit_proxies.py | 2 + uv.lock | 2 +- 5 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 changelog.d/73.md create mode 100644 policyengine_uk_data/tests/test_is_parent_from_frs.py diff --git a/changelog.d/73.md b/changelog.d/73.md new file mode 100644 index 000000000..80370f5d2 --- /dev/null +++ b/changelog.d/73.md @@ -0,0 +1 @@ +Assign `is_parent` from FRS adult-table membership and benefit-unit dependent-child counts (#73). diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index b5e6ef9d3..368da0d44 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -228,6 +228,36 @@ def attach_legacy_benefit_proxies_from_frs_person( ) +def derive_is_parent_from_frs_microdata( + person_ids, + person_benunit_ids, + adult_person_ids, + benunit_ids, + dependent_children, +) -> np.ndarray: + """Identify FRS adults in benefit units with dependent children. + + FRS benefit units contain either one adult or a couple plus any dependent + children. Using the raw adult table and benefit-unit dependent-child count + avoids ranking adults across the whole household when multiple benefit + units share a household. + """ + + dependent_children_by_benunit = pd.Series( + np.asarray(dependent_children, dtype=float), + index=np.asarray(benunit_ids), + ) + has_dependent_children = ( + pd.Series(np.asarray(person_benunit_ids)) + .map(dependent_children_by_benunit) + .fillna(0) + .to_numpy() + > 0 + ) + is_adult_record = np.isin(np.asarray(person_ids), np.asarray(adult_person_ids)) + return is_adult_record & has_dependent_children + + def _as_non_negative_array(values) -> np.ndarray: values = np.asarray(values, dtype=float) return np.maximum(np.nan_to_num(values, nan=0.0), 0.0) @@ -443,6 +473,23 @@ def create_frs( pe_person["hours_worked"] = np.maximum(person.tothours, 0) * 52 pe_person["is_household_head"] = person.hrpid == 1 pe_person["is_benunit_head"] = person.uperson == 1 + dependent_children = ( + benunit.depchldb + if "depchldb" in benunit + else frs["child"] + .groupby("benunit_id") + .size() + .reindex(benunit.benunit_id) + .fillna(0) + .to_numpy() + ) + pe_person["is_parent"] = derive_is_parent_from_frs_microdata( + person_ids=pe_person.person_id, + person_benunit_ids=pe_person.person_benunit_id, + adult_person_ids=frs["adult"].person_id, + benunit_ids=pe_benunit.benunit_id, + dependent_children=dependent_children, + ) MARITAL = [ "MARRIED", "SINGLE", diff --git a/policyengine_uk_data/tests/test_is_parent_from_frs.py b/policyengine_uk_data/tests/test_is_parent_from_frs.py new file mode 100644 index 000000000..055563c3b --- /dev/null +++ b/policyengine_uk_data/tests/test_is_parent_from_frs.py @@ -0,0 +1,27 @@ +import numpy as np + +from policyengine_uk_data.datasets.frs import derive_is_parent_from_frs_microdata + + +def test_is_parent_uses_benefit_unit_not_household_rank(): + result = derive_is_parent_from_frs_microdata( + person_ids=np.array([1_001, 1_002, 1_003, 1_004]), + person_benunit_ids=np.array([101, 101, 102, 102]), + adult_person_ids=np.array([1_001, 1_002, 1_003]), + benunit_ids=np.array([101, 102]), + dependent_children=np.array([0, 1]), + ) + + assert result.tolist() == [False, False, True, False] + + +def test_is_parent_marks_both_adults_in_couple_with_children(): + result = derive_is_parent_from_frs_microdata( + person_ids=np.array([2_001, 2_002, 2_003]), + person_benunit_ids=np.array([201, 201, 201]), + adult_person_ids=np.array([2_001, 2_002]), + benunit_ids=np.array([201]), + dependent_children=np.array([1]), + ) + + assert result.tolist() == [True, True, False] diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py index 7332f36e7..d09e57ee1 100644 --- a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py +++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py @@ -465,6 +465,8 @@ def fake_read_csv(path, *args, **kwargs): "legacy_jobseeker_proxy", "esa_health_condition_proxy", "esa_support_group_proxy", + "is_parent", }.issubset(dataset.person.columns) + assert not dataset.person["is_parent"].iloc[0] assert dataset.person["education_grants"].iloc[0] == 100 assert dataset.person["disabled_students_allowance_eligible_expenses"].iloc[0] == 0 diff --git a/uv.lock b/uv.lock index d29d61701..3f83c8fc8 100644 --- a/uv.lock +++ b/uv.lock @@ -1366,7 +1366,7 @@ wheels = [ [[package]] name = "policyengine-uk-data" -version = "1.50.0" +version = "1.50.1" source = { editable = "." } dependencies = [ { name = "google-auth" },