Skip to content

Commit 40314a7

Browse files
authored
Construct CPS tax units from household records (#824)
* Construct CPS tax units from household records * Fix CPS tax unit construction CI
1 parent cd45baa commit 40314a7

10 files changed

Lines changed: 2442 additions & 8 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Construct CPS tax units from ASEC household relationships instead of using Census tax-unit assignments.

policyengine_us_data/datasets/cps/census_cps.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
from zipfile import ZipFile
66
import pandas as pd
77
from policyengine_us_data.storage import STORAGE_FOLDER
8+
from policyengine_us_data.datasets.cps.tax_unit_construction import (
9+
construct_tax_units,
10+
)
811

912

1013
OPTIONAL_PERSON_COLUMNS = {
@@ -26,6 +29,7 @@
2629
"NOW_CHAMPVA",
2730
"NOW_VACARE",
2831
"NOW_IHSFLG",
32+
"PTOTVAL",
2933
}
3034

3135

@@ -59,6 +63,9 @@ class CensusCPS(Dataset):
5963
time_period: int
6064
"""Year of the dataset."""
6165

66+
tax_unit_construction_mode: str = "policyengine"
67+
"""Mode used when constructing tax units from CPS person records."""
68+
6269
def generate(self):
6370
if self._cps_download_url is None:
6471
raise ValueError(f"No raw CPS data URL known for year {self.time_period}.")
@@ -117,6 +124,7 @@ def generate(self):
117124
usecols=person_usecols,
118125
).fillna(0)
119126
person = _fill_missing_optional_person_columns(person)
127+
tax_unit = self._create_tax_unit_table(person)
120128
storage["person"] = person
121129
with zipfile.open(f"{file_prefix}ffpub{file_year_code}.csv") as f:
122130
person_family_id = person.PH_SEQ * 10 + person.PF_SEQ
@@ -130,7 +138,7 @@ def generate(self):
130138
household_id = household.H_SEQ
131139
household = household[household_id.isin(person_household_id)]
132140
storage["household"] = household
133-
storage["tax_unit"] = self._create_tax_unit_table(person)
141+
storage["tax_unit"] = tax_unit
134142
storage["spm_unit"] = self._create_spm_unit_table(
135143
person, self.time_period
136144
)
@@ -139,10 +147,20 @@ def generate(self):
139147
def _cps_download_url(self) -> str:
140148
return CPS_URL_BY_YEAR.get(self.time_period)
141149

142-
def _create_tax_unit_table(self, person: pd.DataFrame) -> pd.DataFrame:
143-
tax_unit_df = person[TAX_UNIT_COLUMNS].groupby(person.TAX_ID).sum()
144-
tax_unit_df["TAX_ID"] = tax_unit_df.index
145-
return tax_unit_df
150+
def _create_tax_unit_table(
151+
self,
152+
person: pd.DataFrame,
153+
mode: str | None = None,
154+
) -> pd.DataFrame:
155+
person["CENSUS_TAX_ID"] = person["TAX_ID"]
156+
mode = mode or self.tax_unit_construction_mode
157+
constructed_person, tax_unit_df = construct_tax_units(
158+
person=person,
159+
year=self.time_period,
160+
mode=mode,
161+
)
162+
person["TAX_ID"] = constructed_person["TAX_ID"].values
163+
return tax_unit_df[["TAX_ID"]]
146164

147165
def _create_spm_unit_table(
148166
self, person: pd.DataFrame, time_period: int
@@ -282,12 +300,18 @@ class CensusCPS_2018(CensusCPS):
282300
"PF_SEQ",
283301
"P_SEQ",
284302
"TAX_ID",
303+
"PECOHAB",
285304
"SPM_ID",
286305
"A_FNLWGT",
287306
"A_LINENO",
288307
"A_SPOUSE",
308+
"A_EXPRRP",
309+
"A_FAMREL",
310+
"A_FAMTYP",
289311
"A_AGE",
290312
"A_SEX",
313+
"A_ENRLW",
314+
"A_FTPT",
291315
"PEDISEYE",
292316
"NOW_COV",
293317
"NOW_DIR",
@@ -318,6 +342,7 @@ class CensusCPS_2018(CensusCPS):
318342
"LKWEEKS", # Weeks looking for work during the year (Census variable)
319343
"ANN_VAL",
320344
"PNSN_VAL",
345+
"PTOTVAL",
321346
"OI_OFF",
322347
"OI_VAL",
323348
"CSP_VAL",

policyengine_us_data/datasets/cps/cps.py

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,16 @@
33
from policyengine_core.data import Dataset
44
from policyengine_us_data.storage import STORAGE_FOLDER, DOCS_FOLDER
55
import h5py
6-
from policyengine_us_data.datasets.cps.census_cps import *
6+
from policyengine_us_data.datasets.cps.census_cps import (
7+
CensusCPS,
8+
CensusCPS_2018,
9+
CensusCPS_2019,
10+
CensusCPS_2020,
11+
CensusCPS_2021,
12+
CensusCPS_2022,
13+
CensusCPS_2023,
14+
CensusCPS_2024,
15+
)
716
from pandas import DataFrame, Series
817
import numpy as np
918
import pandas as pd
@@ -138,6 +147,7 @@ def generate(self):
138147
person, tax_unit, family, spm_unit, household = [
139148
raw_data[entity] for entity in ENTITIES
140149
]
150+
_validate_raw_cps_schema(person, tax_unit, self.raw_cps.name)
141151

142152
logging.info("Adding ID variables")
143153
add_id_variables(cps, person, tax_unit, family, spm_unit, household)
@@ -562,6 +572,33 @@ def uprate_cps_data(data, from_period, to_period):
562572
return data
563573

564574

575+
def _validate_raw_cps_schema(
576+
person: DataFrame,
577+
tax_unit: DataFrame,
578+
raw_cps_name: str,
579+
) -> None:
580+
required_person_columns = {
581+
"CENSUS_TAX_ID",
582+
}
583+
required_tax_unit_columns = set()
584+
585+
missing_person = sorted(required_person_columns - set(person.columns))
586+
missing_tax_unit = sorted(required_tax_unit_columns - set(tax_unit.columns))
587+
if not missing_person and not missing_tax_unit:
588+
return
589+
590+
missing_parts = []
591+
if missing_person:
592+
missing_parts.append("person: " + ", ".join(missing_person))
593+
if missing_tax_unit:
594+
missing_parts.append("tax_unit: " + ", ".join(missing_tax_unit))
595+
596+
raise ValueError(
597+
f"Raw CPS dataset {raw_cps_name} is stale and must be regenerated; "
598+
f"missing constructed tax-unit columns ({'; '.join(missing_parts)})."
599+
)
600+
601+
565602
def add_id_variables(
566603
cps: h5py.File,
567604
person: DataFrame,
@@ -717,8 +754,12 @@ def children_per_parent(col: str) -> pd.DataFrame:
717754
cps["is_surviving_spouse"] = person.A_MARITL == 4
718755
cps["is_separated"] = person.A_MARITL == 6
719756
# High school or college/university enrollment status.
720-
cps["is_full_time_college_student"] = person.A_HSCOL == 2
721-
757+
if "A_FTPT" in person.columns:
758+
cps["is_full_time_college_student"] = (person.A_HSCOL == 2) & (
759+
person.A_FTPT == 1
760+
)
761+
else:
762+
cps["is_full_time_college_student"] = person.A_HSCOL == 2
722763
cps["detailed_occupation_recode"] = person.POCCU2
723764
cps["treasury_tipped_occupation_code"] = derive_treasury_tipped_occupation_code(
724765
person.PEIOOCC

0 commit comments

Comments
 (0)