55from zipfile import ZipFile
66import pandas as pd
77from policyengine_us_data .storage import STORAGE_FOLDER
8+ from policyengine_us_data .datasets .cps .tax_unit_construction import (
9+ construct_tax_units ,
10+ )
811
912
1013OPTIONAL_PERSON_COLUMNS = {
2629 "NOW_CHAMPVA" ,
2730 "NOW_VACARE" ,
2831 "NOW_IHSFLG" ,
32+ "PTOTVAL" ,
2933}
3034
3135
@@ -59,6 +63,9 @@ class CensusCPS(Dataset):
5963 time_period : int
6064 """Year of the dataset."""
6165
66+ tax_unit_construction_mode : str = "policyengine"
67+ """Mode used when constructing tax units from CPS person records."""
68+
6269 def generate (self ):
6370 if self ._cps_download_url is None :
6471 raise ValueError (f"No raw CPS data URL known for year { self .time_period } ." )
@@ -117,6 +124,7 @@ def generate(self):
117124 usecols = person_usecols ,
118125 ).fillna (0 )
119126 person = _fill_missing_optional_person_columns (person )
127+ tax_unit = self ._create_tax_unit_table (person )
120128 storage ["person" ] = person
121129 with zipfile .open (f"{ file_prefix } ffpub{ file_year_code } .csv" ) as f :
122130 person_family_id = person .PH_SEQ * 10 + person .PF_SEQ
@@ -130,7 +138,7 @@ def generate(self):
130138 household_id = household .H_SEQ
131139 household = household [household_id .isin (person_household_id )]
132140 storage ["household" ] = household
133- storage ["tax_unit" ] = self . _create_tax_unit_table ( person )
141+ storage ["tax_unit" ] = tax_unit
134142 storage ["spm_unit" ] = self ._create_spm_unit_table (
135143 person , self .time_period
136144 )
@@ -139,10 +147,20 @@ def generate(self):
139147 def _cps_download_url (self ) -> str :
140148 return CPS_URL_BY_YEAR .get (self .time_period )
141149
142- def _create_tax_unit_table (self , person : pd .DataFrame ) -> pd .DataFrame :
143- tax_unit_df = person [TAX_UNIT_COLUMNS ].groupby (person .TAX_ID ).sum ()
144- tax_unit_df ["TAX_ID" ] = tax_unit_df .index
145- return tax_unit_df
150+ def _create_tax_unit_table (
151+ self ,
152+ person : pd .DataFrame ,
153+ mode : str | None = None ,
154+ ) -> pd .DataFrame :
155+ person ["CENSUS_TAX_ID" ] = person ["TAX_ID" ]
156+ mode = mode or self .tax_unit_construction_mode
157+ constructed_person , tax_unit_df = construct_tax_units (
158+ person = person ,
159+ year = self .time_period ,
160+ mode = mode ,
161+ )
162+ person ["TAX_ID" ] = constructed_person ["TAX_ID" ].values
163+ return tax_unit_df [["TAX_ID" ]]
146164
147165 def _create_spm_unit_table (
148166 self , person : pd .DataFrame , time_period : int
@@ -282,12 +300,18 @@ class CensusCPS_2018(CensusCPS):
282300 "PF_SEQ" ,
283301 "P_SEQ" ,
284302 "TAX_ID" ,
303+ "PECOHAB" ,
285304 "SPM_ID" ,
286305 "A_FNLWGT" ,
287306 "A_LINENO" ,
288307 "A_SPOUSE" ,
308+ "A_EXPRRP" ,
309+ "A_FAMREL" ,
310+ "A_FAMTYP" ,
289311 "A_AGE" ,
290312 "A_SEX" ,
313+ "A_ENRLW" ,
314+ "A_FTPT" ,
291315 "PEDISEYE" ,
292316 "NOW_COV" ,
293317 "NOW_DIR" ,
@@ -318,6 +342,7 @@ class CensusCPS_2018(CensusCPS):
318342 "LKWEEKS" , # Weeks looking for work during the year (Census variable)
319343 "ANN_VAL" ,
320344 "PNSN_VAL" ,
345+ "PTOTVAL" ,
321346 "OI_OFF" ,
322347 "OI_VAL" ,
323348 "CSP_VAL" ,
0 commit comments