2828 CENSUS_DOCUMENTED_MODE ,
2929 }
3030)
31+ DISABILITY_FLAGS = (
32+ "PEDISDRS" ,
33+ "PEDISEAR" ,
34+ "PEDISEYE" ,
35+ "PEDISOUT" ,
36+ "PEDISPHY" ,
37+ "PEDISREM" ,
38+ )
39+ _GROSS_INCOME_COLUMN = "_tax_unit_gross_income"
40+ _CLAIMANT_INCOME_COLUMN = "_tax_unit_claimant_income"
41+ _TOTAL_MONEY_INCOME_COLUMN = "_tax_unit_total_money_income"
42+ _HAS_DISABILITY_COLUMN = "_tax_unit_has_disability"
43+ _IS_FULL_TIME_STUDENT_COLUMN = "_tax_unit_is_full_time_student"
3144
3245
3346@dataclass (frozen = True )
@@ -88,17 +101,27 @@ def _to_optional_parent_line(value) -> int | None:
88101 return value if value > 0 else None
89102
90103
91- def _positive_series (person : pd .DataFrame , column : str ) -> np .ndarray :
104+ def _numeric_array (
105+ person : pd .DataFrame ,
106+ column : str ,
107+ * ,
108+ default : float = 0 ,
109+ ) -> np .ndarray :
92110 if column not in person :
93- return np .zeros (len (person ), dtype = float )
94- values = (
95- pd .to_numeric (person [column ], errors = "coerce" )
96- .fillna (0 )
97- .to_numpy (
111+ return np .full (len (person ), default , dtype = float )
112+ series = person [column ]
113+ if pd .api .types .is_numeric_dtype (series ):
114+ values = series .to_numpy (dtype = float , copy = False )
115+ else :
116+ values = pd .to_numeric (series , errors = "coerce" ).to_numpy (
98117 dtype = float ,
99118 copy = False ,
100119 )
101- )
120+ return np .nan_to_num (values , nan = default )
121+
122+
123+ def _positive_series (person : pd .DataFrame , column : str ) -> np .ndarray :
124+ values = _numeric_array (person , column )
102125 return np .maximum (values , 0 )
103126
104127
@@ -122,64 +145,71 @@ def _estimate_claimant_income(person: pd.DataFrame) -> np.ndarray:
122145 return estimate_dependent_gross_income (person ) + _positive_series (person , "SS_VAL" )
123146
124147
148+ def _has_disability (person : pd .DataFrame ) -> np .ndarray :
149+ has_disability = np .zeros (len (person ), dtype = bool )
150+ for flag in DISABILITY_FLAGS :
151+ if flag in person :
152+ has_disability |= _numeric_array (person , flag ) == 1
153+ return has_disability
154+
155+
156+ def _is_full_time_student (person : pd .DataFrame ) -> np .ndarray :
157+ enrolled_values = _numeric_array (person , "A_ENRLW" )
158+ full_time_values = _numeric_array (person , "A_FTPT" )
159+ school_level_values = _numeric_array (person , "A_HSCOL" )
160+ # Limit this to tax-unit construction: CPS TAX_ID behavior treats current
161+ # high-school or college enrollment as strong student evidence for young
162+ # adults even when the full-time flag is absent or part-time.
163+ return ((enrolled_values == 1 ) & (full_time_values == 1 )) | (
164+ (enrolled_values == 1 ) & np .isin (school_level_values , [1 , 2 ])
165+ )
166+
167+
168+ def _precompute_tax_unit_inputs (person : pd .DataFrame ) -> pd .DataFrame :
169+ gross_income = estimate_dependent_gross_income (person )
170+ person [_GROSS_INCOME_COLUMN ] = gross_income
171+ person [_CLAIMANT_INCOME_COLUMN ] = gross_income + _positive_series (person , "SS_VAL" )
172+ person [_TOTAL_MONEY_INCOME_COLUMN ] = (
173+ _numeric_array (person , "PTOTVAL" )
174+ if "PTOTVAL" in person
175+ else person [_CLAIMANT_INCOME_COLUMN ].to_numpy (dtype = float , copy = False )
176+ )
177+ person [_HAS_DISABILITY_COLUMN ] = _has_disability (person )
178+ person [_IS_FULL_TIME_STUDENT_COLUMN ] = _is_full_time_student (person )
179+ return person
180+
181+
125182def _prepare_household_people (
126183 household : pd .DataFrame ,
127184 household_id : int ,
128185) -> list [_HouseholdPerson ]:
129- disability_flags = [
130- "PEDISDRS" ,
131- "PEDISEAR" ,
132- "PEDISEYE" ,
133- "PEDISOUT" ,
134- "PEDISPHY" ,
135- "PEDISREM" ,
136- ]
137- gross_income = estimate_dependent_gross_income (household )
138- claimant_income = _estimate_claimant_income ( household )
186+ gross_income = (
187+ household [ _GROSS_INCOME_COLUMN ]. to_numpy ( dtype = float , copy = False )
188+ if _GROSS_INCOME_COLUMN in household
189+ else estimate_dependent_gross_income ( household )
190+ )
191+ claimant_income = (
192+ household [ _CLAIMANT_INCOME_COLUMN ]. to_numpy ( dtype = float , copy = False )
193+ if _CLAIMANT_INCOME_COLUMN in household
194+ else _estimate_claimant_income (household )
195+ )
139196 total_money_income = (
140- pd . to_numeric ( household ["PTOTVAL" ], errors = "coerce" )
141- . fillna ( 0 )
142- . to_numpy ( dtype = float , copy = False )
197+ household [_TOTAL_MONEY_INCOME_COLUMN ]. to_numpy ( dtype = float , copy = False )
198+ if _TOTAL_MONEY_INCOME_COLUMN in household
199+ else _numeric_array ( household , "PTOTVAL" )
143200 if "PTOTVAL" in household
144201 else claimant_income .copy ()
145202 )
146203 has_disability = (
147- pd .DataFrame (
148- {
149- flag : household [flag ] if flag in household else 0
150- for flag in disability_flags
151- },
152- index = household .index ,
153- )
154- .eq (1 )
155- .any (axis = 1 )
156- .to_numpy ()
204+ household [_HAS_DISABILITY_COLUMN ].to_numpy (dtype = bool , copy = False )
205+ if _HAS_DISABILITY_COLUMN in household
206+ else _has_disability (household )
157207 )
158- enrolled = (
159- household ["A_ENRLW" ]
160- if "A_ENRLW" in household
161- else pd .Series (0 , index = household .index )
162- )
163- full_time = (
164- household ["A_FTPT" ]
165- if "A_FTPT" in household
166- else pd .Series (0 , index = household .index )
167- )
168- school_level = (
169- household ["A_HSCOL" ]
170- if "A_HSCOL" in household
171- else pd .Series (0 , index = household .index )
172- )
173- enrolled_values = pd .to_numeric (enrolled , errors = "coerce" ).fillna (0 )
174- full_time_values = pd .to_numeric (full_time , errors = "coerce" ).fillna (0 )
175- school_level_values = pd .to_numeric (school_level , errors = "coerce" ).fillna (0 )
176- # Limit this to tax-unit construction: CPS TAX_ID behavior treats current
177- # high-school or college enrollment as strong student evidence for young
178- # adults even when the full-time flag is absent or part-time.
179208 is_full_time_student = (
180- ((enrolled_values == 1 ) & (full_time_values == 1 ))
181- | ((enrolled_values == 1 ) & school_level_values .isin ([1 , 2 ]))
182- ).to_numpy ()
209+ household [_IS_FULL_TIME_STUDENT_COLUMN ].to_numpy (dtype = bool , copy = False )
210+ if _IS_FULL_TIME_STUDENT_COLUMN in household
211+ else _is_full_time_student (household )
212+ )
183213 people = []
184214 for row_number , (index , row ) in enumerate (household .iterrows ()):
185215 line_no = int (row ["A_LINENO" ])
@@ -788,7 +818,7 @@ def construct_tax_units(
788818 )
789819
790820 original_index = person .index
791- person = person .reset_index (drop = True )
821+ person = _precompute_tax_unit_inputs ( person .reset_index (drop = True ) )
792822 person_assignments = pd .DataFrame (index = original_index )
793823 unit_key_records : list [tuple ] = []
794824 unit_filing_records : list [str ] = []
0 commit comments