Skip to content

Commit 0a866ea

Browse files
authored
Precompute tax unit construction inputs (#937)
1 parent 417531e commit 0a866ea

2 files changed

Lines changed: 86 additions & 55 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Precompute tax-unit construction inputs once per dataset to speed ACS builds.

policyengine_us_data/datasets/cps/tax_unit_construction.py

Lines changed: 85 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,19 @@
2828
CENSUS_DOCUMENTED_MODE,
2929
}
3030
)
31+
DISABILITY_FLAGS = (
32+
"PEDISDRS",
33+
"PEDISEAR",
34+
"PEDISEYE",
35+
"PEDISOUT",
36+
"PEDISPHY",
37+
"PEDISREM",
38+
)
39+
_GROSS_INCOME_COLUMN = "_tax_unit_gross_income"
40+
_CLAIMANT_INCOME_COLUMN = "_tax_unit_claimant_income"
41+
_TOTAL_MONEY_INCOME_COLUMN = "_tax_unit_total_money_income"
42+
_HAS_DISABILITY_COLUMN = "_tax_unit_has_disability"
43+
_IS_FULL_TIME_STUDENT_COLUMN = "_tax_unit_is_full_time_student"
3144

3245

3346
@dataclass(frozen=True)
@@ -88,17 +101,27 @@ def _to_optional_parent_line(value) -> int | None:
88101
return value if value > 0 else None
89102

90103

91-
def _positive_series(person: pd.DataFrame, column: str) -> np.ndarray:
104+
def _numeric_array(
105+
person: pd.DataFrame,
106+
column: str,
107+
*,
108+
default: float = 0,
109+
) -> np.ndarray:
92110
if column not in person:
93-
return np.zeros(len(person), dtype=float)
94-
values = (
95-
pd.to_numeric(person[column], errors="coerce")
96-
.fillna(0)
97-
.to_numpy(
111+
return np.full(len(person), default, dtype=float)
112+
series = person[column]
113+
if pd.api.types.is_numeric_dtype(series):
114+
values = series.to_numpy(dtype=float, copy=False)
115+
else:
116+
values = pd.to_numeric(series, errors="coerce").to_numpy(
98117
dtype=float,
99118
copy=False,
100119
)
101-
)
120+
return np.nan_to_num(values, nan=default)
121+
122+
123+
def _positive_series(person: pd.DataFrame, column: str) -> np.ndarray:
124+
values = _numeric_array(person, column)
102125
return np.maximum(values, 0)
103126

104127

@@ -122,64 +145,71 @@ def _estimate_claimant_income(person: pd.DataFrame) -> np.ndarray:
122145
return estimate_dependent_gross_income(person) + _positive_series(person, "SS_VAL")
123146

124147

148+
def _has_disability(person: pd.DataFrame) -> np.ndarray:
149+
has_disability = np.zeros(len(person), dtype=bool)
150+
for flag in DISABILITY_FLAGS:
151+
if flag in person:
152+
has_disability |= _numeric_array(person, flag) == 1
153+
return has_disability
154+
155+
156+
def _is_full_time_student(person: pd.DataFrame) -> np.ndarray:
157+
enrolled_values = _numeric_array(person, "A_ENRLW")
158+
full_time_values = _numeric_array(person, "A_FTPT")
159+
school_level_values = _numeric_array(person, "A_HSCOL")
160+
# Limit this to tax-unit construction: CPS TAX_ID behavior treats current
161+
# high-school or college enrollment as strong student evidence for young
162+
# adults even when the full-time flag is absent or part-time.
163+
return ((enrolled_values == 1) & (full_time_values == 1)) | (
164+
(enrolled_values == 1) & np.isin(school_level_values, [1, 2])
165+
)
166+
167+
168+
def _precompute_tax_unit_inputs(person: pd.DataFrame) -> pd.DataFrame:
169+
gross_income = estimate_dependent_gross_income(person)
170+
person[_GROSS_INCOME_COLUMN] = gross_income
171+
person[_CLAIMANT_INCOME_COLUMN] = gross_income + _positive_series(person, "SS_VAL")
172+
person[_TOTAL_MONEY_INCOME_COLUMN] = (
173+
_numeric_array(person, "PTOTVAL")
174+
if "PTOTVAL" in person
175+
else person[_CLAIMANT_INCOME_COLUMN].to_numpy(dtype=float, copy=False)
176+
)
177+
person[_HAS_DISABILITY_COLUMN] = _has_disability(person)
178+
person[_IS_FULL_TIME_STUDENT_COLUMN] = _is_full_time_student(person)
179+
return person
180+
181+
125182
def _prepare_household_people(
126183
household: pd.DataFrame,
127184
household_id: int,
128185
) -> list[_HouseholdPerson]:
129-
disability_flags = [
130-
"PEDISDRS",
131-
"PEDISEAR",
132-
"PEDISEYE",
133-
"PEDISOUT",
134-
"PEDISPHY",
135-
"PEDISREM",
136-
]
137-
gross_income = estimate_dependent_gross_income(household)
138-
claimant_income = _estimate_claimant_income(household)
186+
gross_income = (
187+
household[_GROSS_INCOME_COLUMN].to_numpy(dtype=float, copy=False)
188+
if _GROSS_INCOME_COLUMN in household
189+
else estimate_dependent_gross_income(household)
190+
)
191+
claimant_income = (
192+
household[_CLAIMANT_INCOME_COLUMN].to_numpy(dtype=float, copy=False)
193+
if _CLAIMANT_INCOME_COLUMN in household
194+
else _estimate_claimant_income(household)
195+
)
139196
total_money_income = (
140-
pd.to_numeric(household["PTOTVAL"], errors="coerce")
141-
.fillna(0)
142-
.to_numpy(dtype=float, copy=False)
197+
household[_TOTAL_MONEY_INCOME_COLUMN].to_numpy(dtype=float, copy=False)
198+
if _TOTAL_MONEY_INCOME_COLUMN in household
199+
else _numeric_array(household, "PTOTVAL")
143200
if "PTOTVAL" in household
144201
else claimant_income.copy()
145202
)
146203
has_disability = (
147-
pd.DataFrame(
148-
{
149-
flag: household[flag] if flag in household else 0
150-
for flag in disability_flags
151-
},
152-
index=household.index,
153-
)
154-
.eq(1)
155-
.any(axis=1)
156-
.to_numpy()
204+
household[_HAS_DISABILITY_COLUMN].to_numpy(dtype=bool, copy=False)
205+
if _HAS_DISABILITY_COLUMN in household
206+
else _has_disability(household)
157207
)
158-
enrolled = (
159-
household["A_ENRLW"]
160-
if "A_ENRLW" in household
161-
else pd.Series(0, index=household.index)
162-
)
163-
full_time = (
164-
household["A_FTPT"]
165-
if "A_FTPT" in household
166-
else pd.Series(0, index=household.index)
167-
)
168-
school_level = (
169-
household["A_HSCOL"]
170-
if "A_HSCOL" in household
171-
else pd.Series(0, index=household.index)
172-
)
173-
enrolled_values = pd.to_numeric(enrolled, errors="coerce").fillna(0)
174-
full_time_values = pd.to_numeric(full_time, errors="coerce").fillna(0)
175-
school_level_values = pd.to_numeric(school_level, errors="coerce").fillna(0)
176-
# Limit this to tax-unit construction: CPS TAX_ID behavior treats current
177-
# high-school or college enrollment as strong student evidence for young
178-
# adults even when the full-time flag is absent or part-time.
179208
is_full_time_student = (
180-
((enrolled_values == 1) & (full_time_values == 1))
181-
| ((enrolled_values == 1) & school_level_values.isin([1, 2]))
182-
).to_numpy()
209+
household[_IS_FULL_TIME_STUDENT_COLUMN].to_numpy(dtype=bool, copy=False)
210+
if _IS_FULL_TIME_STUDENT_COLUMN in household
211+
else _is_full_time_student(household)
212+
)
183213
people = []
184214
for row_number, (index, row) in enumerate(household.iterrows()):
185215
line_no = int(row["A_LINENO"])
@@ -788,7 +818,7 @@ def construct_tax_units(
788818
)
789819

790820
original_index = person.index
791-
person = person.reset_index(drop=True)
821+
person = _precompute_tax_unit_inputs(person.reset_index(drop=True))
792822
person_assignments = pd.DataFrame(index=original_index)
793823
unit_key_records: list[tuple] = []
794824
unit_filing_records: list[str] = []

0 commit comments

Comments
 (0)