Skip to content

Commit 15629bd

Browse files
authored
Speed up ACS tax unit link inference (#936)
1 parent 613d5f2 commit 15629bd

2 files changed

Lines changed: 150 additions & 108 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Speed up ACS spouse and parent inference in dataset builds.

policyengine_us_data/datasets/acs/acs_to_cps_columns.py

Lines changed: 149 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -324,81 +324,109 @@ def _infer_spouse_lines(
324324
line_no: pd.Series,
325325
age: pd.Series,
326326
) -> tuple[pd.Series, pd.Series]:
327-
spouse_line = pd.Series(0, index=person.index, dtype=int)
328-
imputed = pd.Series(False, index=person.index, dtype=bool)
329327
mar = _numeric(person, "MAR").astype(int)
330328

331-
frame = pd.DataFrame(
332-
{
333-
"household_id": household_id,
334-
"line_no": line_no,
335-
"age": age,
336-
"rel": rel,
337-
"mar": mar,
338-
"sex": _numeric(person, "SEX").astype(int),
339-
},
340-
index=person.index,
341-
)
329+
n = len(person)
330+
spouse_line = np.zeros(n, dtype=np.int64)
331+
imputed = np.zeros(n, dtype=bool)
332+
positions = pd.Series(np.arange(n), index=person.index)
333+
rel_values = rel.to_numpy(dtype=np.int64, copy=False)
334+
household_values = household_id.to_numpy(dtype=np.int64, copy=False)
335+
line_values = line_no.to_numpy(dtype=np.int64, copy=False)
336+
age_values = age.to_numpy(dtype=np.int64, copy=False)
337+
mar_values = mar.to_numpy(dtype=np.int64, copy=False)
338+
reference_codes = np.fromiter(_reference_codes(relationship_system), dtype=np.int64)
339+
spouse_codes = np.fromiter(_spouse_codes(relationship_system), dtype=np.int64)
340+
341+
for _, household_positions in positions.groupby(household_values, sort=False):
342+
household_index = household_positions.to_numpy(dtype=np.int64, copy=False)
343+
household_rel = rel_values[household_index]
344+
reference_positions = household_index[np.isin(household_rel, reference_codes)]
345+
if len(reference_positions):
346+
reference_pos = int(reference_positions[0])
347+
else:
348+
reference_pos = int(
349+
household_index[np.argmin(line_values[household_index])]
350+
)
351+
reference_line = int(line_values[reference_pos])
342352

343-
for _, household in frame.groupby("household_id", sort=False):
344-
reference = household[
345-
household["rel"].isin(_reference_codes(relationship_system))
353+
direct_spouse_positions = household_index[
354+
np.isin(household_rel, spouse_codes) & (mar_values[household_index] == 1)
346355
]
347-
if reference.empty:
348-
reference = household[household["line_no"] == household["line_no"].min()]
349-
reference_index = reference.index[0]
350-
reference_line = int(frame.loc[reference_index, "line_no"])
351-
352-
direct_spouses = household[
353-
household["rel"].isin(_spouse_codes(relationship_system))
354-
& (household["mar"] == 1)
356+
if len(direct_spouse_positions) and mar_values[reference_pos] == 1:
357+
spouse_pos = int(
358+
direct_spouse_positions[np.argmin(line_values[direct_spouse_positions])]
359+
)
360+
spouse_line[reference_pos] = int(line_values[spouse_pos])
361+
spouse_line[spouse_pos] = reference_line
362+
363+
unlinked = household_index[
364+
(mar_values[household_index] == 1)
365+
& (age_values[household_index] >= 18)
366+
& (spouse_line[household_index] <= 0)
355367
]
356-
if not direct_spouses.empty and frame.loc[reference_index, "mar"] == 1:
357-
spouse_index = direct_spouses.sort_values("line_no").index[0]
358-
spouse_line.loc[reference_index] = int(frame.loc[spouse_index, "line_no"])
359-
spouse_line.loc[spouse_index] = reference_line
360-
361-
unlinked = household[
362-
(household["mar"] == 1)
363-
& (household["age"] >= 18)
364-
& (spouse_line.loc[household.index] <= 0)
365-
].copy()
366-
remaining = set(unlinked.index)
367-
for index in sorted(remaining, key=lambda item: frame.loc[item, "line_no"]):
368-
if index not in remaining:
368+
remaining = set(int(position) for position in unlinked)
369+
for position in sorted(remaining, key=lambda item: line_values[item]):
370+
if position not in remaining:
369371
continue
370372
candidate_indexes = [
371-
candidate for candidate in remaining if candidate != index
373+
candidate for candidate in remaining if candidate != position
372374
]
373375
scored_candidates = []
374376
for candidate in candidate_indexes:
375-
score = _spouse_pair_score(
376-
frame.loc[index],
377-
frame.loc[candidate],
378-
relationship_system,
377+
score = _spouse_pair_score_values(
378+
rel_a=rel_values[position],
379+
rel_b=rel_values[candidate],
380+
age_a=age_values[position],
381+
age_b=age_values[candidate],
382+
line_a=line_values[position],
383+
line_b=line_values[candidate],
384+
relationship_system=relationship_system,
379385
)
380386
if score is not None:
381387
scored_candidates.append((score, candidate))
382388
if not scored_candidates:
383389
continue
384-
_, spouse_index = max(scored_candidates)
385-
spouse_line.loc[index] = int(frame.loc[spouse_index, "line_no"])
386-
spouse_line.loc[spouse_index] = int(frame.loc[index, "line_no"])
387-
imputed.loc[[index, spouse_index]] = True
388-
remaining.discard(index)
389-
remaining.discard(spouse_index)
390-
391-
return spouse_line, imputed
390+
_, spouse_pos = max(scored_candidates)
391+
spouse_line[position] = int(line_values[spouse_pos])
392+
spouse_line[spouse_pos] = int(line_values[position])
393+
imputed[[position, spouse_pos]] = True
394+
remaining.discard(position)
395+
remaining.discard(spouse_pos)
396+
397+
return pd.Series(spouse_line, index=person.index), pd.Series(
398+
imputed, index=person.index
399+
)
392400

393401

394402
def _spouse_pair_score(
395403
person_a: pd.Series,
396404
person_b: pd.Series,
397405
relationship_system: str,
398406
) -> tuple[int, int, int] | None:
399-
rel_a = int(person_a["rel"])
400-
rel_b = int(person_b["rel"])
401-
age_gap = abs(int(person_a["age"]) - int(person_b["age"]))
407+
return _spouse_pair_score_values(
408+
rel_a=int(person_a["rel"]),
409+
rel_b=int(person_b["rel"]),
410+
age_a=int(person_a["age"]),
411+
age_b=int(person_b["age"]),
412+
line_a=int(person_a["line_no"]),
413+
line_b=int(person_b["line_no"]),
414+
relationship_system=relationship_system,
415+
)
416+
417+
418+
def _spouse_pair_score_values(
419+
rel_a: int,
420+
rel_b: int,
421+
age_a: int,
422+
age_b: int,
423+
line_a: int,
424+
line_b: int,
425+
relationship_system: str,
426+
) -> tuple[int, int, int] | None:
427+
rel_a = int(rel_a)
428+
rel_b = int(rel_b)
429+
age_gap = abs(int(age_a) - int(age_b))
402430
if age_gap > 20:
403431
return None
404432

@@ -408,9 +436,9 @@ def _spouse_pair_score(
408436
parent_in_law_codes = _parent_in_law_codes(relationship_system)
409437
pair = {rel_a, rel_b}
410438
if pair & child_codes and pair & child_in_law_codes:
411-
return (100, -age_gap, -min(int(person_a["line_no"]), int(person_b["line_no"])))
439+
return (100, -age_gap, -min(int(line_a), int(line_b)))
412440
if pair & parent_codes and pair & parent_in_law_codes:
413-
return (90, -age_gap, -min(int(person_a["line_no"]), int(person_b["line_no"])))
441+
return (90, -age_gap, -min(int(line_a), int(line_b)))
414442
return None
415443

416444

@@ -423,63 +451,76 @@ def _infer_parent_lines(
423451
age: pd.Series,
424452
spouse_line: pd.Series,
425453
) -> tuple[pd.Series, pd.Series, pd.Series]:
426-
parent1 = pd.Series(0, index=person.index, dtype=int)
427-
parent2 = pd.Series(0, index=person.index, dtype=int)
428-
imputed = pd.Series(False, index=person.index, dtype=bool)
429-
frame = pd.DataFrame(
430-
{
431-
"household_id": household_id,
432-
"line_no": line_no,
433-
"age": age,
434-
"rel": rel,
435-
"spouse_line": spouse_line,
436-
},
437-
index=person.index,
454+
n = len(person)
455+
parent1 = np.zeros(n, dtype=np.int64)
456+
parent2 = np.zeros(n, dtype=np.int64)
457+
imputed = np.zeros(n, dtype=bool)
458+
positions = pd.Series(np.arange(n), index=person.index)
459+
rel_values = rel.to_numpy(dtype=np.int64, copy=False)
460+
household_values = household_id.to_numpy(dtype=np.int64, copy=False)
461+
line_values = line_no.to_numpy(dtype=np.int64, copy=False)
462+
age_values = age.to_numpy(dtype=np.int64, copy=False)
463+
spouse_values = spouse_line.to_numpy(dtype=np.int64, copy=False)
464+
reference_codes = np.fromiter(_reference_codes(relationship_system), dtype=np.int64)
465+
own_child_codes = np.fromiter(
466+
_child_codes(relationship_system) | _foster_child_codes(relationship_system),
467+
dtype=np.int64,
468+
)
469+
grandchild_codes = np.fromiter(
470+
_grandchild_codes(relationship_system), dtype=np.int64
471+
)
472+
parent_candidate_codes = np.fromiter(
473+
_child_codes(relationship_system) | _child_in_law_codes(relationship_system),
474+
dtype=np.int64,
438475
)
439476

440-
for _, household in frame.groupby("household_id", sort=False):
441-
reference = household[
442-
household["rel"].isin(_reference_codes(relationship_system))
443-
]
444-
if reference.empty:
445-
reference = household[household["line_no"] == household["line_no"].min()]
446-
reference_index = reference.index[0]
447-
reference_line = int(frame.loc[reference_index, "line_no"])
448-
reference_spouse_line = int(frame.loc[reference_index, "spouse_line"])
449-
450-
own_child_mask = household["rel"].isin(
451-
_child_codes(relationship_system) | _foster_child_codes(relationship_system)
452-
)
453-
for index in household[own_child_mask].index:
454-
parent1.loc[index] = reference_line
455-
if reference_spouse_line > 0:
456-
parent2.loc[index] = reference_spouse_line
457-
458-
grandchild_indexes = household[
459-
household["rel"].isin(_grandchild_codes(relationship_system))
460-
].index
461-
parent_candidates = household[
462-
household["rel"].isin(
463-
_child_codes(relationship_system)
464-
| _child_in_law_codes(relationship_system)
477+
for _, household_positions in positions.groupby(household_values, sort=False):
478+
household_index = household_positions.to_numpy(dtype=np.int64, copy=False)
479+
household_rel = rel_values[household_index]
480+
reference_positions = household_index[np.isin(household_rel, reference_codes)]
481+
if len(reference_positions):
482+
reference_pos = int(reference_positions[0])
483+
else:
484+
reference_pos = int(
485+
household_index[np.argmin(line_values[household_index])]
465486
)
487+
reference_line = int(line_values[reference_pos])
488+
reference_spouse_line = int(spouse_values[reference_pos])
489+
490+
own_child_positions = household_index[np.isin(household_rel, own_child_codes)]
491+
for position in own_child_positions:
492+
parent1[position] = reference_line
493+
if reference_spouse_line > 0:
494+
parent2[position] = reference_spouse_line
495+
496+
grandchild_positions = household_index[np.isin(household_rel, grandchild_codes)]
497+
parent_candidate_positions = household_index[
498+
np.isin(household_rel, parent_candidate_codes)
466499
]
467-
for index in grandchild_indexes:
468-
possible = parent_candidates[
469-
(parent_candidates["age"] - frame.loc[index, "age"]).between(15, 55)
470-
].copy()
471-
if possible.empty:
500+
for position in grandchild_positions:
501+
age_gap = age_values[parent_candidate_positions] - age_values[position]
502+
possible_positions = parent_candidate_positions[
503+
(age_gap >= 15) & (age_gap <= 55)
504+
]
505+
if len(possible_positions) == 0:
472506
continue
473-
possible["score"] = -(possible["age"] - frame.loc[index, "age"] - 30).abs()
474-
selected_index = possible.sort_values(
475-
["score", "age", "line_no"],
476-
ascending=[False, False, True],
477-
).index[0]
478-
selected_line = int(frame.loc[selected_index, "line_no"])
479-
parent1.loc[index] = selected_line
480-
selected_spouse_line = int(frame.loc[selected_index, "spouse_line"])
507+
selected_pos = max(
508+
(int(candidate) for candidate in possible_positions),
509+
key=lambda candidate: (
510+
-abs(age_values[candidate] - age_values[position] - 30),
511+
age_values[candidate],
512+
-line_values[candidate],
513+
),
514+
)
515+
selected_line = int(line_values[selected_pos])
516+
parent1[position] = selected_line
517+
selected_spouse_line = int(spouse_values[selected_pos])
481518
if selected_spouse_line > 0:
482-
parent2.loc[index] = selected_spouse_line
483-
imputed.loc[index] = True
519+
parent2[position] = selected_spouse_line
520+
imputed[position] = True
484521

485-
return parent1, parent2, imputed
522+
return (
523+
pd.Series(parent1, index=person.index),
524+
pd.Series(parent2, index=person.index),
525+
pd.Series(imputed, index=person.index),
526+
)

0 commit comments

Comments
 (0)