@@ -324,81 +324,109 @@ def _infer_spouse_lines(
324324 line_no : pd .Series ,
325325 age : pd .Series ,
326326) -> tuple [pd .Series , pd .Series ]:
327- spouse_line = pd .Series (0 , index = person .index , dtype = int )
328- imputed = pd .Series (False , index = person .index , dtype = bool )
329327 mar = _numeric (person , "MAR" ).astype (int )
330328
331- frame = pd .DataFrame (
332- {
333- "household_id" : household_id ,
334- "line_no" : line_no ,
335- "age" : age ,
336- "rel" : rel ,
337- "mar" : mar ,
338- "sex" : _numeric (person , "SEX" ).astype (int ),
339- },
340- index = person .index ,
341- )
329+ n = len (person )
330+ spouse_line = np .zeros (n , dtype = np .int64 )
331+ imputed = np .zeros (n , dtype = bool )
332+ positions = pd .Series (np .arange (n ), index = person .index )
333+ rel_values = rel .to_numpy (dtype = np .int64 , copy = False )
334+ household_values = household_id .to_numpy (dtype = np .int64 , copy = False )
335+ line_values = line_no .to_numpy (dtype = np .int64 , copy = False )
336+ age_values = age .to_numpy (dtype = np .int64 , copy = False )
337+ mar_values = mar .to_numpy (dtype = np .int64 , copy = False )
338+ reference_codes = np .fromiter (_reference_codes (relationship_system ), dtype = np .int64 )
339+ spouse_codes = np .fromiter (_spouse_codes (relationship_system ), dtype = np .int64 )
340+
341+ for _ , household_positions in positions .groupby (household_values , sort = False ):
342+ household_index = household_positions .to_numpy (dtype = np .int64 , copy = False )
343+ household_rel = rel_values [household_index ]
344+ reference_positions = household_index [np .isin (household_rel , reference_codes )]
345+ if len (reference_positions ):
346+ reference_pos = int (reference_positions [0 ])
347+ else :
348+ reference_pos = int (
349+ household_index [np .argmin (line_values [household_index ])]
350+ )
351+ reference_line = int (line_values [reference_pos ])
342352
343- for _ , household in frame .groupby ("household_id" , sort = False ):
344- reference = household [
345- household ["rel" ].isin (_reference_codes (relationship_system ))
353+ direct_spouse_positions = household_index [
354+ np .isin (household_rel , spouse_codes ) & (mar_values [household_index ] == 1 )
346355 ]
347- if reference .empty :
348- reference = household [household ["line_no" ] == household ["line_no" ].min ()]
349- reference_index = reference .index [0 ]
350- reference_line = int (frame .loc [reference_index , "line_no" ])
351-
352- direct_spouses = household [
353- household ["rel" ].isin (_spouse_codes (relationship_system ))
354- & (household ["mar" ] == 1 )
356+ if len (direct_spouse_positions ) and mar_values [reference_pos ] == 1 :
357+ spouse_pos = int (
358+ direct_spouse_positions [np .argmin (line_values [direct_spouse_positions ])]
359+ )
360+ spouse_line [reference_pos ] = int (line_values [spouse_pos ])
361+ spouse_line [spouse_pos ] = reference_line
362+
363+ unlinked = household_index [
364+ (mar_values [household_index ] == 1 )
365+ & (age_values [household_index ] >= 18 )
366+ & (spouse_line [household_index ] <= 0 )
355367 ]
356- if not direct_spouses .empty and frame .loc [reference_index , "mar" ] == 1 :
357- spouse_index = direct_spouses .sort_values ("line_no" ).index [0 ]
358- spouse_line .loc [reference_index ] = int (frame .loc [spouse_index , "line_no" ])
359- spouse_line .loc [spouse_index ] = reference_line
360-
361- unlinked = household [
362- (household ["mar" ] == 1 )
363- & (household ["age" ] >= 18 )
364- & (spouse_line .loc [household .index ] <= 0 )
365- ].copy ()
366- remaining = set (unlinked .index )
367- for index in sorted (remaining , key = lambda item : frame .loc [item , "line_no" ]):
368- if index not in remaining :
368+ remaining = set (int (position ) for position in unlinked )
369+ for position in sorted (remaining , key = lambda item : line_values [item ]):
370+ if position not in remaining :
369371 continue
370372 candidate_indexes = [
371- candidate for candidate in remaining if candidate != index
373+ candidate for candidate in remaining if candidate != position
372374 ]
373375 scored_candidates = []
374376 for candidate in candidate_indexes :
375- score = _spouse_pair_score (
376- frame .loc [index ],
377- frame .loc [candidate ],
378- relationship_system ,
377+ score = _spouse_pair_score_values (
378+ rel_a = rel_values [position ],
379+ rel_b = rel_values [candidate ],
380+ age_a = age_values [position ],
381+ age_b = age_values [candidate ],
382+ line_a = line_values [position ],
383+ line_b = line_values [candidate ],
384+ relationship_system = relationship_system ,
379385 )
380386 if score is not None :
381387 scored_candidates .append ((score , candidate ))
382388 if not scored_candidates :
383389 continue
384- _ , spouse_index = max (scored_candidates )
385- spouse_line .loc [index ] = int (frame .loc [spouse_index , "line_no" ])
386- spouse_line .loc [spouse_index ] = int (frame .loc [index , "line_no" ])
387- imputed .loc [[index , spouse_index ]] = True
388- remaining .discard (index )
389- remaining .discard (spouse_index )
390-
391- return spouse_line , imputed
390+ _ , spouse_pos = max (scored_candidates )
391+ spouse_line [position ] = int (line_values [spouse_pos ])
392+ spouse_line [spouse_pos ] = int (line_values [position ])
393+ imputed [[position , spouse_pos ]] = True
394+ remaining .discard (position )
395+ remaining .discard (spouse_pos )
396+
397+ return pd .Series (spouse_line , index = person .index ), pd .Series (
398+ imputed , index = person .index
399+ )
392400
393401
394402def _spouse_pair_score (
395403 person_a : pd .Series ,
396404 person_b : pd .Series ,
397405 relationship_system : str ,
398406) -> tuple [int , int , int ] | None :
399- rel_a = int (person_a ["rel" ])
400- rel_b = int (person_b ["rel" ])
401- age_gap = abs (int (person_a ["age" ]) - int (person_b ["age" ]))
407+ return _spouse_pair_score_values (
408+ rel_a = int (person_a ["rel" ]),
409+ rel_b = int (person_b ["rel" ]),
410+ age_a = int (person_a ["age" ]),
411+ age_b = int (person_b ["age" ]),
412+ line_a = int (person_a ["line_no" ]),
413+ line_b = int (person_b ["line_no" ]),
414+ relationship_system = relationship_system ,
415+ )
416+
417+
418+ def _spouse_pair_score_values (
419+ rel_a : int ,
420+ rel_b : int ,
421+ age_a : int ,
422+ age_b : int ,
423+ line_a : int ,
424+ line_b : int ,
425+ relationship_system : str ,
426+ ) -> tuple [int , int , int ] | None :
427+ rel_a = int (rel_a )
428+ rel_b = int (rel_b )
429+ age_gap = abs (int (age_a ) - int (age_b ))
402430 if age_gap > 20 :
403431 return None
404432
@@ -408,9 +436,9 @@ def _spouse_pair_score(
408436 parent_in_law_codes = _parent_in_law_codes (relationship_system )
409437 pair = {rel_a , rel_b }
410438 if pair & child_codes and pair & child_in_law_codes :
411- return (100 , - age_gap , - min (int (person_a [ "line_no" ] ), int (person_b [ "line_no" ] )))
439+ return (100 , - age_gap , - min (int (line_a ), int (line_b )))
412440 if pair & parent_codes and pair & parent_in_law_codes :
413- return (90 , - age_gap , - min (int (person_a [ "line_no" ] ), int (person_b [ "line_no" ] )))
441+ return (90 , - age_gap , - min (int (line_a ), int (line_b )))
414442 return None
415443
416444
@@ -423,63 +451,76 @@ def _infer_parent_lines(
423451 age : pd .Series ,
424452 spouse_line : pd .Series ,
425453) -> tuple [pd .Series , pd .Series , pd .Series ]:
426- parent1 = pd .Series (0 , index = person .index , dtype = int )
427- parent2 = pd .Series (0 , index = person .index , dtype = int )
428- imputed = pd .Series (False , index = person .index , dtype = bool )
429- frame = pd .DataFrame (
430- {
431- "household_id" : household_id ,
432- "line_no" : line_no ,
433- "age" : age ,
434- "rel" : rel ,
435- "spouse_line" : spouse_line ,
436- },
437- index = person .index ,
454+ n = len (person )
455+ parent1 = np .zeros (n , dtype = np .int64 )
456+ parent2 = np .zeros (n , dtype = np .int64 )
457+ imputed = np .zeros (n , dtype = bool )
458+ positions = pd .Series (np .arange (n ), index = person .index )
459+ rel_values = rel .to_numpy (dtype = np .int64 , copy = False )
460+ household_values = household_id .to_numpy (dtype = np .int64 , copy = False )
461+ line_values = line_no .to_numpy (dtype = np .int64 , copy = False )
462+ age_values = age .to_numpy (dtype = np .int64 , copy = False )
463+ spouse_values = spouse_line .to_numpy (dtype = np .int64 , copy = False )
464+ reference_codes = np .fromiter (_reference_codes (relationship_system ), dtype = np .int64 )
465+ own_child_codes = np .fromiter (
466+ _child_codes (relationship_system ) | _foster_child_codes (relationship_system ),
467+ dtype = np .int64 ,
468+ )
469+ grandchild_codes = np .fromiter (
470+ _grandchild_codes (relationship_system ), dtype = np .int64
471+ )
472+ parent_candidate_codes = np .fromiter (
473+ _child_codes (relationship_system ) | _child_in_law_codes (relationship_system ),
474+ dtype = np .int64 ,
438475 )
439476
440- for _ , household in frame .groupby ("household_id" , sort = False ):
441- reference = household [
442- household ["rel" ].isin (_reference_codes (relationship_system ))
443- ]
444- if reference .empty :
445- reference = household [household ["line_no" ] == household ["line_no" ].min ()]
446- reference_index = reference .index [0 ]
447- reference_line = int (frame .loc [reference_index , "line_no" ])
448- reference_spouse_line = int (frame .loc [reference_index , "spouse_line" ])
449-
450- own_child_mask = household ["rel" ].isin (
451- _child_codes (relationship_system ) | _foster_child_codes (relationship_system )
452- )
453- for index in household [own_child_mask ].index :
454- parent1 .loc [index ] = reference_line
455- if reference_spouse_line > 0 :
456- parent2 .loc [index ] = reference_spouse_line
457-
458- grandchild_indexes = household [
459- household ["rel" ].isin (_grandchild_codes (relationship_system ))
460- ].index
461- parent_candidates = household [
462- household ["rel" ].isin (
463- _child_codes (relationship_system )
464- | _child_in_law_codes (relationship_system )
477+ for _ , household_positions in positions .groupby (household_values , sort = False ):
478+ household_index = household_positions .to_numpy (dtype = np .int64 , copy = False )
479+ household_rel = rel_values [household_index ]
480+ reference_positions = household_index [np .isin (household_rel , reference_codes )]
481+ if len (reference_positions ):
482+ reference_pos = int (reference_positions [0 ])
483+ else :
484+ reference_pos = int (
485+ household_index [np .argmin (line_values [household_index ])]
465486 )
487+ reference_line = int (line_values [reference_pos ])
488+ reference_spouse_line = int (spouse_values [reference_pos ])
489+
490+ own_child_positions = household_index [np .isin (household_rel , own_child_codes )]
491+ for position in own_child_positions :
492+ parent1 [position ] = reference_line
493+ if reference_spouse_line > 0 :
494+ parent2 [position ] = reference_spouse_line
495+
496+ grandchild_positions = household_index [np .isin (household_rel , grandchild_codes )]
497+ parent_candidate_positions = household_index [
498+ np .isin (household_rel , parent_candidate_codes )
466499 ]
467- for index in grandchild_indexes :
468- possible = parent_candidates [
469- (parent_candidates ["age" ] - frame .loc [index , "age" ]).between (15 , 55 )
470- ].copy ()
471- if possible .empty :
500+ for position in grandchild_positions :
501+ age_gap = age_values [parent_candidate_positions ] - age_values [position ]
502+ possible_positions = parent_candidate_positions [
503+ (age_gap >= 15 ) & (age_gap <= 55 )
504+ ]
505+ if len (possible_positions ) == 0 :
472506 continue
473- possible ["score" ] = - (possible ["age" ] - frame .loc [index , "age" ] - 30 ).abs ()
474- selected_index = possible .sort_values (
475- ["score" , "age" , "line_no" ],
476- ascending = [False , False , True ],
477- ).index [0 ]
478- selected_line = int (frame .loc [selected_index , "line_no" ])
479- parent1 .loc [index ] = selected_line
480- selected_spouse_line = int (frame .loc [selected_index , "spouse_line" ])
507+ selected_pos = max (
508+ (int (candidate ) for candidate in possible_positions ),
509+ key = lambda candidate : (
510+ - abs (age_values [candidate ] - age_values [position ] - 30 ),
511+ age_values [candidate ],
512+ - line_values [candidate ],
513+ ),
514+ )
515+ selected_line = int (line_values [selected_pos ])
516+ parent1 [position ] = selected_line
517+ selected_spouse_line = int (spouse_values [selected_pos ])
481518 if selected_spouse_line > 0 :
482- parent2 . loc [ index ] = selected_spouse_line
483- imputed . loc [ index ] = True
519+ parent2 [ position ] = selected_spouse_line
520+ imputed [ position ] = True
484521
485- return parent1 , parent2 , imputed
522+ return (
523+ pd .Series (parent1 , index = person .index ),
524+ pd .Series (parent2 , index = person .index ),
525+ pd .Series (imputed , index = person .index ),
526+ )
0 commit comments