@@ -548,17 +548,27 @@ def extract_named_characters(
548548
549549 variants : list [tuple [str , str , int ]] = []
550550 unknowns : list [tuple [str , int ]] = []
551- roster_token_list = sorted (flat_tokens )
552551 for span , count in sorted (unknown_counts .items (), key = lambda kv : (- kv [1 ], kv [0 ])):
552+ span_token_list = [t .strip (".,;:'\" " ).lower () for t in span .split ()]
553553 match_found : str | None = None
554- if roster_token_list :
555- for t in span .split ():
556- close = difflib .get_close_matches (
557- t .lower (), roster_token_list , n = 1 , cutoff = fuzzy_cutoff ,
554+ # Variant detection: a prose span is a variant of a single roster
555+ # character only when EVERY one of its tokens fuzzy-matches some
556+ # token in that character's name. "Marcus Fellowes" is not a
557+ # variant of "Marcus Reid" because "fellowes" has no fuzzy match
558+ # to "reid"; matching on a shared first name alone would reopen
559+ # the cross-character false-positive bug.
560+ for char_set in per_char_tokens :
561+ char_token_list = sorted (char_set )
562+ close_tokens : list [str ] = []
563+ for tok in span_token_list :
564+ hits = difflib .get_close_matches (
565+ tok , char_token_list , n = 1 , cutoff = fuzzy_cutoff ,
558566 )
559- if close :
560- match_found = close [0 ]
561- break
567+ if hits :
568+ close_tokens .append (hits [0 ])
569+ if close_tokens and len (close_tokens ) == len (span_token_list ):
570+ match_found = close_tokens [0 ]
571+ break
562572 if match_found is not None :
563573 variants .append ((span , match_found , count ))
564574 else :
0 commit comments