|
1 | 1 | """Shared constants, pass-failure helpers, content-policy retry, vocabulary scanning, and length enforcement.""" |
2 | 2 |
|
| 3 | +import difflib |
3 | 4 | import logging |
4 | 5 | import re |
5 | 6 | from collections.abc import Callable |
@@ -397,6 +398,162 @@ def scan_vocabulary_overuse(chapter_text: str, genre: str = "") -> list[str]: |
397 | 398 | return warnings |
398 | 399 |
|
399 | 400 |
|
| 401 | +# --------------------------------------------------------------------------- |
| 402 | +# Named-character detection (for reconciliation against the canonical roster) |
| 403 | +# --------------------------------------------------------------------------- |
| 404 | + |
| 405 | +# Common capitalized English words that are NOT character names. Used to |
| 406 | +# filter sentence-initial and conventional capitalization out of the |
| 407 | +# named-character scanner. Roster-token matching is applied BEFORE this |
| 408 | +# filter, so a character legitimately named "May" or "Crown" is still |
| 409 | +# detected correctly — the stop list only catches spans that have no |
| 410 | +# roster hit. |
| 411 | +_NAMED_CHARACTER_STOP_WORDS: frozenset[str] = frozenset({ |
| 412 | + # Pronouns / sentence-initial |
| 413 | + "i", "he", "she", "they", "it", "we", "you", "me", "him", "her", "them", "us", |
| 414 | + "his", "hers", "theirs", "its", "ours", "yours", "mine", |
| 415 | + "this", "that", "these", "those", "there", "here", |
| 416 | + "then", "when", "where", "why", "how", "what", "who", "whose", "which", |
| 417 | + # Conjunctions / modifiers |
| 418 | + "the", "a", "an", "and", "or", "but", "so", "yet", "as", "if", "while", |
| 419 | + "because", "since", "although", "though", "unless", "until", |
| 420 | + "not", "never", "always", "still", "only", "even", "also", |
| 421 | + "now", "before", "after", "later", "soon", "ago", "once", "twice", |
| 422 | + "yes", "no", "ok", "okay", |
| 423 | + # Days |
| 424 | + "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", |
| 425 | + # Months (excluding May — often a character name; roster check handles it) |
| 426 | + "january", "february", "march", "april", "june", "july", |
| 427 | + "august", "september", "october", "november", "december", |
| 428 | + # Honorifics / titles that commonly appear alone |
| 429 | + "mr", "mrs", "ms", "dr", "sir", "madam", "lord", "lady", |
| 430 | + "captain", "lieutenant", "sergeant", "major", "colonel", "general", |
| 431 | + "professor", "father", "mother", "sister", "brother", "uncle", "aunt", |
| 432 | + "detective", "inspector", "officer", "commander", "admiral", "chief", |
| 433 | + "doctor", "nurse", "reverend", "pastor", |
| 434 | + # Structural / narrative |
| 435 | + "chapter", "book", "part", "act", "scene", "volume", "prologue", "epilogue", |
| 436 | + # Exclamations / religious references |
| 437 | + "god", "christ", "jesus", "heaven", "hell", "lord", |
| 438 | + # Greetings / filler |
| 439 | + "hello", "goodbye", "thanks", "please", |
| 440 | + # Cardinal directions / generic place words |
| 441 | + "north", "south", "east", "west", "street", "road", "avenue", "place", |
| 442 | + "square", "city", "town", "village", "county", "state", "country", |
| 443 | +}) |
| 444 | + |
| 445 | + |
| 446 | +# Candidate-name regex: one to three adjacent capitalized tokens. |
| 447 | +# Matches "Sarah", "Sarah Miller", "John Fitzgerald Kennedy" but does not |
| 448 | +# span apostrophes, hyphens, or punctuation — so "Sarah's" yields "Sarah". |
| 449 | +_NAME_CANDIDATE_RE = re.compile(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2}\b") |
| 450 | + |
| 451 | + |
| 452 | +def _roster_name_tokens(roster: list[dict]) -> set[str]: |
| 453 | + """Return the set of lowercase name tokens from a character roster. |
| 454 | +
|
| 455 | + Each character's ``name`` is split on whitespace; tokens shorter than |
| 456 | + two characters are discarded (they match too many false positives under |
| 457 | + fuzzy matching). |
| 458 | + """ |
| 459 | + tokens: set[str] = set() |
| 460 | + for ch in roster or []: |
| 461 | + if not isinstance(ch, dict): |
| 462 | + continue |
| 463 | + name = str(ch.get("name", "")).strip() |
| 464 | + if not name: |
| 465 | + continue |
| 466 | + for tok in name.split(): |
| 467 | + tok_clean = tok.strip(".,;:'\"").lower() |
| 468 | + if len(tok_clean) >= 2: |
| 469 | + tokens.add(tok_clean) |
| 470 | + return tokens |
| 471 | + |
| 472 | + |
| 473 | +def extract_named_characters( |
| 474 | + chapter_text: str, |
| 475 | + roster: list[dict], |
| 476 | + *, |
| 477 | + min_mentions: int = 2, |
| 478 | + fuzzy_cutoff: float = 0.85, |
| 479 | +) -> dict: |
| 480 | + """Detect named characters in chapter prose and classify them against *roster*. |
| 481 | +
|
| 482 | + Pure Python — no LLM call. Uses a capitalized-span regex, a stop-word |
| 483 | + filter, and :func:`difflib.get_close_matches` for variant detection. |
| 484 | +
|
| 485 | + Parameters |
| 486 | + ---------- |
| 487 | + chapter_text: The chapter prose to scan. |
| 488 | + roster: The canonical ``character_list`` (list of dicts with |
| 489 | + a ``name`` key). |
| 490 | + min_mentions: Minimum distinct mentions required before a capitalized |
| 491 | + span is reported as an unknown character. Spans that |
| 492 | + appear fewer times are treated as likely sentence-initial |
| 493 | + false positives or throwaway walk-ons. |
| 494 | + fuzzy_cutoff: :mod:`difflib` similarity threshold for variant matching. |
| 495 | + Higher = stricter. 0.85 catches typos and short |
| 496 | + diminutives without conflating distinct names. |
| 497 | +
|
| 498 | + Returns |
| 499 | + ------- |
| 500 | + dict with three keys: |
| 501 | + ``known``: sorted list of capitalized spans that intersect the |
| 502 | + roster's name tokens (for diagnostic logging). |
| 503 | + ``unknown``: list of ``(prose_name, count)`` tuples for names with |
| 504 | + no roster match and at least *min_mentions* occurrences, |
| 505 | + ordered by descending count. |
| 506 | + ``variants``: list of ``(prose_name, roster_token, count)`` tuples |
| 507 | + — likely misspellings or diminutives of roster names. |
| 508 | + """ |
| 509 | + tokens = _roster_name_tokens(roster) |
| 510 | + |
| 511 | + raw_counts: dict[str, int] = {} |
| 512 | + for m in _NAME_CANDIDATE_RE.finditer(chapter_text): |
| 513 | + raw_counts[m.group()] = raw_counts.get(m.group(), 0) + 1 |
| 514 | + |
| 515 | + known: set[str] = set() |
| 516 | + unknown_counts: dict[str, int] = {} |
| 517 | + for span, count in raw_counts.items(): |
| 518 | + span_tokens = [t.lower() for t in span.split()] |
| 519 | + # Roster check first: a span whose any token matches a roster token |
| 520 | + # is a known character, regardless of stop-word overlap. |
| 521 | + if tokens and any(t in tokens for t in span_tokens): |
| 522 | + known.add(span) |
| 523 | + continue |
| 524 | + # Drop spans whose every token is a stop word (sentence-initial |
| 525 | + # noise, honorifics with no name attached, etc.). |
| 526 | + if all(t in _NAMED_CHARACTER_STOP_WORDS for t in span_tokens): |
| 527 | + continue |
| 528 | + if count < min_mentions: |
| 529 | + continue |
| 530 | + unknown_counts[span] = count |
| 531 | + |
| 532 | + variants: list[tuple[str, str, int]] = [] |
| 533 | + unknowns: list[tuple[str, int]] = [] |
| 534 | + roster_token_list = sorted(tokens) |
| 535 | + for span, count in sorted(unknown_counts.items(), key=lambda kv: (-kv[1], kv[0])): |
| 536 | + match_found: str | None = None |
| 537 | + if roster_token_list: |
| 538 | + for t in span.split(): |
| 539 | + close = difflib.get_close_matches( |
| 540 | + t.lower(), roster_token_list, n=1, cutoff=fuzzy_cutoff, |
| 541 | + ) |
| 542 | + if close: |
| 543 | + match_found = close[0] |
| 544 | + break |
| 545 | + if match_found is not None: |
| 546 | + variants.append((span, match_found, count)) |
| 547 | + else: |
| 548 | + unknowns.append((span, count)) |
| 549 | + |
| 550 | + return { |
| 551 | + "known": sorted(known), |
| 552 | + "unknown": unknowns, |
| 553 | + "variants": variants, |
| 554 | + } |
| 555 | + |
| 556 | + |
400 | 557 | # --------------------------------------------------------------------------- |
401 | 558 | # Chapter length enforcement |
402 | 559 | # --------------------------------------------------------------------------- |
|
0 commit comments