Skip to content

Commit 57b9924

Browse files
authored
Merge pull request #744 from PlanExeOrg/napkin-math/loosen-quote-match
napkin-math(compress): paraphrase-tolerant quote verification
2 parents cd62982 + 9521b68 commit 57b9924

2 files changed

Lines changed: 147 additions & 1 deletion

File tree

worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -918,12 +918,46 @@ def normalise_for_quote_match(text: str) -> str:
918918
return " ".join(text.split())
919919

920920

921+
WORD_TOKEN_PATTERN: re.Pattern[str] = re.compile(r"\w+", re.UNICODE)
922+
923+
QUOTE_MATCH_MIN_TOKENS: int = 3
924+
925+
926+
def tokenize_for_quote_match(text: str) -> list[str]:
927+
"""Tokenize on Unicode word boundaries after the standard normalisation.
928+
929+
Language- and domain-neutral: it splits on whatever the Unicode word
930+
class considers a word character. Numeric tokens like ``$75,000`` split
931+
into ``["75", "000"]`` consistently in both quote and source.
932+
"""
933+
return WORD_TOKEN_PATTERN.findall(normalise_for_quote_match(text))
934+
935+
921936
def quote_is_in_source(quote: str, section_markdown: str) -> bool:
937+
"""Check that the LLM's ``source_quote`` is grounded in the section.
938+
939+
Fast path is the existing substring check after normalisation. When the
940+
LLM paraphrases (drops intermediate words, reorders the noun phrase),
941+
that fast path misses even though every content token came from the
942+
source. The fallback requires every quote token to appear in the source
943+
token set, which accepts reordering and elision but rejects any
944+
substituted word — including numeric substitutions, since digit-bearing
945+
tokens fall under the same all-tokens rule. A short-quote floor avoids
946+
trivial overlap on a large source.
947+
"""
922948
if not quote:
923949
return False
924950
if quote.strip().upper() == "NOT IN SOURCE":
925951
return False
926-
return normalise_for_quote_match(quote) in normalise_for_quote_match(section_markdown)
952+
norm_quote = normalise_for_quote_match(quote)
953+
norm_source = normalise_for_quote_match(section_markdown)
954+
if norm_quote in norm_source:
955+
return True
956+
quote_tokens = tokenize_for_quote_match(quote)
957+
if len(quote_tokens) < QUOTE_MATCH_MIN_TOKENS:
958+
return False
959+
source_tokens = set(tokenize_for_quote_match(section_markdown))
960+
return all(tok in source_tokens for tok in quote_tokens)
927961

928962

929963
def numeric_density_bonus(text: str) -> float:

worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,3 +445,115 @@ def test_merge_second_pass_items_preserves_emit_order() -> None:
445445
second = [_si("c", quote="q3"), _si("d", quote="q4"), _si("e", quote="q5")]
446446
merged, _ = merge_second_pass_items(first, second)
447447
assert [item.line_english for item in merged] == ["a", "b", "c", "d", "e"]
448+
449+
450+
def test_quote_is_in_source_substring_fast_path() -> None:
451+
"""Verbatim substring match remains the primary signal: a quote copied
452+
cleanly from the source should still verify, including across the
453+
existing whitespace/dash normalisation."""
454+
from worker_plan_internal.parameter_extraction.compress_report_section import (
455+
quote_is_in_source,
456+
)
457+
458+
source = "If the lowest qualified bid for OPC UA middleware exceeds $75,000, escalate."
459+
assert quote_is_in_source("lowest qualified bid for OPC UA middleware", source) is True
460+
# Dash variant: em-dash in quote, hyphen in source.
461+
assert quote_is_in_source(
462+
"qualified bid—for OPC UA middleware",
463+
"qualified bid-for OPC UA middleware",
464+
) is True
465+
466+
467+
def test_quote_is_in_source_rejects_empty_and_sentinel() -> None:
468+
"""Empty and the LLM's explicit absence marker are never grounded."""
469+
from worker_plan_internal.parameter_extraction.compress_report_section import (
470+
quote_is_in_source,
471+
)
472+
473+
assert quote_is_in_source("", "anything goes here") is False
474+
assert quote_is_in_source("NOT IN SOURCE", "NOT IN SOURCE appears literally here") is False
475+
476+
477+
def test_quote_is_in_source_accepts_paraphrase_token_overlap() -> None:
478+
"""When the LLM reorders or drops intermediate words while still
479+
drawing every content token from the source, the token-overlap fallback
480+
should accept it. Substring match would miss it because the word order
481+
differs."""
482+
from worker_plan_internal.parameter_extraction.compress_report_section import (
483+
quote_is_in_source,
484+
)
485+
486+
source = (
487+
"If the lowest qualified bid for OPC UA middleware exceeds $75,000, "
488+
"the project re-scopes to a single integration vendor."
489+
)
490+
# Reordered: "middleware" moved before "bid", "for OPC UA" dropped.
491+
assert quote_is_in_source(
492+
"lowest qualified middleware bid exceeds $75,000",
493+
source,
494+
) is True
495+
496+
497+
def test_quote_is_in_source_rejects_hallucinated_number() -> None:
498+
"""Digit-bearing tokens are anchored exactly. A quote that paraphrases
499+
the text but swaps the numeric value is not grounded."""
500+
from worker_plan_internal.parameter_extraction.compress_report_section import (
501+
quote_is_in_source,
502+
)
503+
504+
source = "lowest qualified bid for OPC UA middleware exceeds $75,000"
505+
# 80,000 ≠ 75,000 — must fail even though every other token is present.
506+
assert quote_is_in_source(
507+
"lowest qualified middleware bid exceeds $80,000",
508+
source,
509+
) is False
510+
511+
512+
def test_quote_is_in_source_rejects_substituted_content_word() -> None:
513+
"""All-tokens-in-source rule blocks single-word substitutions that invert
514+
meaning while keeping most surface tokens. ``highest`` is not in the
515+
source, so even a six-other-tokens overlap fails."""
516+
from worker_plan_internal.parameter_extraction.compress_report_section import (
517+
quote_is_in_source,
518+
)
519+
520+
source = "lowest qualified bid for OPC UA middleware exceeds $75,000"
521+
assert quote_is_in_source(
522+
"highest qualified middleware bid exceeds $75,000",
523+
source,
524+
) is False
525+
526+
527+
def test_quote_is_in_source_rejects_substitution_in_long_quote() -> None:
528+
"""A longer quote with one substituted content word must still fail —
529+
high fractional overlap is not a free pass. The all-tokens rule rejects
530+
on the single missing token regardless of quote length, which a
531+
fractional threshold like ≥90% would let through."""
532+
from worker_plan_internal.parameter_extraction.compress_report_section import (
533+
quote_is_in_source,
534+
)
535+
536+
source = (
537+
"If the lowest qualified bid for OPC UA middleware exceeds $75,000, "
538+
"then the project reverts to the current rule-based integration "
539+
"vendor and escalates to the steering committee."
540+
)
541+
# Same 14-token clause; only ``lowest`` swapped for ``highest``. 13/14 of
542+
# the tokens still appear in source, but the substituted word inverts
543+
# the meaning and must not verify.
544+
quote = (
545+
"highest qualified bid for OPC UA middleware exceeds $75,000 "
546+
"then project reverts to integration vendor"
547+
)
548+
assert quote_is_in_source(quote, source) is False
549+
550+
551+
def test_quote_is_in_source_rejects_short_unrelated_quote() -> None:
552+
"""Two- or one-token quotes do not get the token-overlap fallback —
553+
too easy to satisfy by coincidence on a large source."""
554+
from worker_plan_internal.parameter_extraction.compress_report_section import (
555+
quote_is_in_source,
556+
)
557+
558+
source = "lowest qualified bid for OPC UA middleware exceeds $75,000"
559+
assert quote_is_in_source("middleware bid", source) is False

0 commit comments

Comments
 (0)