Merge pull request #744 from PlanExeOrg/napkin-math/loosen-quote-match

neoneye · web-flow · commit 57b992492a37 · 2026-05-21T15:24:37.000+02:00
napkin-math(compress): paraphrase-tolerant quote verification
diff --git a/worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py b/worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py
@@ -918,12 +918,46 @@ def normalise_for_quote_match(text: str) -> str:
     return " ".join(text.split())
 
 
+WORD_TOKEN_PATTERN: re.Pattern[str] = re.compile(r"\w+", re.UNICODE)
+
+QUOTE_MATCH_MIN_TOKENS: int = 3
+
+
+def tokenize_for_quote_match(text: str) -> list[str]:
+    """Tokenize on Unicode word boundaries after the standard normalisation.
+
+    Language- and domain-neutral: it splits on whatever the Unicode word
+    class considers a word character. Numeric tokens like ``$75,000`` split
+    into ``["75", "000"]`` consistently in both quote and source.
+    """
+    return WORD_TOKEN_PATTERN.findall(normalise_for_quote_match(text))
+
+
 def quote_is_in_source(quote: str, section_markdown: str) -> bool:
+    """Check that the LLM's ``source_quote`` is grounded in the section.
+
+    Fast path is the existing substring check after normalisation. When the
+    LLM paraphrases (drops intermediate words, reorders the noun phrase),
+    that fast path misses even though every content token came from the
+    source. The fallback requires every quote token to appear in the source
+    token set, which accepts reordering and elision but rejects any
+    substituted word — including numeric substitutions, since digit-bearing
+    tokens fall under the same all-tokens rule. A short-quote floor avoids
+    trivial overlap on a large source.
+    """
     if not quote:
         return False
     if quote.strip().upper() == "NOT IN SOURCE":
         return False
-    return normalise_for_quote_match(quote) in normalise_for_quote_match(section_markdown)
+    norm_quote = normalise_for_quote_match(quote)
+    norm_source = normalise_for_quote_match(section_markdown)
+    if norm_quote in norm_source:
+        return True
+    quote_tokens = tokenize_for_quote_match(quote)
+    if len(quote_tokens) < QUOTE_MATCH_MIN_TOKENS:
+        return False
+    source_tokens = set(tokenize_for_quote_match(section_markdown))
+    return all(tok in source_tokens for tok in quote_tokens)
 
 
 def numeric_density_bonus(text: str) -> float:
diff --git a/worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py b/worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py
@@ -445,3 +445,115 @@ def test_merge_second_pass_items_preserves_emit_order() -> None:
     second = [_si("c", quote="q3"), _si("d", quote="q4"), _si("e", quote="q5")]
     merged, _ = merge_second_pass_items(first, second)
     assert [item.line_english for item in merged] == ["a", "b", "c", "d", "e"]
+
+
+def test_quote_is_in_source_substring_fast_path() -> None:
+    """Verbatim substring match remains the primary signal: a quote copied
+    cleanly from the source should still verify, including across the
+    existing whitespace/dash normalisation."""
+    from worker_plan_internal.parameter_extraction.compress_report_section import (
+        quote_is_in_source,
+    )
+
+    source = "If the lowest qualified bid for OPC UA middleware exceeds $75,000, escalate."
+    assert quote_is_in_source("lowest qualified bid for OPC UA middleware", source) is True
+    # Dash variant: em-dash in quote, hyphen in source.
+    assert quote_is_in_source(
+        "qualified bid—for OPC UA middleware",
+        "qualified bid-for OPC UA middleware",
+    ) is True
+
+
+def test_quote_is_in_source_rejects_empty_and_sentinel() -> None:
+    """Empty and the LLM's explicit absence marker are never grounded."""
+    from worker_plan_internal.parameter_extraction.compress_report_section import (
+        quote_is_in_source,
+    )
+
+    assert quote_is_in_source("", "anything goes here") is False
+    assert quote_is_in_source("NOT IN SOURCE", "NOT IN SOURCE appears literally here") is False
+
+
+def test_quote_is_in_source_accepts_paraphrase_token_overlap() -> None:
+    """When the LLM reorders or drops intermediate words while still
+    drawing every content token from the source, the token-overlap fallback
+    should accept it. Substring match would miss it because the word order
+    differs."""
+    from worker_plan_internal.parameter_extraction.compress_report_section import (
+        quote_is_in_source,
+    )
+
+    source = (
+        "If the lowest qualified bid for OPC UA middleware exceeds $75,000, "
+        "the project re-scopes to a single integration vendor."
+    )
+    # Reordered: "middleware" moved before "bid", "for OPC UA" dropped.
+    assert quote_is_in_source(
+        "lowest qualified middleware bid exceeds $75,000",
+        source,
+    ) is True
+
+
+def test_quote_is_in_source_rejects_hallucinated_number() -> None:
+    """Digit-bearing tokens are anchored exactly. A quote that paraphrases
+    the text but swaps the numeric value is not grounded."""
+    from worker_plan_internal.parameter_extraction.compress_report_section import (
+        quote_is_in_source,
+    )
+
+    source = "lowest qualified bid for OPC UA middleware exceeds $75,000"
+    # 80,000 ≠ 75,000 — must fail even though every other token is present.
+    assert quote_is_in_source(
+        "lowest qualified middleware bid exceeds $80,000",
+        source,
+    ) is False
+
+
+def test_quote_is_in_source_rejects_substituted_content_word() -> None:
+    """All-tokens-in-source rule blocks single-word substitutions that invert
+    meaning while keeping most surface tokens. ``highest`` is not in the
+    source, so even a six-other-tokens overlap fails."""
+    from worker_plan_internal.parameter_extraction.compress_report_section import (
+        quote_is_in_source,
+    )
+
+    source = "lowest qualified bid for OPC UA middleware exceeds $75,000"
+    assert quote_is_in_source(
+        "highest qualified middleware bid exceeds $75,000",
+        source,
+    ) is False
+
+
+def test_quote_is_in_source_rejects_substitution_in_long_quote() -> None:
+    """A longer quote with one substituted content word must still fail —
+    high fractional overlap is not a free pass. The all-tokens rule rejects
+    on the single missing token regardless of quote length, which a
+    fractional threshold like ≥90% would let through."""
+    from worker_plan_internal.parameter_extraction.compress_report_section import (
+        quote_is_in_source,
+    )
+
+    source = (
+        "If the lowest qualified bid for OPC UA middleware exceeds $75,000, "
+        "then the project reverts to the current rule-based integration "
+        "vendor and escalates to the steering committee."
+    )
+    # Same 14-token clause; only ``lowest`` swapped for ``highest``. 13/14 of
+    # the tokens still appear in source, but the substituted word inverts
+    # the meaning and must not verify.
+    quote = (
+        "highest qualified bid for OPC UA middleware exceeds $75,000 "
+        "then project reverts to integration vendor"
+    )
+    assert quote_is_in_source(quote, source) is False
+
+
+def test_quote_is_in_source_rejects_short_unrelated_quote() -> None:
+    """Two- or one-token quotes do not get the token-overlap fallback —
+    too easy to satisfy by coincidence on a large source."""
+    from worker_plan_internal.parameter_extraction.compress_report_section import (
+        quote_is_in_source,
+    )
+
+    source = "lowest qualified bid for OPC UA middleware exceeds $75,000"
+    assert quote_is_in_source("middleware bid", source) is False