@@ -445,3 +445,115 @@ def test_merge_second_pass_items_preserves_emit_order() -> None:
445445 second = [_si ("c" , quote = "q3" ), _si ("d" , quote = "q4" ), _si ("e" , quote = "q5" )]
446446 merged , _ = merge_second_pass_items (first , second )
447447 assert [item .line_english for item in merged ] == ["a" , "b" , "c" , "d" , "e" ]
448+
449+
450+ def test_quote_is_in_source_substring_fast_path () -> None :
451+ """Verbatim substring match remains the primary signal: a quote copied
452+ cleanly from the source should still verify, including across the
453+ existing whitespace/dash normalisation."""
454+ from worker_plan_internal .parameter_extraction .compress_report_section import (
455+ quote_is_in_source ,
456+ )
457+
458+ source = "If the lowest qualified bid for OPC UA middleware exceeds $75,000, escalate."
459+ assert quote_is_in_source ("lowest qualified bid for OPC UA middleware" , source ) is True
460+ # Dash variant: em-dash in quote, hyphen in source.
461+ assert quote_is_in_source (
462+ "qualified bid—for OPC UA middleware" ,
463+ "qualified bid-for OPC UA middleware" ,
464+ ) is True
465+
466+
467+ def test_quote_is_in_source_rejects_empty_and_sentinel () -> None :
468+ """Empty and the LLM's explicit absence marker are never grounded."""
469+ from worker_plan_internal .parameter_extraction .compress_report_section import (
470+ quote_is_in_source ,
471+ )
472+
473+ assert quote_is_in_source ("" , "anything goes here" ) is False
474+ assert quote_is_in_source ("NOT IN SOURCE" , "NOT IN SOURCE appears literally here" ) is False
475+
476+
477+ def test_quote_is_in_source_accepts_paraphrase_token_overlap () -> None :
478+ """When the LLM reorders or drops intermediate words while still
479+ drawing every content token from the source, the token-overlap fallback
480+ should accept it. Substring match would miss it because the word order
481+ differs."""
482+ from worker_plan_internal .parameter_extraction .compress_report_section import (
483+ quote_is_in_source ,
484+ )
485+
486+ source = (
487+ "If the lowest qualified bid for OPC UA middleware exceeds $75,000, "
488+ "the project re-scopes to a single integration vendor."
489+ )
490+ # Reordered: "middleware" moved before "bid", "for OPC UA" dropped.
491+ assert quote_is_in_source (
492+ "lowest qualified middleware bid exceeds $75,000" ,
493+ source ,
494+ ) is True
495+
496+
497+ def test_quote_is_in_source_rejects_hallucinated_number () -> None :
498+ """Digit-bearing tokens are anchored exactly. A quote that paraphrases
499+ the text but swaps the numeric value is not grounded."""
500+ from worker_plan_internal .parameter_extraction .compress_report_section import (
501+ quote_is_in_source ,
502+ )
503+
504+ source = "lowest qualified bid for OPC UA middleware exceeds $75,000"
505+ # 80,000 ≠ 75,000 — must fail even though every other token is present.
506+ assert quote_is_in_source (
507+ "lowest qualified middleware bid exceeds $80,000" ,
508+ source ,
509+ ) is False
510+
511+
512+ def test_quote_is_in_source_rejects_substituted_content_word () -> None :
513+ """All-tokens-in-source rule blocks single-word substitutions that invert
514+ meaning while keeping most surface tokens. ``highest`` is not in the
515+ source, so even a six-other-tokens overlap fails."""
516+ from worker_plan_internal .parameter_extraction .compress_report_section import (
517+ quote_is_in_source ,
518+ )
519+
520+ source = "lowest qualified bid for OPC UA middleware exceeds $75,000"
521+ assert quote_is_in_source (
522+ "highest qualified middleware bid exceeds $75,000" ,
523+ source ,
524+ ) is False
525+
526+
527+ def test_quote_is_in_source_rejects_substitution_in_long_quote () -> None :
528+ """A longer quote with one substituted content word must still fail —
529+ high fractional overlap is not a free pass. The all-tokens rule rejects
530+ on the single missing token regardless of quote length, which a
531+ fractional threshold like ≥90% would let through."""
532+ from worker_plan_internal .parameter_extraction .compress_report_section import (
533+ quote_is_in_source ,
534+ )
535+
536+ source = (
537+ "If the lowest qualified bid for OPC UA middleware exceeds $75,000, "
538+ "then the project reverts to the current rule-based integration "
539+ "vendor and escalates to the steering committee."
540+ )
541+ # Same 14-token clause; only ``lowest`` swapped for ``highest``. 13/14 of
542+ # the tokens still appear in source, but the substituted word inverts
543+ # the meaning and must not verify.
544+ quote = (
545+ "highest qualified bid for OPC UA middleware exceeds $75,000 "
546+ "then project reverts to integration vendor"
547+ )
548+ assert quote_is_in_source (quote , source ) is False
549+
550+
551+ def test_quote_is_in_source_rejects_short_unrelated_quote () -> None :
552+ """Two- or one-token quotes do not get the token-overlap fallback —
553+ too easy to satisfy by coincidence on a large source."""
554+ from worker_plan_internal .parameter_extraction .compress_report_section import (
555+ quote_is_in_source ,
556+ )
557+
558+ source = "lowest qualified bid for OPC UA middleware exceeds $75,000"
559+ assert quote_is_in_source ("middleware bid" , source ) is False
0 commit comments