feat: add prompt relevance scoring — gate 2 for candidate ranking

longevityboris · longevityboris · commit 09187f0fcae0 · 2026-04-12T04:06:07.000+01:00
New module stylometry/relevance.rs measures content-word overlap
between prompt and generated text, with fuzzy prefix matching for
morphological variants (dolphins→dolphin).

Wired into:
- Candidate ranker: combined score = style_distance + (1-relevance)*0.3
  so off-topic candidates are penalized even if well-styled
- Eval harness: prompt_relevance tracked in EvalRecord, CSV, and summary

5 unit tests for relevance scoring.
diff --git a/src/commands/eval_style.rs b/src/commands/eval_style.rs
@@ -20,6 +20,7 @@ use writer_cli::stylometry::features::lengths;
 use writer_cli::stylometry::features::punctuation::PunctuationStats;
 use writer_cli::stylometry::features::readability::ReadabilityStats;
 use writer_cli::stylometry::fingerprint::StylometricFingerprint;
+use writer_cli::stylometry::relevance;
 
 use crate::config;
 use crate::error::AppError;
@@ -59,6 +60,7 @@ struct EvalRecord {
     questions_per_1k: f64,
     exclamations_per_1k: f64,
     canon_leakage_score: f64,
+    prompt_relevance: f64,
     // Generation config
     system_prompt_enabled: bool,
     prompt_wrapping_enabled: bool,
@@ -82,6 +84,7 @@ struct EvalSummary {
     mean_questions_per_1k: f64,
     mean_exclamations_per_1k: f64,
     mean_canon_leakage: f64,
+    mean_prompt_relevance: f64,
     raw_mode: bool,
     adapter_used: bool,
     model: String,
@@ -245,6 +248,7 @@ pub async fn run(
             let punct = PunctuationStats::compute(&text);
             let read = ReadabilityStats::compute(&text);
             let canon_leakage = compute_canon_leakage(&text, &prompt_entry.text, &leakage_lexicon);
+            let prompt_rel = relevance::score(&prompt_entry.text, &text);
 
             let record = EvalRecord {
                 prompt: prompt_entry.text.clone(),
@@ -258,6 +262,7 @@ pub async fn run(
                 questions_per_1k: punct.questions_per_1k,
                 exclamations_per_1k: punct.exclamations_per_1k,
                 canon_leakage_score: canon_leakage,
+                prompt_relevance: prompt_rel,
                 system_prompt_enabled: system.is_some(),
                 prompt_wrapping_enabled: !raw,
                 raw_mode: raw,
@@ -329,6 +334,10 @@ pub async fn run(
             summary.mean_questions_per_1k, summary.mean_exclamations_per_1k
         );
         println!("  mean canon leakage: {:.3}", summary.mean_canon_leakage);
+        println!(
+            "  mean prompt relevance: {:.3}",
+            summary.mean_prompt_relevance
+        );
         println!("\n  results: {}", output_dir.display().to_string().dimmed());
     } else {
         crate::output::print_success_or(ctx, &summary, |_| {});
@@ -425,13 +434,13 @@ fn contains_whole_word(needle: &str, haystack: &str) -> bool {
 
 fn write_csv(path: &Path, records: &[EvalRecord]) -> Result<(), AppError> {
     let mut out = String::new();
-    out.push_str("prompt,category,seed,style_distance,sentence_length_mean,sentence_length_sd,fk_grade,questions_per_1k,exclamations_per_1k,canon_leakage_score,system_prompt,prompt_wrapping,raw_mode,adapter,n_candidates,model\n");
+    out.push_str("prompt,category,seed,style_distance,sentence_length_mean,sentence_length_sd,fk_grade,questions_per_1k,exclamations_per_1k,canon_leakage_score,prompt_relevance,system_prompt,prompt_wrapping,raw_mode,adapter,n_candidates,model\n");
 
     for r in records {
         // CSV-escape the prompt
         let prompt_escaped = r.prompt.replace('"', "\"\"");
         out.push_str(&format!(
-            "\"{}\",\"{}\",{},{:.4},{:.2},{:.2},{:.2},{:.2},{:.2},{:.4},{},{},{},{},{},{}\n",
+            "\"{}\",\"{}\",{},{:.4},{:.2},{:.2},{:.2},{:.2},{:.2},{:.4},{:.4},{},{},{},{},{},{}\n",
             prompt_escaped,
             r.category,
             r.seed,
@@ -442,6 +451,7 @@ fn write_csv(path: &Path, records: &[EvalRecord]) -> Result<(), AppError> {
             r.questions_per_1k,
             r.exclamations_per_1k,
             r.canon_leakage_score,
+            r.prompt_relevance,
             r.system_prompt_enabled,
             r.prompt_wrapping_enabled,
             r.raw_mode,
@@ -477,6 +487,7 @@ fn compute_summary(
             mean_questions_per_1k: 0.0,
             mean_exclamations_per_1k: 0.0,
             mean_canon_leakage: 0.0,
+            mean_prompt_relevance: 0.0,
             raw_mode: raw,
             adapter_used: adapter,
             model: model_id.to_string(),
@@ -507,6 +518,7 @@ fn compute_summary(
         mean_questions_per_1k: records.iter().map(|r| r.questions_per_1k).sum::<f64>() / n,
         mean_exclamations_per_1k: records.iter().map(|r| r.exclamations_per_1k).sum::<f64>() / n,
         mean_canon_leakage: records.iter().map(|r| r.canon_leakage_score).sum::<f64>() / n,
+        mean_prompt_relevance: records.iter().map(|r| r.prompt_relevance).sum::<f64>() / n,
         raw_mode: raw,
         adapter_used: adapter,
         model: model_id.to_string(),
diff --git a/src/decoding/mod.rs b/src/decoding/mod.rs
@@ -153,8 +153,8 @@ pub async fn run(
             return Err(DecodingError::Backend(err_detail));
         }
 
-        // Rank candidates by stylometric distance
-        let ranked = ranker::rank(&candidates, fingerprint);
+        // Rank candidates by style distance + prompt relevance
+        let ranked = ranker::rank(&candidates, fingerprint, prompt);
 
         // Filter best candidate
         let (best_vec_idx, best_distance) = ranked[0];
diff --git a/src/decoding/ranker.rs b/src/decoding/ranker.rs
@@ -1,25 +1,36 @@
-//! Rank generated candidates by stylometric distance to the user's fingerprint.
+//! Rank generated candidates by combined style fidelity and prompt relevance.
 //!
 //! Reference: PAN authorship verification — cosine/distance-based ranking.
 use crate::stylometry::fingerprint::StylometricFingerprint;
-use crate::stylometry::scoring;
+use crate::stylometry::{relevance, scoring};
 
-/// Rank candidates by stylometric distance to the fingerprint.
-/// Returns vec of (candidate_index, distance), sorted lowest distance first.
+/// Rank candidates by combined score: style distance penalized by low relevance.
+///
+/// Scoring: `combined = style_distance + relevance_penalty`
+/// where `relevance_penalty = (1.0 - relevance) * 0.3`
+///
+/// This means: a perfectly relevant but stylistically distant candidate (0.6 + 0.0)
+/// beats an off-topic but well-styled candidate (0.3 + 0.3).
+///
+/// Returns vec of (candidate_index, combined_score), sorted lowest first.
 pub fn rank(
     candidates: &[(String, u32, u64)],
     fingerprint: &StylometricFingerprint,
+    prompt: &str,
 ) -> Vec<(usize, f64)> {
     let mut scored: Vec<(usize, f64)> = candidates
         .iter()
         .enumerate()
         .map(|(i, (text, _, _))| {
             let report = scoring::distance(text, fingerprint);
-            (i, report.overall)
+            let rel = relevance::score(prompt, text);
+            // Penalty: up to 0.3 for completely irrelevant output
+            let relevance_penalty = (1.0 - rel) * 0.3;
+            let combined = (report.overall + relevance_penalty).clamp(0.0, 1.0);
+            (i, combined)
         })
         .collect();
 
-    // Sort by distance ascending (closest to user's voice first)
     scored.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
     scored
 }
diff --git a/src/stylometry/mod.rs b/src/stylometry/mod.rs
@@ -2,4 +2,5 @@
 pub mod ai_slop;
 pub mod features;
 pub mod fingerprint;
+pub mod relevance;
 pub mod scoring;
diff --git a/src/stylometry/relevance.rs b/src/stylometry/relevance.rs
@@ -0,0 +1,213 @@
+//! Prompt relevance scoring.
+//!
+//! Measures whether generated text addresses the prompt's topic.
+//! Without this, the ranker could prefer a beautifully-styled off-topic
+//! response over a relevant one.
+//!
+//! Approach: content-word overlap — extract meaningful words from the prompt
+//! (excluding stop words), then measure what fraction appear in the output.
+//! Simple, fast, and sufficient for a generation-time gate.
+
+use unicode_segmentation::UnicodeSegmentation;
+
+/// Common English stop words — excluded from content-word extraction.
+/// Kept minimal to avoid false negatives on short prompts.
+const STOP_WORDS: &[&str] = &[
+    "a",
+    "an",
+    "the",
+    "and",
+    "or",
+    "but",
+    "in",
+    "on",
+    "at",
+    "to",
+    "for",
+    "of",
+    "with",
+    "by",
+    "from",
+    "is",
+    "are",
+    "was",
+    "were",
+    "be",
+    "been",
+    "being",
+    "have",
+    "has",
+    "had",
+    "do",
+    "does",
+    "did",
+    "will",
+    "would",
+    "could",
+    "should",
+    "may",
+    "might",
+    "shall",
+    "can",
+    "this",
+    "that",
+    "these",
+    "those",
+    "it",
+    "its",
+    "i",
+    "you",
+    "he",
+    "she",
+    "we",
+    "they",
+    "me",
+    "him",
+    "her",
+    "us",
+    "them",
+    "my",
+    "your",
+    "his",
+    "our",
+    "their",
+    "what",
+    "which",
+    "who",
+    "whom",
+    "how",
+    "when",
+    "where",
+    "why",
+    "not",
+    "no",
+    "so",
+    "if",
+    "about",
+    "up",
+    "out",
+    "just",
+    "than",
+    "then",
+    "also",
+    "very",
+    "some",
+    "any",
+    "all",
+    "each",
+    "every",
+    "into",
+    "as",
+    "write",
+    "writing",
+    "paragraph",
+    "essay",
+    "piece",
+    "about",
+    "describe",
+    "explain",
+    "tell",
+];
+
+/// Extract content words from text — lowercase, alphabetic, non-stop-word.
+fn content_words(text: &str) -> Vec<String> {
+    text.unicode_words()
+        .map(|w| w.to_lowercase())
+        .filter(|w| w.len() >= 3 && w.chars().all(|c| c.is_alphabetic()))
+        .filter(|w| !STOP_WORDS.contains(&w.as_str()))
+        .collect()
+}
+
+/// Compute prompt relevance as the fraction of prompt content words
+/// that appear at least once in the output.
+///
+/// Returns a score in [0.0, 1.0] where:
+/// - 1.0 = every prompt content word appears in the output
+/// - 0.0 = no prompt content words appear in the output
+///
+/// If the prompt has no content words (e.g., "Write something"), returns 1.0
+/// to avoid penalizing vague prompts.
+pub fn score(prompt: &str, output: &str) -> f64 {
+    let prompt_words = content_words(prompt);
+    if prompt_words.is_empty() {
+        return 1.0;
+    }
+
+    // Deduplicate prompt words
+    let unique_prompt: std::collections::HashSet<&str> =
+        prompt_words.iter().map(|s| s.as_str()).collect();
+
+    let _output_lower = output.to_lowercase();
+    let output_words: std::collections::HashSet<String> =
+        output.unicode_words().map(|w| w.to_lowercase()).collect();
+
+    let mut found = 0;
+    for word in &unique_prompt {
+        if output_words.contains(*word) {
+            found += 1;
+        } else {
+            // Fuzzy: check shared prefix >= 4 chars for morphological variants.
+            // "dolphins" matches "dolphin", "swimming" matches "swims", etc.
+            let min_prefix = word.len().min(4);
+            let prefix = &word[..min_prefix];
+            if output_words
+                .iter()
+                .any(|ow| ow.starts_with(prefix) && common_prefix_len(word, ow) >= min_prefix)
+            {
+                found += 1;
+            }
+        }
+    }
+
+    found as f64 / unique_prompt.len() as f64
+}
+
+/// Length of the common prefix between two strings.
+fn common_prefix_len(a: &str, b: &str) -> usize {
+    a.chars().zip(b.chars()).take_while(|(x, y)| x == y).count()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn perfect_relevance() {
+        let prompt = "Write about dolphins and ocean life";
+        let output =
+            "Dolphins are fascinating creatures of the ocean. Their life underwater is complex.";
+        let s = score(prompt, output);
+        assert!(s >= 0.5, "score {s} should be at least 0.5");
+    }
+
+    #[test]
+    fn zero_relevance() {
+        let prompt = "Write about quantum physics and black holes";
+        let output = "The garden was beautiful with roses and tulips blooming everywhere.";
+        let s = score(prompt, output);
+        assert!(s < 0.3, "score {s} should be low for off-topic output");
+    }
+
+    #[test]
+    fn vague_prompt_returns_one() {
+        // All words are stop words or too short → no content words → 1.0
+        let prompt = "Write about it for me";
+        let output = "Anything at all.";
+        assert_eq!(score(prompt, output), 1.0);
+    }
+
+    #[test]
+    fn empty_output() {
+        let prompt = "Write about dolphins";
+        assert_eq!(score(prompt, ""), 0.0);
+    }
+
+    #[test]
+    fn morphological_variant() {
+        let prompt = "Write about dolphins swimming";
+        let output = "A dolphin swims gracefully through the water.";
+        let s = score(prompt, output);
+        // "dolphins" should partially match "dolphin" via substring
+        assert!(s > 0.0, "score {s} should be > 0 for morphological matches");
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -153,8 +153,8 @@ pub async fn run(`
`153`	`153`	`return Err(DecodingError::Backend(err_detail));`
`154`	`154`	`}`
`155`	`155`
`156`		`- // Rank candidates by stylometric distance`
`157`		`- let ranked = ranker::rank(&candidates, fingerprint);`
	`156`	`+ // Rank candidates by style distance + prompt relevance`
	`157`	`+ let ranked = ranker::rank(&candidates, fingerprint, prompt);`
`158`	`158`
`159`	`159`	`// Filter best candidate`
`160`	`160`	`let (best_vec_idx, best_distance) = ranked[0];`