diff --git a/assets/ruleset.json b/assets/ruleset.json index c37b695..e0ec04b 100644 --- a/assets/ruleset.json +++ b/assets/ruleset.json @@ -1267,7 +1267,8 @@ "type": "cross_strait", "context": "@domain IT。tw「優化」泛用於商業;IT optimize 改「最佳化」以區分 improve", "english": "optimize", - "negative_context_clues": ["流程", "體驗", "服務"] + "negative_context_clues": ["流程", "體驗", "服務"], + "editorial_confidence": "low" }, { "from": "優步", @@ -1302,7 +1303,7 @@ "from": "元數據", "to": [], "type": "cross_strait", - "context": "@domain 資料。源自希臘文 meta- (關於) + data (資料),原意為「描述資料的資料」", + "context": "@domain 資料。源自希臘文 meta- (關於) + data (資料),原意為「描述資料的資料」。preferred: metadata;可接受: 詮釋資料 / 後設資料;rejected: 元資料", "english": "metadata" }, { @@ -1319,6 +1320,13 @@ "context": "@domain 程式設計", "english": "metaprogramming/meta-programming" }, + { + "from": "元資料", + "to": [], + "type": "cross_strait", + "context": "@domain 資料。`元資料` 為機械式 Sinification (從 `元數據` 字面替換而來),無 NAER / MoE 立足點。preferred: metadata;可接受替代: 詮釋資料 / 後設資料", + "english": "metadata" + }, { "from": "元音", "to": ["母音"], @@ -3489,7 +3497,8 @@ "type": "confusable", "context": "限 IT 語境。電影/戲劇場景為正確 tw 用法", "english": "scenario", - "context_clues": ["應用", "部署", "測試", "系統", "開發", "架構", "軟件", "軟體", "程式", "行程", "核心", "記憶體", "CPU"] + "context_clues": ["應用", "部署", "測試", "系統", "開發", "架構", "軟件", "軟體", "程式", "行程", "核心", "記憶體", "CPU"], + "editorial_confidence": "low" }, { "from": "塑料", @@ -7416,8 +7425,9 @@ "from": "消息", "to": ["訊息"], "type": "cross_strait", - "context": "@domain IT", - "english": "message" + "context": "@domain IT。`好消息`/`壞消息`/`消息來源` 為合法 zh-TW 用法", + "english": "message", + "positional_clues": ["not_after:好", "not_after:壞", "not_before:來源"] }, { "from": "消息環", @@ -9097,7 +9107,8 @@ "type": "cross_strait", "context": "@domain 程式設計", "english": "algorithm", - "exceptions": ["演算法"] + "exceptions": ["演算法"], + "editorial_confidence": "low" }, { "from": "箭頭操作符", diff --git a/benches/scanner.rs b/benches/scanner.rs index ba89438..ec6a2fa 100644 --- a/benches/scanner.rs +++ b/benches/scanner.rs @@ -480,6 +480,7 @@ fn bench_cpu_attribution_100kb(c: &mut Criterion) { heading_severity_boost: false, political_stance: PoliticalStance::RocCentric, offset_only: false, + exempt_blockquotes: false, }; // Spelling-only config. diff --git a/build.rs b/build.rs index 397374e..02436e9 100644 --- a/build.rs +++ b/build.rs @@ -33,6 +33,16 @@ struct SpellingRule { positional_clues: Option>, #[serde(default)] tags: Option>, + #[serde(default)] + editorial_confidence: Option, +} + +#[derive(serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "lowercase")] +enum EditorialConfidence { + High, + Medium, + Low, } #[derive(serde::Serialize, serde::Deserialize)] diff --git a/scripts/check-ruleset.py b/scripts/check-ruleset.py index ac94270..897d20a 100755 --- a/scripts/check-ruleset.py +++ b/scripts/check-ruleset.py @@ -78,6 +78,7 @@ def dedup_sort( "negative_context_clues", "positional_clues", "tags", + "editorial_confidence", } # Field order for spelling rules (stable, human-scannable output). @@ -93,6 +94,7 @@ def dedup_sort( "positional_clues", "exceptions", "tags", + "editorial_confidence", ] CASE_FIELD_ORDER = ["term", "alternatives", "disabled"] diff --git a/src/cache.rs b/src/cache.rs index ec2fea2..e1e89a2 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -63,6 +63,10 @@ pub struct ScanParams { pub translationese_domain: String, // AI threshold level (formatted f32) — different multipliers produce different results. pub ai_threshold: String, + // Markdown blockquote-exemption flag — changes which spans get + // scanned, so cache hits must be invalidated when toggled. + #[serde(default)] + pub exempt_blockquotes: bool, } /// A single cached entry. @@ -420,6 +424,7 @@ mod tests { detect_translationese: false, translationese_domain: "general".into(), ai_threshold: "1.0".into(), + exempt_blockquotes: false, } } @@ -433,6 +438,7 @@ mod tests { detect_translationese: false, translationese_domain: "general".into(), ai_threshold: "1.0".into(), + exempt_blockquotes: false, } } diff --git a/src/config.rs b/src/config.rs index 4131d0c..97e72b6 100644 --- a/src/config.rs +++ b/src/config.rs @@ -26,6 +26,38 @@ pub struct ProjectConfig { pub suppressions: Option, pub packs: Option>, pub translation_memory: Option, + pub markdown: Option, + pub glossary: Option, +} + +/// Markdown-specific scanning options (35.7). +#[derive(Debug, Default, Deserialize)] +#[serde(default)] +pub struct MarkdownConfig { + /// When true, treat pulldown-cmark `Tag::BlockQuote` ranges as + /// exclusion zones. Useful for documents that quote mainland-Chinese + /// sources for illustrative purposes. Off by default. + pub exempt_blockquotes: Option, +} + +/// Project glossary section (35.9). Layered above the embedded ruleset +/// and pack store but below banned-term enforcement and translation +/// memory. Precedence: glossary `banned` > TM > glossary `preferred` > +/// domain pack > embedded ruleset. +#[derive(Debug, Default, Deserialize)] +#[serde(default)] +pub struct GlossaryConfig { + /// Terms that must always be flagged regardless of context clues. + /// E.g. ["線程", "內存"] forces those calques to fire even in + /// otherwise ambiguous prose. + pub banned: Option>, + /// Project-preferred zh-TW forms. Used by the consistency report + /// (35.1) to choose the canonical suggestion when both TW-preferred + /// and CN-preferred variants appear in the same document. + pub preferred: Option>, + /// Names that should never be flagged (added to the suppression + /// list). E.g. ["TSMC", "MediaTek"]. + pub proper_nouns: Option>, } impl ProjectConfig { diff --git a/src/engine/consistency.rs b/src/engine/consistency.rs new file mode 100644 index 0000000..dba8fd4 --- /dev/null +++ b/src/engine/consistency.rs @@ -0,0 +1,342 @@ +// 35.1 — Document-wide terminology consistency report. +// +// Groups scan issues by their `english` field (natural equivalence +// class), then for each group checks whether the canonical zh-TW form +// also appears elsewhere in the document. Mixed usage produces a +// `Consistency` diagnostic alerting the author that the same concept +// is referred to with both regional variants. +// +// TM-suppressed issues are excluded from consistency grouping — those +// are user-approved overrides, not inadvertent inconsistency. + +use std::collections::BTreeMap; + +use serde::Serialize; + +use crate::rules::glossary::ProjectGlossary; +use crate::rules::ruleset::{Issue, IssueType, Severity}; + +/// One occurrence of a calque in the document — used to anchor the +/// consistency diagnostic. +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub struct ConsistencyOccurrence { + pub offset: usize, + pub line: usize, + pub col: usize, + pub found: String, +} + +/// Aggregated consistency record for one equivalence class. All fields +/// are populated only when both the calque AND a canonical zh-TW form +/// appear in the same document. +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub struct ConsistencyGroup { + /// English anchor (natural equivalence-class key). + pub term_group: String, + /// The TW-preferred form the linter recommends. + pub preferred: String, + /// All occurrences of the calque(s) in this group. + pub occurrences: Vec, +} + +/// Top-level consistency report. Empty `groups` means no mixed usage. +#[derive(Debug, Clone, Default, Serialize)] +pub struct ConsistencyReport { + pub groups: Vec, +} + +impl ConsistencyReport { + pub fn is_empty(&self) -> bool { + self.groups.is_empty() + } +} + +/// Build a consistency report from raw scan issues. +/// +/// Algorithm: +/// 1. Filter to CrossStrait / Confusable issues with non-empty +/// `english`. Those are the cleanest equivalence-class anchors. +/// 2. Skip issues whose severity is Info — TM-suppressed downgrades +/// land at Info; they are user-approved and should not count. +/// 3. Group by `english`. For each group, choose the TW-preferred +/// canonical form from `glossary.preferred` when that preferred +/// form is already present in the document; otherwise fall back to +/// the first suggestion. +/// 4. Check whether that canonical form ALSO appears as a substring +/// anywhere in `text`. If yes (and the calque is also present), +/// both regional variants coexist → emit a group. +pub fn compute_consistency_report( + text: &str, + issues: &[Issue], + glossary: &ProjectGlossary, +) -> ConsistencyReport { + let mut grouped: BTreeMap> = BTreeMap::new(); + + for issue in issues { + let eligible = matches!( + issue.rule_type, + IssueType::CrossStrait | IssueType::Confusable + ) && issue.severity != Severity::Info; + if !eligible { + continue; + } + let Some(english) = issue.english.as_deref().filter(|e| !e.is_empty()) else { + continue; + }; + grouped.entry(english.to_string()).or_default().push(issue); + } + + let mut report = ConsistencyReport::default(); + + for (english, issues_in_group) in grouped { + let canonical = preferred_canonical_for_group(text, &issues_in_group, glossary); + let Some(canonical) = canonical else { continue }; + // Mixed usage: the canonical TW form must appear independently + // somewhere in the document (i.e. NOT as a substring of an + // already-flagged calque region). Cheap proxy: the canonical + // form is found at an offset that is not covered by any + // `from`-span issue. For the typical case where canonical and + // calque differ in characters, plain `text.contains` is + // sufficient because the calque span doesn't contain the + // canonical form as a substring. + if !text.contains(canonical.as_str()) { + continue; + } + + let occurrences: Vec = issues_in_group + .iter() + .map(|i| ConsistencyOccurrence { + offset: i.offset, + line: i.line, + col: i.col, + found: i.found.clone(), + }) + .collect(); + + report.groups.push(ConsistencyGroup { + term_group: english, + preferred: canonical, + occurrences, + }); + } + + report +} + +fn preferred_canonical_for_group( + text: &str, + issues_in_group: &[&Issue], + glossary: &ProjectGlossary, +) -> Option { + // Prefer project glossary house terms when they also appear in the + // document, but only when the rule already surfaced that term as a + // canonical suggestion for this equivalence class. Short zh terms are + // too collision-prone for edit-distance matching. + if !glossary.preferred.is_empty() { + for preferred in &glossary.preferred { + if preferred.is_empty() { + continue; + } + if !text.contains(preferred) { + continue; + } + if glossary_preferred_matches_group(preferred, issues_in_group) { + return Some(preferred.clone()); + } + } + } + + issues_in_group + .iter() + .find_map(|i| i.suggestions.first()) + .filter(|s| !s.is_empty()) + .cloned() +} + +fn glossary_preferred_matches_group(preferred: &str, issues_in_group: &[&Issue]) -> bool { + issues_in_group.iter().any(|issue| { + issue + .suggestions + .iter() + .any(|suggestion| suggestion == preferred) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Arc; + + fn cross_strait(offset: usize, found: &str, suggestion: &str, english: &str) -> Issue { + let mut issue = Issue::new( + offset, + found.len(), + found, + vec![suggestion.into()], + IssueType::CrossStrait, + Severity::Warning, + ); + issue.english = Some(Arc::from(english)); + issue + } + + #[test] + fn empty_when_no_mixed_usage() { + let text = "我們只用線程實作。"; + let issues = vec![cross_strait(3, "線程", "執行緒", "thread")]; + let report = compute_consistency_report(text, &issues, &ProjectGlossary::default()); + assert!(report.is_empty(), "no canonical 執行緒 in text → no group"); + } + + #[test] + fn fires_when_both_forms_present() { + let text = "我們的線程很慢。執行緒設計需要重構。"; + let issues = vec![cross_strait(9, "線程", "執行緒", "thread")]; + let report = compute_consistency_report(text, &issues, &ProjectGlossary::default()); + assert_eq!(report.groups.len(), 1); + let group = &report.groups[0]; + assert_eq!(group.term_group, "thread"); + assert_eq!(group.preferred, "執行緒"); + assert_eq!(group.occurrences.len(), 1); + assert_eq!(group.occurrences[0].found, "線程"); + } + + #[test] + fn groups_multiple_calques_for_same_english() { + // Both 線程 and an alternative mainland form 線程數 share english="thread". + // (Simulated for the test — real ruleset may differ.) + let text = "我們的線程很慢,線程數量太多。執行緒重構。"; + let issues = vec![ + cross_strait(9, "線程", "執行緒", "thread"), + cross_strait(24, "線程", "執行緒", "thread"), + ]; + let report = compute_consistency_report(text, &issues, &ProjectGlossary::default()); + assert_eq!(report.groups.len(), 1); + assert_eq!(report.groups[0].occurrences.len(), 2); + } + + #[test] + fn ignores_info_severity_issues_tm_suppressed() { + let text = "線程 ... 執行緒"; + let mut issue = cross_strait(0, "線程", "執行緒", "thread"); + issue.severity = Severity::Info; + let report = compute_consistency_report(text, &[issue], &ProjectGlossary::default()); + assert!(report.is_empty(), "Info severity (TM-suppressed) skipped"); + } + + #[test] + fn ignores_issues_without_english_anchor() { + let text = "X ... Y"; + let mut issue = Issue::new( + 0, + 1, + "X", + vec!["Y".into()], + IssueType::CrossStrait, + Severity::Warning, + ); + issue.english = None; + let report = compute_consistency_report(text, &[issue], &ProjectGlossary::default()); + assert!(report.is_empty()); + } + + #[test] + fn separates_groups_by_english_anchor() { + let text = "線程 執行緒 用戶 使用者"; + let issues = vec![ + cross_strait(0, "線程", "執行緒", "thread"), + cross_strait(7, "用戶", "使用者", "user"), + ]; + let report = compute_consistency_report(text, &issues, &ProjectGlossary::default()); + assert_eq!(report.groups.len(), 2); + let groups: Vec<&str> = report + .groups + .iter() + .map(|g| g.term_group.as_str()) + .collect(); + assert!(groups.contains(&"thread")); + assert!(groups.contains(&"user")); + } + + #[test] + fn prefers_glossary_preferred_form_over_default_suggestion() { + // The rule lists two acceptable TW forms; the glossary picks + // one as the project-canonical. When both regional variants + // appear in the document AND the glossary's choice is among + // the rule's suggestions (matches_group), the consistency + // report surfaces the glossary's choice instead of the rule's + // first suggestion. + let text = "我們的線程很慢。緒程設計需要重構。"; + let mut issue = Issue::new( + 9, + 6, + "線程", + vec!["執行緒".into(), "緒程".into()], + IssueType::CrossStrait, + Severity::Warning, + ); + issue.english = Some(Arc::from("thread")); + let glossary = ProjectGlossary { + preferred: vec!["緒程".into()], + ..ProjectGlossary::default() + }; + let report = compute_consistency_report(text, &[issue], &glossary); + assert_eq!(report.groups.len(), 1); + assert_eq!(report.groups[0].preferred, "緒程"); + } + + #[test] + fn glossary_preferred_outside_suggestions_falls_back_to_rule_suggestion() { + let text = "我們的線程很慢。緒程設計需要重構。執行緒也要重構。"; + let issues = vec![cross_strait(9, "線程", "執行緒", "thread")]; + let glossary = ProjectGlossary { + preferred: vec!["緒程".into()], + ..ProjectGlossary::default() + }; + let report = compute_consistency_report(text, &issues, &glossary); + assert_eq!(report.groups.len(), 1); + assert_eq!( + report.groups[0].preferred, "執行緒", + "preferred terms outside rule suggestions must not hijack the group" + ); + } + + #[test] + fn edit_distance_neighbor_does_not_hijack_group() { + // Regression guard for short zh terms: sharing one edge + // character with the calque is not enough to join the same + // concept group. + let text = "我們的線程很慢。執行緒設計需要重構。線性代數也出現。"; + let issues = vec![cross_strait(9, "線程", "執行緒", "thread")]; + let glossary = ProjectGlossary { + preferred: vec!["線性".into()], + ..ProjectGlossary::default() + }; + let report = compute_consistency_report(text, &issues, &glossary); + assert_eq!(report.groups.len(), 1); + assert_eq!( + report.groups[0].preferred, "執行緒", + "must fall back to rule suggestion, not pick unrelated 線性" + ); + } + + #[test] + fn glossary_preference_does_not_leak_across_groups() { + let text = "線程與使用者都出現在文件裡。執行緒也出現。"; + let issues = vec![ + cross_strait(0, "線程", "執行緒", "thread"), + cross_strait(3, "用戶", "使用者", "user"), + ]; + let glossary = ProjectGlossary { + preferred: vec!["使用者".into()], + ..ProjectGlossary::default() + }; + let report = compute_consistency_report(text, &issues, &glossary); + let thread_group = report + .groups + .iter() + .find(|group| group.term_group == "thread") + .expect("thread group should exist"); + assert_eq!(thread_group.preferred, "執行緒"); + } +} diff --git a/src/engine/markdown.rs b/src/engine/markdown.rs index 6ef042d..6281ff9 100644 --- a/src/engine/markdown.rs +++ b/src/engine/markdown.rs @@ -14,27 +14,86 @@ use super::excluded::{merge_ranges_pub, ByteRange}; /// and YAML frontmatter (leading --- fences). /// /// The returned ranges are sorted by start position and non-overlapping. +/// Options controlling Markdown structural exclusion. Defaults match +/// the historical behavior of [build_markdown_excluded_ranges]: code +/// blocks excluded, blockquotes scanned. +#[derive(Debug, Clone, Copy, Default)] +pub struct MdScanOptions { + /// When true, fenced/indented code blocks are NOT excluded (i.e. the + /// scanner sees code-block prose). Used by the + /// `MarkdownScanCode` content type. + pub scan_code_blocks: bool, + /// When true, the byte ranges of pulldown-cmark `Tag::BlockQuote` + /// events are excluded. Implemented via cmark events so that nested + /// blockquotes (`> >`), lazy continuation lines, and blockquotes + /// inside list items behave correctly. Off by default — adopted + /// blockquote prose is real content (35.7). + pub exempt_blockquotes: bool, +} + +impl MdScanOptions { + /// Construct options for a Markdown scan: pass `scan_code_blocks=true` + /// when running with the `MarkdownScanCode` content type, and propagate + /// the caller's `--exempt-blockquotes` flag. Centralizes the literal + /// previously copy-pasted across the CLI and MCP entry points. + pub fn new(scan_code_blocks: bool, exempt_blockquotes: bool) -> Self { + Self { + scan_code_blocks, + exempt_blockquotes, + } + } +} + pub fn build_markdown_excluded_ranges(text: &str) -> Vec { + build_markdown_excluded_ranges_with_options(text, MdScanOptions::default()) +} + +/// Like [build_markdown_excluded_ranges], but fenced/indented code blocks are +/// NOT excluded. Only inline code (`backtick`), HTML, and YAML frontmatter +/// are excluded. This allows linting Chinese prose inside code blocks +/// (comments, translated output, etc.) while still protecting inline code +/// and HTML from false positives. +pub fn build_markdown_excluded_ranges_no_code(text: &str) -> Vec { + build_markdown_excluded_ranges_with_options( + text, + MdScanOptions { + scan_code_blocks: true, + ..MdScanOptions::default() + }, + ) +} + +/// Build Markdown exclusion ranges with explicit options. Backbone for the +/// two named wrappers above, plus opt-in features like 35.7's +/// `exempt_blockquotes`. +pub fn build_markdown_excluded_ranges_with_options( + text: &str, + opts: MdScanOptions, +) -> Vec { let mut ranges = Vec::new(); // Pre-pass: detect YAML frontmatter (leading --- fence). Exclude only - // the structural tokens (--- fences, key+colon spans), leaving value - // prose scannable so that linting catches issues in title/description. + // the structural tokens (--- fences, key+colon spans, ASCII quote + // delimiters), leaving value prose scannable so that linting catches + // issues in title/description. if let Some(fm_end) = detect_frontmatter(text) { collect_frontmatter_structural_ranges(text, fm_end, &mut ranges); } collect_container_fence_ranges(text, &mut ranges); - let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH; - let parser = Parser::new_ext(text, opts); + let parser_opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH; + let parser = Parser::new_ext(text, parser_opts); let mut in_code_block = false; let mut code_block_start = 0usize; + let mut blockquote_depth: usize = 0; + let mut blockquote_start: usize = 0; for (event, range) in parser.into_offset_iter() { match event { - // Fenced or indented code blocks: exclude entire block. - Event::Start(Tag::CodeBlock(_)) => { + // Fenced or indented code blocks: exclude entire block (default) + // or scan their prose (when scan_code_blocks is set). + Event::Start(Tag::CodeBlock(_)) if !opts.scan_code_blocks => { in_code_block = true; code_block_start = range.start; } @@ -46,6 +105,22 @@ pub fn build_markdown_excluded_ranges(text: &str) -> Vec { in_code_block = false; } + Event::Start(Tag::BlockQuote(_)) if opts.exempt_blockquotes => { + if blockquote_depth == 0 { + blockquote_start = range.start; + } + blockquote_depth = blockquote_depth.saturating_add(1); + } + Event::End(TagEnd::BlockQuote(_)) if opts.exempt_blockquotes => { + blockquote_depth = blockquote_depth.saturating_sub(1); + if blockquote_depth == 0 { + ranges.push(ByteRange { + start: blockquote_start, + end: range.end, + }); + } + } + // Inline code: exclude the span including backticks. Event::Code(_) | Event::Html(_) | Event::InlineHtml(_) => { ranges.push(ByteRange { @@ -62,42 +137,6 @@ pub fn build_markdown_excluded_ranges(text: &str) -> Vec { merge_ranges_pub(ranges) } -/// Like [build_markdown_excluded_ranges], but fenced/indented code blocks are -/// NOT excluded. Only inline code (`backtick`), HTML, and YAML frontmatter -/// are excluded. This allows linting Chinese prose inside code blocks -/// (comments, translated output, etc.) while still protecting inline code -/// and HTML from false positives. -pub fn build_markdown_excluded_ranges_no_code(text: &str) -> Vec { - let mut ranges = Vec::new(); - - // Pre-pass: detect YAML frontmatter — exclude structural tokens only. - if let Some(fm_end) = detect_frontmatter(text) { - collect_frontmatter_structural_ranges(text, fm_end, &mut ranges); - } - - collect_container_fence_ranges(text, &mut ranges); - - let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH; - let parser = Parser::new_ext(text, opts); - - for (event, range) in parser.into_offset_iter() { - match event { - // Skip code blocks entirely — let them be scanned. - Event::Start(Tag::CodeBlock(_)) | Event::End(TagEnd::CodeBlock) => {} - // Inline code and HTML: still exclude. - Event::Code(_) | Event::Html(_) | Event::InlineHtml(_) => { - ranges.push(ByteRange { - start: range.start, - end: range.end, - }); - } - _ => {} - } - } - - merge_ranges_pub(ranges) -} - /// Build excluded byte ranges for YAML structural tokens. /// /// Excludes YAML key tokens (the key name + colon) so that bare ASCII colons @@ -291,7 +330,10 @@ pub fn extract_heading_ranges(text: &str) -> Vec { } /// Collect YAML frontmatter structural ranges: opening `---` line, closing -/// `---` line, and per-line key+colon spans. Values remain scannable. +/// `---` line, per-line key+colon spans, and bare ASCII `"` / `'` quote +/// bytes used as scalar delimiters. Values remain scannable; only the +/// 1-byte delimiters are masked from the punctuation scanner so that +/// downstream YAML parsers continue to see ASCII quotes (35.7). fn collect_frontmatter_structural_ranges(text: &str, fm_end: usize, ranges: &mut Vec) { let fm = &text[..fm_end]; let mut pos = 0usize; @@ -312,6 +354,20 @@ fn collect_frontmatter_structural_ranges(text: &str, fm_end: usize, ranges: &mut }); } + // Preserve ASCII `"` and `'` bytes used as YAML scalar delimiters + // by excluding them from the punctuation scanner. Without this, + // the scanner converts `"` to `「`/`」` inside frontmatter values + // and breaks downstream YAML parsers (regression observed in + // ai-muninn.com calque blindspot sweep, 2026-05). + for (i, b) in raw_line.bytes().enumerate() { + if b == b'"' || b == b'\'' { + ranges.push(ByteRange { + start: pos + i, + end: pos + i + 1, + }); + } + } + pos += line_len + 1; // +1 for the '\n' if pos >= fm_end { break; diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 88360f0..791853c 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -1,4 +1,5 @@ pub mod ai_score; +pub mod consistency; pub mod disambig; pub mod excluded; pub mod lineindex; diff --git a/src/engine/scan/mod.rs b/src/engine/scan/mod.rs index fb2feca..0378ed6 100644 --- a/src/engine/scan/mod.rs +++ b/src/engine/scan/mod.rs @@ -33,8 +33,7 @@ use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; use super::excluded::{build_excluded_ranges, merge_ranges_pub, ByteRange}; use super::lineindex::{ColumnEncoding, LineIndex}; use super::markdown::{ - build_markdown_excluded_ranges, build_markdown_excluded_ranges_no_code, - build_yaml_excluded_ranges, + build_markdown_excluded_ranges_with_options, build_yaml_excluded_ranges, MdScanOptions, }; use super::normalize::{map_offset, normalize_nfc, Normalized}; use super::segment::{BoundaryBitmap, Segmenter}; @@ -710,11 +709,28 @@ fn punct_issue(offset: usize, found: &str, suggestion: &str, context: &str) -> I /// structural exclusions appropriate to the content type and inline /// suppression markers. Shared between CLI and MCP pipelines. pub fn build_exclusions_for_content_type(text: &str, content_type: ContentType) -> Vec { + build_exclusions_for_content_type_with_options(text, content_type, MdScanOptions::default()) +} + +/// Build exclusion ranges with explicit Markdown options. Honors the +/// caller-supplied [MdScanOptions] (e.g. 35.7 blockquote exemption); +/// falls back to defaults for non-Markdown content types. +pub fn build_exclusions_for_content_type_with_options( + text: &str, + content_type: ContentType, + md_opts: MdScanOptions, +) -> Vec { let mut excluded = build_excluded_ranges(text); match content_type { - ContentType::Markdown => excluded.extend(build_markdown_excluded_ranges(text)), + ContentType::Markdown => { + excluded.extend(build_markdown_excluded_ranges_with_options(text, md_opts)); + } ContentType::MarkdownScanCode => { - excluded.extend(build_markdown_excluded_ranges_no_code(text)) + // MarkdownScanCode forces scan_code_blocks=true regardless of the + // caller-supplied flag, but still honors exempt_blockquotes so the + // 35.7 opt-in works for source-code content too. + let opts = MdScanOptions::new(true, md_opts.exempt_blockquotes); + excluded.extend(build_markdown_excluded_ranges_with_options(text, opts)); } ContentType::Yaml => excluded.extend(build_yaml_excluded_ranges(text)), ContentType::Plain => {} @@ -986,6 +1002,15 @@ impl Scanner { /// /// content_type controls which structural exclusion pass is applied /// during the NFC-rebuild slow path (Markdown, YAML, or plain text). + /// + /// CALLER CONTRACT: config-driven Markdown options like + /// [`ProfileConfig::exempt_blockquotes`] (35.7) only take effect on + /// the NFC-rebuild path, where exclusions are recomputed from the + /// supplied [`ProfileConfig`]. On the fast path the caller-supplied + /// `excluded` slice is used verbatim — if the caller wants + /// blockquotes excluded, they must build the slice with + /// [`build_exclusions_for_content_type_with_options`] using a + /// matching [`MdScanOptions`]. pub fn scan_with_prebuilt_excluded( &self, text: &str, @@ -1044,10 +1069,18 @@ impl Scanner { let scan_text = &norm.text; let nfc_changed = !norm.offset_map.is_empty(); + let md_opts = MdScanOptions::new( + matches!(content_type, ContentType::MarkdownScanCode), + cfg.exempt_blockquotes, + ); let mut output = match prebuilt_excluded { Some(excl) if !nfc_changed => self.scan_with_config(scan_text, excl, cfg), _ => { - let excl = build_exclusions_for_content_type(scan_text, content_type); + let excl = build_exclusions_for_content_type_with_options( + scan_text, + content_type, + md_opts, + ); self.scan_with_config(scan_text, &excl, cfg) } }; @@ -1890,6 +1923,7 @@ mod tests { exceptions: Some(vec!["下著".into()]), positional_clues: None, tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); assert!(scanner.spelling_db.ac_charwise.is_some()); @@ -1917,6 +1951,7 @@ mod tests { exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); assert!(scanner.spelling_db.ac_charwise.is_some()); @@ -1949,6 +1984,7 @@ mod tests { exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); assert!(scanner.spelling_db.ac_charwise.is_some()); @@ -2057,6 +2093,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["before:函式".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); @@ -2088,6 +2125,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["after:請".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); @@ -2118,6 +2156,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["adjacent:函式".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); @@ -2160,6 +2199,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["not_before:的".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); @@ -2190,6 +2230,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["not_after:清單".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); @@ -2220,6 +2261,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["before:函式".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); @@ -2261,6 +2303,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["after:請".into(), "before:函式".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); @@ -2301,6 +2344,7 @@ mod tests { exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); let issues = scanner.scan("這個軟件很好用").issues; @@ -2323,6 +2367,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["before:函式".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); @@ -2353,6 +2398,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["after:請".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); @@ -2379,6 +2425,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["before:函式".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); @@ -2420,6 +2467,7 @@ mod tests { exceptions: None, positional_clues: Some(vec!["adjacent:函式".into()]), tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); diff --git a/src/engine/scan/rule_ir.rs b/src/engine/scan/rule_ir.rs index 7f8505b..ff04c5b 100644 --- a/src/engine/scan/rule_ir.rs +++ b/src/engine/scan/rule_ir.rs @@ -146,6 +146,9 @@ pub struct CompiledSpellingDb { pub spelling_english: Vec>>, /// Pre-interned context clues per rule. Arc bump during inflation. pub spelling_context_clues: Vec>>, + /// Per-rule editorial confidence (35.2). Plain copy at inflation + /// time — `EditorialConfidence` is `Copy`, so no Arc needed. + pub spelling_editorial_confidence: Vec>, /// Per-rule positive clue IDs into the clue AC pattern list. #[allow(dead_code)] pub rule_pos_clue_ids: Vec>>, @@ -179,6 +182,7 @@ impl CompiledSpellingDb { spelling_contexts: Vec::new(), spelling_english: Vec::new(), spelling_context_clues: Vec::new(), + spelling_editorial_confidence: Vec::new(), rule_pos_clue_ids: Vec::new(), rule_neg_clue_ids: Vec::new(), rule_positional_clues: Vec::new(), @@ -610,6 +614,7 @@ fn inflate_spelling_issues_inner( issue.found = s.to_string(); } issue.suggestions = db.spelling_suggestions[idx].clone(); + issue.editorial_confidence = db.spelling_editorial_confidence[idx]; if !offset_only { issue.context.clone_from(&db.spelling_contexts[idx]); issue.english.clone_from(&db.spelling_english[idx]); @@ -748,6 +753,12 @@ pub fn compile_spelling_rules_filtered( .map(|r| r.context_clues.as_ref().map(|v| Arc::from(v.as_slice()))) .collect(); + let spelling_editorial_confidence: Vec> = + spelling_rules + .iter() + .map(|r| r.editorial_confidence) + .collect(); + // Build clue AC: intern all unique clue strings, map per-rule clue // lists to indices, build a bytewise AC for windowed lookups. let (clue_ac, mut rule_pos_clue_ids, mut rule_neg_clue_ids) = { @@ -1036,6 +1047,7 @@ pub fn compile_spelling_rules_filtered( spelling_contexts, spelling_english, spelling_context_clues, + spelling_editorial_confidence, rule_pos_clue_ids, rule_neg_clue_ids, rule_positional_clues, diff --git a/src/engine/scan/tests_generated.rs b/src/engine/scan/tests_generated.rs index fae5cb3..626c366 100644 --- a/src/engine/scan/tests_generated.rs +++ b/src/engine/scan/tests_generated.rs @@ -16,6 +16,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, SpellingRule { from: "數據庫".into(), @@ -30,6 +31,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, ]; let scanner = Scanner::new(rules, vec![]); @@ -63,6 +65,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, SpellingRule { from: "軟件".into(), @@ -77,6 +80,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, ]; let scanner = Scanner::new(rules, vec![]); @@ -104,6 +108,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }] } @@ -149,6 +154,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, SpellingRule { from: "\u{201d}".into(), @@ -163,6 +169,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, ] } @@ -243,6 +250,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); let issues = scanner.scan("ABA").issues; @@ -670,6 +678,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); let issues = scanner.scan("軟件,好用").issues; @@ -779,6 +788,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); let issues = scanner.scan("軟件, 好用.").issues; @@ -1252,6 +1262,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, SpellingRule { from: "\u{201d}".into(), @@ -1266,6 +1277,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, ]; let scanner = Scanner::new(rules, vec![]); @@ -1292,6 +1304,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, SpellingRule { from: "\u{201d}".into(), @@ -1306,6 +1319,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, ]; let scanner = Scanner::new(rules, vec![]); @@ -1336,6 +1350,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, SpellingRule { from: "\u{201d}".into(), @@ -1350,6 +1365,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, ]; let scanner = Scanner::new(rules, vec![]); @@ -1611,6 +1627,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, } } @@ -1627,6 +1644,7 @@ exceptions: Some(exceptions.into_iter().map(String::from).collect()), positional_clues: None, tags: None, + editorial_confidence: None, } } @@ -1779,6 +1797,7 @@ exceptions: None, positional_clues: None, tags: None, + editorial_confidence: None, }, variant_rule("裏", "裡"), ]; @@ -2284,6 +2303,7 @@ negative_context_clues: Some(vec!["的".into(), "等".into()]), positional_clues: None, tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules.clone(), vec![]); @@ -2319,6 +2339,7 @@ negative_context_clues: Some(vec!["獨家".into()]), positional_clues: None, tags: None, + editorial_confidence: None, }]; let scanner = Scanner::new(rules, vec![]); let issues = scanner.scan("這個軟件很好用").issues; diff --git a/src/engine/segment.rs b/src/engine/segment.rs index b81a973..9d8ca32 100644 --- a/src/engine/segment.rs +++ b/src/engine/segment.rs @@ -1094,6 +1094,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let seg = Segmenter::from_rules(&rules); // Dict should contain "軟件", "軟體", and all stop words. @@ -1230,6 +1231,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let seg = Segmenter::from_rules(&rules); // Stop word "的" must have freq=10. @@ -1337,6 +1339,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let seg = Segmenter::from_rules(&rules); // General vocab words should be present. @@ -1366,6 +1369,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let seg = Segmenter::from_rules(&rules); // "提供" and "處理" are general vocab; "數據" and "分析"/"處理" are rule terms. @@ -1390,6 +1394,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let seg = Segmenter::from_rules(&rules); let tokens = seg.segment("目前提供的重要功能"); @@ -1421,6 +1426,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let seg = Segmenter::from_rules(&rules); // Rule term "設計" inserted first with freq=1; general vocab uses diff --git a/src/engine/style_score.rs b/src/engine/style_score.rs index 852f719..ebb1d0b 100644 --- a/src/engine/style_score.rs +++ b/src/engine/style_score.rs @@ -153,10 +153,12 @@ mod tests { english: None, context_clues: None, anchor_match: None, + glossary_banned: false, tier2_outcome: Default::default(), llm_judged: false, spelling_rule_idx: None, table_cell: None, + editorial_confidence: None, } } diff --git a/src/main.rs b/src/main.rs index 5ac164c..825fcee 100644 --- a/src/main.rs +++ b/src/main.rs @@ -78,6 +78,8 @@ fn main() -> Result<()> { let mut dry_run = false; let mut explain = false; let mut relaxed = false; + let mut exempt_blockquotes = false; + let mut consistency = false; let mut detect_ai = false; let mut detect_translationese = false; // Emit composite three-axis scorecard when --detect-style is used @@ -166,6 +168,12 @@ fn main() -> Result<()> { "--relaxed" => { relaxed = true; } + "--exempt-blockquotes" => { + exempt_blockquotes = true; + } + "--consistency" => { + consistency = true; + } "--content-type" => { i += 1; let ct = args.get(i).context("--content-type requires a value")?; @@ -540,6 +548,12 @@ fn main() -> Result<()> { .or_else(|| cfg_ref.and_then(|c| c.profile.as_deref())); // CLI --relaxed flag overrides config file relaxed setting. let eff_relaxed = relaxed || cfg_ref.and_then(|c| c.relaxed).unwrap_or(false); + // CLI --exempt-blockquotes flag OR `[markdown] exempt_blockquotes`. + let eff_exempt_blockquotes = exempt_blockquotes + || cfg_ref + .and_then(|c| c.markdown.as_ref()) + .and_then(|m| m.exempt_blockquotes) + .unwrap_or(false); let eff_content_type = content_type_str .as_deref() .or_else(|| cfg_ref.and_then(|c| c.content_type.as_deref())); @@ -574,6 +588,16 @@ fn main() -> Result<()> { zhtw_mcp::rules::store::discover_tm_path(&cwd) }); + // Build project glossary from `[glossary]` section. + let eff_glossary = cfg_ref + .and_then(|c| c.glossary.as_ref()) + .map(|g| zhtw_mcp::rules::glossary::ProjectGlossary { + banned: g.banned.clone().unwrap_or_default(), + preferred: g.preferred.clone().unwrap_or_default(), + proper_nouns: g.proper_nouns.clone().unwrap_or_default(), + }) + .unwrap_or_default(); + if detect_style && !matches!(lint_format, LintFormat::Json) { anyhow::bail!("--detect-style is only supported with --format json"); } @@ -598,12 +622,15 @@ fn main() -> Result<()> { #[cfg(feature = "translate")] verify, relaxed: eff_relaxed, + exempt_blockquotes: eff_exempt_blockquotes, detect_ai, detect_translationese, detect_style, translationese_domain, ai_threshold_multiplier, tm_path: Some(eff_tm_path), + glossary: eff_glossary, + consistency, telemetry, }); } @@ -698,6 +725,10 @@ struct CliFileOutput { /// is active. #[serde(skip_serializing_if = "Option::is_none")] style_scorecard: Option, + /// Document-wide consistency report (35.1). Present only when + /// --consistency is set AND mixed regional usage is detected. + #[serde(skip_serializing_if = "Option::is_none")] + consistency: Option, } #[derive(serde::Serialize)] @@ -798,6 +829,7 @@ struct LintBatchParams<'a> { #[cfg(feature = "translate")] verify: bool, relaxed: bool, + exempt_blockquotes: bool, detect_ai: bool, detect_translationese: bool, /// Emit composite three-axis style scorecard alongside the per-axis @@ -808,6 +840,15 @@ struct LintBatchParams<'a> { translationese_domain: zhtw_mcp::engine::translationese_score::TranslationeseDomain, ai_threshold_multiplier: f32, tm_path: Option, + /// Project glossary (`[glossary]` section in `.zhtw-mcp.toml`). + /// Applied as a post-scan step: `proper_nouns` suppress matching + /// issues, `banned` injects synthetic Error issues for any + /// occurrence the embedded ruleset missed. + glossary: zhtw_mcp::rules::glossary::ProjectGlossary, + /// When true, append a `consistency` block to JSON output (35.1): + /// per-equivalence-class diagnostic when both the calque and the + /// canonical TW form appear in the same document. + consistency: bool, telemetry: bool, } @@ -825,6 +866,9 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { if params.relaxed { cfg = cfg.with_relaxed(); } + if params.exempt_blockquotes { + cfg = cfg.with_exempt_blockquotes(true); + } if params.detect_ai { cfg.ai_filler_detection = true; cfg.ai_semantic_safety = true; @@ -910,6 +954,39 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { let mut baseline_count: usize = 0; let mut tabular_header_printed = false; + let apply_glossary_to_issues = + |work_text: &str, + content_type: zhtw_mcp::engine::scan::ContentType, + issues: Vec| { + if params.glossary.is_empty() { + return issues; + } + let md_opts = zhtw_mcp::engine::markdown::MdScanOptions::new( + matches!( + content_type, + zhtw_mcp::engine::scan::ContentType::MarkdownScanCode + ), + cfg.exempt_blockquotes, + ); + let excluded = zhtw_mcp::engine::scan::build_exclusions_for_content_type_with_options( + work_text, + content_type, + md_opts, + ); + let mut issues = zhtw_mcp::rules::glossary::apply_glossary( + work_text, + &excluded, + issues, + ¶ms.glossary, + ); + let line_index = zhtw_mcp::engine::lineindex::LineIndex::new(work_text); + line_index.fill_line_col_sorted( + &mut issues, + zhtw_mcp::engine::lineindex::ColumnEncoding::Utf16, + ); + issues + }; + /// Maximum file size for CLI lint mode (16 MiB). const MAX_CLI_FILE_BYTES: u64 = 16 * 1024 * 1024; @@ -951,6 +1028,7 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { detect_translationese: cfg.translationese_detection, translationese_domain: cfg.translationese_domain.name().to_owned(), ai_threshold: format!("{:.1}", params.ai_threshold_multiplier), + exempt_blockquotes: cfg.exempt_blockquotes, }; // Open file via fd, stat from the fd (TOCTOU-safe). @@ -974,9 +1052,27 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { c.check_fast(file_arg, mtime, meta.len(), &cache_params) .into_hit() }); + // Glossary banned-term injection and the consistency report + // both scan the original text buffer; the fast path can + // only short-circuit when neither feature needs it. Same + // story for fix/SC/verify. + let need_text_post_scan = params.fix_mode != zhtw_mcp::fixer::FixMode::None + || !params.glossary.is_empty() + || params.consistency + || { + #[cfg(feature = "translate")] + { + params.verify + } + #[cfg(not(feature = "translate"))] + { + false + } + }; if let Some(hit) = fast_hit { - if !hit.input_was_sc { - // Cache hit for non-SC file — skip file read and scan. + if !hit.input_was_sc && !need_text_post_scan { + // Cache hit AND no later phase needs the text: + // skip file read and scan. return Ok(( String::new(), false, @@ -985,7 +1081,10 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { content_type, )); } - // SC files need the text for S2T write-back; fall through. + // SC files need the text for S2T write-back; glossary + // / consistency / fix / verify need the original + // buffer. Fall through to the slow path so we read + // the file and reuse the cached scan output below. } // Slow path: read file from the same fd. @@ -1030,16 +1129,22 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { // Drop text eagerly when not needed for fix/write-back/verify // to avoid accumulating all files' text in parallel scans. - let need_text = input_was_sc || params.fix_mode != zhtw_mcp::fixer::FixMode::None || { - #[cfg(feature = "translate")] - { - params.verify - } - #[cfg(not(feature = "translate"))] - { - false - } - }; + // Project glossary banned-term scanning and the 35.1 + // consistency report both scan the original buffer. + let need_text = input_was_sc + || params.fix_mode != zhtw_mcp::fixer::FixMode::None + || !params.glossary.is_empty() + || params.consistency + || { + #[cfg(feature = "translate")] + { + params.verify + } + #[cfg(not(feature = "translate"))] + { + false + } + }; if !need_text { text = String::new(); } @@ -1102,6 +1207,14 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { let mut translationese_signature = output.translationese_signature; let mut issues = output.issues; + // 35.9 — Apply project glossary precedence (proper_noun + // suppression + banned-term injection) before disambiguation, + // so the rest of the pipeline sees the canonical issue list. + // Synthetic banned-term issues land with `line: 0, col: 0` from + // `Issue::new`; reapply LineIndex so output formatters and the + // 35.1 consistency report see correct coordinates. + issues = apply_glossary_to_issues(&text, content_type, issues); + // Tier 2: local disambiguation. let disambig_cfg = zhtw_mcp::engine::disambig::DisambigConfig { profile, @@ -1198,7 +1311,7 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { // Suppress convergent-chain noise from the fixer's own replacements. zhtw_mcp::fixer::suppress_convergent_issues(&mut rescan, &fix.applied_fixes); } - rescan + apply_glossary_to_issues(rescan_text, content_type, rescan) } else { issues }; @@ -1227,6 +1340,8 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { // Apply TM suppressions: downgrade rejected terms to Info severity. // Only lexical/contextual issue types; orthographic types are immune. + // Glossary-banned issues (35.9 precedence: banned > TM) are also + // immune — the project explicitly asked for these to always fire. let mut tm_suppressed: usize = 0; let report_issues = if let Some(ref tm) = tm_store { let mut issues = report_issues; @@ -1239,6 +1354,10 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { | zhtw_mcp::rules::ruleset::IssueType::AiStyle => continue, _ => {} } + let is_glossary_banned = zhtw_mcp::rules::glossary::is_glossary_banned(issue); + if is_glossary_banned { + continue; + } if tm.should_suppress(&issue.found) && issue.severity != zhtw_mcp::rules::ruleset::Severity::Info { @@ -1336,6 +1455,23 @@ fn run_lint_batch(params: &LintBatchParams<'_>) -> Result<()> { } else { None }, + consistency: params + .consistency + .then(|| { + let consistency_text = if has_text_changes && !params.dry_run { + fix_result + .as_ref() + .map_or(text.as_str(), |f| f.text.as_str()) + } else { + text.as_str() + }; + zhtw_mcp::engine::consistency::compute_consistency_report( + consistency_text, + &report_issues, + ¶ms.glossary, + ) + }) + .filter(|r| !r.is_empty()), }; if multi { all_file_results.push(output); diff --git a/src/mcp/resources.rs b/src/mcp/resources.rs index 20ee9a1..a998878 100644 --- a/src/mcp/resources.rs +++ b/src/mcp/resources.rs @@ -182,6 +182,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }, SpellingRule { from: "軟件".into(), @@ -196,6 +197,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }, ]; diff --git a/src/mcp/tools.rs b/src/mcp/tools.rs index 4f43f48..8d1b521 100644 --- a/src/mcp/tools.rs +++ b/src/mcp/tools.rs @@ -23,9 +23,7 @@ use crate::audit::Trace; use crate::engine::disambig::{disambiguate_batch, DisambigConfig, DisambigStats}; use crate::engine::excluded::ByteRange; use crate::engine::s2t::S2TConverter; -use crate::engine::scan::{ - build_exclusions_for_content_type, is_spaced_acronym_issue, ContentType, Scanner, -}; +use crate::engine::scan::{is_spaced_acronym_issue, ContentType, Scanner}; #[cfg(feature = "translate")] use crate::engine::translate::calibrate_issues; use crate::engine::zhtype::{detect_chinese_type, ChineseType}; @@ -410,6 +408,18 @@ impl Server { .and_then(|v| v.as_bool()) .unwrap_or(false); + let exempt_blockquotes = args + .get("exempt_blockquotes") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + let glossary = parse_glossary(args); + + let consistency_requested = args + .get("consistency") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + let include_telemetry = args .get("include_telemetry") .and_then(|v| v.as_bool()) @@ -452,6 +462,9 @@ impl Server { if relaxed { cfg = cfg.with_relaxed(); } + if exempt_blockquotes { + cfg = cfg.with_exempt_blockquotes(true); + } if let Some(st) = stance { cfg = cfg.with_stance(st); } @@ -506,6 +519,30 @@ impl Server { }; } + let apply_glossary_to_issues = |work_text: &str, issues: Vec| -> Vec { + if glossary.is_empty() { + return issues; + } + let md_opts = crate::engine::markdown::MdScanOptions::new( + matches!( + content_type, + crate::engine::scan::ContentType::MarkdownScanCode + ), + cfg.exempt_blockquotes, + ); + let excluded = crate::engine::scan::build_exclusions_for_content_type_with_options( + work_text, + content_type, + md_opts, + ); + let mut issues = + crate::rules::glossary::apply_glossary(work_text, &excluded, issues, &glossary); + let line_index = crate::engine::lineindex::LineIndex::new(work_text); + line_index + .fill_line_col_sorted(&mut issues, crate::engine::lineindex::ColumnEncoding::Utf16); + issues + }; + Ok(match fix_mode { FixMode::None => { // Lint-only path. @@ -559,6 +596,22 @@ impl Server { let tm_suppressed = self.apply_tm(&mut issues); apply_ignore_set(&mut issues, &ignore_set); + // 35.9 — Apply project glossary precedence (banned > TM): + // proper_nouns suppress, banned inject synthetic Errors. + // Recompute exclusion ranges so banned-term scanning + // respects code blocks, URLs, and frontmatter. Re-fill + // line/col on synthetic issues. + issues = apply_glossary_to_issues(text, issues); + + // 35.1 — Document-wide consistency report. + let consistency_report = consistency_requested + .then(|| { + crate::engine::consistency::compute_consistency_report( + text, &issues, &glossary, + ) + }) + .filter(|r| !r.is_empty()); + // Build telemetry if requested. let telemetry = if include_telemetry { Some(build_telemetry( @@ -623,12 +676,24 @@ impl Server { disambig_stats, telemetry, include_stats, + consistency: consistency_report.as_ref(), }) } mode @ (FixMode::Orthographic | FixMode::LexicalSafe | FixMode::LexicalContextual) => { // Fix path: scan, apply fixes, re-scan for residual issues. - let excluded = build_exclusions_for_content_type(text, content_type); + let md_opts = crate::engine::markdown::MdScanOptions::new( + matches!( + content_type, + crate::engine::scan::ContentType::MarkdownScanCode + ), + cfg.exempt_blockquotes, + ); + let excluded = crate::engine::scan::build_exclusions_for_content_type_with_options( + text, + content_type, + md_opts, + ); let scan_out = self.scanner.scan_with_prebuilt_excluded_config( text, &excluded, @@ -679,6 +744,7 @@ impl Server { // prevents fixing TM-rejected terms, and the post-fix apply_tm // handles severity downgrade + counting on the final residual. apply_ignore_set(&mut issues, &ignore_set); + issues = apply_glossary_to_issues(text, issues); // Snapshot AFTER suppressions so restored severity reflects final state. struct PreservedState { @@ -784,10 +850,22 @@ impl Server { // whose offset falls within a byte range written by the fixer. suppress_convergent_issues(&mut remaining_issues, &fix_result.applied_fixes); + remaining_issues = apply_glossary_to_issues(&fix_result.text, remaining_issues); + // Apply TM after preserved state restoration so the count // reflects the true final state, not a pre-fix snapshot. let tm_suppressed = self.apply_tm(&mut remaining_issues); + let consistency_report = consistency_requested + .then(|| { + crate::engine::consistency::compute_consistency_report( + &fix_result.text, + &remaining_issues, + &glossary, + ) + }) + .filter(|r| !r.is_empty()); + // Build telemetry if requested. let telemetry = if include_telemetry { Some(build_telemetry( @@ -853,6 +931,7 @@ impl Server { disambig_stats, telemetry, include_stats, + consistency: consistency_report.as_ref(), }) } }) @@ -885,6 +964,15 @@ impl Server { | IssueType::AiStyle => continue, _ => {} } + // Glossary-banned terms are project-wide truth (banned > TM + // per the documented precedence). The provenance tag is + // set by `apply_glossary` either by injecting a synthetic + // Error or by upgrading a covering issue; either way TM + // must not downgrade these. + let is_glossary_banned = crate::rules::glossary::is_glossary_banned(issue); + if is_glossary_banned { + continue; + } if tm.should_suppress(&issue.found) && issue.severity != Severity::Info { issue.severity = Severity::Info; count += 1; @@ -996,9 +1084,12 @@ fn zhtw_known_params() -> &'static [&'static str] { "max_warnings", "profile", "relaxed", + "exempt_blockquotes", "content_type", "political_stance", "ignore_terms", + "glossary", + "consistency", "explain", "fix_output", "verify", @@ -1021,9 +1112,12 @@ fn zhtw_known_params() -> &'static [&'static str] { "max_warnings", "profile", "relaxed", + "exempt_blockquotes", "content_type", "political_stance", "ignore_terms", + "glossary", + "consistency", "explain", "fix_output", "output", @@ -1522,6 +1616,29 @@ fn parse_ignore_terms(args: &Value) -> Vec { .unwrap_or_default() } +/// Parse the optional `glossary` object (35.9). Shape: +/// `{ "banned": [...], "preferred": [...], "proper_nouns": [...] }`. +/// Each field is optional. Missing object → empty glossary. +fn parse_glossary(args: &Value) -> crate::rules::glossary::ProjectGlossary { + fn array_of_strings(v: Option<&Value>) -> Vec { + v.and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .collect() + }) + .unwrap_or_default() + } + let Some(glossary) = args.get("glossary").and_then(|v| v.as_object()) else { + return crate::rules::glossary::ProjectGlossary::default(); + }; + crate::rules::glossary::ProjectGlossary { + banned: array_of_strings(glossary.get("banned")), + preferred: array_of_strings(glossary.get("preferred")), + proper_nouns: array_of_strings(glossary.get("proper_nouns")), + } +} + /// Remove political_coloring issues that the given stance suppresses. fn filter_by_stance(issues: &mut Vec, stance: PoliticalStance) { issues.retain(|issue| { @@ -1653,6 +1770,136 @@ struct AnchorProvenance<'a> { anchor_match: Option, } +// `EditorialConfidence` is canonical-defined in `crate::rules::ruleset` +// so that `SpellingRule.editorial_confidence` and the per-issue field +// share a single type. Re-exported here for the explain pipeline. +use crate::rules::ruleset::EditorialConfidence; + +/// Structured per-issue explain metadata (35.2). +/// +/// Surfaced only when `explain` is requested. Helps reviewers understand +/// the confidence behind each suggestion without parsing free-form prose. +#[derive(Serialize)] +struct ExplainMeta<'a> { + /// Why this is flagged. Sourced from rule context + MoE refs when + /// available; falls back to a structured restatement of the + /// suggestion target. + rationale: String, + /// Domain that triggered the rule. Parsed from `@domain X` markers + /// in the rule's context field; defaults to "general". + #[serde(skip_serializing_if = "Option::is_none")] + domain: Option<&'a str>, + /// True when the surface form is identical across zh-CN and zh-TW + /// but the meaning differs (e.g. 文件: document vs file). + is_false_friend: bool, + /// Whether `--fix` would safely apply this suggestion. + auto_fix_safe: bool, + /// Whether the suggestion benefits from manual review. + needs_review: bool, + /// Per-issue editorial confidence — distinguishes binary corrections + /// from style preferences (e.g. 優化, 算法 are valid zh-TW general + /// vocabulary; 演算法 is the canonical computing form). + editorial_confidence: EditorialConfidence, +} + +/// Heuristic fallback when an issue lacks a rule-level +/// `editorial_confidence`. Translationese / AI-style / grammar hits and +/// any `Info`-severity or anchor-rejected issue are surfaced as `Low`; +/// hits with explicit context support climb to `Medium`; everything else +/// is `High`. +fn heuristic_editorial_confidence(issue: &Issue) -> EditorialConfidence { + use crate::rules::ruleset::{IssueType, Severity}; + + let always_low = matches!( + issue.rule_type, + IssueType::Translationese | IssueType::AiStyle | IssueType::Grammar + ) || issue.severity == Severity::Info + || issue.anchor_match == Some(false); + if always_low { + return EditorialConfidence::Low; + } + if issue.context_clues.is_some() || issue.anchor_match == Some(true) { + EditorialConfidence::Medium + } else { + EditorialConfidence::High + } +} + +/// Derive structured explain metadata for an issue. +/// +/// Confidence resolution order: +/// 1. Honor `issue.editorial_confidence` if the rule annotated it +/// (set in `assets/ruleset.json` per-rule). +/// 2. Otherwise, fall back to heuristics on rule type / severity / +/// anchor_match / context_clues. +/// +/// Invariants: `editorial_confidence == Low` ⇒ `auto_fix_safe = false` +/// AND `needs_review = true`. +fn derive_explain_meta(issue: &Issue) -> ExplainMeta<'_> { + use crate::rules::ruleset::IssueType; + + // -- Domain extraction from `@domain X` markers in the rule context. + let domain = issue.context.as_deref().and_then(|c| { + let needle = "@domain "; + c.find(needle).map(|i| { + let rest = &c[i + needle.len()..]; + // Take up to the first whitespace, full-width comma, or period. + let end = rest + .find(|c: char| c.is_whitespace() || c == '\u{FF0C}' || c == '\u{3002}') + .unwrap_or(rest.len()); + rest[..end].trim() + }) + }); + + // -- Editorial confidence. + // Rule-level annotation wins (from assets/ruleset.json + // `editorial_confidence`); else heuristics on rule type / severity / + // anchor_match / context_clues. + let editorial_confidence = issue + .editorial_confidence + .unwrap_or_else(|| heuristic_editorial_confidence(issue)); + + // -- False-friend detection. + // Confusable rules are the canonical false friends. Rule-tagged + // low-confidence terms are also surfaced as false friends because + // their surface form is shared across regions with divergent senses. + let is_false_friend = matches!(issue.rule_type, IssueType::Confusable) + || matches!(editorial_confidence, EditorialConfidence::Low) + && issue.editorial_confidence.is_some(); + + // -- Auto-fix safety + review need. + // Invariant: `low` confidence forces auto_fix_safe=false + + // needs_review=true. Otherwise punctuation / case / variant / typo + // hits with a single suggestion are auto-fix safe. + let single_unambiguous = issue.suggestions.len() == 1 + && matches!( + issue.rule_type, + IssueType::Punctuation | IssueType::Case | IssueType::Variant | IssueType::Typo + ); + + let auto_fix_safe = + !matches!(editorial_confidence, EditorialConfidence::Low) && single_unambiguous; + + let needs_review = matches!(editorial_confidence, EditorialConfidence::Low) + || issue.suggestions.len() > 1 + || matches!( + issue.rule_type, + IssueType::Translationese | IssueType::AiStyle | IssueType::Grammar + ); + + let rationale = build_explanation(issue) + .unwrap_or_else(|| format!("'{}' flagged by {:?} rule.", issue.found, issue.rule_type)); + + ExplainMeta { + rationale, + domain, + is_false_friend, + auto_fix_safe, + needs_review, + editorial_confidence, + } +} + /// Anchor provenance for compact mode (owned). #[derive(Serialize)] struct AnchorProvenanceOwned { @@ -1671,6 +1918,11 @@ struct AnnotatedIssue<'a> { explanation: Option, #[serde(skip_serializing_if = "Option::is_none")] anchor_provenance: Option>, + /// Structured per-issue explain metadata (35.2). Present only in + /// explain mode. Carries domain, false-friend flag, auto-fix + /// safety, review burden, and editorial confidence. + #[serde(skip_serializing_if = "Option::is_none")] + explain_meta: Option>, /// Resolution tier: which pipeline stage authored this issue's resolution. /// Present only when `include_stats` is true. #[serde(skip_serializing_if = "Option::is_none")] @@ -1743,6 +1995,11 @@ struct FullOutput<'a> { telemetry: Option<&'a TelemetryMetrics>, #[serde(skip_serializing_if = "Option::is_none")] summary_metrics: Option<&'a SummaryMetrics>, + /// Document-wide consistency report (35.1). Present only when the + /// caller passed `consistency: true` AND mixed regional usage + /// (both `線程` and `執行緒`, etc.) is detected in the document. + #[serde(skip_serializing_if = "Option::is_none")] + consistency: Option<&'a crate::engine::consistency::ConsistencyReport>, } /// Compact tool response (serialized directly, no intermediate Value). @@ -1885,6 +2142,10 @@ struct CheckOutputParams<'a> { telemetry: Option, /// Whether to include per-issue resolution tier and summary_metrics. include_stats: bool, + /// Document-wide consistency report (35.1). Some only when the + /// caller requested `consistency: true` AND mixed regional usage + /// is detected. + consistency: Option<&'a crate::engine::consistency::ConsistencyReport>, } /// Build telemetry metrics from accumulated counters. @@ -2043,6 +2304,7 @@ fn build_check_output(params: &CheckOutputParams<'_>) -> CallToolResult { style_scorecard: params.style_scorecard, telemetry: params.telemetry.as_ref(), summary_metrics: stats_metrics.as_ref(), + consistency: params.consistency, }; serialize_output(&output) } @@ -2163,10 +2425,16 @@ fn build_issues_list<'a>( } else { None }; + let explain_meta = if explain { + Some(derive_explain_meta(issue)) + } else { + None + }; AnnotatedIssue { issue, explanation, anchor_provenance, + explain_meta, resolution, } }) @@ -2568,6 +2836,10 @@ fn tool_definitions() -> Vec { "type": "boolean", "description": "Capability flag for software UI strings: disables colon enforcement, dunhao detection, grammar checks; uses en-dash for ranges" })); + props.insert("exempt_blockquotes".into(), json!({ + "type": "boolean", + "description": "Markdown only: exclude pulldown-cmark `Tag::BlockQuote` ranges from scanning. Useful when a document quotes mainland-Chinese sources for illustrative purposes. Off by default." + })); props.insert("content_type".into(), json!({ "type": "string", "enum": ["plain", "markdown", "markdown-scan-code", "yaml"] @@ -2580,6 +2852,19 @@ fn tool_definitions() -> Vec { "type": "array", "items": { "type": "string" } })); + props.insert("glossary".into(), json!({ + "type": "object", + "description": "Project-level glossary. `banned` terms always fire (project-wide truth, banned > TM); `proper_nouns` suppress matching issues; `preferred` chooses canonical TW form for the consistency report.", + "properties": { + "banned": { "type": "array", "items": { "type": "string" } }, + "preferred": { "type": "array", "items": { "type": "string" } }, + "proper_nouns": { "type": "array", "items": { "type": "string" } }, + } + })); + props.insert("consistency".into(), json!({ + "type": "boolean", + "description": "Emit a `consistency` block when both regional variants of one concept appear in the document (e.g. both 線程 and 執行緒). Off by default." + })); props.insert("explain".into(), json!({ "type": "boolean" })); props.insert("fix_output".into(), json!({ "type": "string", @@ -2647,6 +2932,152 @@ mod tests { use crate::mcp::types::RequestId; use crate::rules::ruleset::Tier2Outcome; + /// 35.2 — high confidence: cross_strait without context_clues, single + /// suggestion. Auto-fix safety still gated on rule_type being one of + /// the unambiguous classes (Punctuation/Case/Variant/Typo); a plain + /// CrossStrait keeps `auto_fix_safe=false` because the choice between + /// suggestions is editorial. + #[test] + fn explain_meta_high_confidence_for_unambiguous_cross_strait() { + let mut issue = Issue::new( + 0, + 6, + "線程", + vec!["執行緒".into()], + IssueType::CrossStrait, + Severity::Warning, + ); + issue.english = Some(std::sync::Arc::from("thread")); + let meta = derive_explain_meta(&issue); + assert!(matches!( + meta.editorial_confidence, + EditorialConfidence::High + )); + assert!(!meta.is_false_friend); + assert!(!meta.needs_review); + } + + /// 35.2 — rule-tagged low confidence (e.g. `優化`, `算法`, `場景` + /// in `assets/ruleset.json`) surfaces as `low` so reviewers know + /// they are editorial preference, not binary error. Invariant: + /// low ⇒ auto_fix_safe=false AND needs_review=true. + #[test] + fn explain_meta_low_confidence_for_rule_tagged_boundary_terms() { + for boundary in &["優化", "算法", "場景"] { + let mut issue = Issue::new( + 0, + boundary.len(), + *boundary, + vec!["演算法".into()], + IssueType::CrossStrait, + Severity::Warning, + ); + issue.editorial_confidence = Some(EditorialConfidence::Low); + let meta = derive_explain_meta(&issue); + assert!( + matches!(meta.editorial_confidence, EditorialConfidence::Low), + "rule-tagged {boundary} must be low confidence", + ); + assert!( + !meta.auto_fix_safe, + "rule-tagged {boundary}: low ⇒ !auto_fix_safe" + ); + assert!( + meta.needs_review, + "rule-tagged {boundary}: low ⇒ needs_review" + ); + assert!( + meta.is_false_friend, + "rule-tagged {boundary}: marked false friend" + ); + } + } + + /// 35.2 — `@domain X` extraction populates the `domain` field. + #[test] + fn explain_meta_extracts_domain_from_context() { + let mut issue = Issue::new( + 0, + 6, + "用戶", + vec!["使用者".into()], + IssueType::CrossStrait, + Severity::Warning, + ); + issue.context = Some(std::sync::Arc::from("@domain IT。其他註解")); + let meta = derive_explain_meta(&issue); + assert_eq!(meta.domain, Some("IT")); + } + + /// 35.2 — Translationese / AiStyle / Grammar always demand review. + #[test] + fn explain_meta_translationese_marks_low_confidence() { + let issue = Issue::new( + 0, + 3, + "被", + vec!["主動句".into()], + IssueType::Translationese, + Severity::Info, + ); + let meta = derive_explain_meta(&issue); + assert!(matches!( + meta.editorial_confidence, + EditorialConfidence::Low + )); + assert!(!meta.auto_fix_safe); + assert!(meta.needs_review); + } + + /// 35.9 — `parse_glossary` extracts banned/preferred/proper_nouns + /// from the tool args object. + #[test] + fn parse_glossary_extracts_three_lists() { + let args = serde_json::json!({ + "glossary": { + "banned": ["線程", "內存"], + "preferred": ["執行緒"], + "proper_nouns": ["TSMC"], + } + }); + let g = parse_glossary(&args); + assert_eq!(g.banned, vec!["線程".to_string(), "內存".to_string()]); + assert_eq!(g.preferred, vec!["執行緒".to_string()]); + assert_eq!(g.proper_nouns, vec!["TSMC".to_string()]); + } + + #[test] + fn parse_glossary_missing_object_returns_empty() { + let args = serde_json::json!({}); + let g = parse_glossary(&args); + assert!(g.is_empty()); + } + + #[test] + fn parse_glossary_partial_fields_default_to_empty() { + let args = serde_json::json!({"glossary": {"banned": ["X"]}}); + let g = parse_glossary(&args); + assert_eq!(g.banned, vec!["X".to_string()]); + assert!(g.preferred.is_empty()); + assert!(g.proper_nouns.is_empty()); + } + + /// 35.2 — Punctuation with single suggestion is auto-fix safe. + #[test] + fn explain_meta_punctuation_is_auto_fix_safe() { + let issue = Issue::new( + 0, + 1, + ",", + vec![",".into()], + IssueType::Punctuation, + Severity::Warning, + ); + let meta = derive_explain_meta(&issue); + assert!(meta.auto_fix_safe); + assert!(!meta.needs_review); + } + #[test] fn issue_summary_omits_zero_sampling_fields() { let summary = IssueSummary { @@ -3054,9 +3485,12 @@ mod tests { "max_warnings", "profile", "relaxed", + "exempt_blockquotes", "content_type", "political_stance", "ignore_terms", + "glossary", + "consistency", "explain", "fix_output", "output", @@ -3229,6 +3663,89 @@ mod tests { assert_eq!(output["accepted"], true); } + #[test] + fn tools_call_fix_respects_exempt_blockquotes() { + let (mut server, _dir) = make_initialized_server(); + let text = "> 用戶輸入需要驗證。\n"; + let resp = call_zhtw( + &mut server, + serde_json::json!({ + "text": text, + "content_type": "markdown", + "exempt_blockquotes": true, + "fix_mode": "lexical_safe" + }), + ); + let output = assert_tool_success(&resp); + assert_eq!(output["text"], text); + let issues = output["issues"].as_array().expect("issues array"); + assert!( + !issues.iter().any(|i| i["found"] == "用戶"), + "blockquote text must stay exempt on MCP fix path; got {issues:?}" + ); + } + + #[test] + fn tools_call_fix_honors_glossary_banned_terms() { + let (mut server, _dir) = make_initialized_server(); + let resp = call_zhtw( + &mut server, + serde_json::json!({ + "text": "ABC 不該出現在文件中。\n", + "fix_mode": "lexical_safe", + "glossary": { "banned": ["ABC"] } + }), + ); + let output = assert_tool_success(&resp); + let issues = output["issues"].as_array().expect("issues array"); + assert!( + issues.iter().any(|i| i["found"] == "ABC"), + "glossary banned terms must remain active on fix path; got {issues:?}" + ); + } + + #[test] + fn tools_call_fix_honors_glossary_proper_nouns() { + let (mut server, _dir) = make_initialized_server(); + let text = "我們的線程實作。\n"; + let resp = call_zhtw( + &mut server, + serde_json::json!({ + "text": text, + "fix_mode": "lexical_safe", + "glossary": { "proper_nouns": ["線程"] } + }), + ); + let output = assert_tool_success(&resp); + assert_eq!(output["text"], text); + let issues = output["issues"].as_array().expect("issues array"); + assert!( + !issues.iter().any(|i| i["found"] == "線程"), + "proper_nouns must suppress fix-path issues; got {issues:?}" + ); + } + + #[test] + fn tools_call_fix_returns_consistency_report() { + let (mut server, _dir) = make_initialized_server(); + let resp = call_zhtw( + &mut server, + serde_json::json!({ + "text": "我們的線程太慢,需要重構執行緒。\n", + "fix_mode": "orthographic", + "consistency": true + }), + ); + let output = assert_tool_success(&resp); + let groups = output["consistency"]["groups"] + .as_array() + .expect("consistency groups"); + assert!( + groups.iter().any(|g| g["term_group"] == "thread"), + "fix-path consistency report must be returned; got {groups:?}" + ); + } + #[test] fn tools_call_set_invalid_content_type() { let (mut server, _dir) = make_initialized_server(); diff --git a/src/rules/glossary.rs b/src/rules/glossary.rs new file mode 100644 index 0000000..7ef7bcf --- /dev/null +++ b/src/rules/glossary.rs @@ -0,0 +1,242 @@ +// Project-level glossary (35.9) — `banned`, `preferred`, `proper_nouns`. +// +// Layered above the embedded ruleset and pack store but below banned-term +// enforcement and translation memory. Precedence per TODO 35.9: +// glossary `banned` > TM > glossary `preferred` > domain pack > embedded +// ruleset. +// +// `banned` — terms that must always fire, regardless of context_clues. +// `preferred` — TW forms used by 35.1 to choose the canonical suggestion. +// `proper_nouns` — never flag (added to the suppression list). + +use crate::engine::excluded::{is_excluded, ByteRange}; +use crate::rules::ruleset::{Issue, IssueType, Severity}; + +/// Runtime glossary used by the scan post-processor. +#[derive(Debug, Default, Clone)] +pub struct ProjectGlossary { + pub banned: Vec, + pub preferred: Vec, + pub proper_nouns: Vec, +} + +impl ProjectGlossary { + pub fn is_empty(&self) -> bool { + self.banned.is_empty() && self.preferred.is_empty() && self.proper_nouns.is_empty() + } +} + +/// Mark an issue as protected by glossary banned-term precedence. +pub fn mark_glossary_banned(issue: &mut Issue) { + issue.glossary_banned = true; +} + +/// Per TODO 35.9 precedence (banned > TM), TM must NOT downgrade these. +pub fn is_glossary_banned(issue: &Issue) -> bool { + issue.glossary_banned +} + +/// Apply glossary precedence to a freshly-scanned issue list. +/// +/// 1. Suppress any issue whose `found` text exactly matches a proper noun +/// (highest-priority suppression after TM, lowest noise to authors). +/// 2. Inject a synthetic CrossStrait `Error` for each occurrence of a +/// banned term that the embedded ruleset failed to flag (e.g. because +/// `context_clues` didn't match) AND whose offset falls outside any +/// `excluded` range. Banned-term enforcement respects code blocks, +/// URLs, suppression markers, and YAML frontmatter exclusions just +/// like regular rules do. Banned-term enforcement is project-wide +/// truth: the author asked for these to always fire. +/// +/// Returns the modified issue list, sorted by offset. Synthetic +/// banned-term issues carry `line: 0, col: 0` — callers must run +/// [LineIndex::fill_line_col_sorted] before reporting. +pub fn apply_glossary( + text: &str, + excluded: &[ByteRange], + mut issues: Vec, + glossary: &ProjectGlossary, +) -> Vec { + if glossary.is_empty() { + return issues; + } + + // -- (1) Proper-noun suppression. + if !glossary.proper_nouns.is_empty() { + issues.retain(|i| !glossary.proper_nouns.iter().any(|pn| pn == &i.found)); + } + + // -- (2) Banned-term injection. For each occurrence: + // - If an existing issue covers it, upgrade that issue in place + // (severity → Error, internal glossary-banned flag). + // Upgrading instead of injecting prevents duplicate output AND + // guarantees the banned-term report survives TM downgrade, + // which honors the documented `banned > TM` precedence. + // - Otherwise inject a synthetic Error issue. + for banned in &glossary.banned { + if banned.is_empty() { + continue; + } + let pattern_len = banned.len(); + let mut start = 0; + while let Some(rel) = text[start..].find(banned.as_str()) { + let abs = start + rel; + // Skip matches that fall inside an exclusion zone (code + // fences, URLs, file paths, frontmatter, suppression + // markers). Without this guard, banned terms would fire + // inside blocks the rest of the pipeline carefully respects. + if is_excluded(abs, abs + pattern_len, excluded) { + start = abs + pattern_len; + continue; + } + let covering_idx = issues + .iter() + .position(|i| i.offset <= abs && abs + pattern_len <= i.offset + i.length); + match covering_idx { + Some(idx) => { + let i = &mut issues[idx]; + i.severity = Severity::Error; + mark_glossary_banned(i); + } + None => { + let mut synthetic = Issue::new( + abs, + pattern_len, + banned.clone(), + Vec::new(), + IssueType::CrossStrait, + Severity::Error, + ); + mark_glossary_banned(&mut synthetic); + issues.push(synthetic); + } + } + start = abs + pattern_len; + } + } + + issues.sort_by_key(|i| i.offset); + issues +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Arc; + + fn issue(offset: usize, found: &str) -> Issue { + Issue::new( + offset, + found.len(), + found, + vec!["X".into()], + IssueType::CrossStrait, + Severity::Warning, + ) + } + + #[test] + fn empty_glossary_passes_through() { + let glossary = ProjectGlossary::default(); + let issues = vec![issue(0, "線程")]; + let out = apply_glossary("線程", &[], issues, &glossary); + assert_eq!(out.len(), 1); + } + + #[test] + fn proper_noun_suppresses_matching_issue() { + let glossary = ProjectGlossary { + proper_nouns: vec!["TSMC".into()], + ..ProjectGlossary::default() + }; + let issues = vec![issue(0, "TSMC"), issue(10, "線程")]; + let out = apply_glossary("TSMC ... 線程", &[], issues, &glossary); + assert_eq!(out.len(), 1); + assert_eq!(out[0].found, "線程"); + } + + #[test] + fn banned_term_injects_synthetic_when_not_already_flagged() { + let glossary = ProjectGlossary { + banned: vec!["線程".into()], + ..ProjectGlossary::default() + }; + let text = "我們的線程實作。"; + let out = apply_glossary(text, &[], Vec::new(), &glossary); + assert_eq!(out.len(), 1); + assert_eq!(out[0].found, "線程"); + assert_eq!(out[0].severity, Severity::Error); + assert_eq!(out[0].rule_type, IssueType::CrossStrait); + } + + #[test] + fn banned_term_upgrades_existing_same_span_issue() { + // Covering issue keeps its human-facing context but gets + // Severity::Error plus the internal glossary-banned marker so + // TM cannot downgrade the only report. + let glossary = ProjectGlossary { + banned: vec!["線程".into()], + ..ProjectGlossary::default() + }; + let text = "線程"; + let mut existing = issue(0, "線程"); + existing.context = Some(Arc::from("@domain IT。原始說明")); + let out = apply_glossary(text, &[], vec![existing], &glossary); + assert_eq!(out.len(), 1, "existing issue must not be duplicated"); + assert_eq!(out[0].severity, Severity::Error); + assert_eq!(out[0].context.as_deref(), Some("@domain IT。原始說明")); + assert!(is_glossary_banned(&out[0])); + } + + #[test] + fn banned_term_upgrades_larger_covering_issue() { + // Banned 用戶 inside an existing 用戶介面 issue: the compound + // issue is the user-visible alert, but it must carry + // glossary-banned provenance so TM does not downgrade it. + let glossary = ProjectGlossary { + banned: vec!["用戶".into()], + ..ProjectGlossary::default() + }; + let text = "用戶介面"; + let existing = issue(0, "用戶介面"); + let out = apply_glossary(text, &[], vec![existing], &glossary); + assert_eq!( + out.len(), + 1, + "covering issue must absorb the banned hit, not duplicate" + ); + assert_eq!(out[0].found, "用戶介面"); + assert_eq!(out[0].severity, Severity::Error); + assert!(is_glossary_banned(&out[0])); + } + + #[test] + fn banned_term_finds_multiple_occurrences() { + let glossary = ProjectGlossary { + banned: vec!["線程".into()], + ..ProjectGlossary::default() + }; + let text = "線程一、線程二"; + let out = apply_glossary(text, &[], Vec::new(), &glossary); + assert_eq!(out.len(), 2); + assert!(out.iter().all(|i| i.found == "線程")); + } + + #[test] + fn banned_and_proper_noun_compose() { + let glossary = ProjectGlossary { + banned: vec!["內存".into()], + proper_nouns: vec!["MediaTek".into()], + ..ProjectGlossary::default() + }; + let text = "MediaTek 在內存設計上的優勢"; + let issues = vec![issue(0, "MediaTek"), issue(11, "優化")]; + let out = apply_glossary(text, &[], issues, &glossary); + // MediaTek issue suppressed. + assert!(!out.iter().any(|i| i.found == "MediaTek")); + // 內存 banned synthetic added. + assert!(out + .iter() + .any(|i| i.found == "內存" && i.severity == Severity::Error)); + } +} diff --git a/src/rules/loader.rs b/src/rules/loader.rs index 5ddce0f..9f98098 100644 --- a/src/rules/loader.rs +++ b/src/rules/loader.rs @@ -45,6 +45,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let case_rules = vec![CaseRule { term: "JavaScript".into(), @@ -73,6 +74,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let rules_b = vec![SpellingRule { from: "內存".into(), @@ -87,6 +89,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }]; let case_rules: Vec = vec![]; diff --git a/src/rules/mod.rs b/src/rules/mod.rs index 08b0d2d..de0694b 100644 --- a/src/rules/mod.rs +++ b/src/rules/mod.rs @@ -1,3 +1,4 @@ +pub mod glossary; #[cfg(feature = "native")] pub mod judgment_cache; pub mod loader; diff --git a/src/rules/ruleset.rs b/src/rules/ruleset.rs index 77694cb..4cef2ea 100644 --- a/src/rules/ruleset.rs +++ b/src/rules/ruleset.rs @@ -82,6 +82,11 @@ pub struct ProfileConfig { /// When true, skip line/col computation (byte offsets only). /// Used by MCP tool which consumes offsets directly. pub offset_only: bool, + /// When true (Markdown content only), exclude pulldown-cmark + /// `Tag::BlockQuote` ranges from scanning. Off by default — adopted + /// blockquote prose is real content. Opt-in via `--exempt-blockquotes` + /// or `[markdown] exempt_blockquotes = true` (35.7). + pub exempt_blockquotes: bool, } impl ProfileConfig { @@ -101,6 +106,14 @@ impl ProfileConfig { self.range_en_dash = true; self } + + /// Mark blockquote prose as excluded from scanning. Useful when a + /// document contains long mainland-Chinese citations the author + /// cannot rewrite. + pub fn with_exempt_blockquotes(mut self, on: bool) -> Self { + self.exempt_blockquotes = on; + self + } } impl Profile { @@ -148,6 +161,7 @@ impl Profile { heading_severity_boost: true, political_stance: PoliticalStance::RocCentric, offset_only: false, + exempt_blockquotes: false, }, Profile::Strict => ProfileConfig { spelling: true, @@ -171,6 +185,7 @@ impl Profile { heading_severity_boost: true, political_stance: PoliticalStance::RocCentric, offset_only: false, + exempt_blockquotes: false, }, } } @@ -444,6 +459,34 @@ pub struct SpellingRule { /// Optional tags for categorization and filtering in rule packs. #[serde(default, skip_serializing_if = "Option::is_none")] pub tags: Option>, + /// Per-rule editorial confidence (35.2). Distinguishes binary + /// corrections from style-preference suggestions. When set to + /// `Low`, the explain pipeline marks issues from this rule as + /// `auto_fix_safe = false` AND `needs_review = true` — these are + /// terms whose Mainland/Taiwan distinction is genuine but where + /// the calque form is also valid zh-TW vocabulary in some senses + /// (e.g. `優化`, `算法`, `場景`). Defaults to `None` (heuristic + /// derivation in `derive_explain_meta`). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub editorial_confidence: Option, +} + +/// Editorial confidence tier surfaced in explain output (35.2). +/// +/// Per-issue field that distinguishes binary corrections (`線程` → +/// `執行緒`, high) from editorial-judgment terms (`優化` is valid +/// zh-TW; `最佳化` is preferred in formal writing — low). Distinct +/// from `summary_metrics.confidence_distribution`, which tracks +/// resolution-tier confidence across the document. +/// +/// Invariant enforced downstream: `Low` ⇒ `auto_fix_safe = false` +/// AND `needs_review = true`. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +pub enum EditorialConfidence { + High, + Medium, + Low, } impl SpellingRule { @@ -468,6 +511,7 @@ impl SpellingRule { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, } } } @@ -532,6 +576,11 @@ pub struct Issue { /// `None`: calibration not attempted or API failure (no signal). #[serde(default, skip_serializing_if = "Option::is_none")] pub anchor_match: Option, + /// Internal flag for project-glossary banned-term precedence. + /// When true, TM must not downgrade the issue, but the marker + /// stays out of user-facing `context` metadata. + #[serde(skip)] + pub glossary_banned: bool, /// Tier 2 disambiguation outcome. Set by `disambiguate_batch` to /// indicate whether the issue was resolved locally, suppressed, or /// left in the gray zone for Tier 3. Internal — not serialized. @@ -554,6 +603,11 @@ pub struct Issue { /// integrations and SARIF region output. `None` when not in a table. #[serde(default, skip_serializing_if = "Option::is_none")] pub table_cell: Option, + /// Per-issue editorial confidence (35.2). Copied from the source + /// `SpellingRule` during inflation; surfaces in MCP explain output + /// via `derive_explain_meta`. `None` means heuristic derivation. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub editorial_confidence: Option, } /// Markdown table cell coordinates: `(row, column)` are 0-based, with row 0 @@ -588,10 +642,12 @@ impl Issue { english: None, context_clues: None, anchor_match: None, + glossary_banned: false, tier2_outcome: Tier2Outcome::NotEligible, llm_judged: false, spelling_rule_idx: None, table_cell: None, + editorial_confidence: None, } } @@ -623,10 +679,12 @@ impl Issue { english: None, context_clues: None, anchor_match: None, + glossary_banned: false, tier2_outcome: Tier2Outcome::NotEligible, llm_judged: false, spelling_rule_idx: Some(rule_idx), table_cell: None, + editorial_confidence: None, } } diff --git a/src/rules/store.rs b/src/rules/store.rs index 3508d4b..042250d 100644 --- a/src/rules/store.rs +++ b/src/rules/store.rs @@ -195,6 +195,7 @@ impl OverrideStore { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }; self.upsert_spelling_override(&rule) } @@ -1065,6 +1066,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }, SpellingRule { from: "內存".into(), @@ -1079,6 +1081,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }, ] } @@ -1126,6 +1129,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }; store.upsert_spelling_override(&override_rule).unwrap(); @@ -1148,6 +1152,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }; store.upsert_spelling_override(&new_rule).unwrap(); @@ -1231,6 +1236,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }; store.upsert_spelling_override(&rule).unwrap(); } @@ -1265,6 +1271,7 @@ mod tests { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }], case: vec![], }; diff --git a/tests/consistency_cli.rs b/tests/consistency_cli.rs new file mode 100644 index 0000000..05194b7 --- /dev/null +++ b/tests/consistency_cli.rs @@ -0,0 +1,152 @@ +// 35.1 — `--consistency` flag emits consistency report when both +// regional variants of the same concept appear in one document. + +use std::process::{Command, Stdio}; + +fn binary_path() -> std::path::PathBuf { + let mut path = std::env::current_exe().unwrap(); + path.pop(); + if path.ends_with("deps") { + path.pop(); + } + path.push("zhtw-mcp"); + path +} + +#[test] +fn consistency_block_appears_when_both_forms_present() { + let dir = tempfile::tempdir().unwrap(); + let md = dir.path().join("test.md"); + // Mixed usage: 線程 (mainland) + 執行緒 (TW) both present. + std::fs::write(&md, "我們的線程實作太慢,需要重新設計執行緒。\n").unwrap(); + + let bin = binary_path(); + let output = Command::new(&bin) + .args([ + "lint", + md.to_str().unwrap(), + "--format", + "json", + "--consistency", + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .unwrap(); + let stdout = String::from_utf8_lossy(&output.stdout); + let parsed: serde_json::Value = serde_json::from_str(&stdout).expect("valid JSON"); + + let consistency = parsed["consistency"] + .as_object() + .expect("consistency block present"); + let groups = consistency["groups"].as_array().expect("groups array"); + assert!( + groups.iter().any(|g| g["term_group"] == "thread"), + "expected a 'thread' consistency group; got {groups:?}" + ); +} + +#[test] +fn consistency_block_absent_when_only_one_form_present() { + let dir = tempfile::tempdir().unwrap(); + let md = dir.path().join("test.md"); + // Only mainland form, no TW counterpart. + std::fs::write(&md, "我們的線程實作太慢。\n").unwrap(); + + let bin = binary_path(); + let output = Command::new(&bin) + .args([ + "lint", + md.to_str().unwrap(), + "--format", + "json", + "--consistency", + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .unwrap(); + let stdout = String::from_utf8_lossy(&output.stdout); + let parsed: serde_json::Value = serde_json::from_str(&stdout).expect("valid JSON"); + assert!( + parsed.get("consistency").is_none() || parsed["consistency"].is_null(), + "no mixed usage → no consistency block; got: {}", + parsed + ); +} + +#[test] +fn consistency_block_omitted_without_flag() { + let dir = tempfile::tempdir().unwrap(); + let md = dir.path().join("test.md"); + std::fs::write(&md, "線程 ... 執行緒\n").unwrap(); + + let bin = binary_path(); + let output = Command::new(&bin) + .args(["lint", md.to_str().unwrap(), "--format", "json"]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .unwrap(); + let stdout = String::from_utf8_lossy(&output.stdout); + let parsed: serde_json::Value = serde_json::from_str(&stdout).expect("valid JSON"); + assert!( + parsed.get("consistency").is_none(), + "consistency must be omitted without --consistency flag" + ); +} + +/// `--consistency` must still report mixed usage when the +/// orthographic fixer actually rewrites the document. Half-width +/// `,` and `.` adjacent to CJK are punctuation issues that +/// orthographic mode rewrites; the 線程/執行緒 lexical pair is left +/// untouched (orthographic skips CrossStrait per src/fixer.rs:197), +/// so the consistency block must still surface the `thread` group on +/// the post-fix issue list. +#[test] +fn consistency_block_present_during_fix_runs() { + let dir = tempfile::tempdir().unwrap(); + let md = dir.path().join("test.md"); + // Half-width `,` and `.` next to CJK trigger Punctuation issues + // (FixMode::Orthographic eligible). 線程/執行緒 stay as residual + // CrossStrait issues for the consistency report to grab. + std::fs::write(&md, "我們的線程太慢, 需要重構執行緒.\n").unwrap(); + + let bin = binary_path(); + let output = Command::new(&bin) + .args([ + "lint", + md.to_str().unwrap(), + "--format", + "json", + "--consistency", + "--fix=orthographic", + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .unwrap(); + let stdout = String::from_utf8_lossy(&output.stdout); + let parsed: serde_json::Value = serde_json::from_str(&stdout).expect("valid JSON"); + + // Confirm the orthographic fixer actually rewrote the document — + // otherwise the test would only exercise the pre-fix path. + assert!( + parsed["fixes_applied"].as_u64().unwrap_or(0) > 0, + "orthographic fix must apply at least one rewrite; got fixes_applied={}", + parsed["fixes_applied"], + ); + assert_ne!( + parsed["text"].as_str(), + Some("我們的線程太慢, 需要重構執行緒.\n"), + "post-fix `text` field must reflect the rewritten document", + ); + + let groups = parsed["consistency"]["groups"] + .as_array() + .expect("consistency groups"); + assert!( + groups.iter().any(|g| g["term_group"] == "thread"), + "expected a 'thread' consistency group during fix run; got {groups:?}" + ); +} diff --git a/tests/exclusion-remap.rs b/tests/exclusion-remap.rs index 27641e4..c612e8d 100644 --- a/tests/exclusion-remap.rs +++ b/tests/exclusion-remap.rs @@ -30,6 +30,7 @@ fn cross_strait(from: &str, to: &str) -> SpellingRule { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, } } diff --git a/tests/fixtures/realworld_calques.md b/tests/fixtures/realworld_calques.md new file mode 100644 index 0000000..e2f203c --- /dev/null +++ b/tests/fixtures/realworld_calques.md @@ -0,0 +1,48 @@ +# Real-world calque blindspot fixture + +Anchor for 35.10. Each numbered section pins one mainland-Chinese term +that the linter must flag in everyday zh-TW prose, plus boundary +collocations that must NOT fire. Source: ai-muninn.com calque +blindspot sweep (2026-05). + +## TERMS_MUST_FIRE + +1. 數據結構是程式設計的基礎。 +2. 大數據分析改變了商業決策。 +3. 數據庫的索引設計影響查詢效率。 +4. 數據分析師需要熟悉統計工具。 +5. 用戶體驗的設計原則。 +6. 使用者輸入需要驗證,避免惡意用戶傳入髒資料。 +7. 連接遠端伺服器需要憑證。 +8. 連接超時的處理策略。 +9. 算法的時間複雜度評估。 +10. 並發請求的鎖競爭問題。 +11. 消息中介軟體的選型。 +12. 內存洩漏的偵測工具。 +13. 線程之間的同步機制。 +14. 程序員的工作效率。 +15. 軟件工程的最佳實務。 +16. 硬件加速的應用場景。 +17. 網絡延遲的測量方法。 +18. 視頻轉碼的效能瓶頸。 + +## METADATA_PARENT_RULE + +19. 元數據描述資料的結構與來源。 + +## BOUNDARY_MUST_NOT_FIRE_ALGORITHM + +20. 演算法是電腦科學的核心議題。 +21. 排序演算法的效能比較。 + +## BOUNDARY_MUST_NOT_FIRE_GOODNEWS + +22. 好消息是專案提前完成。 +23. 壞消息是預算被砍。 +24. 消息來源還沒確認。 + +## BOUNDARY_MUST_NOT_FIRE_TWFORMS + +25. 使用者操作介面設計。 +26. 連線逾時時間設為三十秒。 +27. 訊息佇列的吞吐量測試。 diff --git a/tests/glossary_cli.rs b/tests/glossary_cli.rs new file mode 100644 index 0000000..1996c62 --- /dev/null +++ b/tests/glossary_cli.rs @@ -0,0 +1,162 @@ +// 35.9 — project glossary integration test. +// +// Verifies that `[glossary] banned`, `proper_nouns`, and `preferred` +// fields in `.zhtw-mcp.toml` are honored by the `lint` subcommand. + +use std::process::{Command, Stdio}; + +fn binary_path() -> std::path::PathBuf { + let mut path = std::env::current_exe().unwrap(); + path.pop(); + if path.ends_with("deps") { + path.pop(); + } + path.push("zhtw-mcp"); + path +} + +#[test] +fn glossary_banned_term_fires_even_without_context_clues() { + let dir = tempfile::tempdir().unwrap(); + // 線程 (mainland) — embedded ruleset already flags this, but we want + // to confirm the glossary-driven path also works. We use a banned + // term that the embedded ruleset would NOT flag in isolation: the + // word "ABC" with banned=["ABC"] forces flagging. + std::fs::write( + dir.path().join(".zhtw-mcp.toml"), + "[glossary]\nbanned = [\"ABC\"]\n", + ) + .unwrap(); + let md = dir.path().join("test.md"); + std::fs::write(&md, "ABC 不該出現在文件中。\n").unwrap(); + + let bin = binary_path(); + let output = Command::new(&bin) + .args(["lint", md.to_str().unwrap(), "--format", "json"]) + .current_dir(dir.path()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .unwrap(); + let stdout = String::from_utf8_lossy(&output.stdout); + let parsed: serde_json::Value = serde_json::from_str(&stdout).expect("valid JSON"); + let issues = parsed["issues"].as_array().expect("issues array"); + assert!( + issues.iter().any(|i| i["found"] == "ABC"), + "banned ABC must produce a synthetic issue; got {issues:?}" + ); +} + +#[test] +fn glossary_proper_noun_suppresses_matching_issue() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join(".zhtw-mcp.toml"), + "[glossary]\nproper_nouns = [\"線程\"]\n", + ) + .unwrap(); + let md = dir.path().join("test.md"); + std::fs::write(&md, "我們的 線程 實作。\n").unwrap(); + + let bin = binary_path(); + let output = Command::new(&bin) + .args(["lint", md.to_str().unwrap(), "--format", "json"]) + .current_dir(dir.path()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .unwrap(); + let stdout = String::from_utf8_lossy(&output.stdout); + let parsed: serde_json::Value = serde_json::from_str(&stdout).expect("valid JSON"); + let issues = parsed["issues"].as_array().expect("issues array"); + assert!( + !issues.iter().any(|i| i["found"] == "線程"), + "proper_nouns must suppress 線程; got {issues:?}" + ); +} + +#[test] +fn glossary_banned_does_not_duplicate_existing_issues() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join(".zhtw-mcp.toml"), + "[glossary]\nbanned = [\"線程\"]\n", + ) + .unwrap(); + let md = dir.path().join("test.md"); + std::fs::write(&md, "我們的線程實作。\n").unwrap(); + + let bin = binary_path(); + let output = Command::new(&bin) + .args(["lint", md.to_str().unwrap(), "--format", "json"]) + .current_dir(dir.path()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .unwrap(); + let stdout = String::from_utf8_lossy(&output.stdout); + let parsed: serde_json::Value = serde_json::from_str(&stdout).expect("valid JSON"); + let issues = parsed["issues"].as_array().expect("issues array"); + let line_hits: Vec<_> = issues.iter().filter(|i| i["found"] == "線程").collect(); + assert_eq!( + line_hits.len(), + 1, + "banned 線程 must not duplicate the embedded rule's hit; got {line_hits:?}" + ); +} + +#[test] +fn glossary_banned_is_honored_during_fix_runs() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join(".zhtw-mcp.toml"), + "[glossary]\nbanned = [\"ABC\"]\n", + ) + .unwrap(); + let md = dir.path().join("test.md"); + std::fs::write(&md, "ABC 不該出現在文件中。\n").unwrap(); + + let bin = binary_path(); + let output = Command::new(&bin) + .args(["lint", md.to_str().unwrap(), "--format", "json", "--fix"]) + .current_dir(dir.path()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .unwrap(); + let stdout = String::from_utf8_lossy(&output.stdout); + let parsed: serde_json::Value = serde_json::from_str(&stdout).expect("valid JSON"); + let issues = parsed["issues"].as_array().expect("issues array"); + assert!( + issues.iter().any(|i| i["found"] == "ABC"), + "glossary banned terms must remain active during fix runs; got {issues:?}" + ); +} + +#[test] +fn glossary_proper_nouns_are_honored_during_fix_runs() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join(".zhtw-mcp.toml"), + "[glossary]\nproper_nouns = [\"線程\"]\n", + ) + .unwrap(); + let md = dir.path().join("test.md"); + std::fs::write(&md, "我們的線程實作。\n").unwrap(); + + let bin = binary_path(); + let output = Command::new(&bin) + .args(["lint", md.to_str().unwrap(), "--format", "json", "--fix"]) + .current_dir(dir.path()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .unwrap(); + let stdout = String::from_utf8_lossy(&output.stdout); + let parsed: serde_json::Value = serde_json::from_str(&stdout).expect("valid JSON"); + let issues = parsed["issues"].as_array().expect("issues array"); + assert!( + !issues.iter().any(|i| i["found"] == "線程"), + "glossary proper_nouns must suppress fix-path issues; got {issues:?}" + ); +} diff --git a/tests/ir-parity.rs b/tests/ir-parity.rs index e7d6306..8abbc8f 100644 --- a/tests/ir-parity.rs +++ b/tests/ir-parity.rs @@ -41,6 +41,7 @@ fn spelling_with_clues( negative_context_clues: negative_clues.map(|v| v.into_iter().map(String::from).collect()), positional_clues: None, tags: None, + editorial_confidence: None, } } @@ -57,6 +58,7 @@ fn spelling_with_exceptions(from: &str, to: &[&str], exceptions: Vec<&str>) -> S negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, } } @@ -73,6 +75,7 @@ fn spelling_variant(from: &str, to: &[&str]) -> SpellingRule { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, } } @@ -89,6 +92,7 @@ fn spelling_ai_filler(from: &str, to: &[&str]) -> SpellingRule { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, } } @@ -105,6 +109,7 @@ fn spelling_political(from: &str, to: &[&str]) -> SpellingRule { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, } } @@ -122,6 +127,7 @@ fn spelling_deletion(from: &str) -> SpellingRule { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, } } @@ -138,6 +144,7 @@ fn spelling_with_positional(from: &str, to: &[&str], positional: Vec<&str>) -> S negative_context_clues: None, positional_clues: Some(positional.into_iter().map(String::from).collect()), tags: None, + editorial_confidence: None, } } @@ -160,6 +167,7 @@ fn ir_cross_strait_fires() { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }], vec![], ); @@ -349,6 +357,7 @@ fn ir_superstring_absorption() { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }], vec![], ); @@ -637,6 +646,7 @@ fn ir_markdown_code_exclusion() { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, }], vec![], ); diff --git a/tests/markdown_blockquote_yaml.rs b/tests/markdown_blockquote_yaml.rs new file mode 100644 index 0000000..7ee899e --- /dev/null +++ b/tests/markdown_blockquote_yaml.rs @@ -0,0 +1,152 @@ +// 35.7 — Markdown blockquote exemption (opt-in) and YAML scalar +// quote preservation (always-on). +// +// Both behaviors are anchored to the ai-muninn.com calque blindspot +// sweep (2026-05): citation contexts produce ~50 false positives, and +// auto-converting `"` to `「`/`」` inside YAML frontmatter scalar values +// breaks downstream parsers. + +use zhtw_mcp::engine::scan::{ContentType, Scanner}; +use zhtw_mcp::rules::loader::load_embedded_ruleset; +use zhtw_mcp::rules::ruleset::{IssueType, Profile}; + +fn scan_with(text: &str, exempt_blockquotes: bool) -> Vec { + let ruleset = load_embedded_ruleset().expect("embedded ruleset loads"); + let scanner = Scanner::new(ruleset.spelling_rules, ruleset.case_rules); + let cfg = Profile::Base + .config() + .with_exempt_blockquotes(exempt_blockquotes); + scanner + .scan_for_content_type_with_config(text, ContentType::Markdown, cfg) + .issues +} + +/// Blockquote exemption disabled (default): mainland-Chinese calques +/// inside `>`-prefixed citations are still flagged. +#[test] +fn blockquote_default_scans_citations() { + let md = "正文裡的中文是 zh-TW。\n\n> 用戶輸入需要驗證。\n"; + let issues = scan_with(md, false); + assert!( + issues + .iter() + .any(|i| matches!(i.rule_type, IssueType::CrossStrait) + && i.english.as_deref() == Some("user")), + "default mode must scan blockquote citations; got {:?}", + issues.iter().map(|i| i.found.as_str()).collect::>() + ); +} + +/// Blockquote exemption enabled: citations are silenced. +#[test] +fn blockquote_exempt_silences_citations() { + let md = "正文裡的中文是 zh-TW。\n\n> 用戶輸入需要驗證。\n"; + let issues = scan_with(md, true); + assert!( + !issues + .iter() + .any(|i| matches!(i.rule_type, IssueType::CrossStrait) + && i.english.as_deref() == Some("user")), + "exempt mode must skip blockquote prose; got {:?}", + issues.iter().map(|i| i.found.as_str()).collect::>() + ); +} + +/// Blockquote exemption only affects blockquotes — body prose still scans. +#[test] +fn blockquote_exempt_keeps_body_scanning() { + let md = "用戶介面在正文裡也應該被覆蓋:用戶帳號要用使用者帳號。\n\n> 用戶輸入需要驗證。\n"; + let issues = scan_with(md, true); + assert!( + issues + .iter() + .any(|i| matches!(i.rule_type, IssueType::CrossStrait)), + "body cross_strait hits must remain; got {:?}", + issues.iter().map(|i| i.found.as_str()).collect::>() + ); +} + +/// Nested blockquotes (`> >`) and blockquotes inside list items must +/// also be exempt under the option. Cmark events provide the correct +/// span tracking; a regex on `>` line prefixes would mishandle these. +#[test] +fn blockquote_exempt_handles_nested_and_listitem_quotes() { + let md = "\ +- 正文 list item. + > 用戶帳號的處理流程。 + > > 巢狀引用的數據庫設計。 +"; + let issues = scan_with(md, true); + assert!( + !issues + .iter() + .any(|i| matches!(i.rule_type, IssueType::CrossStrait)), + "nested + list-item blockquotes must be exempt; got {:?}", + issues.iter().map(|i| i.found.as_str()).collect::>() + ); +} + +/// YAML frontmatter ASCII `"` and `'` quote bytes are preserved (never +/// auto-converted to `「`/`」`). Otherwise `"...". to `「...」` would +/// break downstream YAML parsers. This is always-on; no option needed. +#[test] +fn yaml_frontmatter_preserves_ascii_quotes() { + let md = "\ +--- +title: \"用戶手冊\" +description: '使用者體驗指南' +--- + +正文。 +"; + let issues = scan_with(md, false); + + // No punctuation issue should fire on the ASCII `\"` bytes inside + // the frontmatter scalar values. + let punct_quote_hits: Vec<_> = issues + .iter() + .filter(|i| { + matches!(i.rule_type, IssueType::Punctuation) && (i.found == "\"" || i.found == "'") + }) + .collect(); + + assert!( + punct_quote_hits.is_empty(), + "YAML scalar quote bytes must not produce punctuation issues; got {:?}", + punct_quote_hits + .iter() + .map(|i| (i.offset, i.found.as_str())) + .collect::>() + ); +} + +/// Body ASCII `"` adjacent to CJK still converts to `「`/`」` (the +/// frontmatter exemption must not bleed into the body). +#[test] +fn body_ascii_quotes_still_convert_to_brackets() { + let md = "\ +--- +title: \"用戶手冊\" +--- + +他說\"你好\"再見。 +"; + let issues = scan_with(md, false); + + let body_quote_hits: Vec<_> = issues + .iter() + .filter(|i| matches!(i.rule_type, IssueType::Punctuation) && i.found == "\"") + .collect(); + + // The body has two `\"` bytes (open + close) adjacent to CJK; both + // should fire the punctuation conversion suggestion. + assert_eq!( + body_quote_hits.len(), + 2, + "body ASCII quotes adjacent to CJK must convert; got {:?}", + body_quote_hits + .iter() + .map(|i| (i.offset, i.found.as_str())) + .collect::>() + ); +} diff --git a/tests/realworld_calques.rs b/tests/realworld_calques.rs new file mode 100644 index 0000000..fb8c432 --- /dev/null +++ b/tests/realworld_calques.rs @@ -0,0 +1,268 @@ +// 35.10 high-frequency calque coverage audit (real-world corpus). +// +// Anchored to ai-muninn.com calque blindspot sweep (2026-05). Pins +// the linter behavior on the 14 mainland-Chinese terms reported as +// missed in published zh-TW articles, plus the boundary collocations +// that must NOT fire. + +use std::path::{Path, PathBuf}; + +use zhtw_mcp::engine::scan::{ContentType, Scanner}; +use zhtw_mcp::rules::loader::load_embedded_ruleset; +use zhtw_mcp::rules::ruleset::{Issue, IssueType, Profile}; + +fn fixture_path() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("realworld_calques.md") +} + +fn scan_text(text: &str) -> Vec { + let ruleset = load_embedded_ruleset().expect("embedded ruleset loads"); + let scanner = Scanner::new(ruleset.spelling_rules, ruleset.case_rules); + scanner + .scan_for_content_type(text, ContentType::Markdown, Profile::Base) + .issues +} + +fn issues_on_line(issues: &[Issue], line: usize) -> Vec<&Issue> { + issues.iter().filter(|i| i.line == line).collect() +} + +/// True when the issue is a CrossStrait hit whose english field equals or +/// contains the concept anchor. The compound rule may use a richer +/// english (e.g. `數據庫` → "database", `用戶介面` → "user interface"), +/// so we accept substring containment for concept-level coverage. +fn english_matches(english: Option<&str>, concept: &str) -> bool { + match english { + Some(e) => e == concept || e.contains(concept), + None => false, + } +} + +fn assert_term_fires(issues: &[Issue], line: usize, expected_english: &str) { + let line_issues = issues_on_line(issues, line); + let hit = line_issues.iter().any(|i| { + matches!(i.rule_type, IssueType::CrossStrait) + && english_matches(i.english.as_deref(), expected_english) + }); + assert!( + hit, + "line {line}: expected a CrossStrait issue with english containing {expected_english:?}, got: {:?}", + line_issues + .iter() + .map(|i| (i.found.as_str(), i.english.as_deref(), i.rule_type)) + .collect::>() + ); +} + +fn assert_term_silent(issues: &[Issue], line: usize, forbidden_from: &str) { + // Match by containment, not equality: a regression where the + // scanner emits a longer phrase that contains the forbidden bare + // term (e.g. `好消息` slipping through as a phrase-level hit + // covering the inner `消息` calque) would still violate the + // collocation invariant the test guards. Equality would let + // those slip past. + let line_issues = issues_on_line(issues, line); + let hit = line_issues.iter().any(|i| i.found.contains(forbidden_from)); + assert!( + !hit, + "line {line}: expected NO issue containing {forbidden_from:?}, got: {:?}", + line_issues + .iter() + .map(|i| (i.found.as_str(), i.rule_type)) + .collect::>() + ); +} + +/// Phase 1 — coverage audit. Each of the 14 audited terms (plus a few +/// closely related compounds) must produce a non-zero hit when used in +/// realistic prose under the `base` profile. +#[test] +fn phase1_high_frequency_terms_fire() { + let body = std::fs::read_to_string(fixture_path()).expect("fixture exists"); + let issues = scan_text(&body); + + // Each (line, expected term) anchors a section in the fixture. + // Each anchor is (line, expected english field on a CrossStrait issue + // covering some span of that line). Concept-level coverage — the + // longer compound rule is allowed to win over the bare term as long + // as english anchors match. + let anchors: &[(usize, &str)] = &[ + (10, "data"), + (11, "data"), + (12, "data"), + (13, "data"), + (14, "user"), + (15, "user"), + (16, "connect"), + (17, "connect"), + (18, "algorithm"), + (19, "concurrency"), + (20, "message"), + (21, "memory"), + (22, "thread"), + (23, "programmer"), + (24, "software"), + (25, "hardware"), + (26, "network"), + (27, "video"), + ]; + + for (line, english) in anchors { + assert_term_fires(&issues, *line, english); + } +} + +/// Phase 2 — `元數據` parent rule must keep firing exactly once and +/// surface "metadata" verbatim as the suggestion (no `元資料` / +/// `詮釋資料` / `後設資料` translation target). +#[test] +fn phase2_metadata_parent_rule_keeps_firing_with_english_anchor() { + let body = std::fs::read_to_string(fixture_path()).expect("fixture exists"); + let issues = scan_text(&body); + + let line_31 = issues_on_line(&issues, 31); + let metadata_hits: Vec<_> = line_31.iter().filter(|i| i.found == "元數據").collect(); + assert_eq!( + metadata_hits.len(), + 1, + "line 31: expected exactly one 元數據 hit (parent), got {}: {:?}", + metadata_hits.len(), + line_31.iter().map(|i| i.found.as_str()).collect::>() + ); + + let parent = metadata_hits[0]; + let english = parent.english.as_deref().unwrap_or(""); + assert_eq!( + english, "metadata", + "元數據 rule must surface english anchor 'metadata' verbatim" + ); + + // The 元數據 rule uses `to: []`, so `effective_suggestions` falls + // back to the english anchor. The user-visible suggestion must be + // exactly "metadata" — neither the rejected mainland form `元資料` + // nor the acceptable-but-not-preferred coinages `詮釋資料` / + // `後設資料`. Asserting the suggestion list literally catches a + // regression where someone adds `to: ["後設資料"]` thinking it's + // a friendlier translation; the engine would surface that instead + // of "metadata", silently violating the gate. + assert_eq!( + parent.suggestions.as_ref(), + ["metadata".to_string()].as_slice(), + "元數據 must surface exactly [\"metadata\"]; got {:?}", + parent.suggestions, + ); + + // Phase 2 invariant: the inner 數據 hit must NOT double-fire on the + // same span — overlap resolution + the parent rule should yield + // exactly one issue covering the full 元數據 span. + let inner_data: Vec<_> = line_31 + .iter() + .filter(|i| { + i.found == "數據" + && (i.offset..i.offset + i.length) != (parent.offset..parent.offset + parent.length) + }) + .collect(); + assert!( + inner_data.is_empty(), + "inner 數據 must not double-fire inside 元數據; got: {inner_data:?}" + ); +} + +/// Three-tier metadata policy: writer prose containing `元資料` (the +/// rejected mainland-style Sinification) must trigger the symmetric +/// rule and surface "metadata" as the suggestion, mirroring the +/// `元數據` rule. Without the sibling rule, `元資料` would slip +/// through unflagged. The acceptable forms `詮釋資料` and `後設資料` +/// are validated separately by the boundary tests below. +#[test] +fn phase2_metadata_rejected_form_is_flagged_symmetrically() { + let issues = scan_text("文件的元資料描述了結構與來源。"); + let yuanziliao_hits: Vec<_> = issues.iter().filter(|i| i.found == "元資料").collect(); + assert_eq!( + yuanziliao_hits.len(), + 1, + "writer prose containing 元資料 must trigger its own rule; got {yuanziliao_hits:?}" + ); + let hit = yuanziliao_hits[0]; + assert_eq!(hit.english.as_deref(), Some("metadata")); + assert_eq!( + hit.suggestions.as_ref(), + ["metadata".to_string()].as_slice() + ); +} + +/// Three-tier metadata policy: the acceptable zh-TW alternatives +/// `詮釋資料` and `後設資料` (NAER terminology) must NOT be flagged +/// in writer prose — they are valid zh-TW forms even though +/// "metadata" is the preferred surface form. +#[test] +fn phase2_metadata_acceptable_forms_pass_through() { + let issues = scan_text("詮釋資料與後設資料皆為合法的中文翻譯。"); + assert!( + !issues + .iter() + .any(|i| i.found == "詮釋資料" || i.found == "後設資料"), + "acceptable zh-TW alternatives must not fire; got {:?}", + issues + .iter() + .map(|i| (i.found.as_str(), i.english.as_deref())) + .collect::>() + ); +} + +/// Phase 2 — `算法` must not fire inside `演算法` (canonical zh-TW form). +#[test] +fn phase2_algorithm_silent_inside_canonical_form() { + let body = std::fs::read_to_string(fixture_path()).expect("fixture exists"); + let issues = scan_text(&body); + + for line in [35, 36] { + assert_term_silent(&issues, line, "算法"); + } +} + +/// Phase 3 — `消息` rule must respect legitimate zh-TW collocations. +#[test] +fn phase3_message_silent_in_legitimate_collocations() { + let body = std::fs::read_to_string(fixture_path()).expect("fixture exists"); + let issues = scan_text(&body); + + for line in [40, 41, 42] { + assert_term_silent(&issues, line, "消息"); + } +} + +/// Phase 3 — bare `文件` rule is intentionally disabled +/// (assets/ruleset.json `"disabled": true`). The audit must NOT +/// accidentally re-enable it. This test catches a regression where +/// someone toggles the flag without realizing the bare-word +/// ambiguity rationale. +#[test] +fn phase3_bare_file_rule_remains_disabled() { + let ruleset = load_embedded_ruleset().expect("embedded ruleset loads"); + let bare_file_rule = ruleset + .spelling_rules + .iter() + .find(|r| r.from == "文件") + .expect("文件 rule exists"); + assert!( + bare_file_rule.disabled, + "文件 cross_strait rule must remain disabled — bare-word ambiguity" + ); +} + +/// Boundary — pure zh-TW forms must produce zero hits on these terms. +#[test] +fn boundary_pure_tw_forms_silent() { + let body = std::fs::read_to_string(fixture_path()).expect("fixture exists"); + let issues = scan_text(&body); + + let forms: &[(usize, &str)] = &[(46, "用戶"), (47, "連接"), (48, "消息")]; + + for (line, term) in forms { + assert_term_silent(&issues, *line, term); + } +} diff --git a/tests/scanner-integration.rs b/tests/scanner-integration.rs index 879de19..00de044 100644 --- a/tests/scanner-integration.rs +++ b/tests/scanner-integration.rs @@ -25,6 +25,7 @@ fn spelling(from: &str, to: &[&str]) -> SpellingRule { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, } } @@ -41,6 +42,7 @@ fn variant(from: &str, to: &[&str]) -> SpellingRule { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, } } @@ -904,6 +906,7 @@ fn ai_filler_rule(from: &str, to: &[&str]) -> SpellingRule { negative_context_clues: None, positional_clues: None, tags: None, + editorial_confidence: None, } }