sysprog21
diff --git a/‎assets/ruleset.json‎
Lines changed: 17 additions & 6 deletions b/‎assets/ruleset.json‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎benches/scanner.rs‎
Lines changed: 1 addition & 0 deletions b/‎benches/scanner.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎build.rs‎
Lines changed: 10 additions & 0 deletions b/‎build.rs‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎scripts/check-ruleset.py‎
Lines changed: 2 additions & 0 deletions b/‎scripts/check-ruleset.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cache.rs‎
Lines changed: 6 additions & 0 deletions b/‎src/cache.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/config.rs‎
Lines changed: 32 additions & 0 deletions b/‎src/config.rs‎
Lines changed: 32 additions & 0 deletions
@@ -1267,7 +1267,8 @@
       "type": "cross_strait",
       "context": "@domain IT。tw「優化」泛用於商業；IT optimize 改「最佳化」以區分 improve",
       "english": "optimize",
-      "negative_context_clues": ["流程", "體驗", "服務"]
+      "negative_context_clues": ["流程", "體驗", "服務"],
+      "editorial_confidence": "low"
     },
     {
       "from": "優步",
@@ -1302,7 +1303,7 @@
       "from": "元數據",
       "to": [],
       "type": "cross_strait",
-      "context": "@domain 資料。源自希臘文 meta- (關於) + data (資料)，原意為「描述資料的資料」",
+      "context": "@domain 資料。源自希臘文 meta- (關於) + data (資料)，原意為「描述資料的資料」。preferred: metadata；可接受: 詮釋資料 / 後設資料；rejected: 元資料",
       "english": "metadata"
     },
     {
@@ -1319,6 +1320,13 @@
       "context": "@domain 程式設計",
       "english": "metaprogramming/meta-programming"
     },
+    {
+      "from": "元資料",
+      "to": [],
+      "type": "cross_strait",
+      "context": "@domain 資料。`元資料` 為機械式 Sinification (從 `元數據` 字面替換而來)，無 NAER / MoE 立足點。preferred: metadata；可接受替代: 詮釋資料 / 後設資料",
+      "english": "metadata"
+    },
     {
       "from": "元音",
       "to": ["母音"],
@@ -3489,7 +3497,8 @@
       "type": "confusable",
       "context": "限 IT 語境。電影/戲劇場景為正確 tw 用法",
       "english": "scenario",
-      "context_clues": ["應用", "部署", "測試", "系統", "開發", "架構", "軟件", "軟體", "程式", "行程", "核心", "記憶體", "CPU"]
+      "context_clues": ["應用", "部署", "測試", "系統", "開發", "架構", "軟件", "軟體", "程式", "行程", "核心", "記憶體", "CPU"],
+      "editorial_confidence": "low"
     },
     {
       "from": "塑料",
@@ -7416,8 +7425,9 @@
       "from": "消息",
       "to": ["訊息"],
       "type": "cross_strait",
-      "context": "@domain IT",
-      "english": "message"
+      "context": "@domain IT。`好消息`/`壞消息`/`消息來源` 為合法 zh-TW 用法",
+      "english": "message",
+      "positional_clues": ["not_after:好", "not_after:壞", "not_before:來源"]
     },
     {
       "from": "消息環",
@@ -9097,7 +9107,8 @@
       "type": "cross_strait",
       "context": "@domain 程式設計",
       "english": "algorithm",
-      "exceptions": ["演算法"]
+      "exceptions": ["演算法"],
+      "editorial_confidence": "low"
     },
     {
       "from": "箭頭操作符",
 
@@ -480,6 +480,7 @@ fn bench_cpu_attribution_100kb(c: &mut Criterion) {
         heading_severity_boost: false,
         political_stance: PoliticalStance::RocCentric,
         offset_only: false,
+        exempt_blockquotes: false,
     };
 
     // Spelling-only config.
 
@@ -33,6 +33,16 @@ struct SpellingRule {
     positional_clues: Option<Vec<String>>,
     #[serde(default)]
     tags: Option<Vec<String>>,
+    #[serde(default)]
+    editorial_confidence: Option<EditorialConfidence>,
+}
+
+#[derive(serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "lowercase")]
+enum EditorialConfidence {
+    High,
+    Medium,
+    Low,
 }
 
 #[derive(serde::Serialize, serde::Deserialize)]
 
@@ -78,6 +78,7 @@ def dedup_sort(
     "negative_context_clues",
     "positional_clues",
     "tags",
+    "editorial_confidence",
 }
 
 # Field order for spelling rules (stable, human-scannable output).
@@ -93,6 +94,7 @@ def dedup_sort(
     "positional_clues",
     "exceptions",
     "tags",
+    "editorial_confidence",
 ]
 
 CASE_FIELD_ORDER = ["term", "alternatives", "disabled"]
 
@@ -63,6 +63,10 @@ pub struct ScanParams {
     pub translationese_domain: String,
     // AI threshold level (formatted f32) — different multipliers produce different results.
     pub ai_threshold: String,
+    // Markdown blockquote-exemption flag — changes which spans get
+    // scanned, so cache hits must be invalidated when toggled.
+    #[serde(default)]
+    pub exempt_blockquotes: bool,
 }
 
 /// A single cached entry.
@@ -420,6 +424,7 @@ mod tests {
             detect_translationese: false,
             translationese_domain: "general".into(),
             ai_threshold: "1.0".into(),
+            exempt_blockquotes: false,
         }
     }
 
@@ -433,6 +438,7 @@ mod tests {
             detect_translationese: false,
             translationese_domain: "general".into(),
             ai_threshold: "1.0".into(),
+            exempt_blockquotes: false,
         }
     }
 
 
@@ -26,6 +26,38 @@ pub struct ProjectConfig {
     pub suppressions: Option<String>,
     pub packs: Option<Vec<String>>,
     pub translation_memory: Option<String>,
+    pub markdown: Option<MarkdownConfig>,
+    pub glossary: Option<GlossaryConfig>,
+}
+
+/// Markdown-specific scanning options (35.7).
+#[derive(Debug, Default, Deserialize)]
+#[serde(default)]
+pub struct MarkdownConfig {
+    /// When true, treat pulldown-cmark `Tag::BlockQuote` ranges as
+    /// exclusion zones.  Useful for documents that quote mainland-Chinese
+    /// sources for illustrative purposes.  Off by default.
+    pub exempt_blockquotes: Option<bool>,
+}
+
+/// Project glossary section (35.9).  Layered above the embedded ruleset
+/// and pack store but below banned-term enforcement and translation
+/// memory.  Precedence: glossary `banned` > TM > glossary `preferred` >
+/// domain pack > embedded ruleset.
+#[derive(Debug, Default, Deserialize)]
+#[serde(default)]
+pub struct GlossaryConfig {
+    /// Terms that must always be flagged regardless of context clues.
+    /// E.g. ["線程", "內存"] forces those calques to fire even in
+    /// otherwise ambiguous prose.
+    pub banned: Option<Vec<String>>,
+    /// Project-preferred zh-TW forms.  Used by the consistency report
+    /// (35.1) to choose the canonical suggestion when both TW-preferred
+    /// and CN-preferred variants appear in the same document.
+    pub preferred: Option<Vec<String>>,
+    /// Names that should never be flagged (added to the suppression
+    /// list).  E.g. ["TSMC", "MediaTek"].
+    pub proper_nouns: Option<Vec<String>>,
 }
 
 impl ProjectConfig {
Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@ def dedup_sort(`
`78`	`78`	`"negative_context_clues",`
`79`	`79`	`"positional_clues",`
`80`	`80`	`"tags",`
	`81`	`+ "editorial_confidence",`
`81`	`82`	`}`
`82`	`83`
`83`	`84`	`# Field order for spelling rules (stable, human-scannable output).`
`@@ -93,6 +94,7 @@ def dedup_sort(`
`93`	`94`	`"positional_clues",`
`94`	`95`	`"exceptions",`
`95`	`96`	`"tags",`
	`97`	`+ "editorial_confidence",`
`96`	`98`	`]`
`97`	`99`
`98`	`100`	`CASE_FIELD_ORDER = ["term", "alternatives", "disabled"]`
Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,10 @@ pub struct ScanParams {`
`63`	`63`	`pub translationese_domain: String,`
`64`	`64`	`// AI threshold level (formatted f32) — different multipliers produce different results.`
`65`	`65`	`pub ai_threshold: String,`
	`66`	`+ // Markdown blockquote-exemption flag — changes which spans get`
	`67`	`+ // scanned, so cache hits must be invalidated when toggled.`
	`68`	`+ #[serde(default)]`
	`69`	`+ pub exempt_blockquotes: bool,`
`66`	`70`	`}`
`67`	`71`
`68`	`72`	`/// A single cached entry.`
`@@ -420,6 +424,7 @@ mod tests {`
`420`	`424`	`detect_translationese: false,`
`421`	`425`	`translationese_domain: "general".into(),`
`422`	`426`	`ai_threshold: "1.0".into(),`
	`427`	`+ exempt_blockquotes: false,`
`423`	`428`	`}`
`424`	`429`	`}`
`425`	`430`
`@@ -433,6 +438,7 @@ mod tests {`
`433`	`438`	`detect_translationese: false,`
`434`	`439`	`translationese_domain: "general".into(),`
`435`	`440`	`ai_threshold: "1.0".into(),`
	`441`	`+ exempt_blockquotes: false,`
`436`	`442`	`}`
`437`	`443`	`}`
`438`	`444`