Skip to content

Commit cbddee7

Browse files
authored
Add consistency report and project glossary (#82)
A real-world deployment study [1] reported mainland-Chinese terms slipping past the linter in published zh-TW articles, blockquote citation contexts producing ~50 false positives across a 72-article corpus, and ASCII quotes auto-converted to 「」 inside YAML frontmatter breaking downstream parsers. User-facing additions: - '--consistency' reports mixed regional usage of one concept (both 線程 and 執行緒 in the same document). Groups by the rule's "english" anchor; skips TM-suppressed terms. - '--exempt-blockquotes' (CLI + '[markdown]' config) excludes pulldown-cmark 'Tag::BlockQuote' ranges from scanning. Off by default: adopted blockquote prose is real content. - YAML frontmatter preserves ASCII '"' / ''' scalar delimiters. Body prose still converts to 「」. - '[glossary]' section in '.zhtw-mcp.toml': banned / preferred / proper_nouns lists. Banned terms inject synthetic Errors that TM cannot downgrade; proper_nouns suppress matching issues; both honor exclusion zones. - Per-rule 'editorial_confidence' (low / medium / high) flows through issue inflation into MCP explain output. Low forces auto_fix_safe = false and needs_review = true. 優化, 算法, 場景 tagged low because both regional forms are valid zh-TW. Calque-audit refinements: - 消息 gains positional_clues; 好消息 / 壞消息 / 消息來源 no longer fire. - Symmetric 元資料 rule mirrors 元數據 — both use to: [] plus english: "metadata", surfacing the English original as the preferred form. 詮釋資料 and 後設資料 (NAER terminology bank) remain unflagged as acceptable zh-TW alternatives. - Real-world regression fixture pins the 14 documented blind-spot terms. [1] https://ai-muninn.com/zh-TW/blog/zhtw-mcp-calque-blindspot-sweep
2 parents d44835f + 269c8dd commit cbddee7

30 files changed

Lines changed: 2384 additions & 72 deletions

assets/ruleset.json

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1267,7 +1267,8 @@
12671267
"type": "cross_strait",
12681268
"context": "@domain IT。tw「優化」泛用於商業;IT optimize 改「最佳化」以區分 improve",
12691269
"english": "optimize",
1270-
"negative_context_clues": ["流程", "體驗", "服務"]
1270+
"negative_context_clues": ["流程", "體驗", "服務"],
1271+
"editorial_confidence": "low"
12711272
},
12721273
{
12731274
"from": "優步",
@@ -1302,7 +1303,7 @@
13021303
"from": "元數據",
13031304
"to": [],
13041305
"type": "cross_strait",
1305-
"context": "@domain 資料。源自希臘文 meta- (關於) + data (資料),原意為「描述資料的資料」",
1306+
"context": "@domain 資料。源自希臘文 meta- (關於) + data (資料),原意為「描述資料的資料」。preferred: metadata;可接受: 詮釋資料 / 後設資料;rejected: 元資料",
13061307
"english": "metadata"
13071308
},
13081309
{
@@ -1319,6 +1320,13 @@
13191320
"context": "@domain 程式設計",
13201321
"english": "metaprogramming/meta-programming"
13211322
},
1323+
{
1324+
"from": "元資料",
1325+
"to": [],
1326+
"type": "cross_strait",
1327+
"context": "@domain 資料。`元資料` 為機械式 Sinification (從 `元數據` 字面替換而來),無 NAER / MoE 立足點。preferred: metadata;可接受替代: 詮釋資料 / 後設資料",
1328+
"english": "metadata"
1329+
},
13221330
{
13231331
"from": "元音",
13241332
"to": ["母音"],
@@ -3489,7 +3497,8 @@
34893497
"type": "confusable",
34903498
"context": "限 IT 語境。電影/戲劇場景為正確 tw 用法",
34913499
"english": "scenario",
3492-
"context_clues": ["應用", "部署", "測試", "系統", "開發", "架構", "軟件", "軟體", "程式", "行程", "核心", "記憶體", "CPU"]
3500+
"context_clues": ["應用", "部署", "測試", "系統", "開發", "架構", "軟件", "軟體", "程式", "行程", "核心", "記憶體", "CPU"],
3501+
"editorial_confidence": "low"
34933502
},
34943503
{
34953504
"from": "塑料",
@@ -7416,8 +7425,9 @@
74167425
"from": "消息",
74177426
"to": ["訊息"],
74187427
"type": "cross_strait",
7419-
"context": "@domain IT",
7420-
"english": "message"
7428+
"context": "@domain IT。`好消息`/`壞消息`/`消息來源` 為合法 zh-TW 用法",
7429+
"english": "message",
7430+
"positional_clues": ["not_after:好", "not_after:壞", "not_before:來源"]
74217431
},
74227432
{
74237433
"from": "消息環",
@@ -9097,7 +9107,8 @@
90979107
"type": "cross_strait",
90989108
"context": "@domain 程式設計",
90999109
"english": "algorithm",
9100-
"exceptions": ["演算法"]
9110+
"exceptions": ["演算法"],
9111+
"editorial_confidence": "low"
91019112
},
91029113
{
91039114
"from": "箭頭操作符",

benches/scanner.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,7 @@ fn bench_cpu_attribution_100kb(c: &mut Criterion) {
480480
heading_severity_boost: false,
481481
political_stance: PoliticalStance::RocCentric,
482482
offset_only: false,
483+
exempt_blockquotes: false,
483484
};
484485

485486
// Spelling-only config.

build.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,16 @@ struct SpellingRule {
3333
positional_clues: Option<Vec<String>>,
3434
#[serde(default)]
3535
tags: Option<Vec<String>>,
36+
#[serde(default)]
37+
editorial_confidence: Option<EditorialConfidence>,
38+
}
39+
40+
#[derive(serde::Serialize, serde::Deserialize)]
41+
#[serde(rename_all = "lowercase")]
42+
enum EditorialConfidence {
43+
High,
44+
Medium,
45+
Low,
3646
}
3747

3848
#[derive(serde::Serialize, serde::Deserialize)]

scripts/check-ruleset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def dedup_sort(
7878
"negative_context_clues",
7979
"positional_clues",
8080
"tags",
81+
"editorial_confidence",
8182
}
8283

8384
# Field order for spelling rules (stable, human-scannable output).
@@ -93,6 +94,7 @@ def dedup_sort(
9394
"positional_clues",
9495
"exceptions",
9596
"tags",
97+
"editorial_confidence",
9698
]
9799

98100
CASE_FIELD_ORDER = ["term", "alternatives", "disabled"]

src/cache.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ pub struct ScanParams {
6363
pub translationese_domain: String,
6464
// AI threshold level (formatted f32) — different multipliers produce different results.
6565
pub ai_threshold: String,
66+
// Markdown blockquote-exemption flag — changes which spans get
67+
// scanned, so cache hits must be invalidated when toggled.
68+
#[serde(default)]
69+
pub exempt_blockquotes: bool,
6670
}
6771

6872
/// A single cached entry.
@@ -420,6 +424,7 @@ mod tests {
420424
detect_translationese: false,
421425
translationese_domain: "general".into(),
422426
ai_threshold: "1.0".into(),
427+
exempt_blockquotes: false,
423428
}
424429
}
425430

@@ -433,6 +438,7 @@ mod tests {
433438
detect_translationese: false,
434439
translationese_domain: "general".into(),
435440
ai_threshold: "1.0".into(),
441+
exempt_blockquotes: false,
436442
}
437443
}
438444

src/config.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,38 @@ pub struct ProjectConfig {
2626
pub suppressions: Option<String>,
2727
pub packs: Option<Vec<String>>,
2828
pub translation_memory: Option<String>,
29+
pub markdown: Option<MarkdownConfig>,
30+
pub glossary: Option<GlossaryConfig>,
31+
}
32+
33+
/// Markdown-specific scanning options (35.7).
34+
#[derive(Debug, Default, Deserialize)]
35+
#[serde(default)]
36+
pub struct MarkdownConfig {
37+
/// When true, treat pulldown-cmark `Tag::BlockQuote` ranges as
38+
/// exclusion zones. Useful for documents that quote mainland-Chinese
39+
/// sources for illustrative purposes. Off by default.
40+
pub exempt_blockquotes: Option<bool>,
41+
}
42+
43+
/// Project glossary section (35.9). Layered above the embedded ruleset
44+
/// and pack store but below banned-term enforcement and translation
45+
/// memory. Precedence: glossary `banned` > TM > glossary `preferred` >
46+
/// domain pack > embedded ruleset.
47+
#[derive(Debug, Default, Deserialize)]
48+
#[serde(default)]
49+
pub struct GlossaryConfig {
50+
/// Terms that must always be flagged regardless of context clues.
51+
/// E.g. ["線程", "內存"] forces those calques to fire even in
52+
/// otherwise ambiguous prose.
53+
pub banned: Option<Vec<String>>,
54+
/// Project-preferred zh-TW forms. Used by the consistency report
55+
/// (35.1) to choose the canonical suggestion when both TW-preferred
56+
/// and CN-preferred variants appear in the same document.
57+
pub preferred: Option<Vec<String>>,
58+
/// Names that should never be flagged (added to the suppression
59+
/// list). E.g. ["TSMC", "MediaTek"].
60+
pub proper_nouns: Option<Vec<String>>,
2961
}
3062

3163
impl ProjectConfig {

0 commit comments

Comments
 (0)