Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions assets/ruleset.json
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,8 @@
"type": "cross_strait",
"context": "@domain IT。tw「優化」泛用於商業;IT optimize 改「最佳化」以區分 improve",
"english": "optimize",
"negative_context_clues": ["流程", "體驗", "服務"]
"negative_context_clues": ["流程", "體驗", "服務"],
"editorial_confidence": "low"
},
{
"from": "優步",
Expand Down Expand Up @@ -1302,7 +1303,7 @@
"from": "元數據",
"to": [],
"type": "cross_strait",
"context": "@domain 資料。源自希臘文 meta- (關於) + data (資料),原意為「描述資料的資料」",
"context": "@domain 資料。源自希臘文 meta- (關於) + data (資料),原意為「描述資料的資料」。preferred: metadata;可接受: 詮釋資料 / 後設資料;rejected: 元資料",
"english": "metadata"
},
{
Expand All @@ -1319,6 +1320,13 @@
"context": "@domain 程式設計",
"english": "metaprogramming/meta-programming"
},
{
"from": "元資料",
"to": [],
"type": "cross_strait",
"context": "@domain 資料。`元資料` 為機械式 Sinification (從 `元數據` 字面替換而來),無 NAER / MoE 立足點。preferred: metadata;可接受替代: 詮釋資料 / 後設資料",
"english": "metadata"
},
{
"from": "元音",
"to": ["母音"],
Expand Down Expand Up @@ -3489,7 +3497,8 @@
"type": "confusable",
"context": "限 IT 語境。電影/戲劇場景為正確 tw 用法",
"english": "scenario",
"context_clues": ["應用", "部署", "測試", "系統", "開發", "架構", "軟件", "軟體", "程式", "行程", "核心", "記憶體", "CPU"]
"context_clues": ["應用", "部署", "測試", "系統", "開發", "架構", "軟件", "軟體", "程式", "行程", "核心", "記憶體", "CPU"],
"editorial_confidence": "low"
},
{
"from": "塑料",
Expand Down Expand Up @@ -7416,8 +7425,9 @@
"from": "消息",
"to": ["訊息"],
"type": "cross_strait",
"context": "@domain IT",
"english": "message"
"context": "@domain IT。`好消息`/`壞消息`/`消息來源` 為合法 zh-TW 用法",
"english": "message",
"positional_clues": ["not_after:好", "not_after:壞", "not_before:來源"]
},
{
"from": "消息環",
Expand Down Expand Up @@ -9097,7 +9107,8 @@
"type": "cross_strait",
"context": "@domain 程式設計",
"english": "algorithm",
"exceptions": ["演算法"]
"exceptions": ["演算法"],
"editorial_confidence": "low"
},
{
"from": "箭頭操作符",
Expand Down
1 change: 1 addition & 0 deletions benches/scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ fn bench_cpu_attribution_100kb(c: &mut Criterion) {
heading_severity_boost: false,
political_stance: PoliticalStance::RocCentric,
offset_only: false,
exempt_blockquotes: false,
};

// Spelling-only config.
Expand Down
10 changes: 10 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@ struct SpellingRule {
positional_clues: Option<Vec<String>>,
#[serde(default)]
tags: Option<Vec<String>>,
#[serde(default)]
editorial_confidence: Option<EditorialConfidence>,
}

#[derive(serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "lowercase")]
enum EditorialConfidence {
High,
Medium,
Low,
}

#[derive(serde::Serialize, serde::Deserialize)]
Expand Down
2 changes: 2 additions & 0 deletions scripts/check-ruleset.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def dedup_sort(
"negative_context_clues",
"positional_clues",
"tags",
"editorial_confidence",
}

# Field order for spelling rules (stable, human-scannable output).
Expand All @@ -93,6 +94,7 @@ def dedup_sort(
"positional_clues",
"exceptions",
"tags",
"editorial_confidence",
]

CASE_FIELD_ORDER = ["term", "alternatives", "disabled"]
Expand Down
6 changes: 6 additions & 0 deletions src/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ pub struct ScanParams {
pub translationese_domain: String,
// AI threshold level (formatted f32) — different multipliers produce different results.
pub ai_threshold: String,
// Markdown blockquote-exemption flag — changes which spans get
// scanned, so cache hits must be invalidated when toggled.
#[serde(default)]
pub exempt_blockquotes: bool,
}

/// A single cached entry.
Expand Down Expand Up @@ -420,6 +424,7 @@ mod tests {
detect_translationese: false,
translationese_domain: "general".into(),
ai_threshold: "1.0".into(),
exempt_blockquotes: false,
}
}

Expand All @@ -433,6 +438,7 @@ mod tests {
detect_translationese: false,
translationese_domain: "general".into(),
ai_threshold: "1.0".into(),
exempt_blockquotes: false,
}
}

Expand Down
32 changes: 32 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,38 @@ pub struct ProjectConfig {
pub suppressions: Option<String>,
pub packs: Option<Vec<String>>,
pub translation_memory: Option<String>,
pub markdown: Option<MarkdownConfig>,
pub glossary: Option<GlossaryConfig>,
}

/// Markdown-specific scanning options (35.7).
#[derive(Debug, Default, Deserialize)]
#[serde(default)]
pub struct MarkdownConfig {
/// When true, treat pulldown-cmark `Tag::BlockQuote` ranges as
/// exclusion zones. Useful for documents that quote mainland-Chinese
/// sources for illustrative purposes. Off by default.
pub exempt_blockquotes: Option<bool>,
}

/// Project glossary section (35.9). Layered above the embedded ruleset
/// and pack store but below banned-term enforcement and translation
/// memory. Precedence: glossary `banned` > TM > glossary `preferred` >
/// domain pack > embedded ruleset.
#[derive(Debug, Default, Deserialize)]
#[serde(default)]
pub struct GlossaryConfig {
/// Terms that must always be flagged regardless of context clues.
/// E.g. ["線程", "內存"] forces those calques to fire even in
/// otherwise ambiguous prose.
pub banned: Option<Vec<String>>,
/// Project-preferred zh-TW forms. Used by the consistency report
/// (35.1) to choose the canonical suggestion when both TW-preferred
/// and CN-preferred variants appear in the same document.
pub preferred: Option<Vec<String>>,
/// Names that should never be flagged (added to the suppression
/// list). E.g. ["TSMC", "MediaTek"].
pub proper_nouns: Option<Vec<String>>,
}

impl ProjectConfig {
Expand Down
Loading
Loading