|
| 1 | +import re |
| 2 | +import os |
| 3 | + |
| 4 | +file_path = r"D:\RubberTale\RubberTale.github.io\source\_posts\2026年霍尔木兹海峡危机:特朗普政府的海上收费政策、全球航运封锁与地缘经济秩序的系统性重构.md" |
| 5 | + |
| 6 | +with open(file_path, "r", encoding="utf-8") as f: |
| 7 | + content = f.read() |
| 8 | + |
| 9 | +# 1. Fix YAML metadata |
| 10 | +content = re.sub(r"date: 2026-04-\[18\] 00:\[07\]:\[34\]", "date: 2026-04-18 00:07:34", content) |
| 11 | + |
| 12 | +# 2. Fix data tables (brackets around decimals) |
| 13 | +content = re.sub(r"(\d+)\.\[(\d+)\]", r"\1.\2", content) |
| 14 | + |
| 15 | +# 3. Handle Works Cited section (protect it) |
| 16 | +parts = content.split("#### **引用的著作**") |
| 17 | +body = parts[0] |
| 18 | +references = "#### **引用的著作**" + parts[1] if len(parts) > 1 else "" |
| 19 | + |
| 20 | +# 4. Normalize citations in text |
| 21 | +# Find 1-99 following Chinese char, punctuation, or space, not followed by units |
| 22 | +units = ["%", "美元", "桶", "桶/日", "英里", "年", "月", "日", "时", "分", "秒", "艘", "位", "人", "美元/桶", "亿", "万"] |
| 23 | +# Regex for number following Chinese char (\u4e00-\u9fff), punctuation, or space |
| 24 | +# We use a negative lookahead for units |
| 25 | +def citation_replacer(match): |
| 26 | + num = match.group(2) |
| 27 | + following = match.group(3) |
| 28 | + # Check if following starts with any unit |
| 29 | + for unit in units: |
| 30 | + if following.startswith(unit): |
| 31 | + return match.group(0) |
| 32 | + # If it's part of a time like 14:00, don't bracket |
| 33 | + if following.startswith(":"): |
| 34 | + return match.group(0) |
| 35 | + # If it's already bracketed, don't double bracket |
| 36 | + return f"{match.group(1)}[{num}]{following}" |
| 37 | + |
| 38 | +# Pattern: (Chinese char|Punctuation|Space) (Number 1-99) (Following char) |
| 39 | +# Note: This is a simplified regex, might need refinement |
| 40 | +# Better approach: find all numbers 1-99 and check context |
| 41 | +def normalize_body(text): |
| 42 | + # Fix existing mistakes from previous replace calls if any |
| 43 | + text = text.replace("原油进口国 and 海峡能源流量", "原油进口国和海峡能源流量") |
| 44 | + text = text.replace("彻底违背向。", "彻底违背。") |
| 45 | + |
| 46 | + # Specific fix for [14]:00 |
| 47 | + text = text.replace("[14]:00", "14:00") |
| 48 | + |
| 49 | + # General citation normalization |
| 50 | + # Numbers 1-99 that are not followed by units and not already bracketed |
| 51 | + # Look for patterns like "断裂 1。" -> "断裂 [1]。" |
| 52 | + # We want to match: (Preceding) (Space)? (Number) (Following) |
| 53 | + # Preceding: Chinese char or punctuation |
| 54 | + # Number: 1-99 |
| 55 | + # Following: Punctuation or Space or Newline |
| 56 | + |
| 57 | + # This regex matches a number 1-99 preceded by a space or Chinese char, |
| 58 | + # and followed by punctuation, space, or end of line. |
| 59 | + # It excludes cases where it's followed by a unit. |
| 60 | + |
| 61 | + # Units list for regex |
| 62 | + units_pattern = "|".join(re.escape(u) for u in units) |
| 63 | + |
| 64 | + # Matches a number that is NOT part of a larger number, NOT bracketed, |
| 65 | + # and NOT followed by a unit. |
| 66 | + # (?<![\d\[]) matches if not preceded by digit or [ |
| 67 | + # (\d{1,2}) matches 1-2 digits |
| 68 | + # (?![\d%]|美元|桶|...) matches if not followed by digit, %, or units |
| 69 | + pattern = r"(?<![\d\[])(\d{1,2})(?![\d\]]|" + units_pattern + r"|:)" |
| 70 | + |
| 71 | + # We only apply this to text following Chinese chars or at end of sentences. |
| 72 | + # Actually, the user's rule is "识别文中紧跟在汉字、标点或空格后的 1-99 之间的数字引用" |
| 73 | + |
| 74 | + # Let's try a more specific replacement based on the file content's observed patterns |
| 75 | + # Most citations are "Word Number." or "Word Number," or "Word Number " |
| 76 | + |
| 77 | + def repl(m): |
| 78 | + # Group 1: Number |
| 79 | + return f"[{m.group(1)}]" |
| 80 | + |
| 81 | + # Apply to body |
| 82 | + # We need to be careful not to hit numbers in the middle of text that are NOT citations. |
| 83 | + # But in this specific file, almost all such numbers are citations. |
| 84 | + # Except those with units, which we excluded in the negative lookahead. |
| 85 | + |
| 86 | + new_body = re.sub(pattern, repl, text) |
| 87 | + |
| 88 | + # Fix cases where I might have bracketed something I shouldn't |
| 89 | + # (e.g. 2026, which is > 99, so it's safe) |
| 90 | + |
| 91 | + return new_body |
| 92 | + |
| 93 | +# Actually, the user's examples: |
| 94 | +# 将“断裂 1。”改为“断裂 [1]。” |
| 95 | +# My regex `(?<![\d\[])(\d{1,2})(?![\d\]]|unit|:)` will catch `1` in `断裂 1。` |
| 96 | + |
| 97 | +new_body = normalize_body(body) |
| 98 | + |
| 99 | +with open(file_path, "w", encoding="utf-8") as f: |
| 100 | + f.write(new_body + references) |
0 commit comments