-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathexport_wikt_csv.py
More file actions
87 lines (75 loc) · 3.14 KB
/
export_wikt_csv.py
File metadata and controls
87 lines (75 loc) · 3.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse, re, csv, hashlib
from pathlib import Path
from html import unescape
POS_LIST = ["Noun","Verb","Adjective","Adverb","Pronoun","Proper_noun","Interjection","Conjunction","Preposition","Determiner","Article","Numeral","Prefix","Suffix"]
def safe_filename(word: str) -> str:
base = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', word).strip('.')
h = hashlib.sha1(word.encode()).hexdigest()[:8]
return f"{base}_{h}.html"
def subdir_for_word(base_dir: Path, word: str) -> Path:
key = (word[:2].lower() if len(word) >= 2 else (word[:1].lower() or "_"))
return base_dir / key
def read_html(cache_root: Path, word: str) -> str | None:
p = subdir_for_word(cache_root, word) / safe_filename(word)
if not p.exists():
return None
try:
return p.read_text(encoding="utf-8", errors="replace")
except Exception:
return None
def strip_tags(s: str) -> str:
s = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', s, flags=re.S|re.I)
s = re.sub(r'<[^>]+>', ' ', s)
s = unescape(s)
s = re.sub(r'\s+', ' ', s).strip()
return s
def find_pos(html: str) -> list[str]:
found = []
for pos in POS_LIST:
# id="Noun" or id='Noun'
if f'id="{pos}"' in html or f"id='{pos}'" in html:
found.append(pos.replace('_',' '))
return found
def first_definition_after(html: str, anchor: str) -> str | None:
# Look for the anchor then first <li> within a reasonable window
idx = html.find(anchor)
if idx == -1:
return None
window = html[idx: idx + 30000] # scan next 30k chars
m = re.search(r'<ol[^>]*>\s*<li[^>]*>(.*?)</li>', window, flags=re.S|re.I)
if not m:
# fallback: any <li>
m = re.search(r'<li[^>]*>(.*?)</li>', window, flags=re.S|re.I)
return strip_tags(m.group(1)) if m else None
def extract_first_def(html: str, pos_list: list[str]) -> str | None:
# Try each POS in order
for pos in pos_list:
for anchor in (f'id="{pos.replace(" ","_")}"', f"id='{pos.replace(' ','_')}'"):
d = first_definition_after(html, anchor)
if d:
return d
return None
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--in", dest="infile", required=True, help="word list, one per line")
ap.add_argument("--wikt-cache-root", default="../cache/wiktionary_cache")
ap.add_argument("--out", default="words_defs.csv")
args = ap.parse_args()
wikt_root = Path(args.wikt_cache_root)
words = [w.strip() for w in Path(args.infile).read_text(encoding="utf-8").splitlines() if w.strip()]
with open(args.out, "w", newline="", encoding="utf-8") as f:
wr = csv.writer(f)
wr.writerow(["word","is_english","pos","first_definition"])
for w in words:
html = read_html(wikt_root, w)
if not html:
wr.writerow([w, False, "", ""])
continue
pos = find_pos(html)
is_en = bool(pos)
first_def = extract_first_def(html, pos) if is_en else ""
wr.writerow([w, is_en, ";".join(pos), first_def or ""])
if __name__ == "__main__":
main()