-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleaner.py
More file actions
38 lines (30 loc) · 1.13 KB
/
cleaner.py
File metadata and controls
38 lines (30 loc) · 1.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
class Cleaner:
def __init__(self):
# Mapping Arabic characters to Persian/Dari equivalents
self.char_map = {
"ي": "ی", # Arabic Yeh → Farsi/Dari Yeh
"ك": "ک", # Arabic Kaf → Farsi/Dari Kaf
}
def normalize_text(self, text: str) -> str:
"""
Normalize Dari/Persian text.
"""
if not text:
return ""
# Replace Arabic characters with Persian/Dari equivalents
for arabic, dari in self.char_map.items():
text = text.replace(arabic, dari)
# Remove multiple spaces and newlines
text = re.sub(r"\s+", " ", text).strip()
# Remove weird invisible characters
text = re.sub(r"[\u200c\u200d]", "", text) # zero-width chars
return text
def clean_article(self, article: dict) -> dict:
"""
Cleans all text fields in an article dictionary.
"""
for field in ["title", "author", "summary", "content"]:
if field in article and article[field]:
article[field] = self.normalize_text(article[field])
return article