-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path00_langdetect.py
More file actions
46 lines (39 loc) · 1.22 KB
/
00_langdetect.py
File metadata and controls
46 lines (39 loc) · 1.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# Import submodules
import pandas as pd
import numpy as np
from langdetect import detect, detect_langs
from sklearn import metrics
from tqdm import tqdm
import pycountry
# Helpers
# Conversion table between dataset and langdetect
# Load the dataset
print(">> Loading the dataset ...")
lang_set = pd.read_csv('sentences.csv', sep='\t')
lang_set = lang_set.dropna()
print(">> done.")
print(">> Rearranging everything ...")
X, Y = [], []
for no, ln in enumerate(lang_set.cmn.unique()):
langs = lang_set.loc[lang_set.cmn == ln]
if langs.shape[0] < 500:
continue
langs = langs.iloc[:500, -1].tolist()
X.extend(langs)
Y.extend([ln] * len(langs))
print(">> done.")
# Get the labels -> required for classification_report
labels = np.unique(Y, return_counts=True)[0]
P = []
exception_count = 0
for x, y in tqdm(zip(X, Y)):
# Get the top language
try:
detected = detect(x)
lang_code = pycountry.languages.get(alpha_2=detected.split('-')[0]).alpha_3.lower()
P.append(lang_code if lang_code in labels else 'eng')
except:
exception_count += 1
P.append('eng')
print(f"Ran into {exception_count} exception of 64500 in total")
print(metrics.classification_report(Y, P, target_names=labels))