-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess-text-files.py
More file actions
100 lines (74 loc) · 2.82 KB
/
preprocess-text-files.py
File metadata and controls
100 lines (74 loc) · 2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
import string
# Load datasets
dataframes = {
"cooking": pd.read_csv("../input/cooking.csv"),
"crypto": pd.read_csv("../input/crypto.csv"),
"robotics": pd.read_csv("../input/robotics.csv"),
"biology": pd.read_csv("../input/biology.csv"),
"travel": pd.read_csv("../input/travel.csv"),
"diy": pd.read_csv("../input/diy.csv"),
}
url_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'
def stripTagsAndUrls(x):
"""
Function to return a data frame composed of strings without HTML tags and URLs
Args:
x (Pandas Dataframe): Dataframe with HTML tags and URL contents
Returns:
Pandas Dataframe without without HTML tags and URLs
"""
if x:
# BeautifulSoup on content
soup = BeautifulSoup(x, "html.parser")
# Stripping all <code> tags with their content
if soup.code:
soup.code.decompose()
# Extract text from HTML
text = soup.get_text()
# Returning text stripping out all URLs
return re.sub(url_re, "", text)
else:
return ""
def removePunctuation(x):
"""
Function to return a data frame with no punctuation
Args:
x (Pandas Dataframe): Dataframe with punctuation
Returns:
Pandas Dataframe without without punctuation
"""
x = x.lower()
# Removing non ASCII chars
x = re.sub(r'[^\x00-\x7f]',r' ',x)
# Removing all the punctuation by replacing it with empty spaces actually
return re.sub("["+string.punctuation+"]", " ", x)
# Remove stopwords from titles and contents
stops = set(stopwords.words("english"))
def removeStopwords(x):
"""
Function to return a data frame with no English stopwords, such as
"i.e.", "would", "get", "like", "using", "know", "question", "use".
Args:
x (Pandas Dataframe): Dataframe containing stowords
Returns:
Pandas Dataframe without without stopwords
"""
filtered_words = [word for word in x.split() if word not in stops]
return " ".join(filtered_words)
# Apply the above-defined functions
for df in dataframes.values():
df["content"] = df["content"].map(stripTagsAndUrls)
for df in dataframes.values():
df["title"] = df["title"].map(removePunctuation)
df["content"] = df["content"].map(removePunctuation)
for df in dataframes.values():
df["title"] = df["title"].map(removeStopwords)
df["content"] = df["content"].map(removeStopwords)
# Finally, save pre-processed dataframes into a csv format
for name, df in dataframes.items():
# Saving to file
df.to_csv(name + "_preprocessed.csv", index=False)