-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtweet_cleaner.py
More file actions
111 lines (86 loc) · 2.66 KB
/
tweet_cleaner.py
File metadata and controls
111 lines (86 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re
import enchant
import HTMLParser
from constants import original_tweet
from constants import cachedStopWords
from itertools import groupby
def escapeHTMLChars(tweet):
html_parser = HTMLParser.HTMLParser()
tweet = html_parser.unescape(tweet)
return tweet
def decodeData(tweet):
tweet = tweet.decode("utf8").encode('ascii','ignore')
return tweet
def appostropheLookup(tweet):
from constants import APPOSTROPHES
words = tweet.split()
reformed = [APPOSTROPHES[word] if word in APPOSTROPHES \
else word for word in words]
reformed = " ".join(reformed)
return reformed
def removeEmoji(tweet):
myre = re.compile(u'['
u'\U0001F300-\U0001F64F'
u'\U0001F680-\U0001F6FF'
u'\u2600-\u26FF\u2700-\u27BF]+',
re.UNICODE)
tweet = myre.sub('', tweet)
return tweet
def splitAttachedWords(tweet):
tweet = " ".join(re.findall('[A-Z][^A-Z]*', tweet))
return tweet
def standardizeWords(tweet):
tweet = ''.join(''.join(s)[:2] for _, s in groupby(tweet))
return tweet
def removeLinks(tweet):
links_re = re.compile("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)")
tweet = links_re.sub(" ",tweet)
tweet = ' '.join(tweet.split())
return tweet
def removeSlangs(tweet):
from constants import SLANGS
words = tweet.split()
reformed = [' ' if word in SLANGS else word for word in words ]
reformed = " ".join(reformed)
return reformed
def removeShortTweets(tweet):
words = tweet.split()
if len(words)<3:
return ''
else:
return tweet
def removeStopWords(tweet):
words = tweet.split()
reformed = [word for word in tweet.split() if word not in cachedStopWords]
return " ".join(reformed)
def removeNonEnglishTweets(tweet):
words = tweet.split()
d = enchant.Dict("en_US")
count = 0
total = len(words)
if total == 0:
return ''
# print total
for word in words:
if not d.check(word):
count += 1
# print count
percentage = float(count)/total * 100
# print percentage
if percentage >= 60:
return ''
else:
return tweet
def cleanTweet(tweet):
clean_tweet = tweet
clean_tweet = escapeHTMLChars(clean_tweet)
clean_tweet = decodeData(clean_tweet)
clean_tweet = appostropheLookup(clean_tweet)
clean_tweet = removeEmoji(clean_tweet)
clean_tweet = removeStopWords(clean_tweet)
clean_tweet = standardizeWords(clean_tweet)
clean_tweet = removeLinks(clean_tweet)
clean_tweet = removeSlangs(clean_tweet)
clean_tweet = removeShortTweets(clean_tweet)
clean_tweet = removeNonEnglishTweets(clean_tweet)
return clean_tweet