-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathre_modules.py
More file actions
60 lines (49 loc) · 1.97 KB
/
re_modules.py
File metadata and controls
60 lines (49 loc) · 1.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import re
def contains_phone(text):
phonePattern=re.compile(r'''
# don't match beginning of string,number can start anywhere
(\d{3}) # area code is 3 digits (e.g. '800')
\D* # optional separator is any number of non-digits
(\d{3}) # trunk is 3 digits (e.g. '555')
\D* # optional separator
(\d{4}) # rest of number is 4 digits (e.g. '1212')
\D* # optional separator
(\d*) # extension is optional and can be any number of digits
$ # end of string
''',re.VERBOSE)
# return len(phonePattern.findall(text))
if len(phonePattern.findall(text)) > 0:
return "phonenumber"
else :
return text
def preprocess(s,lowercase=False):
# print("Method: preprocess(s,lowercase=False)")
emoticons_str=r'''
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)'''
regex_str=[
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)" ,# hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f]['
r'0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re =re.compile(r'('+'|'.join(regex_str)+')',re.VERBOSE | re.IGNORECASE)
emoticon_re=re.compile(r'^'+emoticons_str+'$',re.VERBOSE | re.IGNORECASE)
tokens=tokens_re.findall(str(s))
if lowercase:
tokens=[token if emoticon_re.search(token) else token.lower() for token
in tokens]
return tokens
def main():
s="Haryana government will send food package and 25.3 blankets in Nepal 1230,1485 http://t.co/hAiN2bMSQ3"
print(preprocess(s,lowercase=False))
if __name__ == "__main__": main()