-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathget_english_word_test_2.py
More file actions
71 lines (57 loc) · 1.97 KB
/
get_english_word_test_2.py
File metadata and controls
71 lines (57 loc) · 1.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from bs4 import BeautifulSoup
import requests
import nltk
import distance
import sys
import pprint
STOP_WORDS = nltk.corpus.stopwords.words('english')
ENGLISH_WORDS = nltk.corpus.words.words()
pp = pprint.PrettyPrinter(indent=2)
def __contiguous_string_matches__(main_s, s_compare):
if len(main_s) < len(s_compare):
main_s, s_compare = s_compare, main_s
total = 0
for i in range(len(s_compare)):
if s_compare[i] == main_s[i]:
total += 1
else:
break # To ensure only contiguous matches are counted
if total / len(s_compare) < 0.8:
return False
return True
csm = __contiguous_string_matches__
word = 'electricals'
url = f'https://www.merriam-webster.com/dictionary/{word}'
spr_hdr = {}
spr_hdr['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
spr_hdr['User-Agent'] += 'AppleWebKit/537.36 (KHTML, like Gecko) '
spr_hdr['User-Agent'] += 'Chrome/56.0.2924.76 Safari/537.36'
spr_hdr['Upgrade-Insecure-Requests'] = '1'
spr_hdr["DNT"] = "1"
spr_hdr["Accept"] = "text/html,application/xhtml+xml,application/xml;"
spr_hdr["Accept"] += "q=0.9,*/*;q=0.8"
spr_hdr["Accept-Language"] = "en-US,en;q=0.5"
spr_hdr["Accept-Encoding"] = "gzip, deflate"
response = requests.get(url, headers=spr_hdr)
soup = BeautifulSoup(response.text, 'html.parser')
stuff = soup.find('div', id="near-entries-anchor")
if not stuff:
other_stuff = soup.find('p', class_="spelling-suggestions")
new_stuff = other_stuff.get_text().split()
print(new_stuff)
# print(f'Should use {the_word} instead')
sys.exit()
text = stuff.get_text()
text = text.replace('Dictionary Entries near ', '')
text = text.replace(' See More Nearby Entries', '')
poss_base_words = text.split()
poss_base_words = [t.lower() for t in poss_base_words]
print(poss_base_words)
print()
poss_base_words = sorted(set([
t for t in poss_base_words
if t not in STOP_WORDS
and t in ENGLISH_WORDS
and csm(word, t)
and len(t) <= len(word)]))
print(poss_base_words)