-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathlanguage_processing.py
More file actions
144 lines (115 loc) · 5.19 KB
/
language_processing.py
File metadata and controls
144 lines (115 loc) · 5.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2024-05-02: created by the Cambridge Crystallographic Data Centre
#
"""
Credit - this code is adapted from The impact of the Cambridge Structural Database and the small molecule crystal structures it contains: a bibliographic and literature study
Peter Willett, Jason C. Cole and Ian J. Bruno
@Article{D0CE00045K,
author ="Willett, Peter and Cole, Jason C. and Bruno, Ian J.",
title ="The impact of the Cambridge Structural Database and the small molecule crystal structures it contains: a bibliographic and literature study",
journal ="CrystEngComm",
year ="2020",
volume ="22",
issue ="43",
pages ="7233-7241",
publisher ="The Royal Society of Chemistry",
doi ="10.1039/D0CE00045K",
url ="https://dx.doi.org/10.1039/D0CE00045K",
"""
# For now, to prevent deprecation warnings tripping things up
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # noqa
# Flake doesnt like imports after the line above, but we have to have the line above to
# prevent deprecation warnings from these third party packages sending spurious output
# which we dont want, as it is 'seen' as an error when using this code through Mercury.
from wordcloud import WordCloud # noqa
import nltk # noqa
import nltk.collocations # noqa
from nltk.stem.wordnet import WordNetLemmatizer # noqa
import pandas as pd # noqa
from csv import writer # noqa
class FrequencyCalculator:
"""
A class to calculate the frequency of words in a text corpus that build on nltk
"""
def __init__(self, text):
def update_word(word):
if word == "bonding":
word = "bond"
return word
def keep(word, stop_words):
if len(word) <= 3 and word.lower() != "tin":
return False
if word.startswith('/'):
return False
if word in stop_words:
return False
return True
self.stop = set(nltk.corpus.stopwords.words('english'))
self.bigrams_to_ignore = set()
self.bigram_words_to_ignore = set()
lem = WordNetLemmatizer()
self._words = [update_word(lem.lemmatize(word)) for word in nltk.word_tokenize(text.lower()) if
keep(word, self.stop)]
self._fdist = nltk.FreqDist(self._words)
def most_common_words(self, how_many=None):
n_elements = len(self._words)
if how_many is None:
rowdata = self._fdist.most_common()
else:
rowdata = self._fdist.most_common(how_many)
return n_elements, rowdata
def write_word_frequency_table(self, how_many, filename='single_word_frequencies.csv'):
n_elements, rowdata = self.most_common_words(how_many)
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
w = writer(csvfile)
w.writerow(['word', 'count', 'frequency'])
for row in rowdata:
towrite = [row[0], row[1], row[1] * 100.0 / n_elements]
w.writerow(towrite)
"""
Based on
https://medium.com/@nicharuch/collocations-identifying-phrases-that-act-like-individual-words-in-nlp-f58a93a2f84a
"""
def right_bigram_type(self, ngram):
if '-pron-' in ngram or 't' in ngram:
return False
for word in ngram:
if word in self.stop or word.isspace() or word in self.bigram_words_to_ignore:
return False
acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
second_type = ('NN', 'NNS', 'NNP', 'NNPS')
tags = nltk.pos_tag(ngram)
if tags[0][1] in acceptable_types and tags[1][1] in second_type:
# Further checks
return ngram not in self.bigrams_to_ignore
else:
return False
def calculate_bigram_frequencies(self):
finder = nltk.collocations.BigramCollocationFinder.from_words(self._words)
bigram_freq = finder.ngram_fd.items()
frequency_table = pd.DataFrame(list(bigram_freq), columns=['bigram', 'freq']).sort_values(by='freq',
ascending=False)
return frequency_table[frequency_table.bigram.map(lambda x: self.right_bigram_type(x))]
def make_word_cloud_from_text(text, fname):
# lower max_font_size
wc = WordCloud(max_font_size=40).generate(text)
wc.to_file(fname)
def make_word_cloud_from_frequencies(dataframe, min_count, fname):
d = {}
for a, x in dataframe.values:
if x >= min_count:
d[" ".join(a)] = x
wc = WordCloud(width=800, height=600, background_color='white', prefer_horizontal=0.5)
wc.generate_from_frequencies(frequencies=d)
wc.to_file(fname)
def word_frequency_analysis(min_bigrams, text_to_process, fname):
c = FrequencyCalculator(text_to_process)
make_word_cloud_from_frequencies(c.calculate_bigram_frequencies(), min_bigrams, fname)