-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathextractive.py
More file actions
197 lines (182 loc) · 8.17 KB
/
extractive.py
File metadata and controls
197 lines (182 loc) · 8.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import nltk
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
"""
All Extractive Data Processing Methods In One Python File
"""
class TextRank():
"""
Run TextRank on the data to ensure model is run against the most important summaries
"""
def __init__(self, df):
self.df = df
self.main()
def main(self):
summaries = self.df['summary']
# update summaries
new_summaries = [self.rank_summaries(summary) for summary in summaries]
self.df['summary'] = new_summaries
def rank_summaries(self, summary):
"""
Rank summaries and return the one with the highest score
"""
summary_split = summary.split("@ highlight")
embedding_index = self.get_word_embeddings()
sentence_vectors = []
# get word count vector for each sentence
for sentence in summary_split:
words = nltk.word_tokenize(sentence)
mean_vector_score = sum([embedding_index.get(
word, np.zeros((100,))) for word in words])/len(words)
sentence_vectors.append(mean_vector_score)
# similarity matrix
sim_matrix = self.get_similarity_matrix(sentence_vectors)
# graph of matrix - retrieve a set of scores based on page rank algorithm
pageRank_scores = self.get_graph(sim_matrix)
# rank sentences based off scores and extract top one as the chosen sentence for training
sent_scores = [(pageRank_scores[i], sent)
for i, sent in enumerate(summary_split)]
sent_scores = sorted(sent_scores, reverse=True)
chosen_summary = sent_scores[0][1]
return(chosen_summary)
def get_similarity_matrix(self, sentence_vectors):
sim_matrix = np.zeros([len(sentence_vectors), len(sentence_vectors)])
# CSim(d1,d2) = cos(x) - use cosine similarity
for i, d1 in enumerate(sentence_vectors):
for j, d2 in enumerate(sentence_vectors):
if i != j:
sim_matrix[i][j] = cosine_similarity(
d1.reshape(1, 100), d2.reshape(1, 100))
return sim_matrix
def get_graph(self, sim_matrix):
nx_graph = nx.from_numpy_array(sim_matrix)
try:
# limit to 50 iterations to speed up processing
scores = nx.pagerank(nx_graph, max_iter=50, alpha=0.85)
except Exception:
# dealing with potential power iteration errors
# instead of running page rank we'll deal with this by simply averaging the scores over each matrix row
scores = []
for row in sim_matrix:
scores.append(np.average(row))
return scores
def get_word_embeddings(self):
"""
Get GloVe Word Embeddings
"""
embedding_index = {}
with open('./glove/glove.6B.100d.txt', encoding="utf8") as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embedding_index[word] = coefs
return embedding_index
class WordFrequency():
"""
Run WordFrequency on the data to ensure model is run against the summaries that has the highest word frequency rank with the main article
"""
def __init__(self, df):
self.df = df
self.main()
def main(self):
texts = self.df['text']
summaries = self.df['summary']
# get sentence scores for each summary
self.sent_pos = 0 # this is a hack for getting the correct article for each summary
sentence_scores = [self.score_sentences(summary, texts) for summary in summaries]
# sentence scores = [("sentence1", value1) ... ("sentecex", valuex)]
self.df['summary'] = [self.get_best_summary(sentences) for sentences in sentence_scores]
def score_sentences(self, document, texts):
""""
Score each summary based on the number of words that occurs in them that also occur in the highest occuring words in the main document text
"""
sent_scores = []
# call word_frequency to get a word frequency table (or rather list of words) from the respective article
scorable_words = self.word_frequency(texts[self.sent_pos])
# split the summaries by @highlight token
summary_split = document.split("@ highlight")
sentenceValue = 0
sent_len = 0
# for each summary calculate the sentence value
for summary in summary_split:
words = nltk.word_tokenize(summary)
sent_len = len(words)
for word in words:
if word in scorable_words:
sentenceValue =+ 1
# normalise sentence value based on sentence length so that longer sentences do not get an automatic advantage over shorter ones
# as null rows havent been dropped yet there may be scores of 0
if (sentenceValue !=0 and sent_len !=0):
sentenceValue = sentenceValue / sent_len
sent_scores.append((summary, sentenceValue))
return sent_scores
def word_frequency(self, document):
"""
Calculate a word frequency table for the words in a given documents
After this, it removes any words that occur below a given threshold value, returning a list of "acceptable" words from the original corpus
"""
freq_table = {}
words = nltk.word_tokenize(document)
for word in words:
if word in freq_table:
freq_table[word] = freq_table.get(word) + 1
else:
freq_table[word] = 1
# cut down the frequency table so that only common words are scored for
freq_table = sorted(freq_table.items(), key=lambda x: x[1], reverse=True)
scorable_words = []
for word, occ in freq_table:
# set threshold as words appearing x times or more - set to optimal valeue = 0
# in hindsight this can just be deleted
if int(occ) > 0:
scorable_words.append(word)
else:
break
self.sent_pos = self.sent_pos + 1
return scorable_words
def get_best_summary(self, sent_scores):
"""
Get the best summary based on which has the greatest score
"""
best_val = 0
best_sent = ""
for (sentence, val) in sent_scores:
if val > best_val:
best_sent = sentence
best_val = val
return best_sent
class SentencePosition():
"""
Run SentencePosition on the data to ensure model is run against the most important sentences in the articles.
Sentences in the beginning define the theme and sentences at the end conclude the document. Use this knowledge to calculate the positional value of each sentence.
"""
def __init__(self, df):
self.df = df
self.main()
def main(self):
texts = self.df['text']
new_texts = [self.sentence_ranker(text) for text in texts]
self.df['text'] = new_texts
def sentence_ranker(self, article):
max_rank = 5 # we only care about the first and last five sentence.
# split by <eos> token added in sent_pos_cleaner
sentences = article.split("< eos >")
sent_with_rank = {}
len_sent = len(sentences)
for i in range(0, len_sent):
sentence = sentences[i]
if max_rank - i > 0 :
# give rank to first 5 sentences - considered to be intro
sent_with_rank[sentence] = max_rank - i
if len_sent - max_rank <= i :
# give rank to last 5 sentences - considered to be summaries
if sentence in sent_with_rank:
# there may be situations where a corpus is < 11 lines long as so there will be an overlap
sent_with_rank[sentence] = sent_with_rank.get(sentence) + (max_rank - (len_sent - i) + 1)
else:
sent_with_rank[sentence] = max_rank - (len_sent - i) + 1
# return the new article joined together
return "".join(sent_with_rank.keys())