forked from v1shwa/document-similarity
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDocSim.py
More file actions
56 lines (47 loc) · 2.12 KB
/
DocSim.py
File metadata and controls
56 lines (47 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import numpy as np
class DocSim:
def __init__(self, w2v_model, stopwords=None):
self.w2v_model = w2v_model
self.stopwords = stopwords if stopwords is not None else []
def vectorize(self, doc):
"""Identify the vector values for each word in the given document"""
doc = doc.lower()
words = [w for w in doc.split(" ") if w not in self.stopwords]
word_vecs = []
for word in words:
try:
vec = self.w2v_model[word]
word_vecs.append(vec)
except KeyError:
# Ignore, if the word doesn't exist in the vocabulary
pass
# Assuming that document vector is the mean of all the word vectors
# PS: There are other & better ways to do it.
vector = np.mean(word_vecs, axis=0)
return vector
def _cosine_sim(self, vecA, vecB):
"""Find the cosine similarity distance between two vectors."""
csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
if np.isnan(np.sum(csim)):
return 0
return csim
def calculate_similarity(self, source_doc, target_docs=None,topn=None,threshold=0):
"""Calculates & returns similarity scores between given source document & all
the target documents."""
if not target_docs:
return []
if not topn:
topn = len(target_docs)
if isinstance(target_docs, str):
target_docs = [target_docs]
source_vec = self.vectorize(source_doc)
tagged_data = [TaggedDocument(words=_d.lower(),tags=[str(i)]) for i, _d in enumerate(corpus)]
results = []
for i in range(0,len(tagged_data)):
target_vec = self.vectorize(tagged_data[i].words)
sim_score = self._cosine_sim(source_vec, target_vec)
if sim_score > threshold:
results.append({"score": sim_score,"tag":tagged_data[i].tags,"doc":tagged_data[i].words})
# Sort results by score in desc order
results.sort(key=lambda k: k["score"], reverse=True)
return results[0:topn]