-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmatch_engine.py
More file actions
100 lines (78 loc) · 3.01 KB
/
match_engine.py
File metadata and controls
100 lines (78 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from __future__ import print_function
import jellyfish
#import ngram
class MatchEngine(object):
def __init__(self, dicts):
self.dicts = dicts
def find_match_levenshtein(self, token, canonical):
candidates = []
best_score = 2
for word in self.dicts:
score = jellyfish.levenshtein_distance(
token, word.decode("utf-8").lower())
if score <= best_score:
best_score = score
candidates.append(word.lower())
#G = ngram.NGram(candidates)
#best_candidates = G.search(token, threshold=0.5)
#results = [item[0] for item in best_candidates]
is_match = False
for word in candidates:
if word == canonical:
is_match = True
break
#if len(best_candidates) > 0:
# best_match = best_candidates[0][0]
#else:
# best_match = ""
return candidates, is_match
def find_match_levenshtein_soundex(self, token, canonical):
candidates = []
best_score = 2
for word in self.dicts:
score = jellyfish.levenshtein_distance(
token, word.decode("utf-8").lower())
if score <= best_score:
best_score = score
candidates.append(word.lower())
token_soundex = jellyfish.soundex(token.decode("utf-8"))
match_soundex = [match for match in candidates if jellyfish.soundex(
match.decode("utf-8")) == token_soundex]
#G = ngram.NGram(match_soundex)
#best_candidates = G.search(token, threshold=0.5)
#results = [item[0] for item in best_candidates]
is_match = False
for word in match_soundex:
if word == canonical:
is_match = True
break
#if len(best_candidates) > 0:
# best_match = best_candidates[0][0]
#else:
# best_match = ""
return match_soundex, is_match
def find_match_levenshtein_metaphone(self, token, canonical):
candidates = []
best_score = 2
for word in self.dicts:
score = jellyfish.levenshtein_distance(
token, word.decode("utf-8").lower())
if score <= best_score:
best_score = score
candidates.append(word.lower())
token_metaphone = jellyfish.metaphone(token.decode("utf-8"))
match_metaphone = [match for match in candidates if jellyfish.metaphone(
match.decode("utf-8")) == token_metaphone]
#G = ngram.NGram(match_metaphone)
#best_candidates = G.search(token, threshold=0.5)
#results = [item[0] for item in best_candidates]
is_match = False
for word in match_metaphone:
if word == canonical:
is_match = True
break
#if len(best_candidates) > 0:
# best_match = best_candidates[0][0]
#else:
# best_match = ""
return match_metaphone, is_match