twitter-lexical-normalisation/match_engine.py at master · martindavid/twitter-lexical-normalisation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from __future__ import print_function
import jellyfish
#import ngram

class MatchEngine(object):
    def __init__(self, dicts):
        self.dicts = dicts

    def find_match_levenshtein(self, token, canonical):
        candidates = []
        best_score = 2
        for word in self.dicts:
            score = jellyfish.levenshtein_distance(
                token, word.decode("utf-8").lower())
            if score <= best_score:
                best_score = score
                candidates.append(word.lower())

        #G = ngram.NGram(candidates)
        #best_candidates = G.search(token, threshold=0.5)

        #results = [item[0] for item in best_candidates]

        is_match = False
        for word in candidates:
            if word == canonical:
                is_match = True
                break

        #if len(best_candidates) > 0:
        #    best_match = best_candidates[0][0]
        #else:
        #    best_match = ""

        return candidates, is_match


    def find_match_levenshtein_soundex(self, token, canonical):
        candidates = []
        best_score = 2
        for word in self.dicts:
            score = jellyfish.levenshtein_distance(
                token, word.decode("utf-8").lower())
            if score <= best_score:
                best_score = score
                candidates.append(word.lower())

        token_soundex = jellyfish.soundex(token.decode("utf-8"))
        match_soundex = [match for match in candidates if jellyfish.soundex(
            match.decode("utf-8")) == token_soundex]

        #G = ngram.NGram(match_soundex)
        #best_candidates = G.search(token, threshold=0.5)

        #results = [item[0] for item in best_candidates]

        is_match = False
        for word in match_soundex:
            if word == canonical:
                is_match = True
                break

        #if len(best_candidates) > 0:
        #    best_match = best_candidates[0][0]
        #else:
        #    best_match = ""

        return match_soundex, is_match

    def find_match_levenshtein_metaphone(self, token, canonical):
        candidates = []
        best_score = 2
        for word in self.dicts:
            score = jellyfish.levenshtein_distance(
                token, word.decode("utf-8").lower())
            if score <= best_score:
                best_score = score
                candidates.append(word.lower())

        token_metaphone = jellyfish.metaphone(token.decode("utf-8"))
        match_metaphone = [match for match in candidates if jellyfish.metaphone(
            match.decode("utf-8")) == token_metaphone]

        #G = ngram.NGram(match_metaphone)
        #best_candidates = G.search(token, threshold=0.5)

        #results = [item[0] for item in best_candidates]

        is_match = False
        for word in match_metaphone:
            if word == canonical:
                is_match = True
                break

        #if len(best_candidates) > 0:
        #    best_match = best_candidates[0][0]
        #else:
        #    best_match = ""

        return match_metaphone, is_match