✨ Add synonyms support (thesaurus feature fully implemented)

FerreroJeremy · FerreroJeremy · commit 45adf21675b5 · 2017-11-25T11:11:58.000+01:00
- 🔨 Move to Python 3
- ✨ Add synonyms support for WHERE's columns (thesaurus
feature fully implemented)
- 🐛 Bugfix on values affectation in WHERE parsing
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 clean:
 	rm -rf *.json *.pyc
 test:
-	python -m unittest test_unit
+	python3 -m unittest test_unit
 	rm -rf *.json *.pyc
diff --git a/Parser.py b/Parser.py
@@ -259,7 +259,7 @@ def join(self):
 
 class WhereParser(Thread):
 
-    def __init__(self, phrases, tables_of_from, columns_of_values_of_where, count_keywords, sum_keywords, average_keywords, max_keywords, min_keywords, greater_keywords, less_keywords, between_keywords, negation_keywords, junction_keywords, disjunction_keywords, like_keywords, distinct_keywords, database_dico):
+    def __init__(self, phrases, tables_of_from, columns_of_values_of_where, count_keywords, sum_keywords, average_keywords, max_keywords, min_keywords, greater_keywords, less_keywords, between_keywords, negation_keywords, junction_keywords, disjunction_keywords, like_keywords, distinct_keywords, database_dico, database_object):
         Thread.__init__(self)
         self.where_objects = []
         self.phrases = phrases
@@ -279,6 +279,7 @@ def __init__(self, phrases, tables_of_from, columns_of_values_of_where, count_ke
         self.like_keywords = like_keywords
         self.distinct_keywords = distinct_keywords
         self.database_dico = database_dico
+        self.database_object = database_object
 
     def get_tables_of_column(self, column):
         tmp_table = []
@@ -359,6 +360,13 @@ def predict_junction(self, previous_column_offset, current_column_offset):
         else:
             return 'OR'
 
+    def uniquify(self, list): 
+       already = []
+       for element in list:
+           if element not in already:
+               already.append(element)
+       return already
+
     def run(self):
         number_of_where_columns = 0
         columns_of_where = []
@@ -379,13 +387,18 @@ def run(self):
 
         for phrase in self.phrases:
             for i in range(0, len(phrase)):
-                for table in self.database_dico:
-                    if phrase[i] in self.database_dico[table]:
-                        number_of_where_columns += 1
-                        columns_of_where.append(phrase[i])
-                        offset_of[phrase[i]] = i
-                        column_offset.append(i)
-                        break
+                for table_name in self.database_dico:
+                    columns = self.database_object.get_table_by_name(table_name).get_columns()
+                    for column in columns:
+                        if (phrase[i] == column.get_name()) or (phrase[i] in column.get_equivalences()):
+                            number_of_where_columns += 1
+                            columns_of_where.append(column.get_name())
+                            offset_of[phrase[i]] = i
+                            column_offset.append(i)
+                            break
+                    else:
+                        continue
+                    break
 
                 phrase_keyword = str(phrase[i]).lower()  # for robust keyword matching
 
@@ -416,6 +429,8 @@ def run(self):
                 if phrase_keyword in self.like_keywords:  # after the column
                     self.like_keyword_offset.append(i)
 
+        print(self.columns_of_values_of_where)
+        print(columns_of_where)
 
         for table_of_from in self.tables_of_from:
             where_object = Where()
@@ -437,7 +452,7 @@ def run(self):
                 operation_type = self.predict_operation_type(previous, current)
 
                 if len(self.columns_of_values_of_where) > i:
-                    value = self.columns_of_values_of_where[i]
+                    value = self.columns_of_values_of_where[len(self.columns_of_values_of_where) - len(columns_of_where) + i]
                 else:
                     value = 'OOV'  # Out Of Vocabulary: default value
 
@@ -452,12 +467,13 @@ def join(self):
 
 class GroupByParser(Thread):
 
-    def __init__(self, phrases, tables_of_from, database_dico):
+    def __init__(self, phrases, tables_of_from, database_dico, database_object):
         Thread.__init__(self)
         self.group_by_objects = []
         self.phrases = phrases
         self.tables_of_from = tables_of_from
         self.database_dico = database_dico
+        self.database_object = database_object
 
     def get_tables_of_column(self, column):
         tmp_table = []
@@ -479,11 +495,12 @@ def run(self):
             group_by_object = GroupBy()
             for phrase in self.phrases:
                 for i in range(0, len(phrase)):
-                    for table in self.database_dico:
-                        if phrase[i] in self.database_dico[table]:
-                            column = self.get_column_name_with_alias_table(
-                                phrase[i], table_of_from)
-                            group_by_object.set_column(column)
+                    for table_name in self.database_dico:
+                        columns = self.database_object.get_table_by_name(table_name).get_columns()
+                        for column in columns:
+                            if (phrase[i] == column.get_name()) or (phrase[i] in column.get_equivalences()):
+                            	column_with_alias = self.get_column_name_with_alias_table(column.get_name(), table_of_from)
+                            	group_by_object.set_column(column_with_alias)
             self.group_by_objects.append(group_by_object)
 
     def join(self):
@@ -493,14 +510,15 @@ def join(self):
 
 class OrderByParser(Thread):
 
-    def __init__(self, phrases, tables_of_from, asc_keywords, desc_keywords, database_dico):
+    def __init__(self, phrases, tables_of_from, asc_keywords, desc_keywords, database_dico, database_object):
         Thread.__init__(self)
         self.order_by_objects = []
         self.phrases = phrases
         self.tables_of_from = tables_of_from
         self.asc_keywords = asc_keywords
         self.desc_keywords = desc_keywords
         self.database_dico = database_dico
+        self.database_object = database_object
 
     def get_tables_of_column(self, column):
         tmp_table = []
@@ -531,10 +549,12 @@ def run(self):
             order_by_object = OrderBy()
             for phrase in self.phrases:
                 for i in range(0, len(phrase)):
-                    for table in self.database_dico:
-                        if phrase[i] in self.database_dico[table]:
-                            column = self.get_column_name_with_alias_table(phrase[i], table_of_from)
-                            order_by_object.add_column(column, self.predict_order(phrase))
+                    for table_name in self.database_dico:
+                        columns = self.database_object.get_table_by_name(table_name).get_columns()
+                        for column in columns:
+                            if (phrase[i] == column.get_name()) or (phrase[i] in column.get_equivalences()):
+                                column_with_alias = self.get_column_name_with_alias_table(column.get_name(), table_of_from)
+                                order_by_object.add_column(column_with_alias, self.predict_order(phrase))
             self.order_by_objects.append(order_by_object)
 
     def join(self):
@@ -624,27 +644,35 @@ def parse_sentence(self, sentence, stopwordsFilter=None):
         med_phrase = ''
         end_phrase = ''
 
+        ''' @todo merge this part of the algorithm (detection of values of where) in the rest of the parsing algorithm (about line 725) '''
+
         for i in range(0, len(input_word_list)):
-            if input_word_list[i] in self.database_dico:
-                if number_of_table_temp == 0:
-                    start_phrase = input_word_list[:i]
-                number_of_table_temp += 1
-                last_table_position_temp = i
-            for table in self.database_dico:
-                if input_word_list[i] in self.database_dico[table]:
-                    if number_of_where_column_temp == 0:
-                        med_phrase = input_word_list[
-                            len(start_phrase):last_table_position_temp + 1]
-                    number_of_where_column_temp += 1
-                    break
+            for table_name in self.database_dico:
+                if (input_word_list[i] == table_name) or (input_word_list[i] in self.database_object.get_table_by_name(table_name).get_equivalences()):
+                    if number_of_table_temp == 0:
+                        start_phrase = input_word_list[:i]
+                    number_of_table_temp += 1
+                    last_table_position_temp = i
+
+                columns = self.database_object.get_table_by_name(table_name).get_columns()
+                for column in columns:
+                    if (input_word_list[i] == column.get_name()) or (input_word_list[i] in column.get_equivalences()):
+                        if number_of_where_column_temp == 0:
+                            med_phrase = input_word_list[len(start_phrase):last_table_position_temp + 1]
+                        number_of_where_column_temp += 1
+                        break
+                    else:
+                        if (number_of_table_temp != 0) and (number_of_where_column_temp == 0) and (i == (len(input_word_list) - 1)):
+                            med_phrase = input_word_list[len(start_phrase):]
                 else:
-                    if (number_of_table_temp != 0) and (number_of_where_column_temp == 0) and (i == (len(input_word_list) - 1)):
-                        med_phrase = input_word_list[len(start_phrase):]
+                    continue
+                break
 
         end_phrase = input_word_list[len(start_phrase) + len(med_phrase):]
+
         irext = ' '.join(end_phrase)
 
-        ''' @todo set this part of the algorithm (detection of values of where) in the part of the phrases where parsing '''
+        ''' @todo set this part of the algorithm (detection of values of where) in the WhereParser thread '''
 
         if irext:
             irext = self.remove_accents(irext.lower())
@@ -698,6 +726,8 @@ def parse_sentence(self, sentence, stopwordsFilter=None):
                         # replace back <_> to spaces from the values assigned
                         columns_of_values_of_where.append(str("'" + str(irext_list[index]).replace('<_>', ' ') + "'"))
 
+        ''' ----------------------------------------------------------------------------------------------------------- '''
+        
         tables_of_from = []
         select_phrase = ''
         from_phrase = ''
@@ -819,9 +849,9 @@ def parse_sentence(self, sentence, stopwordsFilter=None):
         try:
             select_parser = SelectParser(columns_of_select, tables_of_from, select_phrase, self.count_keywords, self.sum_keywords, self.average_keywords, self.max_keywords, self.min_keywords, self.distinct_keywords, self.database_dico, self.database_object)
             from_parser = FromParser(tables_of_from, columns_of_select, columns_of_where, self.database_object)
-            where_parser = WhereParser(new_where_phrase, tables_of_from, columns_of_values_of_where, self.count_keywords, self.sum_keywords, self.average_keywords, self.max_keywords, self.min_keywords, self.greater_keywords, self.less_keywords, self.between_keywords, self.negation_keywords, self.junction_keywords, self.disjunction_keywords, self.like_keywords, self.distinct_keywords, self.database_dico)
-            group_by_parser = GroupByParser(group_by_phrase, tables_of_from, self.database_dico)
-            order_by_parser = OrderByParser(order_by_phrase, tables_of_from, self.asc_keywords, self.desc_keywords, self.database_dico)
+            where_parser = WhereParser(new_where_phrase, tables_of_from, columns_of_values_of_where, self.count_keywords, self.sum_keywords, self.average_keywords, self.max_keywords, self.min_keywords, self.greater_keywords, self.less_keywords, self.between_keywords, self.negation_keywords, self.junction_keywords, self.disjunction_keywords, self.like_keywords, self.distinct_keywords, self.database_dico, self.database_object)
+            group_by_parser = GroupByParser(group_by_phrase, tables_of_from, self.database_dico, self.database_object)
+            order_by_parser = OrderByParser(order_by_phrase, tables_of_from, self.asc_keywords, self.desc_keywords, self.database_dico, self.database_object)
 
             select_parser.start()
             from_parser.start()
diff --git a/lang/english.csv b/lang/english.csv
@@ -13,6 +13,6 @@ ASC: ascending, increasing
 DESC: descending, decreasing, inverse, reverse, opposite
 GROUP: group, grouped
 NEGATION: not, no
-EQUAL: is, equal, equals, equal to, equals to
+EQUAL: is, equal, equals, equal to, equals to, are
 LIKE: like, likes
 DISTINCT: distinct, different, distinctive, distinctly
diff --git a/lang/french.csv b/lang/french.csv
@@ -13,6 +13,6 @@ ASC: ascendant, ascendante, croissant
 DESC: descendant, descendante, décroissant, inverse, inversé, inversée
 GROUP: groupe, groupé, rangé
 NEGATION: ne, pas, aucun
-EQUAL: est, égal, égal à
+EQUAL: est, égal, égal à, sont
 LIKE: comme
 DISTINCT: distinct, distincte, distincts, distinctes, distinctive, distinctement, distinctivement
diff --git a/test_unit.py b/test_unit.py
@@ -180,6 +180,12 @@ def test_main(self):
                 'database': './database/city.sql',
                 'language': './lang/english.csv',
                 'output': "SELECT DISTINCT emp.name FROM city INNER JOIN emp ON city.id = emp.cityId WHERE emp.score = '9';"
+            },
+            {
+                'input': "Compte les nom des élève dont les nom sont BELLE",
+                'database': './database/ecole.sql',
+                'language': './lang/french.csv',
+                'output': "SELECT COUNT(eleve.nom) FROM eleve WHERE eleve.nom = 'belle';"
             }
         ]
 
@@ -252,6 +258,30 @@ def test_main(self):
                 'language': './lang/french.csv',
                 'thesaurus': 'thesaurus/th_french.dat',
                 'output': "SELECT classe.salle FROM classe;"
+            },
+            {
+                'input': "Compte les dénomination des étudiant dont les dénomination sont BELLE",
+                'database': './database/ecole.sql',
+                'language': './lang/french.csv',
+                'thesaurus': 'thesaurus/th_french.dat',
+                'output': "SELECT COUNT(eleve.nom) FROM eleve WHERE eleve.nom = 'belle';"
+            },
+            {
+                'input': "Compte les dénomination des étudiant dont les dénomination sont BELLE et l'ancienneté est 25",
+                'database': './database/ecole.sql',
+                'language': './lang/french.csv',
+                'thesaurus': 'thesaurus/th_french.dat',
+                'output': "SELECT COUNT(eleve.nom) FROM eleve WHERE eleve.nom = 'belle' AND eleve.age = '25';"
+            }
+        ]
+
+        thesaurusTest2 = [
+            {
+                'input': "Quel est le cours où la pièce est B45",
+                'database': './database/ecole.sql',
+                'language': './lang/french.csv',
+                'thesaurus': 'thesaurus/th_french.dat',
+                'output': "SELECT * FROM classe WHERE classe.salle = 'b45';"
             }
         ]