Skip to content

Commit 45adf21

Browse files
committed
✨ Add synonyms support (thesaurus feature fully implemented)
- 🔨 Move to Python 3 - ✨ Add synonyms support for WHERE's columns (thesaurus feature fully implemented) - 🐛 Bugfix on values affectation in WHERE parsing
1 parent bec3735 commit 45adf21

5 files changed

Lines changed: 101 additions & 41 deletions

File tree

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
clean:
22
rm -rf *.json *.pyc
33
test:
4-
python -m unittest test_unit
4+
python3 -m unittest test_unit
55
rm -rf *.json *.pyc

Parser.py

Lines changed: 68 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def join(self):
259259

260260
class WhereParser(Thread):
261261

262-
def __init__(self, phrases, tables_of_from, columns_of_values_of_where, count_keywords, sum_keywords, average_keywords, max_keywords, min_keywords, greater_keywords, less_keywords, between_keywords, negation_keywords, junction_keywords, disjunction_keywords, like_keywords, distinct_keywords, database_dico):
262+
def __init__(self, phrases, tables_of_from, columns_of_values_of_where, count_keywords, sum_keywords, average_keywords, max_keywords, min_keywords, greater_keywords, less_keywords, between_keywords, negation_keywords, junction_keywords, disjunction_keywords, like_keywords, distinct_keywords, database_dico, database_object):
263263
Thread.__init__(self)
264264
self.where_objects = []
265265
self.phrases = phrases
@@ -279,6 +279,7 @@ def __init__(self, phrases, tables_of_from, columns_of_values_of_where, count_ke
279279
self.like_keywords = like_keywords
280280
self.distinct_keywords = distinct_keywords
281281
self.database_dico = database_dico
282+
self.database_object = database_object
282283

283284
def get_tables_of_column(self, column):
284285
tmp_table = []
@@ -359,6 +360,13 @@ def predict_junction(self, previous_column_offset, current_column_offset):
359360
else:
360361
return 'OR'
361362

363+
def uniquify(self, list):
364+
already = []
365+
for element in list:
366+
if element not in already:
367+
already.append(element)
368+
return already
369+
362370
def run(self):
363371
number_of_where_columns = 0
364372
columns_of_where = []
@@ -379,13 +387,18 @@ def run(self):
379387

380388
for phrase in self.phrases:
381389
for i in range(0, len(phrase)):
382-
for table in self.database_dico:
383-
if phrase[i] in self.database_dico[table]:
384-
number_of_where_columns += 1
385-
columns_of_where.append(phrase[i])
386-
offset_of[phrase[i]] = i
387-
column_offset.append(i)
388-
break
390+
for table_name in self.database_dico:
391+
columns = self.database_object.get_table_by_name(table_name).get_columns()
392+
for column in columns:
393+
if (phrase[i] == column.get_name()) or (phrase[i] in column.get_equivalences()):
394+
number_of_where_columns += 1
395+
columns_of_where.append(column.get_name())
396+
offset_of[phrase[i]] = i
397+
column_offset.append(i)
398+
break
399+
else:
400+
continue
401+
break
389402

390403
phrase_keyword = str(phrase[i]).lower() # for robust keyword matching
391404

@@ -416,6 +429,8 @@ def run(self):
416429
if phrase_keyword in self.like_keywords: # after the column
417430
self.like_keyword_offset.append(i)
418431

432+
print(self.columns_of_values_of_where)
433+
print(columns_of_where)
419434

420435
for table_of_from in self.tables_of_from:
421436
where_object = Where()
@@ -437,7 +452,7 @@ def run(self):
437452
operation_type = self.predict_operation_type(previous, current)
438453

439454
if len(self.columns_of_values_of_where) > i:
440-
value = self.columns_of_values_of_where[i]
455+
value = self.columns_of_values_of_where[len(self.columns_of_values_of_where) - len(columns_of_where) + i]
441456
else:
442457
value = 'OOV' # Out Of Vocabulary: default value
443458

@@ -452,12 +467,13 @@ def join(self):
452467

453468
class GroupByParser(Thread):
454469

455-
def __init__(self, phrases, tables_of_from, database_dico):
470+
def __init__(self, phrases, tables_of_from, database_dico, database_object):
456471
Thread.__init__(self)
457472
self.group_by_objects = []
458473
self.phrases = phrases
459474
self.tables_of_from = tables_of_from
460475
self.database_dico = database_dico
476+
self.database_object = database_object
461477

462478
def get_tables_of_column(self, column):
463479
tmp_table = []
@@ -479,11 +495,12 @@ def run(self):
479495
group_by_object = GroupBy()
480496
for phrase in self.phrases:
481497
for i in range(0, len(phrase)):
482-
for table in self.database_dico:
483-
if phrase[i] in self.database_dico[table]:
484-
column = self.get_column_name_with_alias_table(
485-
phrase[i], table_of_from)
486-
group_by_object.set_column(column)
498+
for table_name in self.database_dico:
499+
columns = self.database_object.get_table_by_name(table_name).get_columns()
500+
for column in columns:
501+
if (phrase[i] == column.get_name()) or (phrase[i] in column.get_equivalences()):
502+
column_with_alias = self.get_column_name_with_alias_table(column.get_name(), table_of_from)
503+
group_by_object.set_column(column_with_alias)
487504
self.group_by_objects.append(group_by_object)
488505

489506
def join(self):
@@ -493,14 +510,15 @@ def join(self):
493510

494511
class OrderByParser(Thread):
495512

496-
def __init__(self, phrases, tables_of_from, asc_keywords, desc_keywords, database_dico):
513+
def __init__(self, phrases, tables_of_from, asc_keywords, desc_keywords, database_dico, database_object):
497514
Thread.__init__(self)
498515
self.order_by_objects = []
499516
self.phrases = phrases
500517
self.tables_of_from = tables_of_from
501518
self.asc_keywords = asc_keywords
502519
self.desc_keywords = desc_keywords
503520
self.database_dico = database_dico
521+
self.database_object = database_object
504522

505523
def get_tables_of_column(self, column):
506524
tmp_table = []
@@ -531,10 +549,12 @@ def run(self):
531549
order_by_object = OrderBy()
532550
for phrase in self.phrases:
533551
for i in range(0, len(phrase)):
534-
for table in self.database_dico:
535-
if phrase[i] in self.database_dico[table]:
536-
column = self.get_column_name_with_alias_table(phrase[i], table_of_from)
537-
order_by_object.add_column(column, self.predict_order(phrase))
552+
for table_name in self.database_dico:
553+
columns = self.database_object.get_table_by_name(table_name).get_columns()
554+
for column in columns:
555+
if (phrase[i] == column.get_name()) or (phrase[i] in column.get_equivalences()):
556+
column_with_alias = self.get_column_name_with_alias_table(column.get_name(), table_of_from)
557+
order_by_object.add_column(column_with_alias, self.predict_order(phrase))
538558
self.order_by_objects.append(order_by_object)
539559

540560
def join(self):
@@ -624,27 +644,35 @@ def parse_sentence(self, sentence, stopwordsFilter=None):
624644
med_phrase = ''
625645
end_phrase = ''
626646

647+
''' @todo merge this part of the algorithm (detection of values of where) in the rest of the parsing algorithm (about line 725) '''
648+
627649
for i in range(0, len(input_word_list)):
628-
if input_word_list[i] in self.database_dico:
629-
if number_of_table_temp == 0:
630-
start_phrase = input_word_list[:i]
631-
number_of_table_temp += 1
632-
last_table_position_temp = i
633-
for table in self.database_dico:
634-
if input_word_list[i] in self.database_dico[table]:
635-
if number_of_where_column_temp == 0:
636-
med_phrase = input_word_list[
637-
len(start_phrase):last_table_position_temp + 1]
638-
number_of_where_column_temp += 1
639-
break
650+
for table_name in self.database_dico:
651+
if (input_word_list[i] == table_name) or (input_word_list[i] in self.database_object.get_table_by_name(table_name).get_equivalences()):
652+
if number_of_table_temp == 0:
653+
start_phrase = input_word_list[:i]
654+
number_of_table_temp += 1
655+
last_table_position_temp = i
656+
657+
columns = self.database_object.get_table_by_name(table_name).get_columns()
658+
for column in columns:
659+
if (input_word_list[i] == column.get_name()) or (input_word_list[i] in column.get_equivalences()):
660+
if number_of_where_column_temp == 0:
661+
med_phrase = input_word_list[len(start_phrase):last_table_position_temp + 1]
662+
number_of_where_column_temp += 1
663+
break
664+
else:
665+
if (number_of_table_temp != 0) and (number_of_where_column_temp == 0) and (i == (len(input_word_list) - 1)):
666+
med_phrase = input_word_list[len(start_phrase):]
640667
else:
641-
if (number_of_table_temp != 0) and (number_of_where_column_temp == 0) and (i == (len(input_word_list) - 1)):
642-
med_phrase = input_word_list[len(start_phrase):]
668+
continue
669+
break
643670

644671
end_phrase = input_word_list[len(start_phrase) + len(med_phrase):]
672+
645673
irext = ' '.join(end_phrase)
646674

647-
''' @todo set this part of the algorithm (detection of values of where) in the part of the phrases where parsing '''
675+
''' @todo set this part of the algorithm (detection of values of where) in the WhereParser thread '''
648676

649677
if irext:
650678
irext = self.remove_accents(irext.lower())
@@ -698,6 +726,8 @@ def parse_sentence(self, sentence, stopwordsFilter=None):
698726
# replace back <_> to spaces from the values assigned
699727
columns_of_values_of_where.append(str("'" + str(irext_list[index]).replace('<_>', ' ') + "'"))
700728

729+
''' ----------------------------------------------------------------------------------------------------------- '''
730+
701731
tables_of_from = []
702732
select_phrase = ''
703733
from_phrase = ''
@@ -819,9 +849,9 @@ def parse_sentence(self, sentence, stopwordsFilter=None):
819849
try:
820850
select_parser = SelectParser(columns_of_select, tables_of_from, select_phrase, self.count_keywords, self.sum_keywords, self.average_keywords, self.max_keywords, self.min_keywords, self.distinct_keywords, self.database_dico, self.database_object)
821851
from_parser = FromParser(tables_of_from, columns_of_select, columns_of_where, self.database_object)
822-
where_parser = WhereParser(new_where_phrase, tables_of_from, columns_of_values_of_where, self.count_keywords, self.sum_keywords, self.average_keywords, self.max_keywords, self.min_keywords, self.greater_keywords, self.less_keywords, self.between_keywords, self.negation_keywords, self.junction_keywords, self.disjunction_keywords, self.like_keywords, self.distinct_keywords, self.database_dico)
823-
group_by_parser = GroupByParser(group_by_phrase, tables_of_from, self.database_dico)
824-
order_by_parser = OrderByParser(order_by_phrase, tables_of_from, self.asc_keywords, self.desc_keywords, self.database_dico)
852+
where_parser = WhereParser(new_where_phrase, tables_of_from, columns_of_values_of_where, self.count_keywords, self.sum_keywords, self.average_keywords, self.max_keywords, self.min_keywords, self.greater_keywords, self.less_keywords, self.between_keywords, self.negation_keywords, self.junction_keywords, self.disjunction_keywords, self.like_keywords, self.distinct_keywords, self.database_dico, self.database_object)
853+
group_by_parser = GroupByParser(group_by_phrase, tables_of_from, self.database_dico, self.database_object)
854+
order_by_parser = OrderByParser(order_by_phrase, tables_of_from, self.asc_keywords, self.desc_keywords, self.database_dico, self.database_object)
825855

826856
select_parser.start()
827857
from_parser.start()

lang/english.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,6 @@ ASC: ascending, increasing
1313
DESC: descending, decreasing, inverse, reverse, opposite
1414
GROUP: group, grouped
1515
NEGATION: not, no
16-
EQUAL: is, equal, equals, equal to, equals to
16+
EQUAL: is, equal, equals, equal to, equals to, are
1717
LIKE: like, likes
1818
DISTINCT: distinct, different, distinctive, distinctly

lang/french.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,6 @@ ASC: ascendant, ascendante, croissant
1313
DESC: descendant, descendante, décroissant, inverse, inversé, inversée
1414
GROUP: groupe, groupé, rangé
1515
NEGATION: ne, pas, aucun
16-
EQUAL: est, égal, égal à
16+
EQUAL: est, égal, égal à, sont
1717
LIKE: comme
1818
DISTINCT: distinct, distincte, distincts, distinctes, distinctive, distinctement, distinctivement

test_unit.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,12 @@ def test_main(self):
180180
'database': './database/city.sql',
181181
'language': './lang/english.csv',
182182
'output': "SELECT DISTINCT emp.name FROM city INNER JOIN emp ON city.id = emp.cityId WHERE emp.score = '9';"
183+
},
184+
{
185+
'input': "Compte les nom des élève dont les nom sont BELLE",
186+
'database': './database/ecole.sql',
187+
'language': './lang/french.csv',
188+
'output': "SELECT COUNT(eleve.nom) FROM eleve WHERE eleve.nom = 'belle';"
183189
}
184190
]
185191

@@ -252,6 +258,30 @@ def test_main(self):
252258
'language': './lang/french.csv',
253259
'thesaurus': 'thesaurus/th_french.dat',
254260
'output': "SELECT classe.salle FROM classe;"
261+
},
262+
{
263+
'input': "Compte les dénomination des étudiant dont les dénomination sont BELLE",
264+
'database': './database/ecole.sql',
265+
'language': './lang/french.csv',
266+
'thesaurus': 'thesaurus/th_french.dat',
267+
'output': "SELECT COUNT(eleve.nom) FROM eleve WHERE eleve.nom = 'belle';"
268+
},
269+
{
270+
'input': "Compte les dénomination des étudiant dont les dénomination sont BELLE et l'ancienneté est 25",
271+
'database': './database/ecole.sql',
272+
'language': './lang/french.csv',
273+
'thesaurus': 'thesaurus/th_french.dat',
274+
'output': "SELECT COUNT(eleve.nom) FROM eleve WHERE eleve.nom = 'belle' AND eleve.age = '25';"
275+
}
276+
]
277+
278+
thesaurusTest2 = [
279+
{
280+
'input': "Quel est le cours où la pièce est B45",
281+
'database': './database/ecole.sql',
282+
'language': './lang/french.csv',
283+
'thesaurus': 'thesaurus/th_french.dat',
284+
'output': "SELECT * FROM classe WHERE classe.salle = 'b45';"
255285
}
256286
]
257287

0 commit comments

Comments
 (0)