RTX/code/autocomplete/rtxcomplete.py at master · RTXteam/RTX · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import sqlite3
import re
import timeit
import sys
import os
def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs)

pathlist = os.path.realpath(__file__).split(os.path.sep)
RTXindex = pathlist.index("RTX")
sys.path.append(os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code']))
from RTXConfiguration import RTXConfiguration

RTXConfig = RTXConfiguration()
autocomplete_filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'autocomplete'])


conn = None
cursor = None
cache_conn = None
cache_cursor = None


def load():
    global conn
    global cursor
    global cache_conn
    global cache_cursor
    database_name = f"{autocomplete_filepath}{os.path.sep}{RTXConfig.autocomplete_path.split('/')[-1]}"
    conn = sqlite3.connect(database_name)
    cursor = conn.cursor()
    try:
        conn.execute(f"SELECT term FROM terms LIMIT 1")
        print(f"INFO: Connected to {database_name}",file=sys.stderr)
    except:
        print(f"WARN: Could NOT connect to {database_name}. Please check that file and database exist!",file=sys.stderr)

    cache_database_name = os.path.dirname(os.path.abspath(__file__)) + '/rtxcomplete_cache.sqlite'
    cache_conn = sqlite3.connect(cache_database_name)
    cache_cursor = cache_conn.cursor()
    print(f"INFO: Connected to {cache_database_name}",file=sys.stderr)
    cache_cursor.execute("CREATE TABLE IF NOT EXISTS cached_fragments(fragment VARCHAR(1024))")
    cache_cursor.execute("CREATE TABLE IF NOT EXISTS cached_fragment_terms(fragment_id VARCHAR(1024), term VARCHAR(1024))")

    return True


def get_nodes_like(word,requested_limit):

    debug = False

    t0 = timeit.default_timer()
    requested_limit = int(requested_limit)

    values = []
    n_values = 0

    if len(word) < 2:
        return values

    #### Try to avoid SQL injection exploits by sanitizing input #1823
    word = word.replace('"','')

    floor = word[:-1]
    ceiling = floor + 'zz'

    #### Get a list of matching node names that begin with these letters
    if debug:
        eprint(f"INFO: Query 1")
    #cursor.execute("SELECT term FROM term WHERE term LIKE \"%s%%\" ORDER BY length(term),term LIMIT %s" % (word,1000))
    cursor.execute(f"SELECT term FROM terms WHERE term > \"{floor}\" AND term < \"{ceiling}\" AND term LIKE \"{word}%%\" ORDER BY length(term),term LIMIT {requested_limit}")
    rows = cursor.fetchall()
    values_dict = {}
    for row in rows:
        term = row[0]
        if term.upper() not in values_dict:
            if debug:
                eprint(f"    - {term}")
            properties = { "curie": '??', "name": term, "type": '??' }
            values.append(properties)
            values_dict[term.upper()] = 1
            n_values += 1
            if n_values >= requested_limit:
                break
    t1 = timeit.default_timer()
    if debug:
        eprint(f"INFO: Query 1 in {t1-t0} sec")

    #### If we haven't reached the limit yet, add a list of matching terms that contain this string
    if n_values < requested_limit:
        if debug:
            eprint(f"INFO: Query 2")

        #### See if there is a cached entry already
        word_part = word
        found_fragment = None
        while len(word_part) > 2:
            cache_cursor.execute(f"SELECT rowid, fragment FROM cached_fragments WHERE fragment == \"{word_part}\"")
            rows = cache_cursor.fetchall()
            if len(rows) > 0:
                fragment_id = rows[0][0]
                found_fragment = rows[0][1]
                break
            word_part = word_part[:-1]

        if found_fragment:
            if debug:
                eprint(f"Found matching fragment {found_fragment} as fragment_id {fragment_id}")

            cache_cursor.execute(f"SELECT term FROM cached_fragment_terms WHERE fragment_id = {fragment_id} AND term LIKE \"%%{word}%%\"")
            rows = cache_cursor.fetchall()

            for row in rows:
                term = row[0]
                if term.upper() not in values_dict:

                    if n_values < requested_limit:
                        if debug:
                            print(f"    - {term}")
                        properties = { "curie": '??', "name": term, "type": '??' }
                        values.append(properties)
                        n_values += 1


        if found_fragment is None:

            #### Cache this fragment in the database
            try:
                cache_cursor.execute("INSERT INTO cached_fragments(fragment) VALUES(?)", (word,))
                fragment_id = cache_cursor.lastrowid
            except:
                eprint(f"ERROR: Unable to INSERT into cached_fragments(fragment)",file=sys.stderr)
                fragment_id = 0
            if debug:
                eprint(f"fragment_id = {fragment_id}")

            #### Execute an expensive LIKE query
            cursor.execute("SELECT term FROM terms WHERE term LIKE \"%%%s%%\" ORDER BY length(term),term LIMIT %s" % (word,10000))
            rows = cursor.fetchall()

            for row in rows:
                term = row[0]
                if term.upper() not in values_dict:

                    if n_values < requested_limit:
                        if debug:
                            eprint(f"    - {term}")
                        properties = { "curie": '??', "name": term, "type": '??' }
                        values.append(properties)
                        n_values += 1

                    values_dict[term.upper()] = 1
                    cache_cursor.execute("INSERT INTO cached_fragment_terms(fragment_id, term) VALUES(?,?)", (fragment_id, term,))
            cache_conn.commit()

        t2 = timeit.default_timer()
        if debug:
            eprint(f"INFO: Query 2 in {t2-t1} sec")


    return(values)