splitgraph-llm-demo/query.py at main · splitgraph/splitgraph-llm-demo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# from: https://python.langchain.com/en/latest/modules/chains/examples/sqlite.html
import sys
import os

from langchain_demo.repo_info import get_repo_list
from langchain_demo.repo_search import RepoSearcher
from langchain_demo.splitgraph_db import get_splitgraph_db
from langchain_demo.splitgraph_db_chain import invoke_chain
import pprint

# # Step 1: Select potentially useful repositories based on query
# Start by searching for available repos based on the query.
# This search uses pg_vector to compare the query expression's OpenAI
# embeddig to the pre-calculated embeddings stored in PostgreSQL.
# Example query: 'How many hospitals are there in Chicago?'
# RepoSearcher expects a collection name, specified when indexing repos,
# which doesn't have to be a Splitgraph namespace. In this case,
# it's 'cityofchicago' because all of the indexed repositories are contained
# within that namespace.
collection_name = "repo_embeddings"
namespace = sys.argv[1]
query = sys.argv[2]
repo_searcher = RepoSearcher(collection_name, os.environ["PG_CONN_STR_LOCAL"])
qualified_repo_names = repo_searcher.find_repos(query)
# Note: assuming there isn't a collision in repo names. This is true for this
# demo, as all repos come from a single namespace ('cityofchicago'), but will
# break when multiple namespaces are used.
repo_name_set = set(qualified_repo_names)
print("Identified the following repositories which may be relevant to the question")
pprint.pprint(repo_name_set)
# Now that we have the potentially relevant repos, we can use the graphql
# API to get the table schemas for each repository.
relevant_repos = [
    r
    for r in get_repo_list(namespace)
    if (r["namespace"], r["repository"]) in repo_name_set
]

# # Step 2: Use GPT to create an SQL statement using any of the tables in the
# selected repositories, then execute the resulting query on the DDL.
# Initialize the langchain SQLAlchemy wrapper adopted to the DDN with the repo list
db = get_splitgraph_db(os.environ["PG_CONN_STR_DDN"], relevant_repos)
result = invoke_chain(db, query)
print(f"SQL query generated by LLM: {result.sql_query}")
print("SQL resultset returned by the DDN:")
pprint.pprint(result.result)