Skip to content

Commit 9e52c37

Browse files
committed
Text-to-SQL: Help agents turn natural language into SQL queries
DataQuery is the little sister of Google's QueryData product.
1 parent 408ec74 commit 9e52c37

11 files changed

Lines changed: 461 additions & 25 deletions

File tree

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
- Kinesis: Added `ctk kinesis` CLI group with `list-checkpoints` and
66
`prune-checkpoints` commands for checkpoint table maintenance
77
- Dependencies: Permitted installation of click 8.3
8+
- QueryData: Help agents turn natural language into SQL queries
89

910
## 2026/03/16 v0.0.46
1011
- I/O: API improvements: `ctk {load,save} table` became `ctk {load,save}`

cratedb_toolkit/query/cli.py

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,9 @@
1-
import logging
2-
3-
import click
4-
from click_aliases import ClickAliasedGroup
5-
6-
from ..util.cli import boot_click
1+
from ..util.app import make_cli
72
from .convert.cli import convert_query
3+
from .llm.cli import llm_cli
84
from .mcp.cli import cli as mcp_cli
95

10-
logger = logging.getLogger(__name__)
11-
12-
13-
@click.group(cls=ClickAliasedGroup)
14-
@click.option("--verbose", is_flag=True, required=False, help="Turn on logging")
15-
@click.option("--debug", is_flag=True, required=False, help="Turn on logging with debug level")
16-
@click.version_option()
17-
@click.pass_context
18-
def cli(ctx: click.Context, verbose: bool, debug: bool):
19-
"""
20-
Query utilities.
21-
"""
22-
return boot_click(ctx, verbose, debug)
23-
24-
6+
cli = make_cli()
257
cli.add_command(convert_query, name="convert")
8+
cli.add_command(llm_cli, name="llm")
269
cli.add_command(mcp_cli, name="mcp")

cratedb_toolkit/query/llm/__init__.py

Whitespace-only changes.

cratedb_toolkit/query/llm/api.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
Use an LLM to query a database in human language via NLSQLTableQueryEngine.
3+
Example code using LlamaIndex with vanilla Open AI, Azure Open AI, or Ollama.
4+
"""
5+
6+
import dataclasses
7+
import logging
8+
from typing import Optional
9+
10+
from cratedb_toolkit.query.llm.model import DatabaseInfo, ModelInfo
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
try:
16+
from llama_index.core.base.embeddings.base import BaseEmbedding
17+
from llama_index.core.base.response.schema import RESPONSE_TYPE
18+
from llama_index.core.llms import LLM
19+
from llama_index.core.query_engine import NLSQLTableQueryEngine
20+
from llama_index.core.utilities.sql_wrapper import SQLDatabase
21+
except ImportError:
22+
pass
23+
24+
25+
@dataclasses.dataclass
26+
class DataQuery:
27+
"""
28+
DataQuery helps agents turn natural language into SQL queries.
29+
It's the little sister of Google's QueryData product. [1]
30+
31+
We recommend evaluating the Text-to-SQL interface using the Gemma models if you are
32+
looking at non-frontier variants that need less resources for inference. However,
33+
depending on the complexity of your problem, you may also want to use cutting-edge
34+
models with your provider of choice at the cost of higher resource usage.
35+
36+
Attention: Any natural language SQL table query engine and Text-to-SQL application
37+
should be aware that executing arbitrary SQL queries can be a security risk.
38+
It is recommended to take precautions as needed, such as using restricted roles,
39+
read-only databases, sandboxing, etc.
40+
41+
[1] https://cloud.google.com/blog/products/databases/introducing-querydata-for-near-100-percent-accurate-data-agents
42+
[2] https://github.com/kupp0/multi-db-property-search-data-agents
43+
"""
44+
45+
db: DatabaseInfo
46+
model: ModelInfo
47+
query_engine: Optional["NLSQLTableQueryEngine"] = None
48+
49+
def __post_init__(self):
50+
self.setup()
51+
52+
def setup(self):
53+
"""Configure database connection and query engine."""
54+
logger.info("Connecting to CrateDB")
55+
56+
# Configure model.
57+
logger.info("Configuring LLM model")
58+
llm: LLM
59+
embed_model: BaseEmbedding
60+
from cratedb_toolkit.query.llm.util import configure_llm
61+
62+
llm, embed_model = configure_llm(self.model)
63+
64+
# Configure query engine.
65+
logger.info("Creating query engine")
66+
sql_database = SQLDatabase(
67+
self.db.engine,
68+
schema=self.db.schema,
69+
ignore_tables=self.db.ignore_tables,
70+
include_tables=self.db.include_tables,
71+
)
72+
self.query_engine = NLSQLTableQueryEngine(
73+
sql_database=sql_database,
74+
llm=llm,
75+
embed_model=embed_model,
76+
)
77+
78+
def ask(self, question: str) -> "RESPONSE_TYPE":
79+
"""Invoke an inquiry to the LLM."""
80+
if not self.query_engine:
81+
raise ValueError("Query engine not configured")
82+
logger.debug("Running query: %s", question)
83+
return self.query_engine.query(question)

cratedb_toolkit/query/llm/cli.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import logging
2+
import os
3+
from typing import Optional
4+
5+
import click
6+
from dotenv import load_dotenv
7+
8+
from cratedb_toolkit import DatabaseCluster
9+
from cratedb_toolkit.query.llm.api import DataQuery
10+
from cratedb_toolkit.query.llm.model import DatabaseInfo, ModelInfo, ModelProvider
11+
from cratedb_toolkit.util.common import setup_logging
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
def help_llm():
17+
"""
18+
Use an LLM to query the database in human language.
19+
20+
Synopsis
21+
========
22+
23+
export CRATEDB_CLUSTER_URL=crate://localhost/
24+
ctk query llm "What is the average value for sensor 1?"
25+
26+
""" # noqa: E501
27+
28+
29+
@click.command()
30+
@click.argument("question")
31+
@click.option("--schema", type=str, required=False, help="Schema where to operate on")
32+
@click.option("--llm-provider", type=str, required=False, help="LLM provider name")
33+
@click.option("--llm-endpoint", type=str, required=False, help="LLM endpoint URL")
34+
@click.option("--llm-name", type=str, required=False, help="LLM model name for completions")
35+
@click.option("--llm-embedding-name", type=str, required=False, help="LLM model name for embeddings")
36+
@click.option("--llm-api-key", type=str, required=False, help="LLM API key")
37+
@click.option("--llm-api-version", type=str, required=False, help="LLM API version")
38+
@click.pass_context
39+
def llm_cli(
40+
ctx: click.Context,
41+
question: str,
42+
schema: Optional[str],
43+
llm_provider: Optional[str],
44+
llm_endpoint: Optional[str],
45+
llm_name: Optional[str],
46+
llm_embedding_name: Optional[str],
47+
llm_api_key: Optional[str],
48+
llm_api_version: Optional[str],
49+
):
50+
"""
51+
Use an LLM to query a database in human language.
52+
"""
53+
setup_logging()
54+
load_dotenv()
55+
56+
schema = schema or os.getenv("CRATEDB_SCHEMA") or "doc"
57+
llm_provider = llm_provider or os.getenv("LLM_PROVIDER")
58+
llm_endpoint = llm_endpoint or os.getenv("LLM_ENDPOINT")
59+
llm_name = llm_name or os.getenv("LLM_NAME")
60+
llm_embedding_name = llm_embedding_name or os.getenv("LLM_EMBEDDING_NAME")
61+
llm_api_key = llm_api_key or os.getenv("LLM_API_KEY")
62+
if not llm_provider:
63+
raise click.UsageError("LLM provider name is required")
64+
65+
# Connect to database and configure LLM.
66+
dc = DatabaseCluster.from_options(ctx.meta["address"])
67+
engine = dc.adapter.engine
68+
provider = ModelProvider(llm_provider)
69+
70+
# Submit query.
71+
dq = DataQuery(
72+
db=DatabaseInfo(
73+
engine=engine,
74+
schema=schema,
75+
),
76+
model=ModelInfo.from_options(
77+
provider=provider,
78+
llm_name=llm_name,
79+
llm_embedding_name=llm_embedding_name,
80+
llm_endpoint=llm_endpoint,
81+
llm_api_key=llm_api_key,
82+
llm_api_version=llm_api_version,
83+
),
84+
)
85+
86+
logger.info("Selected LLM: completion=%s, embedding=%s", dq.model.completion, dq.model.embedding)
87+
88+
response = dq.ask(question)
89+
90+
logger.info("Query was: %s", question)
91+
logger.info("Answer was: %s", response)
92+
logger.info("More (metadata, formatted sources):")
93+
logger.info(response.get_formatted_sources())
94+
logger.info(response.metadata)
95+
return response

cratedb_toolkit/query/llm/model.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import dataclasses
2+
import os
3+
from enum import Enum
4+
from typing import List, Optional
5+
6+
import sqlalchemy as sa
7+
8+
9+
class ModelProvider(Enum):
10+
"""Model provider choices."""
11+
12+
OPENAI = "openai"
13+
AZURE = "azure"
14+
OLLAMA = "ollama"
15+
16+
17+
@dataclasses.dataclass
18+
class ModelInfo:
19+
"""Information about the model."""
20+
21+
provider: ModelProvider
22+
completion: str
23+
embedding: str
24+
endpoint: Optional[str] = None
25+
instance: Optional[str] = None
26+
api_key: Optional[str] = None
27+
api_version: Optional[str] = None
28+
29+
@classmethod
30+
def from_options(
31+
cls,
32+
provider: ModelProvider,
33+
llm_endpoint: Optional[str],
34+
llm_name: Optional[str],
35+
llm_embedding_name: Optional[str],
36+
llm_api_key: Optional[str],
37+
llm_api_version: Optional[str],
38+
):
39+
"""Read options and apply parameter sanity checks and heuristics."""
40+
if not llm_name:
41+
if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
42+
llm_name = "gpt-4.1"
43+
elif provider in [ModelProvider.OLLAMA]:
44+
llm_name = "gemma3:1b"
45+
else:
46+
raise ValueError("LLM completion model not selected")
47+
if not llm_embedding_name:
48+
if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
49+
# Canonical embedding models with Open AI: text-embedding-ada-002, text-embedding-3-large
50+
llm_embedding_name = "text-embedding-ada-002"
51+
elif provider in [ModelProvider.OLLAMA]:
52+
# Popular embedding models with Ollama: nomic-embed-text, embeddinggemma, mxbai-embed-large
53+
llm_embedding_name = "nomic-embed-text"
54+
else:
55+
raise ValueError("LLM embedding model not selected")
56+
if not llm_api_key:
57+
if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
58+
llm_api_key = os.getenv("OPENAI_API_KEY")
59+
return cls(
60+
provider=provider,
61+
endpoint=llm_endpoint,
62+
completion=llm_name,
63+
embedding=llm_embedding_name,
64+
api_key=llm_api_key,
65+
api_version=llm_api_version,
66+
)
67+
68+
69+
@dataclasses.dataclass
70+
class DatabaseInfo:
71+
"""Information about the database."""
72+
73+
engine: sa.engine.Engine
74+
schema: Optional[str] = None
75+
ignore_tables: Optional[List[str]] = None
76+
include_tables: Optional[List[str]] = None

cratedb_toolkit/query/llm/util.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# ty: ignore[unresolved-import]
2+
from typing import Tuple
3+
4+
import llama_index.core
5+
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
6+
from llama_index.core.base.embeddings.base import BaseEmbedding
7+
from llama_index.core.llms import LLM
8+
from llama_index.embeddings.langchain import LangchainEmbedding
9+
from llama_index.embeddings.ollama import OllamaEmbedding
10+
from llama_index.llms.azure_openai import AzureOpenAI
11+
from llama_index.llms.ollama import Ollama
12+
from llama_index.llms.openai import OpenAI
13+
14+
from cratedb_toolkit.query.llm.model import ModelInfo, ModelProvider
15+
16+
17+
def configure_llm(info: ModelInfo, debug: bool = False) -> Tuple[LLM, BaseEmbedding]:
18+
"""
19+
Configure LLM access and model types. Use either vanilla Open AI, Azure Open AI, or Ollama.
20+
21+
TODO: What about Hugging Face, Runpod, vLLM, and others?
22+
23+
Notes about text embedding models:
24+
25+
> The new model, `text-embedding-ada-002`, replaces five separate models for text search,
26+
> text similarity, and code search, and outperforms our previous most capable model,
27+
> Davinci, at most tasks, while being priced 99.8% lower.
28+
29+
- https://openai.com/index/new-and-improved-embedding-model/
30+
- https://community.openai.com/t/models-embedding-vs-similarity-vs-search-models/291265
31+
"""
32+
33+
completion_model = info.completion
34+
35+
if not info.provider:
36+
raise ValueError("LLM model type not defined")
37+
if not completion_model:
38+
raise ValueError("LLM model name not defined")
39+
40+
# https://docs.llamaindex.ai/en/stable/understanding/tracing_and_debugging/tracing_and_debugging/
41+
if debug:
42+
llama_index.core.set_global_handler("simple")
43+
44+
# Select completions model.
45+
if info.provider is ModelProvider.OPENAI:
46+
llm = OpenAI(
47+
model=completion_model,
48+
temperature=0.0,
49+
api_key=info.api_key,
50+
api_version=info.api_version,
51+
)
52+
elif info.provider is ModelProvider.AZURE:
53+
llm = AzureOpenAI(
54+
model=completion_model,
55+
temperature=0.0,
56+
engine=info.instance,
57+
azure_endpoint=info.endpoint,
58+
api_key=info.api_key,
59+
api_version=info.api_version,
60+
)
61+
elif info.provider is ModelProvider.OLLAMA:
62+
# https://docs.llamaindex.ai/en/stable/api_reference/llms/ollama/
63+
llm = Ollama(
64+
base_url=info.endpoint or "http://localhost:11434",
65+
model=completion_model,
66+
temperature=0.0,
67+
request_timeout=120.0,
68+
keep_alive=-1,
69+
)
70+
else:
71+
raise ValueError("LLM model type invalid: %s", info.provider)
72+
73+
# Select embeddings model.
74+
if info.provider is ModelProvider.OPENAI:
75+
embed_model = LangchainEmbedding(
76+
OpenAIEmbeddings(
77+
model=info.embedding,
78+
api_key=info.api_key, # ty: ignore[unknown-argument]
79+
)
80+
)
81+
elif info.provider is ModelProvider.AZURE:
82+
embed_model = LangchainEmbedding(
83+
AzureOpenAIEmbeddings(
84+
azure_endpoint=info.endpoint,
85+
model=info.embedding,
86+
api_key=info.api_key, # ty: ignore[unknown-argument]
87+
api_version=info.api_version, # ty: ignore[unknown-argument]
88+
)
89+
)
90+
# https://pypi.org/project/llama-index-embeddings-ollama/
91+
# https://developers.llamaindex.ai/python/framework/integrations/embeddings/ollama_embedding/
92+
# https://developers.llamaindex.ai/typescript/framework/modules/models/embeddings/
93+
# Popular embedding models with Ollama: nomic-embed-text, embeddinggemma, mxbai-embed-large
94+
elif info.provider is ModelProvider.OLLAMA:
95+
embed_model = OllamaEmbedding(
96+
model_name=info.embedding,
97+
base_url=info.endpoint or "http://localhost:11434",
98+
)
99+
else:
100+
embed_model = None
101+
102+
return llm, embed_model

0 commit comments

Comments
 (0)