Skip to content

Commit 7daf8ff

Browse files
committed
Text-to-SQL: Help agents turn natural language into SQL queries
DataQuery is the little sister of Google's QueryData product.
1 parent 408ec74 commit 7daf8ff

11 files changed

Lines changed: 440 additions & 25 deletions

File tree

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
- Kinesis: Added `ctk kinesis` CLI group with `list-checkpoints` and
66
`prune-checkpoints` commands for checkpoint table maintenance
77
- Dependencies: Permitted installation of click 8.3
8+
- QueryData: Help agents turn natural language into SQL queries
89

910
## 2026/03/16 v0.0.46
1011
- I/O: API improvements: `ctk {load,save} table` became `ctk {load,save}`

cratedb_toolkit/query/cli.py

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,9 @@
1-
import logging
2-
3-
import click
4-
from click_aliases import ClickAliasedGroup
5-
6-
from ..util.cli import boot_click
1+
from ..util.app import make_cli
72
from .convert.cli import convert_query
3+
from .llm.cli import llm_cli
84
from .mcp.cli import cli as mcp_cli
95

10-
logger = logging.getLogger(__name__)
11-
12-
13-
@click.group(cls=ClickAliasedGroup)
14-
@click.option("--verbose", is_flag=True, required=False, help="Turn on logging")
15-
@click.option("--debug", is_flag=True, required=False, help="Turn on logging with debug level")
16-
@click.version_option()
17-
@click.pass_context
18-
def cli(ctx: click.Context, verbose: bool, debug: bool):
19-
"""
20-
Query utilities.
21-
"""
22-
return boot_click(ctx, verbose, debug)
23-
24-
6+
cli = make_cli()
257
cli.add_command(convert_query, name="convert")
8+
cli.add_command(llm_cli, name="llm")
269
cli.add_command(mcp_cli, name="mcp")

cratedb_toolkit/query/llm/__init__.py

Whitespace-only changes.

cratedb_toolkit/query/llm/api.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
Use an LLM to query a database in human language via NLSQLTableQueryEngine.
3+
Example code using LlamaIndex with vanilla Open AI, Azure Open AI, or Ollama.
4+
"""
5+
6+
import dataclasses
7+
import logging
8+
from typing import Optional
9+
10+
from cratedb_toolkit.query.llm.model import DatabaseInfo, ModelInfo
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
try:
16+
from llama_index.core.base.embeddings.base import BaseEmbedding
17+
from llama_index.core.base.response.schema import RESPONSE_TYPE
18+
from llama_index.core.llms import LLM
19+
from llama_index.core.query_engine import NLSQLTableQueryEngine
20+
from llama_index.core.utilities.sql_wrapper import SQLDatabase
21+
except ImportError:
22+
pass
23+
24+
25+
@dataclasses.dataclass
26+
class DataQuery:
27+
"""
28+
DataQuery helps agents turn natural language into SQL queries.
29+
It's the little sister of Google's QueryData product. [1]
30+
31+
We recommend evaluating the Text-to-SQL interface using the Gemma models if you are
32+
looking at non-frontier variants that need less resources for inference. However,
33+
depending on the complexity of your problem, you may also want to use cutting-edge
34+
models with your provider of choice at the cost of higher resource usage.
35+
36+
Attention: Any natural language SQL table query engine and Text-to-SQL application
37+
should be aware that executing arbitrary SQL queries can be a security risk.
38+
It is recommended to take precautions as needed, such as using restricted roles,
39+
read-only databases, sandboxing, etc.
40+
41+
[1] https://cloud.google.com/blog/products/databases/introducing-querydata-for-near-100-percent-accurate-data-agents
42+
[2] https://github.com/kupp0/multi-db-property-search-data-agents
43+
"""
44+
45+
db: DatabaseInfo
46+
model: ModelInfo
47+
query_engine: Optional["NLSQLTableQueryEngine"] = None
48+
49+
def __post_init__(self):
50+
self.setup()
51+
52+
def setup(self):
53+
# Configure database connection and query engine.
54+
logger.info("Connecting to CrateDB")
55+
56+
# Configure model.
57+
logger.info("Configuring LLM model")
58+
llm: LLM
59+
embed_model: BaseEmbedding
60+
from cratedb_toolkit.query.llm.util import configure_llm
61+
62+
llm, embed_model = configure_llm(self.model)
63+
64+
# Configure query engine.
65+
logger.info("Creating query engine")
66+
sql_database = SQLDatabase(
67+
self.db.engine,
68+
schema=self.db.schema,
69+
ignore_tables=self.db.ignore_tables,
70+
include_tables=self.db.include_tables,
71+
)
72+
self.query_engine = NLSQLTableQueryEngine(
73+
sql_database=sql_database,
74+
llm=llm,
75+
embed_model=embed_model,
76+
)
77+
78+
def ask(self, question: str) -> "RESPONSE_TYPE":
79+
"""Invoke an inquiry to the LLM."""
80+
if not self.query_engine:
81+
raise ValueError("Query engine not configured")
82+
logger.debug("Running query: %s", question)
83+
return self.query_engine.query(question)

cratedb_toolkit/query/llm/cli.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import logging
2+
import os
3+
from typing import Optional
4+
5+
import click
6+
from dotenv import load_dotenv
7+
8+
from cratedb_toolkit import DatabaseCluster
9+
from cratedb_toolkit.query.llm.api import DataQuery
10+
from cratedb_toolkit.query.llm.model import DatabaseInfo, ModelInfo, ModelProvider
11+
from cratedb_toolkit.util.common import setup_logging
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
def help_llm():
17+
"""
18+
Use an LLM to query the database in human language.
19+
20+
Synopsis
21+
========
22+
23+
export CRATEDB_CLUSTER_URL=crate://localhost/
24+
ctk query llm "What is the average value for sensor 1?"
25+
26+
""" # noqa: E501
27+
28+
29+
@click.command()
30+
@click.argument("question")
31+
@click.option("--schema", type=str, required=False, help="Schema where to operate on")
32+
@click.option("--llm-provider", type=str, required=False, help="LLM provider name")
33+
@click.option("--llm-endpoint", type=str, required=False, help="LLM endpoint URL")
34+
@click.option("--llm-name", type=str, required=False, help="LLM model name for completions")
35+
@click.option("--llm-embedding-name", type=str, required=False, help="LLM model name for embeddings")
36+
@click.option("--llm-api-key", type=str, required=False, help="LLM API key")
37+
@click.pass_context
38+
def llm_cli(
39+
ctx: click.Context,
40+
question: str,
41+
schema: Optional[str],
42+
llm_provider: Optional[str],
43+
llm_endpoint: Optional[str],
44+
llm_name: Optional[str],
45+
llm_embedding_name: Optional[str],
46+
llm_api_key: Optional[str],
47+
):
48+
"""
49+
Use an LLM to query a database in human language.
50+
"""
51+
setup_logging()
52+
load_dotenv()
53+
54+
schema = schema or os.getenv("CRATEDB_SCHEMA") or "doc"
55+
llm_provider = llm_provider or os.getenv("LLM_PROVIDER")
56+
llm_endpoint = llm_endpoint or os.getenv("LLM_ENDPOINT")
57+
llm_name = llm_name or os.getenv("LLM_NAME")
58+
llm_embedding_name = llm_embedding_name or os.getenv("LLM_EMBEDDING_NAME")
59+
llm_api_key = llm_api_key or os.getenv("LLM_API_KEY")
60+
if not llm_provider:
61+
raise click.UsageError("LLM provider name is required")
62+
63+
# Connect to database and configure LLM.
64+
dc = DatabaseCluster.from_options(ctx.meta["address"])
65+
engine = dc.adapter.engine
66+
provider = ModelProvider(llm_provider)
67+
68+
# Parameter sanity checks and heuristics.
69+
if not llm_name:
70+
if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
71+
llm_name = "gpt-4.1"
72+
elif provider in [ModelProvider.OLLAMA]:
73+
llm_name = "gemma3:1b"
74+
else:
75+
raise ValueError("LLM completion model not selected")
76+
if not llm_embedding_name:
77+
if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
78+
# Canonical embedding models with Open AI: text-embedding-ada-002, text-embedding-3-large
79+
llm_embedding_name = "text-embedding-ada-002"
80+
elif provider in [ModelProvider.OLLAMA]:
81+
# Popular embedding models with Ollama: nomic-embed-text, embeddinggemma, mxbai-embed-large
82+
llm_embedding_name = "nomic-embed-text"
83+
else:
84+
raise ValueError("LLM embedding model not selected")
85+
if not llm_api_key:
86+
if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
87+
llm_api_key = os.getenv("OPENAI_API_KEY")
88+
89+
logger.info("Selected LLM: completion=%s, embedding=%s", llm_name, llm_embedding_name)
90+
91+
# Submit query.
92+
dq = DataQuery(
93+
db=DatabaseInfo(
94+
engine=engine,
95+
schema=schema,
96+
),
97+
model=ModelInfo(
98+
provider=provider,
99+
completion=llm_name,
100+
embedding=llm_embedding_name,
101+
endpoint=llm_endpoint,
102+
api_key=llm_api_key,
103+
),
104+
)
105+
response = dq.ask(question)
106+
107+
logger.info("Query was: %s", question)
108+
logger.info("Answer was: %s", response)
109+
logger.info("More (metadata, formatted sources):")
110+
logger.info(response.get_formatted_sources())
111+
logger.info(response.metadata)
112+
return response
113+
114+
# assert "Answer was: The average value for sensor 1 is approximately 17.03." in out # noqa: ERA001

cratedb_toolkit/query/llm/model.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import dataclasses
2+
from enum import Enum
3+
from typing import List, Optional
4+
5+
import sqlalchemy as sa
6+
7+
8+
class ModelProvider(Enum):
9+
"""Model provider choices."""
10+
11+
OPENAI = "openai"
12+
AZURE = "azure"
13+
OLLAMA = "ollama"
14+
15+
16+
@dataclasses.dataclass
17+
class ModelInfo:
18+
"""Information about the model."""
19+
20+
provider: ModelProvider
21+
completion: str
22+
embedding: str
23+
endpoint: Optional[str] = None
24+
instance: Optional[str] = None
25+
api_key: Optional[str] = None
26+
api_version: Optional[str] = None
27+
28+
29+
@dataclasses.dataclass
30+
class DatabaseInfo:
31+
"""Information about the database."""
32+
33+
engine: sa.engine.Engine
34+
schema: Optional[str] = None
35+
ignore_tables: Optional[List[str]] = None
36+
include_tables: Optional[List[str]] = None

cratedb_toolkit/query/llm/util.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# ty: ignore[unresolved-import]
2+
from typing import Tuple
3+
4+
import llama_index.core
5+
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
6+
from llama_index.core.base.embeddings.base import BaseEmbedding
7+
from llama_index.core.llms import LLM
8+
from llama_index.embeddings.langchain import LangchainEmbedding
9+
from llama_index.embeddings.ollama import OllamaEmbedding
10+
from llama_index.llms.azure_openai import AzureOpenAI
11+
from llama_index.llms.ollama import Ollama
12+
from llama_index.llms.openai import OpenAI
13+
14+
from cratedb_toolkit.query.llm.model import ModelInfo, ModelProvider
15+
16+
17+
def configure_llm(info: ModelInfo, debug: bool = False) -> Tuple[LLM, BaseEmbedding]:
18+
"""
19+
Configure LLM access and model types. Use either vanilla Open AI, Azure Open AI, or Ollama.
20+
21+
TODO: What about Hugging Face, Runpod, vLLM, and others?
22+
23+
Notes about text embedding models:
24+
25+
> The new model, `text-embedding-ada-002`, replaces five separate models for text search,
26+
> text similarity, and code search, and outperforms our previous most capable model,
27+
> Davinci, at most tasks, while being priced 99.8% lower.
28+
29+
- https://openai.com/index/new-and-improved-embedding-model/
30+
- https://community.openai.com/t/models-embedding-vs-similarity-vs-search-models/291265
31+
"""
32+
33+
completion_model = info.completion
34+
35+
if not info.provider:
36+
raise ValueError("LLM model type not defined")
37+
if not completion_model:
38+
raise ValueError("LLM model name not defined")
39+
40+
# https://docs.llamaindex.ai/en/stable/understanding/tracing_and_debugging/tracing_and_debugging/
41+
if debug:
42+
llama_index.core.set_global_handler("simple")
43+
44+
# Select completions model.
45+
if info.provider is ModelProvider.OPENAI:
46+
llm = OpenAI(
47+
model=completion_model,
48+
temperature=0.0,
49+
api_key=info.api_key,
50+
api_version=info.api_version,
51+
)
52+
elif info.provider is ModelProvider.AZURE:
53+
llm = AzureOpenAI(
54+
model=completion_model,
55+
temperature=0.0,
56+
engine=info.instance,
57+
azure_endpoint=info.endpoint,
58+
api_key=info.api_key,
59+
api_version=info.api_version,
60+
)
61+
elif info.provider is ModelProvider.OLLAMA:
62+
# https://docs.llamaindex.ai/en/stable/api_reference/llms/ollama/
63+
llm = Ollama(
64+
base_url=info.endpoint or "http://localhost:11434",
65+
model=completion_model,
66+
temperature=0.0,
67+
request_timeout=120.0,
68+
keep_alive=-1,
69+
)
70+
else:
71+
raise ValueError("LLM model type invalid: %s", info.provider)
72+
73+
# Select embeddings model.
74+
if info.provider is ModelProvider.OPENAI:
75+
embed_model = LangchainEmbedding(
76+
OpenAIEmbeddings(
77+
model=info.embedding,
78+
api_key=info.api_key, # ty: ignore[unknown-argument]
79+
)
80+
)
81+
elif info.provider is ModelProvider.AZURE:
82+
embed_model = LangchainEmbedding(
83+
AzureOpenAIEmbeddings(
84+
azure_endpoint=info.endpoint,
85+
model=info.embedding,
86+
api_key=info.api_key, # ty: ignore[unknown-argument]
87+
api_version=info.api_version, # ty: ignore[unknown-argument]
88+
)
89+
)
90+
# https://pypi.org/project/llama-index-embeddings-ollama/
91+
# https://developers.llamaindex.ai/python/framework/integrations/embeddings/ollama_embedding/
92+
# https://developers.llamaindex.ai/typescript/framework/modules/models/embeddings/
93+
# Popular embedding models with Ollama: nomic-embed-text, embeddinggemma, mxbai-embed-large
94+
elif info.provider is ModelProvider.OLLAMA:
95+
embed_model = OllamaEmbedding(
96+
model_name=info.embedding,
97+
base_url=info.endpoint or "http://localhost:11434",
98+
)
99+
else:
100+
embed_model = None
101+
102+
return llm, embed_model

doc/query/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ expressions: Adapters, converters, migration support tasks, etc.
66
```{toctree}
77
:maxdepth: 2
88
9+
llm/index
910
mcp/index
1011
convert
1112
```

0 commit comments

Comments
 (0)