crate
diff --git a/‎CHANGES.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGES.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cratedb_toolkit/query/cli.py‎
Lines changed: 4 additions & 21 deletions b/‎cratedb_toolkit/query/cli.py‎
Lines changed: 4 additions & 21 deletions
diff --git a/‎cratedb_toolkit/query/llm/__init__.py‎ b/‎cratedb_toolkit/query/llm/__init__.py‎
diff --git a/‎cratedb_toolkit/query/llm/api.py‎
Lines changed: 83 additions & 0 deletions b/‎cratedb_toolkit/query/llm/api.py‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎cratedb_toolkit/query/llm/cli.py‎
Lines changed: 95 additions & 0 deletions b/‎cratedb_toolkit/query/llm/cli.py‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎cratedb_toolkit/query/llm/model.py‎
Lines changed: 76 additions & 0 deletions b/‎cratedb_toolkit/query/llm/model.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎cratedb_toolkit/query/llm/util.py‎
Lines changed: 102 additions & 0 deletions b/‎cratedb_toolkit/query/llm/util.py‎
Lines changed: 102 additions & 0 deletions
@@ -5,6 +5,7 @@
 - Kinesis: Added `ctk kinesis` CLI group with `list-checkpoints` and
   `prune-checkpoints` commands for checkpoint table maintenance
 - Dependencies: Permitted installation of click 8.3
+- QueryData: Help agents turn natural language into SQL queries
 
 ## 2026/03/16 v0.0.46
 - I/O: API improvements: `ctk {load,save} table` became `ctk {load,save}`
 
@@ -1,26 +1,9 @@
-import logging
-
-import click
-from click_aliases import ClickAliasedGroup
-
-from ..util.cli import boot_click
+from ..util.app import make_cli
 from .convert.cli import convert_query
+from .llm.cli import llm_cli
 from .mcp.cli import cli as mcp_cli
 
-logger = logging.getLogger(__name__)
-
-
-@click.group(cls=ClickAliasedGroup)
-@click.option("--verbose", is_flag=True, required=False, help="Turn on logging")
-@click.option("--debug", is_flag=True, required=False, help="Turn on logging with debug level")
-@click.version_option()
-@click.pass_context
-def cli(ctx: click.Context, verbose: bool, debug: bool):
-    """
-    Query utilities.
-    """
-    return boot_click(ctx, verbose, debug)
-
-
+cli = make_cli()
 cli.add_command(convert_query, name="convert")
+cli.add_command(llm_cli, name="llm")
 cli.add_command(mcp_cli, name="mcp")
@@ -0,0 +1,83 @@
+"""
+Use an LLM to query a database in human language via NLSQLTableQueryEngine.
+Example code using LlamaIndex with vanilla Open AI, Azure Open AI, or Ollama.
+"""
+
+import dataclasses
+import logging
+from typing import Optional
+
+from cratedb_toolkit.query.llm.model import DatabaseInfo, ModelInfo
+
+logger = logging.getLogger(__name__)
+
+
+try:
+    from llama_index.core.base.embeddings.base import BaseEmbedding
+    from llama_index.core.base.response.schema import RESPONSE_TYPE
+    from llama_index.core.llms import LLM
+    from llama_index.core.query_engine import NLSQLTableQueryEngine
+    from llama_index.core.utilities.sql_wrapper import SQLDatabase
+except ImportError:
+    pass
+
+
+@dataclasses.dataclass
+class DataQuery:
+    """
+    DataQuery helps agents turn natural language into SQL queries.
+    It's the little sister of Google's QueryData product. [1]
+
+    We recommend evaluating the Text-to-SQL interface using the Gemma models if you are
+    looking at non-frontier variants that need less resources for inference. However,
+    depending on the complexity of your problem, you may also want to use cutting-edge
+    models with your provider of choice at the cost of higher resource usage.
+
+    Attention: Any natural language SQL table query engine and Text-to-SQL application
+    should be aware that executing arbitrary SQL queries can be a security risk.
+    It is recommended to take precautions as needed, such as using restricted roles,
+    read-only databases, sandboxing, etc.
+
+    [1] https://cloud.google.com/blog/products/databases/introducing-querydata-for-near-100-percent-accurate-data-agents
+    [2] https://github.com/kupp0/multi-db-property-search-data-agents
+    """
+
+    db: DatabaseInfo
+    model: ModelInfo
+    query_engine: Optional["NLSQLTableQueryEngine"] = None
+
+    def __post_init__(self):
+        self.setup()
+
+    def setup(self):
+        """Configure database connection and query engine."""
+        logger.info("Connecting to CrateDB")
+
+        # Configure model.
+        logger.info("Configuring LLM model")
+        llm: LLM
+        embed_model: BaseEmbedding
+        from cratedb_toolkit.query.llm.util import configure_llm
+
+        llm, embed_model = configure_llm(self.model)
+
+        # Configure query engine.
+        logger.info("Creating query engine")
+        sql_database = SQLDatabase(
+            self.db.engine,
+            schema=self.db.schema,
+            ignore_tables=self.db.ignore_tables,
+            include_tables=self.db.include_tables,
+        )
+        self.query_engine = NLSQLTableQueryEngine(
+            sql_database=sql_database,
+            llm=llm,
+            embed_model=embed_model,
+        )
+
+    def ask(self, question: str) -> "RESPONSE_TYPE":
+        """Invoke an inquiry to the LLM."""
+        if not self.query_engine:
+            raise ValueError("Query engine not configured")
+        logger.debug("Running query: %s", question)
+        return self.query_engine.query(question)
@@ -0,0 +1,95 @@
+import logging
+import os
+from typing import Optional
+
+import click
+from dotenv import load_dotenv
+
+from cratedb_toolkit import DatabaseCluster
+from cratedb_toolkit.query.llm.api import DataQuery
+from cratedb_toolkit.query.llm.model import DatabaseInfo, ModelInfo, ModelProvider
+from cratedb_toolkit.util.common import setup_logging
+
+logger = logging.getLogger(__name__)
+
+
+def help_llm():
+    """
+    Use an LLM to query the database in human language.
+
+    Synopsis
+    ========
+
+    export CRATEDB_CLUSTER_URL=crate://localhost/
+    ctk query llm "What is the average value for sensor 1?"
+
+    """  # noqa: E501
+
+
+@click.command()
+@click.argument("question")
+@click.option("--schema", type=str, required=False, help="Schema where to operate on")
+@click.option("--llm-provider", type=str, required=False, help="LLM provider name")
+@click.option("--llm-endpoint", type=str, required=False, help="LLM endpoint URL")
+@click.option("--llm-name", type=str, required=False, help="LLM model name for completions")
+@click.option("--llm-embedding-name", type=str, required=False, help="LLM model name for embeddings")
+@click.option("--llm-api-key", type=str, required=False, help="LLM API key")
+@click.option("--llm-api-version", type=str, required=False, help="LLM API version")
+@click.pass_context
+def llm_cli(
+    ctx: click.Context,
+    question: str,
+    schema: Optional[str],
+    llm_provider: Optional[str],
+    llm_endpoint: Optional[str],
+    llm_name: Optional[str],
+    llm_embedding_name: Optional[str],
+    llm_api_key: Optional[str],
+    llm_api_version: Optional[str],
+):
+    """
+    Use an LLM to query a database in human language.
+    """
+    setup_logging()
+    load_dotenv()
+
+    schema = schema or os.getenv("CRATEDB_SCHEMA") or "doc"
+    llm_provider = llm_provider or os.getenv("LLM_PROVIDER")
+    llm_endpoint = llm_endpoint or os.getenv("LLM_ENDPOINT")
+    llm_name = llm_name or os.getenv("LLM_NAME")
+    llm_embedding_name = llm_embedding_name or os.getenv("LLM_EMBEDDING_NAME")
+    llm_api_key = llm_api_key or os.getenv("LLM_API_KEY")
+    if not llm_provider:
+        raise click.UsageError("LLM provider name is required")
+
+    # Connect to database and configure LLM.
+    dc = DatabaseCluster.from_options(ctx.meta["address"])
+    engine = dc.adapter.engine
+    provider = ModelProvider(llm_provider)
+
+    # Submit query.
+    dq = DataQuery(
+        db=DatabaseInfo(
+            engine=engine,
+            schema=schema,
+        ),
+        model=ModelInfo.from_options(
+            provider=provider,
+            llm_name=llm_name,
+            llm_embedding_name=llm_embedding_name,
+            llm_endpoint=llm_endpoint,
+            llm_api_key=llm_api_key,
+            llm_api_version=llm_api_version,
+        ),
+    )
+
+    logger.info("Selected LLM: completion=%s, embedding=%s", dq.model.completion, dq.model.embedding)
+
+    response = dq.ask(question)
+
+    logger.info("Query was: %s", question)
+    logger.info("Answer was: %s", response)
+    logger.info("More (metadata, formatted sources):")
+    logger.info(response.get_formatted_sources())
+    logger.info(response.metadata)
+    return response
@@ -0,0 +1,76 @@
+import dataclasses
+import os
+from enum import Enum
+from typing import List, Optional
+
+import sqlalchemy as sa
+
+
+class ModelProvider(Enum):
+    """Model provider choices."""
+
+    OPENAI = "openai"
+    AZURE = "azure"
+    OLLAMA = "ollama"
+
+
+@dataclasses.dataclass
+class ModelInfo:
+    """Information about the model."""
+
+    provider: ModelProvider
+    completion: str
+    embedding: str
+    endpoint: Optional[str] = None
+    instance: Optional[str] = None
+    api_key: Optional[str] = None
+    api_version: Optional[str] = None
+
+    @classmethod
+    def from_options(
+        cls,
+        provider: ModelProvider,
+        llm_endpoint: Optional[str],
+        llm_name: Optional[str],
+        llm_embedding_name: Optional[str],
+        llm_api_key: Optional[str],
+        llm_api_version: Optional[str],
+    ):
+        """Read options and apply parameter sanity checks and heuristics."""
+        if not llm_name:
+            if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
+                llm_name = "gpt-4.1"
+            elif provider in [ModelProvider.OLLAMA]:
+                llm_name = "gemma3:1b"
+            else:
+                raise ValueError("LLM completion model not selected")
+        if not llm_embedding_name:
+            if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
+                # Canonical embedding models with Open AI: text-embedding-ada-002, text-embedding-3-large
+                llm_embedding_name = "text-embedding-ada-002"
+            elif provider in [ModelProvider.OLLAMA]:
+                # Popular embedding models with Ollama: nomic-embed-text, embeddinggemma, mxbai-embed-large
+                llm_embedding_name = "nomic-embed-text"
+            else:
+                raise ValueError("LLM embedding model not selected")
+        if not llm_api_key:
+            if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
+                llm_api_key = os.getenv("OPENAI_API_KEY")
+        return cls(
+            provider=provider,
+            endpoint=llm_endpoint,
+            completion=llm_name,
+            embedding=llm_embedding_name,
+            api_key=llm_api_key,
+            api_version=llm_api_version,
+        )
+
+
+@dataclasses.dataclass
+class DatabaseInfo:
+    """Information about the database."""
+
+    engine: sa.engine.Engine
+    schema: Optional[str] = None
+    ignore_tables: Optional[List[str]] = None
+    include_tables: Optional[List[str]] = None
@@ -0,0 +1,102 @@
+# ty: ignore[unresolved-import]
+from typing import Tuple
+
+import llama_index.core
+from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
+from llama_index.core.base.embeddings.base import BaseEmbedding
+from llama_index.core.llms import LLM
+from llama_index.embeddings.langchain import LangchainEmbedding
+from llama_index.embeddings.ollama import OllamaEmbedding
+from llama_index.llms.azure_openai import AzureOpenAI
+from llama_index.llms.ollama import Ollama
+from llama_index.llms.openai import OpenAI
+
+from cratedb_toolkit.query.llm.model import ModelInfo, ModelProvider
+
+
+def configure_llm(info: ModelInfo, debug: bool = False) -> Tuple[LLM, BaseEmbedding]:
+    """
+    Configure LLM access and model types. Use either vanilla Open AI, Azure Open AI, or Ollama.
+
+    TODO: What about Hugging Face, Runpod, vLLM, and others?
+
+    Notes about text embedding models:
+
+    > The new model, `text-embedding-ada-002`, replaces five separate models for text search,
+    > text similarity, and code search, and outperforms our previous most capable model,
+    > Davinci, at most tasks, while being priced 99.8% lower.
+
+    - https://openai.com/index/new-and-improved-embedding-model/
+    - https://community.openai.com/t/models-embedding-vs-similarity-vs-search-models/291265
+    """
+
+    completion_model = info.completion
+
+    if not info.provider:
+        raise ValueError("LLM model type not defined")
+    if not completion_model:
+        raise ValueError("LLM model name not defined")
+
+    # https://docs.llamaindex.ai/en/stable/understanding/tracing_and_debugging/tracing_and_debugging/
+    if debug:
+        llama_index.core.set_global_handler("simple")
+
+    # Select completions model.
+    if info.provider is ModelProvider.OPENAI:
+        llm = OpenAI(
+            model=completion_model,
+            temperature=0.0,
+            api_key=info.api_key,
+            api_version=info.api_version,
+        )
+    elif info.provider is ModelProvider.AZURE:
+        llm = AzureOpenAI(
+            model=completion_model,
+            temperature=0.0,
+            engine=info.instance,
+            azure_endpoint=info.endpoint,
+            api_key=info.api_key,
+            api_version=info.api_version,
+        )
+    elif info.provider is ModelProvider.OLLAMA:
+        # https://docs.llamaindex.ai/en/stable/api_reference/llms/ollama/
+        llm = Ollama(
+            base_url=info.endpoint or "http://localhost:11434",
+            model=completion_model,
+            temperature=0.0,
+            request_timeout=120.0,
+            keep_alive=-1,
+        )
+    else:
+        raise ValueError("LLM model type invalid: %s", info.provider)
+
+    # Select embeddings model.
+    if info.provider is ModelProvider.OPENAI:
+        embed_model = LangchainEmbedding(
+            OpenAIEmbeddings(
+                model=info.embedding,
+                api_key=info.api_key,  # ty: ignore[unknown-argument]
+            )
+        )
+    elif info.provider is ModelProvider.AZURE:
+        embed_model = LangchainEmbedding(
+            AzureOpenAIEmbeddings(
+                azure_endpoint=info.endpoint,
+                model=info.embedding,
+                api_key=info.api_key,  # ty: ignore[unknown-argument]
+                api_version=info.api_version,  # ty: ignore[unknown-argument]
+            )
+        )
+    # https://pypi.org/project/llama-index-embeddings-ollama/
+    # https://developers.llamaindex.ai/python/framework/integrations/embeddings/ollama_embedding/
+    # https://developers.llamaindex.ai/typescript/framework/modules/models/embeddings/
+    # Popular embedding models with Ollama: nomic-embed-text, embeddinggemma, mxbai-embed-large
+    elif info.provider is ModelProvider.OLLAMA:
+        embed_model = OllamaEmbedding(
+            model_name=info.embedding,
+            base_url=info.endpoint or "http://localhost:11434",
+        )
+    else:
+        embed_model = None
+
+    return llm, embed_model