diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..700da4b1 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,30 @@ +# Repository Guidelines + +## Project Structure & Module Organization +- `rust/lance-graph/` hosts the Rust Cypher engine; keep new modules under `src/` and co-locate helpers inside `query/` or feature-specific submodules. +- `python/src/` contains the PyO3 bridge; `python/python/lance_graph/` holds the pure-Python facade and packaging metadata. +- `python/python/tests/` stores functional tests; mirror new features with targeted cases here and in the corresponding Rust module. +- `examples/` demonstrates Cypher usage; update or add examples when introducing new public APIs. + +## Build, Test, and Development Commands +- `cargo check` / `cargo test --all` (run inside `rust/lance-graph`) validate Rust code paths. +- `cargo bench --bench graph_execution` measures performance-critical changes; include shortened runs with `--warm-up-time 1`. +- `uv venv --python 3.11 .venv` and `uv pip install -e '.[tests]'` bootstrap the Python workspace. +- `maturin develop` rebuilds the extension after Rust edits; `pytest python/python/tests/ -v` exercises Python bindings. +- `make lint` (in `python/`) runs `ruff`, formatting checks, and `pyright`. + +## Coding Style & Naming Conventions +- Format Rust with `cargo fmt --all`; keep modules and functions snake_case, types PascalCase, and reuse `snafu` error patterns. +- Run `cargo clippy --all-targets --all-features` to catch lint regressions. +- Use 4-space indentation in Python; maintain snake_case modules, CamelCase classes, and type-annotated public APIs. +- Apply `ruff format python/` before committing; `ruff check` and `pyright` enforce import hygiene and typing. + +## Testing Guidelines +- Add Rust unit tests alongside implementations via `#[cfg(test)]`; prefer focused scenarios over broad integration. +- Python tests belong in `python/python/tests/`; name files `test_*.py` and use markers (`gpu`, `cuda`, `integration`, `slow`) consistently. +- When touching performance-sensitive code, capture representative `cargo bench` or large-table pytest timing notes in the PR. + +## Commit & Pull Request Guidelines +- Follow the existing history style (`feat(graph):`, `docs:`, `refactor(query):`), using imperative, ≤72-character subjects. +- Reference issues or discussions when relevant and include brief context in the body. +- PRs should describe scope, list test commands run, mention benchmark deltas when applicable, and highlight impacts on bindings or examples. diff --git a/python/pyproject.toml b/python/pyproject.toml index 29914cc8..07946f91 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -28,6 +28,7 @@ classifiers = [ [tool.maturin] python-source = "python" +python-packages = ["lance_graph", "knowledge_graph"] [build-system] requires = ["maturin>=1.4"] @@ -37,6 +38,9 @@ build-backend = "maturin" tests = ["pytest", "pyarrow>=14", "pandas"] dev = ["ruff", "pyright"] +[project.scripts] +knowledge_graph = "knowledge_graph.main:main" + [tool.ruff] lint.select = ["F", "E", "W", "I", "G", "TCH", "PERF", "B019"] diff --git a/python/python/knowledge_graph/__init__.py b/python/python/knowledge_graph/__init__.py new file mode 100644 index 00000000..374e0d7b --- /dev/null +++ b/python/python/knowledge_graph/__init__.py @@ -0,0 +1,103 @@ +"""High-level helpers for working with Lance-backed knowledge graphs.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, Mapping, Optional + +import pyarrow as pa +from lance_graph import CypherQuery, GraphConfig + +try: # Prefer to import for typing without raising at runtime. + from lance_graph import GraphConfigBuilder +except ImportError: # pragma: no cover - builder is available in normal installs. + GraphConfigBuilder = object # type: ignore[assignment] + +TableMapping = Mapping[str, pa.Table] + + +def _ensure_table(name: str, table: pa.Table) -> pa.Table: + if not isinstance(table, pa.Table): + raise TypeError( + f"Dataset '{name}' must be a pyarrow.Table (got {type(table)!r})" + ) + return table + + +@dataclass(frozen=True) +class KnowledgeGraph: + """Wraps a ``GraphConfig`` alongside the Arrow tables backing it.""" + + config: GraphConfig + _tables: Dict[str, pa.Table] + + def __init__(self, config: GraphConfig, datasets: TableMapping) -> None: + object.__setattr__(self, "config", config) + normalized = { + name: _ensure_table(name, table) for name, table in datasets.items() + } + object.__setattr__(self, "_tables", normalized) + + def run( + self, + statement: str, + *, + datasets: Optional[TableMapping] = None, + ): + """Execute a Cypher statement, overriding tables when provided.""" + query = CypherQuery(statement).with_config(self.config) + sources: Dict[str, pa.Table] = dict(self._tables) + if datasets: + sources.update( + {name: _ensure_table(name, table) for name, table in datasets.items()} + ) + return query.execute(sources) + + def tables(self) -> Dict[str, pa.Table]: + """Return a shallow copy of the registered datasets.""" + return dict(self._tables) + + +class KnowledgeGraphBuilder: + """Collects nodes, relationships, and datasets before building a graph.""" + + def __init__(self) -> None: + builder = GraphConfig.builder() + self._builder: GraphConfigBuilder = builder # type: ignore[annotation-unchecked] + self._datasets: Dict[str, pa.Table] = {} + + def with_node( + self, + label: str, + primary_key: str, + table: pa.Table, + ) -> KnowledgeGraphBuilder: + """Register a node label and Arrow table.""" + self._builder = self._builder.with_node_label(label, primary_key) + self._datasets[label] = _ensure_table(label, table) + return self + + def with_relationship( + self, + name: str, + source_key: str, + target_key: str, + table: pa.Table, + ) -> KnowledgeGraphBuilder: + """Register a relationship and its underlying table.""" + self._builder = self._builder.with_relationship(name, source_key, target_key) + self._datasets[name] = _ensure_table(name, table) + return self + + def with_dataset(self, name: str, table: pa.Table) -> KnowledgeGraphBuilder: + """Attach arbitrary supporting datasets (e.g., reference tables).""" + self._datasets[name] = _ensure_table(name, table) + return self + + def build(self) -> KnowledgeGraph: + """Materialize the ``KnowledgeGraph`` instance.""" + config = self._builder.build() + return KnowledgeGraph(config, self._datasets) + + +__all__ = ["KnowledgeGraph", "KnowledgeGraphBuilder"] diff --git a/python/python/knowledge_graph/__main__.py b/python/python/knowledge_graph/__main__.py new file mode 100644 index 00000000..bb3ea24e --- /dev/null +++ b/python/python/knowledge_graph/__main__.py @@ -0,0 +1,8 @@ +"""Executable module wrapper for `python -m knowledge_graph`.""" + +from __future__ import annotations + +from .main import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/python/python/knowledge_graph/main.py b/python/python/knowledge_graph/main.py new file mode 100644 index 00000000..2dd48f77 --- /dev/null +++ b/python/python/knowledge_graph/main.py @@ -0,0 +1,110 @@ +"""Command line interface for the knowledge_graph helpers.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Optional, Sequence + + +def init_graph() -> None: + """Initialize storage for the knowledge graph.""" + pass + + +def run_interactive() -> None: + """Enter an interactive shell for issuing commands.""" + pass + + +def execute_query(text: str) -> None: + """Execute a single knowledge graph query.""" + del text # placeholder until implementation + + +def preview_extraction(path: Path) -> None: + """Preview extracted knowledge from a text source.""" + del path # placeholder until implementation + + +def extract_and_add(path: Path) -> None: + """Extract knowledge and append it to the backing graph.""" + del path # placeholder until implementation + + +def ask_question(question: str) -> None: + """Answer a natural-language question using the graph.""" + del question # placeholder until implementation + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="knowledge_graph", + description="Operate the Lance-backed knowledge graph.", + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--init", + action="store_true", + help="Initialize the knowledge graph storage.", + ) + group.add_argument( + "--extract-preview", + metavar="PATH", + help="Preview extracted entities and relations from a text file.", + ) + group.add_argument( + "--extract-and-add", + metavar="PATH", + help="Extract and insert knowledge from a text file.", + ) + group.add_argument( + "--ask", + metavar="QUESTION", + help="Ask a natural-language question over the knowledge graph.", + ) + parser.add_argument( + "query", + nargs="?", + help="Execute a single Cypher or semantic query.", + ) + return parser + + +def main(argv: Optional[Sequence[str]] = None) -> int: + parser = _build_parser() + args = parser.parse_args(argv) + + exclusive_args = any( + [ + args.init, + args.extract_preview is not None, + args.extract_and_add is not None, + args.ask is not None, + ] + ) + if args.query and exclusive_args: + parser.error("Query argument cannot be combined with flags.") + + if args.init: + init_graph() + return 0 + if args.extract_preview: + preview_extraction(Path(args.extract_preview)) + return 0 + if args.extract_and_add: + extract_and_add(Path(args.extract_and_add)) + return 0 + if args.ask: + ask_question(args.ask) + return 0 + if args.query: + execute_query(args.query) + return 0 + + run_interactive() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())