From b904506f84c5e58441a2e680b65ef1d9fedc5c66 Mon Sep 17 00:00:00 2001 From: Patrick Hall Date: Tue, 19 May 2026 08:18:43 -0400 Subject: [PATCH 1/2] =?UTF-8?q?feat(cypher):=20M0=20=E2=80=94=20catalog=20?= =?UTF-8?q?scaffold=20+=20plan=20docs=20for=20Cypher=20frontend?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bootstraps the Cypher frontend integration on a new feat/cypher-frontend branch: * docs/contributor_guide/cypher-frontend/000-080: complete spec set covering overview, architecture, catalog extensions, read+write operator mapping, expression/type bridge, diagnostics, milestones, and open questions (including dependencies on cyrs upstream). * sql/cypher_catalog.sql: six new catalog tables wiring Cypher labels and rel-types onto registered pgGraph tables/edges, plus uniqueness and multi-label-set declarations. * src/cypher_facade/: module skeleton with five new pg_extern registration functions (register_label, register_label_property, register_rel_type, register_unique, allow_label_set) and a cyrs_schema::SchemaProvider impl reading the catalog via SPI. * Cargo.toml: cyrs-schema path dep against ../../cyrs (pin scheme to be revisited per docs/.../080-open-questions.md Q-UP-10). Execution path (graph.cypher(text, jsonb)) lands in M1. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cypher-frontend/000-overview.md | 122 +++++ .../cypher-frontend/010-architecture.md | 145 ++++++ .../cypher-frontend/020-catalog-extensions.md | 233 +++++++++ .../cypher-frontend/030-read-mapping.md | 218 ++++++++ .../cypher-frontend/040-write-mapping.md | 241 +++++++++ .../cypher-frontend/050-expr-and-types.md | 181 +++++++ .../060-diagnostics-and-errors.md | 146 ++++++ .../070-milestones-and-tests.md | 204 ++++++++ .../cypher-frontend/080-open-questions.md | 181 +++++++ graph/Cargo.lock | 101 +++- graph/Cargo.toml | 9 + graph/sql/cypher_catalog.sql | 120 +++++ graph/src/cypher_facade/mod.rs | 46 ++ graph/src/cypher_facade/registration.rs | 486 ++++++++++++++++++ graph/src/cypher_facade/schema_provider.rs | 464 +++++++++++++++++ graph/src/lib.rs | 6 + 16 files changed, 2899 insertions(+), 4 deletions(-) create mode 100644 docs/contributor_guide/cypher-frontend/000-overview.md create mode 100644 docs/contributor_guide/cypher-frontend/010-architecture.md create mode 100644 docs/contributor_guide/cypher-frontend/020-catalog-extensions.md create mode 100644 docs/contributor_guide/cypher-frontend/030-read-mapping.md create mode 100644 docs/contributor_guide/cypher-frontend/040-write-mapping.md create mode 100644 docs/contributor_guide/cypher-frontend/050-expr-and-types.md create mode 100644 docs/contributor_guide/cypher-frontend/060-diagnostics-and-errors.md create mode 100644 docs/contributor_guide/cypher-frontend/070-milestones-and-tests.md create mode 100644 docs/contributor_guide/cypher-frontend/080-open-questions.md create mode 100644 graph/sql/cypher_catalog.sql create mode 100644 graph/src/cypher_facade/mod.rs create mode 100644 graph/src/cypher_facade/registration.rs create mode 100644 graph/src/cypher_facade/schema_provider.rs diff --git a/docs/contributor_guide/cypher-frontend/000-overview.md b/docs/contributor_guide/cypher-frontend/000-overview.md new file mode 100644 index 0000000..2ae605b --- /dev/null +++ b/docs/contributor_guide/cypher-frontend/000-overview.md @@ -0,0 +1,122 @@ +# Cypher frontend for pgGraph — overview + +> **Status:** Plan / spec. No code on this branch yet. +> **Branch:** `feat/cypher-frontend` +> **Upstream dependency:** [cyrs](https://github.com/) — Rust openCypher v9 / +> GQL frontend. Frontend-only (parser, HIR, plan IR, sema, diagnostics). +> No executor; pgGraph supplies the executor. +> **Companion document:** `../../../../cyrs/feat-request.md` — the asks we +> are making upstream of cyrs to make this fit. + +## What this is + +pgGraph today exposes graph queries as PostgreSQL SQL functions: + +```sql +SELECT * FROM graph.traverse( + 'public.people'::regclass, 'alice', + /*max_depth=*/ 2, /*edge_types=*/ ARRAY['KNOWS'], + /*direction=*/ 'outgoing', ... +); +``` + +This is the right interface for *the engine*, but it's a poor surface +for *users asking graph questions*. Multi-hop pattern queries with +predicates are exactly what Cypher is good at expressing. + +We will add a new SQL function `graph.cypher(text, jsonb)` that accepts +an openCypher v9 query string, parses it through the +[cyrs](https://github.com/) frontend, and dispatches each plan operator +to either pgGraph's existing in-memory engine (reads) or to SPI-issued +DML against the registered source tables (writes). + +## What this is NOT + +- It is **not** a replacement query language. SQL stays. The + `graph.cypher(...)` function is *additive*. +- It is **not** a "graph database mode" where the extension owns graph + storage. Your tables remain the source of truth (per pgGraph's + founding pitch). +- It is **not** a cost-based optimiser. cyrs produces logical plans; + pgGraph executes them. Anything resembling a planner is in cyrs or + in Postgres, never here. +- It is **not** scope creep for the extension. The extension already + uses SPI to write to user tables (catalog, sync, build). Cypher + writes use the same machinery against user-registered "label tables". + +## Why cyrs + +- Layered: we consume **HIR + Plan** (per `cyrs/docs/integration-depth.md` + decision table — "graph database → HIR + Plan"). Cheaper than building + our own parser; richer than consuming the agent JSON. +- Has dialect modes; the `OpenCypherV9` mode is exactly the surface + we want to expose. +- Diagnostics are first-class (`cyrs-diag`, codes `E0xxx` through + `E5xxx`), span-accurate, and can be projected through Postgres' + `ereport(ERROR, ...)`. +- The `WriteOp` set is complete for v9 (`CreateNode`, `CreateRel`, + `MergeNode`, `MergeRel`, `SetProperty`, `SetLabels`, + `RemoveProperty`, `RemoveLabels`, `Delete{detach}`). We do not need + to build a write-side IR ourselves. +- Frontend-only by design: no executor to fight with. + +## High-level architecture + +``` +┌──────────────────────────────────────────────────────────────────┐ +│ Postgres backend (one transaction) │ +│ │ +│ SELECT * FROM graph.cypher('MATCH ... RETURN ...', '{}'::jsonb)│ +│ │ │ +│ ┌──────────────────────────▼─────────────────────────────┐ │ +│ │ graph crate — new module: cypher_facade │ │ +│ │ │ │ +│ │ 1. parse + HIR-lower (cyrs_hir) │ │ +│ │ 2. sema (schema-aware) (cyrs_sema + our │ │ +│ │ SchemaProvider impl backed by pgGraph catalog) │ │ +│ │ 3. plan-lower (cyrs_plan) │ │ +│ │ 4. dispatch: │ │ +│ │ - ReadOp tree → engine + row evaluator │ │ +│ │ - WriteOp list → SPI DML on source tables │ │ +│ │ 5. materialise rows as JSONB → TableIterator │ │ +│ └─────────────────────────┬───────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────────▼─────────────────────────────┐ │ +│ │ Existing pgGraph machinery (unchanged): │ │ +│ │ • engine.rs / bfs.rs / path_finder.rs (reads) │ │ +│ │ • Spi::run_with_args(INSERT/UPDATE/DELETE) (writes) │ │ +│ │ • sync triggers pick up writes for index refresh │ │ +│ └────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────┘ +``` + +## Documents in this directory + +- **000-overview.md** — this file. +- **010-architecture.md** — module layout, type contracts, where in + `graph/src/` each piece lands. +- **020-catalog-extensions.md** — new catalog tables, `SchemaProvider` + implementation, label↔table mapping, unique-constraint registration. +- **030-read-mapping.md** — `cyrs_plan::ReadOp` → pgGraph engine + call / SQL emission, operator by operator. +- **040-write-mapping.md** — `cyrs_plan::WriteOp` → SPI DML, operator + by operator, plus MERGE / DETACH DELETE semantics. +- **050-expr-and-types.md** — `cyrs_plan::Expr` evaluation + (push-to-SQL vs Rust-side), Cypher↔Postgres type bridge, null/3VL + alignment. +- **060-diagnostics-and-errors.md** — how cyrs diagnostics surface as + Postgres `ereport`, embedder-host diagnostic range, error UX. +- **070-milestones-and-tests.md** — milestone plan, openCypher TCK + subset wiring, integration test fixtures. +- **080-open-questions.md** — known unknowns to resolve before / during + implementation. Issues blocked on upstream cyrs work cite + `feat-request.md` sections. + +## Reading order + +If you're new: 000 → 010 → 070 → 080. (Architecture, then milestones +to know what we're cutting, then open questions to know what's not +settled.) + +If you're implementing: 020 (you need the catalog before anything +else) → 030 / 040 / 050 in parallel → 060 → 070. diff --git a/docs/contributor_guide/cypher-frontend/010-architecture.md b/docs/contributor_guide/cypher-frontend/010-architecture.md new file mode 100644 index 0000000..d38e994 --- /dev/null +++ b/docs/contributor_guide/cypher-frontend/010-architecture.md @@ -0,0 +1,145 @@ +# Architecture + +## Module layout (additions to `graph/src/`) + +``` +graph/src/ +├── lib.rs # add: pub mod cypher_facade; +│ # add: pg_extern fn cypher(text, jsonb) +├── cypher_facade/ +│ ├── mod.rs # entry: compile() → Plan; execute() → rows +│ ├── schema_provider.rs # impl cyrs_schema::SchemaProvider over catalog +│ ├── plan_translator/ +│ │ ├── mod.rs +│ │ ├── read.rs # ReadOp → engine calls / SQL +│ │ ├── write.rs # WriteOp → SPI DML +│ │ ├── expr.rs # cyrs_plan::Expr → SQL fragment OR Rust eval +│ │ ├── path.rs # MATCH p = ... path materialisation +│ │ └── shortest.rs # ShortestPath op → path_finder +│ ├── row_eval.rs # in-process row evaluator for Filter / +│ │ # Project / Aggregate / OrderBy / Skip / +│ │ # Limit / Distinct / Unwind when SQL can't +│ ├── param_bind.rs # JSONB params → cyrs param map + pg type +│ ├── diag_to_pg.rs # cyrs_diag::Diagnostic → ereport +│ └── tests/ +│ └── ... +├── catalog/ +│ ├── mod.rs # existing; add label/rel mapping read+write +│ ├── labels.rs # NEW: label↔table↔column mapping +│ └── unique.rs # NEW: registered uniqueness constraints +└── sql/ # NEW migrations for new catalog tables + └── cypher_catalog.sql +``` + +No changes to `engine.rs`, `bfs.rs`, `path_finder.rs`, `edge_store.rs`, +`node_store.rs`, `sync.rs`. The facade is strictly additive. + +## Cargo.toml additions + +```toml +[dependencies] +cyrs-hir = { version = "...", default-features = false } +cyrs-plan = { version = "...", default-features = false } +cyrs-schema = { version = "...", default-features = false } +cyrs-sema = { version = "..." } +cyrs-diag = { version = "..." } +smol_str = "0.3" # cyrs surfaces SmolStr; we'll see it in matches +``` + +Version pin: a single git tag or crates.io minor version. See +`080-open-questions.md` Q-PKG-1. + +## Public surface + +Exactly one new pgrx SQL function: + +```sql +-- Returns the row stream of a Cypher query. +-- result_jsonb is one row per RETURN row, columns flattened into a single JSONB object. +CREATE FUNCTION graph.cypher(query text, params jsonb DEFAULT '{}'::jsonb) + RETURNS TABLE (row jsonb) + LANGUAGE c STRICT VOLATILE; +``` + +`VOLATILE` because writes are allowed. A future `graph.cypher_read(...)` +companion declared `STABLE` is a possible optimisation (gates writes, +allows query-planner re-use) but is out of scope for v1. + +## Pipeline contract + +```rust +// cypher_facade/mod.rs (sketch) + +pub fn execute(query: &str, params: serde_json::Value) + -> Result, FacadeError> +{ + // 1. parse + HIR-lower. + let hir = cyrs_hir::lower::lower_statement(query)?; + + // 2. schema-aware sema. Schema = pgGraph catalog snapshot. + let schema = SchemaProvider::from_catalog(snapshot_catalog()?); + let diags = cyrs_sema::check(&hir, &schema); + if diags.iter().any(|d| d.severity == Severity::Error) { + return Err(FacadeError::SemaErrors(diags)); + } + + // 3. plan-lower. + let plan = cyrs_plan::lower::lower_statement(&hir)?; + + // 4. bind params (params.jsonb → cyrs param table, typed via 2.4 of feat-request). + let bound = param_bind::bind(&plan, params)?; + + // 5. execute read tree, applying writes per row. + let rows = plan_translator::execute(&plan, &bound, &schema)?; + + Ok(rows) +} +``` + +Steps 1–3 are pure functions; their result is cacheable on `(query, +catalog_fingerprint, schema_digest)`. The `catalog_fingerprint` +already exists (`catalog::catalog_fingerprint`). The `schema_digest` +comes from `cyrs_schema::SchemaProvider::schema_digest()`. We'll +share these for a per-backend statement cache in a later milestone. + +## Boundaries with the existing engine + +The facade calls into pgGraph's existing read path through a thin +adapter layer it owns. We don't expose engine internals back to cyrs. + +| Read op | Engine entry point we'll call | +| ----------------------------- | ---------------------------------------------- | +| `Source { label, bind }` | `sql_search::source_table_search_rows` or `Spi` table scan | +| `Expand { single }` | new helper over `engine::Engine::adjacent` (one hop) | +| `Expand { variable-length }` | `sql_traversal::execute_traverse_rows` + `TraverseRequest` | +| `ShortestPath` (cy-feat §1.1) | `path_finder` (the existing shortest-path module) | +| All other ops | `row_eval` (in-process), composing engine results | + +Write ops compose existing SPI helpers; the facade owns the SQL it +emits because pgGraph's current SPI users target the catalog/sync +path, not arbitrary user-table DML. + +## Threading and transactions + +- `graph.cypher(...)` is called inside a Postgres query, which is + inside a transaction. All SPI calls inherit that transaction. +- A whole Cypher statement therefore commits or rolls back atomically + with the rest of the user's transaction. No special savepoints + needed. +- The facade is single-threaded per invocation; we don't introduce + worker threads. pgGraph's background workers are unchanged. + +## Error model + +| Origin | Surfaces as | +| ---------------------------------------- | -------------------------------------------- | +| `cyrs_syntax` parse errors | `ereport(ERROR, ..., SQLSTATE 42601)` | +| `cyrs_sema` `Error` diagnostics | `ereport(ERROR, ..., SQLSTATE 42P10)` | +| `cyrs_plan` `PlanLowerError` | `ereport(ERROR, ..., SQLSTATE XX000)` | +| Embedder rejection (e.g. unmapped label) | `ereport(ERROR, ..., SQLSTATE 0A000)` | +| Underlying SPI error | bubble up the original SQLSTATE | +| `cyrs_sema` `Warning` / `Note` | `ereport(NOTICE / WARNING, ...)` per severity| + +cyrs diagnostic spans become Postgres `errposition()` offsets where +available — wraps the `HirId → byte span` accessor request (§4.2 of +`feat-request.md`). diff --git a/docs/contributor_guide/cypher-frontend/020-catalog-extensions.md b/docs/contributor_guide/cypher-frontend/020-catalog-extensions.md new file mode 100644 index 0000000..b1cfa25 --- /dev/null +++ b/docs/contributor_guide/cypher-frontend/020-catalog-extensions.md @@ -0,0 +1,233 @@ +# Catalog extensions + +cyrs is schema-aware via `SchemaProvider` (declared in +`cyrs-schema`). pgGraph already has a catalog of registered tables +and edges (`graph._registered_tables`, `graph._registered_edges`). To +serve Cypher we need a thin layer above this that maps Cypher +*labels* and *relationship types* onto Postgres tables, columns, and +unique constraints. + +The invariant: **the catalog is the source of truth for the +label↔storage mapping.** Sema never guesses. If a label has no +mapping, sema produces a diagnostic with a host-range code (see +`feat-request.md` §3.1) and the query is rejected before any DML is +emitted. + +## New catalog tables + +```sql +-- A Cypher label, and which Postgres table stores nodes carrying it. +-- A label maps to exactly one table. (Multi-label compositions are +-- handled below via _registered_label_overlay.) +CREATE TABLE graph._registered_labels ( + label text PRIMARY KEY, + table_name text NOT NULL REFERENCES graph._registered_tables(table_name), + discriminator_col text NULL, -- when the same table stores multiple labels, + -- this column carries the label name. + discriminator_val text NULL, -- the value of that column for this label. + CHECK ((discriminator_col IS NULL) = (discriminator_val IS NULL)) +); + +-- Cypher property → table column mapping. One row per (label, property). +-- Missing rows = "property not stored on this label" (sema emits a +-- diagnostic; reads return NULL). +CREATE TABLE graph._registered_label_properties ( + label text NOT NULL, + property text NOT NULL, + column_name text NOT NULL, + column_type text NOT NULL, -- pg type as text (e.g. 'text', 'int8', 'jsonb') + required bool NOT NULL DEFAULT false, + PRIMARY KEY (label, property), + FOREIGN KEY (label) REFERENCES graph._registered_labels(label) ON DELETE CASCADE +); + +-- Cypher relationship type → (edge storage) mapping. +-- pgGraph already has _registered_edges which encodes the FK shape; +-- this table just gives the Cypher rel-type a name and binds it to one +-- of those edges. One row per Cypher rel type. +CREATE TABLE graph._registered_rel_types ( + rel_type text PRIMARY KEY, + from_table text NOT NULL, + from_column text NOT NULL, + to_table text NOT NULL, + to_column text NOT NULL, + label text NOT NULL, -- matches _registered_edges.label + FOREIGN KEY (from_table, from_column, to_table, to_column, label) + REFERENCES graph._registered_edges + (from_table, from_column, to_table, to_column, label) +); + +-- Relationship-type property storage. Same idea as label_properties, +-- only meaningful when the edge has its own row (junction table). +CREATE TABLE graph._registered_rel_properties ( + rel_type text NOT NULL, + property text NOT NULL, + column_name text NOT NULL, + column_type text NOT NULL, + required bool NOT NULL DEFAULT false, + PRIMARY KEY (rel_type, property), + FOREIGN KEY (rel_type) REFERENCES graph._registered_rel_types(rel_type) + ON DELETE CASCADE +); + +-- Declared uniqueness — required for MERGE atomicity. +-- Each row is one uniqueness tuple. Multiple rows per label OK +-- (analogous to multiple unique indexes). +CREATE TABLE graph._registered_unique_props ( + kind text NOT NULL CHECK (kind IN ('label', 'rel_type')), + name text NOT NULL, -- label or rel_type + props text[] NOT NULL, -- ordered tuple of property names + PRIMARY KEY (kind, name, props) +); + +-- Declared multi-label compatibility (which label sets may co-exist +-- on a single node). Empty table = no multi-label nodes allowed. +CREATE TABLE graph._registered_label_sets ( + labels text[] PRIMARY KEY -- sorted, distinct +); +``` + +## New registration SQL functions + +```sql +-- Register a Cypher label against a registered table. +SELECT graph.register_label( + label := 'Person', + table_name := 'public.people', + discriminator_col := NULL, + discriminator_val := NULL +); + +-- Map a property onto a column. column_type is informational for +-- sema; it does not enforce a cast. +SELECT graph.register_label_property( + label := 'Person', + property := 'name', + column_name := 'full_name', + column_type := 'text', + required := true +); + +-- Register a Cypher rel type against an already-registered edge. +SELECT graph.register_rel_type( + rel_type := 'KNOWS', + from_table := 'public.people', + from_column := 'id', + to_table := 'public.people', + to_column := 'id', + label := 'KNOWS' +); + +-- Declare a uniqueness tuple. The function checks that a matching +-- Postgres UNIQUE / PRIMARY KEY constraint actually exists on the +-- underlying table; rejects otherwise. +SELECT graph.register_unique( + kind := 'label', + name := 'Person', + props := ARRAY['email'] +); + +-- Allow specific multi-label compositions. +SELECT graph.allow_label_set(ARRAY['Person', 'Customer']); +``` + +The check inside `graph.register_unique` is non-negotiable. It is +what lets us issue `INSERT ... ON CONFLICT (cols) DO UPDATE` for +`MERGE` and trust the atomicity guarantee. + +## `SchemaProvider` implementation + +```rust +// graph/src/cypher_facade/schema_provider.rs (sketch) + +pub struct PgGraphSchema { + snapshot: CatalogSnapshot, // taken once per cypher() invocation + digest: [u8; 32], // catalog_fingerprint() → digest input +} + +impl cyrs_schema::SchemaProvider for PgGraphSchema { + fn labels(&self) -> Vec { ... } + fn relationship_types(&self) -> Vec { ... } + + fn node_properties(&self, label: &str) -> Option> { + // _registered_label_properties → PropertyDecl with PropertyType + // mapped from column_type via the table below. + } + + fn relationship_properties(&self, rel_type: &str) -> Option> { ... } + + fn relationship_endpoints(&self, rel_type: &str) -> Vec { + // _registered_rel_types FROM/TO labels. + } + + fn function(&self, name: &str) -> Option { + // delegate to cyrs_schema::StandardLibrary; pgGraph adds no + // procs of its own in v1. + StandardLibrary::default().function(name) + } + + fn procedure(&self, name: &str) -> Option { + None // no CALL in v1 + } + + fn schema_digest(&self) -> [u8; 32] { self.digest } + + // From cyrs feat-request §2.2 — implement once cyrs ships them. + fn label_unique_props(&self, label: &str) -> Vec> { ... } + fn rel_type_unique_props(&self, rel_type: &str) -> Vec> { ... } + + // From cyrs feat-request §2.3. + fn labels_compatible(&self, labels: &[SmolStr]) -> Option { + // Look up the sorted-distinct slice in _registered_label_sets. + // Single-element slice is always compatible. + } +} +``` + +## `column_type` → `PropertyType` mapping + +| pg type | `cyrs_schema::PropertyType` | +| -------------------------------- | --------------------------- | +| `text`, `varchar`, `bpchar` | `String` | +| `int2`, `int4`, `int8` | `Int` | +| `float4`, `float8`, `numeric` | `Float` | +| `bool` | `Bool` | +| `date` | `Date` | +| `timestamp`, `timestamptz` | `Datetime` | +| `_text`, `_int4`, … | `List(...)` recursively | +| `jsonb`, `json` | `Any` (and we'll handle structurally) | +| anything else | `Opaque("")` | + +If the user wants typed handling of an opaque type, they can either +re-register the column with a closer-fitting type or extend this map +in a later milestone. + +## Snapshot consistency + +`schema_digest()` must change whenever any of the eight tables above +changes. The implementation: + +```rust +fn compute_digest(snapshot: &CatalogSnapshot) -> [u8; 32] { + let mut hasher = blake3::Hasher::new(); + snapshot.hash_into(&mut hasher); // deterministic field order + *hasher.finalize().as_bytes() +} +``` + +Combine with pgGraph's existing `catalog_fingerprint` (which already +covers the underlying `_registered_tables` / `_registered_edges`) so +a single key invalidates the per-statement-cache when either layer +changes. + +## What we leave to a later milestone + +- **Inferred label registration** — auto-mapping every registered + table to a label of the same name. Sugar; out of scope for v1. +- **Computed properties** — labels that expose a `prop` derived from + a SQL expression rather than a column. Out of scope. +- **Property aliases** — `register_label_property` with two + `property` rows pointing at the same column. Out of scope. +- **Per-tenant catalog overlays** — pgGraph supports tenant scoping; + v1 Cypher uses the tenant context set on the session, but the + label↔table mapping is global. Per-tenant overlays are a v2 ask. diff --git a/docs/contributor_guide/cypher-frontend/030-read-mapping.md b/docs/contributor_guide/cypher-frontend/030-read-mapping.md new file mode 100644 index 0000000..50e3682 --- /dev/null +++ b/docs/contributor_guide/cypher-frontend/030-read-mapping.md @@ -0,0 +1,218 @@ +# Read-side mapping: `cyrs_plan::ReadOp` → pgGraph execution + +Read operators are a tree. The facade walks the tree post-order, with +two emission strategies per node: + +- **SQL push** — emit a SQL fragment, accumulate it into a query the + pgGraph engine or a SPI call executes. Cheap, leverages Postgres' + planner. +- **Row eval** — materialise rows from the child, evaluate in Rust + using `row_eval.rs`. Required when an op references graph-shaped + values (paths, lists of nodes, maps) that don't have natural SQL + expressions, or when an `Expr` includes a function we haven't + classified as push-safe (see `050-expr-and-types.md`). + +The decision is per node, made bottom-up: if any descendant required +row eval, the parent does too (we don't reshape rows back into SQL). + +## Operator catalogue + +### `Source { label, bind }` + +**Translation:** look up `label` in `_registered_labels`. Two cases: + +1. *No discriminator:* `SELECT AS __id, FROM + `. +2. *With discriminator:* same, plus `WHERE = + `. + +The `bind` VarId records the resulting node's identity for downstream +ops. We materialise node identity as `(regclass, text)` to match +pgGraph's existing engine contract. + +**Tenant scoping:** if the registered table has a `tenant_column` and +the session has a tenant set (`pgGraph.tenant` GUC), append +`AND = current_setting('pgGraph.tenant')`. + +**`label = None` (all-node scan):** `UNION ALL` over every registered +table that has at least one label registered against it. Diagnose +"label-free Source with empty catalog" as a host-range error. + +### `Expand { from, rel, to, bind_rel, bind_to, input }` + +The single most important operator and the one that benefits most +from pgGraph's engine. + +#### Single-hop (`RelLength::Single`) + +- If `from` is already a single concrete node (resolved upstream), + call `engine::Engine::adjacent(from, rel.types, rel.direction)` + directly. +- If `from` is a stream of nodes, batch: collect into a + `Vec<(regclass, id)>`, then call a new + `engine::Engine::adjacent_batch(...)` (small wrapper around the + existing per-node call). +- `to.labels` and `to.properties` apply as a Filter on the resulting + rows. + +#### Variable-length (`RelLength::Variable { min, max }`) + +This is exactly `graph.traverse(...)`. Build a `TraverseRequest`: + +```rust +TraverseRequest { + root_table: pg_class_of(from.label), + root_id: from.id, + max_depth: max.unwrap_or(default_max_depth) as i32, + edge_types: Some(rel.types), + direction: rel.direction.into(), + node_tables: to_node_tables_for(&to.labels), + filter: expr_to_filter_jsonb(&to.properties), + strategy: "bfs", + uniqueness: "node_global", + include_start: false, + hydrate: true, + max_rows: default_max_rows, + row_offset: 0, + max_nodes: default_max_nodes, + max_frontier: default_max_frontier, +} +``` + +Call `execute_traverse_rows` (already exists). The result already has +the path JSONB and edge-path JSONB; we keep those for `RETURN p` +support. + +Edge cases: + +- `min = 0`: include the start node as a zero-length match. We handle + this in the row evaluator, not by re-running traverse — we emit a + union of `{depth=0, node=from}` and the traverse result. +- `min > 1`: post-filter by `depth >= min`. The traversal already + enumerates from depth 1 upward. +- `max = None` (unbounded): use the configured `graph.default_max_depth` + as a hard cap, and emit a `NOTICE` if the cap was hit. Cypher does + not actually allow truly unbounded varlen in practice — there's + always a cap somewhere. + +### `Filter { input, predicate }` + +Two-stage strategy: + +1. **Push the pushable.** Walk `predicate` in CNF; each conjunct that + is composed entirely of push-safe functions and references + columns we already pulled in `Source` / `Expand` becomes a SQL + `WHERE` fragment. +2. **Row-eval the rest.** Remaining conjuncts run over the + materialised stream from the child. + +Cypher null/3VL: a `Filter` drops both `false` and `null` rows (spec +§12.1 N3). SQL `WHERE` already drops `null`. They align. (See +`feat-request.md` §5.1.) + +### `Project { input, items }` / `With { input, items, filter }` + +- For each `Projection`, evaluate its `Expr` either in SQL (as part + of the surrounding SELECT) or in Rust. +- `With` has the same shape as `Project` but with an optional + trailing filter and a new scope barrier; downstream variable + references after a `With` are scoped to its output items only. +- Star-projection (`RETURN *`) was already expanded by HIR into an + explicit item list. Nothing special here. + +### `Aggregate { input, keys, aggs }` + +- If `input` materialised to SQL: `SELECT , FROM (...) GROUP BY `. +- If row-eval: stream rows into a `HashMap`. +- Empty `keys` → single output row even on empty input (Cypher + semantics; see `feat-request.md` §5.2). +- Supported aggregates: `count`, `count(*)`, `sum`, `avg`, `min`, + `max`, `collect`. `stDev`/`stDevP`/`percentile*` deferred to a + later milestone. + +### `OrderBy { input, keys }` / `Skip { input, count }` / `Limit { input, count }` + +SQL surface for both. Cypher's sort is stable and uses Cypher +ordering, which doesn't match SQL's `ORDER BY` for mixed-type columns +(e.g. `null` last vs `null` first). Two-stage approach: + +- If keys are all of a homogeneous primitive type and not mixed with + null-ordering rules, push to SQL with explicit `NULLS LAST`. +- Otherwise row-eval with the Cypher ordering predicate. + +`count` for `Skip` / `Limit` is an `Expr`; we evaluate it at the +start of execution (must be a constant integer; sema enforces). + +### `Distinct { input }` + +`SELECT DISTINCT` if SQL-pushed; otherwise a `HashSet` +in row-eval. The row fingerprint must respect Cypher value equality +(`null != null`), which differs from SQL's `DISTINCT` (`null = null`). +That's a gotcha — when rows contain nulls, we have to row-eval. + +### `Unwind { input, list, bind }` + +- If `list` evaluates to a SQL array, use `unnest(list)`. +- If `list` is a Cypher list value (JSONB array, possibly heterogeneous), + row-eval iterating the JSONB elements. + +### `Union { left, right, kind }` + +Run both sub-plans, emit rows of `left` then `right`. `UnionKind::All` +keeps duplicates; `UnionKind::Distinct` dedups by Cypher value +equality (same gotcha as `Distinct`). + +### `OptionalJoin { input, pattern }` + +Conceptually a left outer join: for each row of `input`, evaluate +`pattern`; if it produces zero rows, emit one row with every variable +introduced by `pattern` bound to `null`. + +Implementation: lateral execution. + +```sql +SELECT outer.*, inner.* +FROM () outer +LEFT JOIN LATERAL () inner ON true +``` + +If `pattern` is row-eval, do it explicitly in Rust: for each outer +row, call the inner sub-plan; if empty, emit one null-bound row. + +### `ShortestPath { ... }` — pending upstream cyrs + +This op doesn't exist yet (see `feat-request.md` §1.1). Once it +lands, route directly to `path_finder.rs`. Until then, we reject +`shortestPath(...)` / `allShortestPaths(...)` with a host-range +diagnostic explaining the limitation. + +## Path materialisation (`MATCH p = ...`) + +Pending the upstream clarification in `feat-request.md` §1.2. Our +working assumption (subject to confirmation): + +- A `VarKind::Path` binding produces a Plan-level value whose runtime + representation is a list-of-(node, rel, node) triples. +- pgGraph already produces this as `path JSONB` from + `execute_traverse_rows`. +- Functions over paths (`length(p)`, `nodes(p)`, `relationships(p)`) + consume this JSONB. + +If cyrs surfaces a more structured Plan-level path constructor, we'll +match that shape. + +## Heuristics: when to row-eval vs push to SQL + +Default: **push to SQL until something forces row-eval**. + +Forces row-eval (sticky from that operator upward): + +- Any `Expr::Call` to a function not classified push-safe. +- Any reference to a Cypher-typed value with no natural SQL projection + (paths, lists-of-nodes, maps). +- `OptionalJoin` where the inner pattern itself row-evals. +- Cypher value-equality semantics where SQL's would diverge + (`Distinct`/`Union Distinct` on rows containing null). + +Profile-driven re-tuning is a v2 concern. v1 picks the obvious +strategy per op and ships. diff --git a/docs/contributor_guide/cypher-frontend/040-write-mapping.md b/docs/contributor_guide/cypher-frontend/040-write-mapping.md new file mode 100644 index 0000000..732640c --- /dev/null +++ b/docs/contributor_guide/cypher-frontend/040-write-mapping.md @@ -0,0 +1,241 @@ +# Write-side mapping: `cyrs_plan::WriteOp` → SPI DML + +Write ops form a `Vec` on `PlanStatement`. They execute +**per output row of the read tree**, in order. Each op may bind a new +variable visible to subsequent ops on the same row. Atomicity is +inherited from the surrounding Postgres transaction. + +Identities used here: + +- **Node identity** = `(table_name, id)` where `id` is text matching + the registered `id_column`. +- **Relationship identity** = `(rel_type, from_id, to_id)` and, for + junction-table edges, also the row's primary key in the junction + table. + +## `CreateNode { labels, props, bind }` + +1. Look up the (unique) label in `_registered_labels`. Multi-label + `labels.len() > 1` requires the sorted set to appear in + `_registered_label_sets`; otherwise reject with host-range + diagnostic. (Tracked at sema time via `labels_compatible` — + `feat-request.md` §2.3 — but we re-check at execution to defend + against catalog races.) +2. Evaluate `props` to a `serde_json::Map`. +3. For each property: look up `_registered_label_properties`; map + `property → column_name`. Properties without a mapping become an + error unless we add a "spill to JSONB column" convention later + (deferred). +4. Build: + + ```sql + INSERT INTO (, , ...) + VALUES ($1, $2, ...) + RETURNING + ``` + + Execute via `Spi::run_with_args`. The returned `id` populates the + `bind` variable for downstream ops. + +`NOT NULL` columns without defaults that are missing from `props` +become a write-time error from Postgres. We surface those as +`ereport(ERROR)` with the original SQLSTATE. + +Multi-label with discriminator: if any of the labels uses a +discriminator column, set it explicitly in the INSERT. + +## `CreateRel { from, to, rel_type, props, bind }` + +Lookup `_registered_rel_types[rel_type]`. Two structural cases +depending on the underlying edge shape: + +### Case A — FK column on `from_table` + +```sql +UPDATE +SET = $to_id +WHERE = $from_id +``` + +Set-typed FKs are not in scope (one-to-many requires a junction). + +### Case B — Junction table + +```sql +INSERT INTO ( + , , , +) VALUES ($1, $2, $3, ...) +RETURNING +``` + +`label_column` only present if the registered edge is polymorphic on +type (one junction holds multiple rel types). + +In both cases the `bind` variable carries a relationship identity for +downstream `SetProperty` / `Delete` calls. + +## `MergeNode { labels, props, on_create, on_match, bind }` + +**Sema precondition** (relies on `feat-request.md` §2.1 / §2.2): +the planner identified `key_props: Vec` as the determinism +key. Sema validated, via `SchemaProvider::label_unique_props`, that +these props correspond to a registered `_registered_unique_props` row, +which itself was validated to correspond to a real Postgres unique +constraint at registration time. + +Execution: + +```sql +INSERT INTO () +VALUES ($1, $2, ...) +ON CONFLICT () DO UPDATE + SET = EXCLUDED. -- no-op, just to fire RETURNING +RETURNING , (xmax = 0) AS __was_created +``` + +`__was_created` selects between `on_create` and `on_match`. We then +execute each `WriteOp` in the chosen vec, in order, with `bind` +already populated. + +Race-free because the constraint is real Postgres uniqueness; the +upsert is atomic. + +## `MergeRel { from, to, rel_type, props, on_create, on_match, bind }` + +Two cases mirror `CreateRel`: + +- **FK column:** can't really MERGE — an FK column either has a value + or it doesn't. We treat MERGE as "if from.is NULL set it, + else verify it equals to_id; if it doesn't, choose `on_match` + side." Rare in practice. +- **Junction table:** requires a `UNIQUE (from_col, to_col[, type])` + on the junction. `_registered_unique_props` records this. Then: + + ```sql + INSERT INTO (, , [,] ) + VALUES ($1, $2, [$3,] ...) + ON CONFLICT (, [, ]) DO UPDATE + SET = EXCLUDED. + RETURNING , (xmax = 0) AS __was_created + ``` + +If the catalog doesn't record a matching uniqueness tuple, MERGE on +this rel is rejected at sema time. No silent SELECT-then-INSERT +fallback — that would violate atomicity guarantees Cypher users +rightfully expect. + +## `SetProperty { target, prop, value }` + +`target` resolves to a `(table_or_junction, id)`. Look up `prop` in +the appropriate property table. Emit: + +```sql +UPDATE
SET = $value WHERE = $id +``` + +`value` is an `Expr` evaluated to a Cypher value, then cast to the +column type via the type-bridge (`050-expr-and-types.md`). Cast +failures (e.g. assigning a list into a scalar column) become +`ereport(ERROR, ..., SQLSTATE 22023)`. + +## `SetLabels { target, labels }` + +Hard case. Three possible interpretations: + +1. **Discriminator-column model:** `UPDATE
SET = + $new_label`. Trivial when applicable. +2. **Multi-table model:** moving a row between tables. We do not + support this in v1 — reject with host-range diagnostic + `E45xx — label set arithmetic not supported by storage model`. +3. **Junction-row model:** insert a row into a per-row `_node_labels` + junction. Not in scope for v1; tracked in `080-open-questions.md`. + +The diagnostic message names the underlying constraint +("`SET n:Foo:Bar` cannot run because `Person` is stored without a +discriminator column"). The user fix is to `register_label_property` +with an explicit discriminator. + +## `RemoveProperty { target, prop }` + +```sql +UPDATE
SET = NULL WHERE = $id +``` + +Reject if the column is `NOT NULL` (host-range diagnostic citing the +catalog's `required = true`). + +## `RemoveLabels { target, labels }` + +Mirror of `SetLabels`. Same v1 restriction: only the discriminator +model is supported. + +## `Delete { targets, detach: false }` + +For each `target` expr (must evaluate to a node or relationship +reference): + +```sql +DELETE FROM
WHERE = $id +``` + +Postgres enforces "no orphaned edges" via FKs on the registered +edges. If a non-`DETACH DELETE` would orphan an edge, Postgres raises +`23503` (`foreign_key_violation`); we map it to a Cypher-style +diagnostic citing the rel type that blocked the delete. + +## `Delete { targets, detach: true }` + +`DETACH DELETE` removes incident edges first. + +pgGraph already knows the incident edges (the index). For each +target node: + +1. Look up all `_registered_rel_types` whose `from_table` or + `to_table` matches the target's table. +2. For each, issue: + - If junction edge: `DELETE FROM WHERE = + $id OR = $id`. + - If FK edge: `UPDATE SET = NULL WHERE + = $id` (sets the FK to NULL — only legal if the column + is nullable; otherwise the DETACH DELETE itself is illegal in + pgGraph's model and we diagnose). +3. Then `DELETE FROM
WHERE = $id` as in the + non-detach case. + +This runs as a single SPI sequence inside the per-row write step, +inside the surrounding transaction. + +## Sync integration + +We do nothing special. Every write above hits a registered table +through SPI. pgGraph's existing sync triggers fire on those tables +and refresh the in-memory index just as they would for any +application-issued DML. The index sees a consistent view at the next +read. + +## Per-row write ordering and visibility + +Within one Cypher statement, write ops execute in the order they +appear, per output row of the read tree. Effects of earlier ops are +visible to later ops *within the same row's variable bindings* +(because the bound variables are passed forward in-memory by the +facade); the **graph index** may or may not have been refreshed by +the time a later op runs (sync is async-ish), so we MUST NOT depend +on it for write-then-read on the same statement. + +What this means in practice: if a Cypher statement does +`CREATE (a:X)-[:R]->(b:Y) ... MATCH (q)-[:R]->(b) ...` in the same +statement, the `MATCH` after `CREATE` will *not* see the freshly +created edge through the graph engine. Cypher semantics expect it to. +We have two options: + +1. **Materialise writes through SQL only**, and run every subsequent + MATCH as a SQL traversal (recursive CTE / repeated joins) within + the same transaction so it sees uncommitted writes. Heavy. +2. **Reject mixed read-after-write within a single statement** with + a clear diagnostic, and let users split into two statements (the + sync barrier between them refreshes the index). + +For v1: ship option 2 with a clean diagnostic. Option 1 is a v2 ask +that may justify a bigger redesign of the engine read path. See +`080-open-questions.md` Q-RW-1. diff --git a/docs/contributor_guide/cypher-frontend/050-expr-and-types.md b/docs/contributor_guide/cypher-frontend/050-expr-and-types.md new file mode 100644 index 0000000..1804dc2 --- /dev/null +++ b/docs/contributor_guide/cypher-frontend/050-expr-and-types.md @@ -0,0 +1,181 @@ +# Expressions and types + +This document covers: + +1. How a `cyrs_plan::Expr` becomes either a SQL fragment or a Rust-side + value computation. +2. The Cypher value type ↔ Postgres type bridge. +3. The null / 3VL alignment between Cypher and SQL. + +## `Expr` translation strategies + +`cyrs_plan::Expr` variants (`crates/cyrs-plan/src/lib.rs:483-…`): + +| `Expr` variant | SQL push | Row eval | +| ------------------- | --------------------------------------------------------- | --------------------------------------- | +| `Null` | `NULL` | `Value::Null` | +| Scalar literals | param literal of matching type | `Value::*` | +| `VarRef(v)` | column reference (must already be in scope) | lookup in row bindings | +| Property access | `.` | JSONB get-path on the materialised node | +| Binary ops | translate operator; respect Cypher 3VL | Rust impl | +| Unary ops | translate | Rust impl | +| `Call { name, .. }` | only if function is push-safe (see table below) | dispatch in `row_eval::call` | +| `CASE … WHEN …` | SQL `CASE` | Rust match | +| List literal | `array[$1, $2, ...]` for homogeneous primitives; otherwise force row eval | `Value::List` | +| Map literal | `jsonb_build_object(...)` | `Value::Map` | +| Parameter ref | bound SPI parameter (`$N`) | bound facade value | +| `Exists(pattern)` | sub-`EXISTS (...)` against the SQL-pushed inner | run the sub-plan, check non-empty | + +The bottom-up rule: any `Expr` that requires row eval forces its +parent operator to row eval. We never reconstruct a SQL fragment +around a Rust-evaluated sub-expression. + +## Function classification + +Built-ins from `cyrs_schema::StandardLibrary` (pending the enumeration +in `feat-request.md` §1.3). Three buckets: + +### Push-safe to SQL + +| Cypher built-in | Postgres equivalent | +| --------------------- | ---------------------------- | +| `coalesce(x, y, …)` | `COALESCE(x, y, …)` | +| `toLower(s)` | `lower(s)` | +| `toUpper(s)` | `upper(s)` | +| `trim(s)` | `btrim(s)` | +| `ltrim(s)` / `rtrim` | `ltrim` / `rtrim` | +| `substring(s, i[, l])`| `substring(s, i+1, l)` (`i` is 0-based in Cypher) | +| `replace(s, a, b)` | `replace(s, a, b)` | +| `size(list)` | `array_length(list, 1)` for arrays; row-eval for jsonb arrays | +| `length(string)` | `length(s)` | +| `abs/ceil/floor/round`| `abs/ceil/floor/round` | +| `sqrt/exp/log/log10` | matching | +| `toString(x)` | `cast(x as text)` for primitives | +| `toInteger(x)` | `cast(x as bigint)` | +| `toFloat(x)` | `cast(x as double precision)`| +| `toBoolean(s)` | `cast(s as bool)` | +| Date/datetime ctors | `make_date`, `make_timestamp`, etc. | + +### Row-eval only + +`rand`, `randomUUID`, `timestamp()`, `datetime()` (when called without +args — non-deterministic), `id`, `labels`, `keys`, `properties`, +`nodes(p)`, `relationships(p)`, `head/tail/last` on heterogeneous +lists, list predicates `any/all/none/single`, `reduce`. + +### Rejected / not yet supported + +`shortestPath` / `allShortestPaths` (pending cyrs §1.1; the read-op +form is the supported entry point, not the expression form). Spatial +functions. Temporal functions beyond `date`/`datetime`. + +## Value model (Rust-side) + +```rust +#[derive(Debug, Clone)] +enum Value { + Null, + Bool(bool), + Int(i64), + Float(f64), + String(SmolStr), + Date(chrono::NaiveDate), + Datetime(chrono::DateTime), + List(Vec), + Map(IndexMap), + Node(NodeRef), // (table_oid, id) + Relationship(RelRef), // (rel_type, from_id, to_id, [junction_pk]) + Path(Vec), // alternating node/rel/node +} +``` + +Row eval operates over `Value`. SQL-pushed paths skip this entirely. + +## Type bridge + +Driven by `_registered_label_properties.column_type`. Mapping table is +in `020-catalog-extensions.md`. + +The bridge runs at three points: + +1. **Read hydration:** Postgres value → `Value`. JSONB columns are + recursively converted; other primitives map directly. +2. **Parameter binding:** Cypher param `$x` (typed via cyrs's + inferred param map, `feat-request.md` §2.4) → SPI parameter with + matching pg type. Maps and lists become JSONB. +3. **Write coercion:** `Value` produced by an `Expr` → SPI parameter + of the target column's pg type. Cast failures become + `ereport(ERROR, ..., SQLSTATE 22023 invalid_parameter_value)`. + +## Null and 3VL + +Cypher uses 3-valued logic. Postgres SQL uses 3-valued logic. They +mostly align. Known cases where they differ: + +| Construct | Cypher | Postgres SQL | Resolution | +| -------------------------- | --------------------- | --------------------- | ---------- | +| `null = null` | `null` | unknown (→ false in WHERE) | Same observable behaviour in `Filter`; both drop the row. ✓ | +| `null = x` | `null` | unknown | Same in `Filter`. ✓ | +| `null OR true` | `true` | `true` | ✓ | +| `null AND false` | `false` | `false` | ✓ | +| `n.prop` on null `n` | `null` | type error | Wrap with `CASE WHEN n IS NULL THEN NULL ELSE n.END` | +| `[1,2,3][null]` | `null` | type error | Force row eval for list indexing | +| `list IN list` | structural | per-element via `ANY` | Force row eval for list membership | +| `Distinct` over null cols | rows with null differ structurally | `DISTINCT` treats null as one group | Force row eval (see `030-read-mapping.md`) | +| `ORDER BY` null | "smaller than any value" | NULLS FIRST or LAST depending | Use explicit `NULLS LAST` when pushing | + +The first rule of the bridge: **when in doubt, row-eval.** +Correctness > performance. Push-to-SQL is opportunistic. + +## Parameter typing + +Each `$param` in the source surfaces in `PlanStatement.params` (pending +upstream `feat-request.md` §2.4). Pre-binding step: + +```rust +fn bind_param(p: &ParamType, j: &serde_json::Value) -> Result { + match p { + ParamType::Scalar(PropertyType::Int) => i64::try_from(j.as_i64()...), + ParamType::Scalar(PropertyType::String) => ..., + ParamType::Scalar(PropertyType::Date) => parse_date(j.as_str()?), + ParamType::List(inner) => to_jsonb_array_typed(j, inner)?, + ParamType::Map => SpiParam::jsonb(j.clone()), + ParamType::Unknown => SpiParam::jsonb(j.clone()), // fallback + ... + } +} +``` + +`graph.cypher(query text, params jsonb)` accepts params as a single +JSONB; the JSONB has the parameter names as keys. (We considered a +variadic form but JSONB is easier to forward from clients and is what +existing pgGraph functions like `filter` already accept.) + +## Function library scaffolding + +```rust +// graph/src/cypher_facade/plan_translator/expr.rs + +enum PushResult<'a> { + Sql(String, Vec>), + NeedsRowEval(&'a Expr), +} + +fn try_push_call(name: &str, args: &[Expr], ctx: &PushCtx) -> PushResult<'_> { + match BUILTIN_PUSH.get(name) { + Some(rule) => rule.emit(args, ctx), + None => PushResult::NeedsRowEval(/* expr ref */), + } +} + +static BUILTIN_PUSH: phf::Map<&'static str, &'static dyn PushRule> = phf::phf_map! { + "coalesce" => &CoalescePushRule, + "toLower" => &SimpleRename("lower"), + "toUpper" => &SimpleRename("upper"), + ... +}; +``` + +The table is exhaustive; missing entries default to row eval. CI test +asserts every name in `cyrs_schema::StandardLibrary` is either listed +in `BUILTIN_PUSH` or in a `ROW_EVAL_ONLY` allowlist. diff --git a/docs/contributor_guide/cypher-frontend/060-diagnostics-and-errors.md b/docs/contributor_guide/cypher-frontend/060-diagnostics-and-errors.md new file mode 100644 index 0000000..b39c663 --- /dev/null +++ b/docs/contributor_guide/cypher-frontend/060-diagnostics-and-errors.md @@ -0,0 +1,146 @@ +# Diagnostics, errors, and Postgres surface + +The point of using cyrs is the diagnostic quality. We must not lose +it on the way through Postgres. + +## Diagnostic taxonomy + +cyrs (`cyrs-diag`) produces `Diagnostic { code, severity, message, +primary, labels, notes, related, fixes }`. Code ranges (spec §10.2): + +| Range | Meaning | +| --------- | ---------------------------------------------- | +| E0xxx | Syntax (lexer + parser) | +| E1xxx | Name resolution | +| E2xxx | Semantic — schema-free | +| E3xxx | Semantic — schema-aware | +| E4xxx | Dialect / compatibility | +| E45xx | **Reserved for embedder rejections** (see `feat-request.md` §3.1) | +| E5xxx | Type system | +| W6xxx | Style / lint | +| W7xxx | Performance | +| N8xxx | Notes | + +The `E45xx` range is the one **we own** as the embedder. We mint +codes from it for situations cyrs cannot diagnose because they're +about our storage model: + +| Code | Meaning | +| ------ | ------------------------------------------------------------------------ | +| E4500 | Label not registered (`graph.register_label` required). | +| E4501 | Property not registered for label. | +| E4502 | Relationship type not registered. | +| E4503 | Label set not registered as compatible (multi-label `CREATE`). | +| E4504 | MERGE pattern lacks declared uniqueness on key props. | +| E4510 | `SetLabels` / `RemoveLabels` not supported by storage model. | +| E4520 | Read-after-write within a single statement (sync barrier required). | +| E4530 | `shortestPath` / `allShortestPaths` not yet implemented (pending §1.1). | +| E4540 | Property column is `NOT NULL`; `REMOVE` would violate. | +| E4550 | Inferred parameter type incompatible with registered column type. | +| E4560 | Function not supported (push-set + row-eval-set both reject). | +| E4599 | Catch-all: feature not yet implemented. | + +These codes are stable. New codes append; existing codes don't shift. + +## ereport surface + +```rust +fn report_diag(d: &Diagnostic) -> ! { + let sqlstate = sqlstate_for(d.code); + let mut err = ErrorReport::new(PgSqlErrorCode::from_sqlstate(sqlstate), + d.message.clone(), + /* funcname */ "graph.cypher"); + if let Some(byte_offset) = d.primary.span_start_byte() { + err = err.errposition(byte_offset as i32); + } + for label in &d.labels { + err = err.detail(format!("{}: {}", label.span, label.caption)); + } + for note in &d.notes { + err = err.hint(note.clone()); + } + err.report(); // unwinding panic → pgrx → ereport +} +``` + +`sqlstate_for`: + +| cyrs range | SQLSTATE | Postgres class | +| ----------- | -------- | ---------------------------------------- | +| `E0xxx` | `42601` | syntax_error | +| `E1xxx` | `42703` | undefined_column (used for undefined name in scope) | +| `E2xxx-E3xxx`| `42P10` | invalid_column_reference | +| `E4xxx` | `0A000` | feature_not_supported | +| `E45xx` | `0A000` | feature_not_supported (embedder) | +| `E5xxx` | `42804` | datatype_mismatch | +| `W6xxx` | NOTICE level, no SQLSTATE | +| `W7xxx` | WARNING level | +| `N8xxx` | NOTICE level | + +## Multi-error reporting + +cyrs returns *all* diagnostics from a check, not just the first. +Postgres' `ereport(ERROR, ...)` aborts on the first error. So: + +- Collect all sema diagnostics. +- Sort by primary span byte offset. +- If any are `Error`, emit the first as `ERROR` and append the rest + into `detail()` lines so the user sees them in the same backend + message. +- If only `Warning`/`Notice` exist, emit each via `ereport(NOTICE)` + before continuing. + +Optional future: ship a `graph.cypher_check(text) -> SETOF jsonb` +function that returns every diagnostic as a row, never aborts. Useful +for IDE integrations. v2. + +## Error sources by phase + +| Phase | Source | Surface | +| ---------------------- | ------------------------------------------ | ---------------- | +| HIR lowering | `cyrs_syntax::SyntaxError` (parse errors), `cyrs_hir::HirLowerError` (pending §4.1) | `42601` | +| Sema | `cyrs_sema` diagnostic stream | `42703 / 42P10 / 42804` | +| Plan lowering | `cyrs_plan::PlanLowerError` (`UnresolvedRef`, `UndesugaredExpr`, …) | `XX000` (internal — sema should have caught) | +| Embedder rejection | facade-side checks (label/rel missing, etc.) | `0A000` | +| Parameter binding | type bridge failure | `22023` | +| SPI exec | Postgres' own error (FK violation, unique violation, NOT NULL violation, …) | original SQLSTATE preserved | + +`PlanLowerError` from cyrs at runtime means we failed to keep the +pipeline in sync; we treat that as an internal error and log +verbosely. Should never happen with a correctly implemented sema +gate. + +## Diagnostic fixtures + +UI-style tests (compiletest-flavoured): pairs of `input.cypher` + +`expected.diag.txt`. The harness drives `graph.cypher()` in a test +backend, captures the `ereport` payload, and compares against the +expected file. `cargo xtask bless` regenerates. + +Mirror the structure cyrs already has under `tests/ui/sema/` and +`tests/ui/dialect/` — we add `tests/ui/cypher/`: + +``` +graph/tests/ui/cypher/ +├── label_not_registered/ +│ ├── input.cypher +│ └── expected.diag.txt +├── merge_no_unique/ +│ ├── input.cypher +│ ├── expected.diag.txt +│ └── schema.sql +└── ... +``` + +`schema.sql` is the catalog registration script run before the test +input; it documents exactly which registrations the input depends on. + +## What we do NOT translate + +- We do not translate `errposition` to a row/column. pgrx's + `errposition` takes a byte offset; that's what we already have + from cyrs. Postgres' clients (psql, etc.) render it. +- We do not chain related diagnostics through Postgres' "internal + query" mechanism. They become `DETAIL:` and `HINT:` lines instead. +- We do not emit machine-readable diagnostics in v1. SARIF / JSON + output is a v2 ask matching cyrs's roadmap D7. diff --git a/docs/contributor_guide/cypher-frontend/070-milestones-and-tests.md b/docs/contributor_guide/cypher-frontend/070-milestones-and-tests.md new file mode 100644 index 0000000..d5084cd --- /dev/null +++ b/docs/contributor_guide/cypher-frontend/070-milestones-and-tests.md @@ -0,0 +1,204 @@ +# Milestones and tests + +Six milestones; each shippable on its own, each guarded by tests. +"Done" means: passing tests, doc page updated, all new public SQL +listed in `graph.control`-style migration. + +## M0 — Skeleton & catalog (week 1) + +**Goal:** the new module compiles, the catalog tables exist, the +`SchemaProvider` impl is wired but nothing executes yet. + +Scope: + +- Add cyrs deps to `graph/Cargo.toml`. Pin a single git revision / + crates.io minor. +- `cypher_facade/mod.rs` with stub `execute()` returning "not yet + implemented". +- `cypher_facade/schema_provider.rs` implementing `SchemaProvider` + over a fixed catalog snapshot. Methods that depend on upstream + asks (`label_unique_props`, `labels_compatible`) return default + values until cyrs ships them. +- `sql/cypher_catalog.sql` migration creating the eight catalog + tables from `020-catalog-extensions.md`. +- Five new pgrx SQL functions: `register_label`, + `register_label_property`, `register_rel_type`, + `register_unique`, `allow_label_set`. +- `register_unique` validates that a matching Postgres unique + constraint actually exists on the underlying table. + +Tests: + +- Unit tests on the `SchemaProvider` impl with a hand-built + `CatalogSnapshot`. +- pg_test: register a label, query the catalog, assert presence. +- Negative pg_test: `register_unique` rejects when no real + constraint exists. + +## M1 — Read MVP (weeks 2–3) + +**Goal:** `MATCH (a:L) WHERE a.x = $1 RETURN a.y, a.z` works for a +real registered table, with parameter binding. + +Scope: + +- `cypher_facade/plan_translator/read.rs` implementing `Source`, + `Filter` (push-only, scalar equalities and conjunctions), `Project` + (push-only). +- `Skip`, `Limit`, `Distinct` (SQL push). +- `param_bind.rs` for scalar param types (`String`, `Int`, `Float`, + `Bool`). +- `diag_to_pg.rs` for parse/sema/embedder errors. +- New SQL function: `graph.cypher(text, jsonb) RETURNS TABLE(row jsonb)`. + +Tests: + +- TCK subset: every scenario tagged `@MATCH` + `@WHERE` + `@RETURN` + that doesn't reference variable-length patterns or aggregations. + Skip / xfail anything that does. +- UI tests for diagnostics: unregistered label, unregistered property, + bad param type. +- pg_test: round-trip insert via SQL → query via `graph.cypher` → + assert rows. + +## M2 — Traversal & path ops (week 4) + +**Goal:** multi-hop and variable-length patterns work. + +Scope: + +- `Expand` (single-hop) via `engine::Engine::adjacent`. +- `Expand` (varlen) via existing `execute_traverse_rows` + + `TraverseRequest` builder. +- `OptionalJoin` (lateral form when SQL-pushed; explicit form when + row-evaled). +- `OrderBy` (push for homogeneous keys; row-eval otherwise). +- `Aggregate` (push for `count/sum/avg/min/max`; row-eval `collect`). +- `Unwind` (push for arrays; row-eval for JSONB lists). +- `Union` (both kinds, with the `Distinct` null-handling caveat). +- `With` (scope barrier; same emission as `Project` + optional + `Filter`). + +Tests: + +- TCK subset adds `@OPTIONAL-MATCH`, `@UNWIND`, `@AGGREGATIONS`, + `@WITH`, `@PATTERNS` (fixed-length), `@LISTS` (non-comprehension), + `@MAPS`. +- Engine integration tests using existing pgGraph fixtures (the demo + `people` ↔ `orders` schema in `demo/`). + +## M3 — Write MVP (weeks 5–6) + +**Goal:** `CREATE`, `SET`, `REMOVE`, `DELETE` all work against +registered tables. + +Scope: + +- `cypher_facade/plan_translator/write.rs`. +- `CreateNode` (single label only; multi-label requires the catalog + hook from cyrs §2.3). +- `CreateRel` (both FK-column and junction-table cases). +- `SetProperty`, `RemoveProperty`. +- `Delete` (no detach). +- `Delete { detach: true }` walking incident edges via the catalog. +- Param binding extended for date/datetime, list, map. + +Tests: + +- TCK subset adds `@CREATE`, `@SET`, `@REMOVE`, `@DELETE`. +- pg_test verifies sync triggers fire after each write op so the + index sees the change on a subsequent statement. +- Negative test for `E4520` (read-after-write within a single + statement). +- Negative test for FK-violation when deleting without `DETACH`. + +## M4 — MERGE & uniqueness (week 7) + +**Goal:** `MERGE` works atomically. + +Scope: + +- `MergeNode` via `INSERT ... ON CONFLICT (cols) DO UPDATE … + RETURNING …, (xmax = 0)`. +- `MergeRel` same shape on junction tables. +- `on_create` / `on_match` dispatch over the `xmax = 0` flag. +- Sema gate: `MergeNode` whose key props don't match a registered + uniqueness tuple → `E4504`. +- Requires cyrs feat-request §2.1 / §2.2 to be in place; until then + use a temporary embedder-side analysis on `MergeNode.props` to + extract the candidate key. + +Tests: + +- TCK subset adds `@MERGE`. +- Concurrent MERGE test using two backends; asserts no duplicates + and exactly one `on_create` branch fires. + +## M5 — Shortest path, hardening, TCK gate (week 8+) + +**Goal:** `shortestPath` works (blocked on cyrs §1.1); golden TCK +subset becomes a CI gate. + +Scope: + +- `ShortestPath` op → `path_finder.rs`. Lands when cyrs ships the + operator. +- All "deferred-to-M5" items from earlier milestones. +- Function-coverage table cross-checked against + `cyrs_schema::StandardLibrary` (CI test). +- Perf budget: a representative MATCH query compiles + executes in + < N ms; document N once we measure baseline. +- Wire the openCypher TCK subset (the green-tag list from spec §17.5) + as a CI gate in pgGraph. Failures block merge. + +Tests: + +- Full TCK subset run on every PR. +- Mutation-testing run on the facade module weekly (matches cyrs's + own §17.8 cadence). + +## What goes in `graph/tests/` + +``` +graph/tests/ +├── ui/cypher/ # diagnostic fixtures (see 060) +├── integration/cypher/ # pg_test cases +│ ├── basic_match.rs +│ ├── varlen_traverse.rs +│ ├── merge_atomicity.rs +│ ├── detach_delete.rs +│ └── ... +└── tck/ # openCypher TCK subset + ├── README.md + ├── corpus -> ../../cyrs-tck-corpus/ # vendored or submodule + └── runner.rs +``` + +The TCK runner re-uses cyrs's parser/sema for input validation but +runs the query through `graph.cypher()` in a real Postgres backend +spun up via pgrx-tests. We don't depend on `cyrs-tck` the crate; we +consume the corpus directly. + +## Performance baselines + +Establish before M5: + +- Median compile time (parse → HIR → sema → plan) on a 200-character + Cypher query: target < 5 ms. +- Median execute time for "2-hop pattern, equality WHERE on indexed + column, RETURN 10 rows" on the demo schema: target < 10 ms. +- Allocation budget: < 200 KB per invocation steady-state. + +These are sanity caps, not products. If we miss them by 2×, dig in. +If we miss by 10×, we have a design problem. + +## Out of scope for v1 (tracked elsewhere) + +- `CALL { }` subqueries, `EXISTS { }` subqueries. +- Temporal beyond `date`/`datetime`. +- Spatial. +- Read-after-write within a single statement (M3 ships rejection, + not support). +- Streaming results larger than `work_mem` — TableIterator + materialises. +- Per-tenant catalog overlays. diff --git a/docs/contributor_guide/cypher-frontend/080-open-questions.md b/docs/contributor_guide/cypher-frontend/080-open-questions.md new file mode 100644 index 0000000..c3e97af --- /dev/null +++ b/docs/contributor_guide/cypher-frontend/080-open-questions.md @@ -0,0 +1,181 @@ +# Open questions + +Resolve before, or during, implementation. Each question has a +decision owner and a "needed by" milestone. Items blocked on cyrs +upstream cite the section of `../../../../cyrs/feat-request.md` they +depend on. + +## Upstream (blocking on cyrs) + +### Q-UP-1 — `ShortestPath` ReadOp lands in `cyrs-plan` + +- **Status:** awaiting upstream (cyrs `feat-request.md` §1.1). +- **Needed by:** M5. +- **Impact if delayed:** `shortestPath` / `allShortestPaths` + permanently emit `E4530` until landed. We can ship M0–M4 without + it. + +### Q-UP-2 — Path-variable surface in Plan + +- **Status:** awaiting upstream doc clarification (cyrs §1.2). +- **Needed by:** M2 (varlen patterns with `RETURN p`). +- **Workaround:** treat path variables as JSONB shaped like the + existing `path` column from `execute_traverse_rows`. If cyrs + surfaces a structured form, we adapt. + +### Q-UP-3 — MERGE key surface on `WriteOp::MergeNode/MergeRel` + +- **Status:** awaiting upstream (cyrs §2.1). +- **Needed by:** M4. +- **Workaround for M4:** embedder-side analysis of `MergeNode.props` + to extract the key. Remove the workaround when upstream ships. + +### Q-UP-4 — `label_unique_props`, `rel_type_unique_props` on `SchemaProvider` + +- **Status:** awaiting upstream (cyrs §2.2). +- **Needed by:** M4 (so sema can prove MERGE determinism instead of + embedder rejecting at exec time). +- **Workaround:** runtime check in the facade, with `E4504` if the + required unique constraint isn't registered. + +### Q-UP-5 — `labels_compatible` on `SchemaProvider` + +- **Status:** awaiting upstream (cyrs §2.3). +- **Needed by:** M3 (multi-label CREATE). +- **Workaround:** v1 rejects every multi-label CREATE with `E4503` + until cyrs ships the hook. Single-label CREATE is unaffected. + +### Q-UP-6 — Parameter type surface + +- **Status:** awaiting upstream (cyrs §2.4). +- **Needed by:** M1. +- **Workaround:** treat every param as JSONB. Loses some pg-side + type checking; otherwise works. Re-bind once upstream ships. + +### Q-UP-7 — Function builtin enumeration + +- **Status:** awaiting upstream (cyrs §1.3). +- **Needed by:** M2 (any RETURN with function calls). +- **Workaround:** maintain our own bucket table; CI test asserts every + name in cyrs's current set is covered. Drift becomes a CI failure, + not a silent miss. + +### Q-UP-8 — `cyrs-hir::lower_statement` returns `Result` + +- **Status:** awaiting upstream (cyrs §4.1). +- **Needed by:** M1. +- **Workaround:** catch unwinds from `lower_statement` in a panic + boundary and translate to `E0xxx` `42601`. Worse UX than a real + result type. + +### Q-UP-9 — `HirId → byte span` accessor + +- **Status:** awaiting upstream (cyrs §4.2). +- **Needed by:** M1 (for `errposition`). +- **Workaround:** omit `errposition`; users get diagnostics without + caret positioning. Tolerable for v1. + +### Q-UP-10 — Crates.io publication or stable git tag + +- **Status:** awaiting upstream (cyrs §6.1). +- **Needed by:** M0 (cannot pin `Cargo.toml` otherwise). +- **Workaround:** path dependency in development; flip to a tagged + rev before any release. + +## Internal (no upstream dependency) + +### Q-IN-1 — Read-after-write within a single statement + +- **Decision needed:** for v1, do we + (a) reject every Cypher statement that has both a write op and a + subsequent read referencing the just-written entities (`E4520`), + or + (b) accept it but document that the read won't see the write + through the graph engine (only through SQL)? +- **Recommendation:** (a). Cleaner UX; never silently wrong. +- **Needed by:** M3. + +### Q-IN-2 — Label set arithmetic without discriminator column + +- **Decision needed:** do we (1) reject `SetLabels` / `RemoveLabels` + on non-discriminator tables (`E4510`), (2) introduce a v2-only + `_node_labels` junction table, or (3) require every label table to + declare a discriminator column at registration time? +- **Recommendation for v1:** (1). Discriminator-backed tables get + full support; everything else gets `E4510` with a clear remediation + hint. +- **Needed by:** M3. + +### Q-IN-3 — JSON shape of the `graph.cypher()` result + +- **Decision needed:** one `jsonb` column with the whole RETURN row + as an object, or `RETURNS SETOF jsonb` per item, or `RETURNS TABLE + (..)` with one column per RETURN item dynamically typed? +- **Recommendation:** single `jsonb row` column for v1. + `RETURNS TABLE` with dynamic columns is not supported by pgrx; + emulating it via `record` types is fragile and gives bad UX. Users + who want flat columns wrap in `jsonb_to_record(...)`. +- **Needed by:** M1. + +### Q-IN-4 — Statement compile cache + +- **Decision needed:** cache `(text, schema_digest, dialect)` → + `PlanStatement`, or recompile every call? +- **Recommendation:** v1 recompiles. Add a per-backend `LruCache` + in M5 if compile-time shows up in profiling. +- **Needed by:** post-M5 perf pass. + +### Q-IN-5 — Configurable default depth / row caps + +- **Decision needed:** does `graph.cypher()` honour + `graph.default_max_depth` / `graph.max_nodes` / `graph.max_frontier` + GUCs the same way `graph.traverse()` does? +- **Recommendation:** yes. Same GUCs, same semantics. Adds a `NOTICE` + when a cap is reached. +- **Needed by:** M2. + +### Q-IN-6 — Tenant scoping + +- **Decision needed:** how does `WHERE` interact with the existing + tenant column? Auto-injection of a tenant predicate is convenient + but surprising. +- **Recommendation:** auto-inject ` = + current_setting('pgGraph.tenant')` on every `Source` over a tenanted + table when the GUC is set. Emit a `NOTICE` when first auto-injected + in a statement. Document it as a documented invariant, not as + magic. +- **Needed by:** M1. + +### Q-IN-7 — Dialect default + +- **Decision needed:** what's the default `DialectMode` for + `graph.cypher()`? GqlAligned or OpenCypherV9? +- **Recommendation:** `OpenCypherV9`. It's what the green-tag TCK + subset targets, and it's the dialect most existing tooling and + documentation expects. Selectable via GUC `graph.cypher_dialect`. +- **Needed by:** M1. + +### Q-IN-8 — Should `graph.cypher` accept multiple statements? + +- **Decision needed:** one statement per call, or a script + (statement; statement; …)? +- **Recommendation:** one. Multi-statement is what `psql` and the + outer transaction give you for free. +- **Needed by:** M1. + +### Q-IN-9 — `EXPLAIN`-equivalent + +- **Decision needed:** ship a companion `graph.explain_cypher(text) + RETURNS text` in v1, or defer? +- **Recommendation:** defer to M5 once the operator catalogue + stabilises. Print the `cyrs_plan::pretty` form. +- **Needed by:** M5. + +## To document later + +Not blocking, but worth writing up once we have a working M1: + +- A `user_guide/` page explaining `register_label` / friends and + showing a "hello cypher" example on the demo schema. +- A `roadmap.mdx` entry pointing at this directory. +- A note in the top-level `README.md` once M5 ships. diff --git a/graph/Cargo.lock b/graph/Cargo.lock index 7178b1a..e495ac0 100644 --- a/graph/Cargo.lock +++ b/graph/Cargo.lock @@ -132,6 +132,16 @@ dependencies = [ "hybrid-array", ] +[[package]] +name = "borsh" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfd1e3f8955a5d7de9fab72fc8373fade9fb8a703968cb200ae3dc6cf08e185a" +dependencies = [ + "bytes", + "cfg_aliases", +] + [[package]] name = "bumpalo" version = "3.20.2" @@ -195,7 +205,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "374b7c592d9c00c1f4972ea58390ac6b18cbb6ab79011f3bdc90a0b82ca06b77" dependencies = [ "serde", - "toml", + "toml 0.9.12+spec-1.1.0", ] [[package]] @@ -239,6 +249,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "chacha20" version = "0.10.0" @@ -470,6 +486,17 @@ dependencies = [ "cmov", ] +[[package]] +name = "cyrs-schema" +version = "0.1.0" +dependencies = [ + "indexmap", + "serde", + "smol_str", + "thiserror 2.0.18", + "toml 0.8.23", +] + [[package]] name = "digest" version = "0.11.2" @@ -686,6 +713,8 @@ dependencies = [ "bitvec", "crc32fast", "criterion", + "cyrs-schema", + "indexmap", "memmap2", "pgrx", "pgrx-tests", @@ -693,6 +722,7 @@ dependencies = [ "roaring", "serde", "serde_json", + "smol_str", "thiserror 2.0.18", "xxhash-rust", ] @@ -1224,7 +1254,7 @@ dependencies = [ "serde", "serde_json", "thiserror 2.0.18", - "toml", + "toml 0.9.12+spec-1.1.0", "url", "winapi", ] @@ -1712,6 +1742,15 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_spanned" version = "1.1.1" @@ -1756,6 +1795,16 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "smol_str" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4aaa7368fcf4852a4c2dd92df0cace6a71f2091ca0a23391ce7f3a31833f1523" +dependencies = [ + "borsh", + "serde_core", +] + [[package]] name = "socket2" version = "0.6.3" @@ -1974,6 +2023,18 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned 0.6.9", + "toml_datetime 0.6.11", + "toml_edit", +] + [[package]] name = "toml" version = "0.9.12+spec-1.1.0" @@ -1982,13 +2043,22 @@ checksum = "cf92845e79fc2e2def6a5d828f0801e29a2f8acc037becc5ab08595c7d5e9863" dependencies = [ "indexmap", "serde_core", - "serde_spanned", - "toml_datetime", + "serde_spanned 1.1.1", + "toml_datetime 0.7.5+spec-1.1.0", "toml_parser", "toml_writer", "winnow 0.7.15", ] +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + [[package]] name = "toml_datetime" version = "0.7.5+spec-1.1.0" @@ -1998,6 +2068,20 @@ dependencies = [ "serde_core", ] +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned 0.6.9", + "toml_datetime 0.6.11", + "toml_write", + "winnow 0.7.15", +] + [[package]] name = "toml_parser" version = "1.1.2+spec-1.1.0" @@ -2007,6 +2091,12 @@ dependencies = [ "winnow 1.0.2", ] +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "toml_writer" version = "1.1.1+spec-1.1.0" @@ -2436,6 +2526,9 @@ name = "winnow" version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] [[package]] name = "winnow" diff --git a/graph/Cargo.toml b/graph/Cargo.toml index 945ced3..3b1bb81 100644 --- a/graph/Cargo.toml +++ b/graph/Cargo.toml @@ -35,6 +35,15 @@ memmap2 = "0.9" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +# ─── Cypher frontend (feat/cypher-frontend branch) ───────────────── +# Path deps to the cyrs workspace co-located at ../../cyrs. We pin +# these by path during development; will flip to crates.io versions +# or a tagged git rev before any release. See +# docs/contributor_guide/cypher-frontend/080-open-questions.md Q-UP-10. +cyrs-schema = { path = "../../cyrs/crates/cyrs-schema" } +smol_str = "0.3" +indexmap = "2" + [lints.rust] rust_2018_idioms = { level = "warn", priority = -1 } missing_docs = "warn" diff --git a/graph/sql/cypher_catalog.sql b/graph/sql/cypher_catalog.sql new file mode 100644 index 0000000..28f090e --- /dev/null +++ b/graph/sql/cypher_catalog.sql @@ -0,0 +1,120 @@ +-- ─── Cypher frontend catalog tables (feat/cypher-frontend, M0) ───── +-- +-- Maps Cypher labels and relationship types onto pgGraph's registered +-- tables and edges. The cypher_facade module reads from these tables +-- to build a cyrs_schema::SchemaProvider snapshot per query. +-- +-- See: docs/contributor_guide/cypher-frontend/020-catalog-extensions.md + +-- ────────────────────────────────────────────────────────────────── +-- _registered_labels: Cypher label → Postgres table mapping. +-- A label maps to exactly one table. When the same table stores +-- multiple labels, the discriminator_col + discriminator_val pair +-- selects rows for this label. +-- ────────────────────────────────────────────────────────────────── +CREATE TABLE IF NOT EXISTS graph._registered_labels ( + label TEXT PRIMARY KEY, + table_name TEXT NOT NULL REFERENCES graph._registered_tables(table_name) + ON DELETE CASCADE, + discriminator_col TEXT NULL, + discriminator_val TEXT NULL, + CHECK ((discriminator_col IS NULL) = (discriminator_val IS NULL)) +); + +CREATE INDEX IF NOT EXISTS _registered_labels_table_idx + ON graph._registered_labels(table_name); + +-- ────────────────────────────────────────────────────────────────── +-- _registered_label_properties: Cypher property → table column. +-- One row per (label, property). Missing rows = "property not +-- stored on this label" → sema diagnostic + NULL reads. +-- ────────────────────────────────────────────────────────────────── +CREATE TABLE IF NOT EXISTS graph._registered_label_properties ( + label TEXT NOT NULL, + property TEXT NOT NULL, + column_name TEXT NOT NULL, + column_type TEXT NOT NULL, + required BOOLEAN NOT NULL DEFAULT FALSE, + PRIMARY KEY (label, property), + FOREIGN KEY (label) REFERENCES graph._registered_labels(label) + ON DELETE CASCADE +); + +-- ────────────────────────────────────────────────────────────────── +-- _registered_rel_types: Cypher rel type → existing pgGraph edge. +-- Binds a Cypher relationship type name to a registered edge row +-- (one of _registered_edges). +-- ────────────────────────────────────────────────────────────────── +CREATE TABLE IF NOT EXISTS graph._registered_rel_types ( + rel_type TEXT PRIMARY KEY, + from_table TEXT NOT NULL, + from_column TEXT NOT NULL, + to_table TEXT NOT NULL, + to_column TEXT NOT NULL, + label TEXT NOT NULL, + FOREIGN KEY (from_table, from_column, to_table, to_column, label) + REFERENCES graph._registered_edges + (from_table, from_column, to_table, to_column, label) + ON DELETE CASCADE +); + +-- ────────────────────────────────────────────────────────────────── +-- _registered_rel_properties: Cypher rel property → column (for +-- junction-table edges that carry their own properties). +-- ────────────────────────────────────────────────────────────────── +CREATE TABLE IF NOT EXISTS graph._registered_rel_properties ( + rel_type TEXT NOT NULL, + property TEXT NOT NULL, + column_name TEXT NOT NULL, + column_type TEXT NOT NULL, + required BOOLEAN NOT NULL DEFAULT FALSE, + PRIMARY KEY (rel_type, property), + FOREIGN KEY (rel_type) REFERENCES graph._registered_rel_types(rel_type) + ON DELETE CASCADE +); + +-- ────────────────────────────────────────────────────────────────── +-- _registered_unique_props: Declared uniqueness tuples (required +-- for MERGE atomicity — sema requires a matching tuple here before +-- allowing INSERT ... ON CONFLICT lowering). +-- ────────────────────────────────────────────────────────────────── +CREATE TABLE IF NOT EXISTS graph._registered_unique_props ( + kind TEXT NOT NULL CHECK (kind IN ('label', 'rel_type')), + name TEXT NOT NULL, + props TEXT[] NOT NULL, + PRIMARY KEY (kind, name, props) +); + +-- ────────────────────────────────────────────────────────────────── +-- _registered_label_sets: Declared multi-label compatibility. Each +-- row is a sorted-distinct label tuple permitted to co-exist on a +-- single node. Empty table = single-label-only. +-- ────────────────────────────────────────────────────────────────── +CREATE TABLE IF NOT EXISTS graph._registered_label_sets ( + labels TEXT[] PRIMARY KEY +); + +-- ────────────────────────────────────────────────────────────────── +-- Permissions: same posture as the base catalog tables. +-- ────────────────────────────────────────────────────────────────── +REVOKE ALL ON TABLE graph._registered_labels FROM PUBLIC; +REVOKE ALL ON TABLE graph._registered_label_properties FROM PUBLIC; +REVOKE ALL ON TABLE graph._registered_rel_types FROM PUBLIC; +REVOKE ALL ON TABLE graph._registered_rel_properties FROM PUBLIC; +REVOKE ALL ON TABLE graph._registered_unique_props FROM PUBLIC; +REVOKE ALL ON TABLE graph._registered_label_sets FROM PUBLIC; + +GRANT SELECT ON TABLE graph._registered_labels TO PUBLIC; +GRANT SELECT ON TABLE graph._registered_label_properties TO PUBLIC; +GRANT SELECT ON TABLE graph._registered_rel_types TO PUBLIC; +GRANT SELECT ON TABLE graph._registered_rel_properties TO PUBLIC; +GRANT SELECT ON TABLE graph._registered_unique_props TO PUBLIC; +GRANT SELECT ON TABLE graph._registered_label_sets TO PUBLIC; + +-- Mark these as configuration tables so pg_dump preserves their contents. +SELECT pg_catalog.pg_extension_config_dump('graph._registered_labels', ''); +SELECT pg_catalog.pg_extension_config_dump('graph._registered_label_properties', ''); +SELECT pg_catalog.pg_extension_config_dump('graph._registered_rel_types', ''); +SELECT pg_catalog.pg_extension_config_dump('graph._registered_rel_properties', ''); +SELECT pg_catalog.pg_extension_config_dump('graph._registered_unique_props', ''); +SELECT pg_catalog.pg_extension_config_dump('graph._registered_label_sets', ''); diff --git a/graph/src/cypher_facade/mod.rs b/graph/src/cypher_facade/mod.rs new file mode 100644 index 0000000..d801cec --- /dev/null +++ b/graph/src/cypher_facade/mod.rs @@ -0,0 +1,46 @@ +//! Cypher / openCypher v9 frontend for pgGraph. +//! +//! Entry point: the `graph.cypher(text, jsonb)` SQL function (added in +//! M1). This module owns the translation from a Cypher query string to +//! pgGraph operations, using the cyrs frontend (`cyrs-hir`, +//! `cyrs-plan`, `cyrs-schema`, `cyrs-sema`, `cyrs-diag`) for parsing, +//! resolution, sema, and logical plan IR. +//! +//! See: `docs/contributor_guide/cypher-frontend/000-overview.md` +//! +//! Milestone status: **M0 — skeleton & catalog**. The catalog tables +//! and registration SQL functions are wired; `execute()` returns the +//! `NotYetImplemented` error. + +pub(crate) mod registration; +pub(crate) mod schema_provider; + +use crate::safety::GraphError; + +/// Error surface for the Cypher facade. +/// +/// Maps onto `ereport` SQLSTATE per +/// `docs/contributor_guide/cypher-frontend/060-diagnostics-and-errors.md`. +#[derive(Debug, thiserror::Error)] +pub enum FacadeError { + /// Feature is part of the documented milestone plan but not yet + /// implemented on this branch. Surfaces as SQLSTATE `0A000` + /// (`feature_not_supported`). + #[error("graph.cypher: {0}")] + NotYetImplemented(&'static str), + + /// Catalog read or write failure. Propagates the underlying + /// `GraphError` SQLSTATE. + #[error(transparent)] + Catalog(#[from] GraphError), +} + +impl FacadeError { + /// SQLSTATE for `ereport`. + pub(crate) fn sqlstate(&self) -> &'static str { + match self { + FacadeError::NotYetImplemented(_) => "0A000", + FacadeError::Catalog(err) => err.sqlstate(), + } + } +} diff --git a/graph/src/cypher_facade/registration.rs b/graph/src/cypher_facade/registration.rs new file mode 100644 index 0000000..0834213 --- /dev/null +++ b/graph/src/cypher_facade/registration.rs @@ -0,0 +1,486 @@ +//! SQL-callable Cypher catalog registration functions. +//! +//! These functions write to the `_registered_labels`, +//! `_registered_label_properties`, `_registered_rel_types`, +//! `_registered_rel_properties`, `_registered_unique_props`, and +//! `_registered_label_sets` tables. They are the public +//! administration surface users invoke to bind Cypher names to +//! pgGraph storage before issuing Cypher queries. +//! +//! Mirrors the auth and panic-boundary pattern from +//! `sql_facade/admin.rs`. See: +//! `docs/contributor_guide/cypher-frontend/020-catalog-extensions.md`. + +use pgrx::prelude::*; + +use crate::safety::{self, GraphError, GraphResult}; + +// ────────────────────────────────────────────────────────────────── +// Admin-permission gate (mirrors sql_facade::admin so we don't have +// to widen visibility on the shared helper). +// ────────────────────────────────────────────────────────────────── + +fn require_graph_admin_result() -> GraphResult<()> { + let allowed = Spi::connect(|client| { + let result = client.select( + "SELECT + COALESCE((SELECT rolsuper FROM pg_roles WHERE rolname = current_user), false) + OR has_schema_privilege(current_user, 'graph', 'CREATE')", + None, + &[], + )?; + Ok::<_, pgrx::spi::SpiError>( + result + .first() + .get::(1) + .ok() + .flatten() + .unwrap_or(false), + ) + }) + .map_err(|e| GraphError::Internal(format!("admin-check SPI failed: {e}")))?; + + if allowed { + Ok(()) + } else { + Err(GraphError::AclDenied { + table: "graph (cypher registration)".to_string(), + }) + } +} + +// ────────────────────────────────────────────────────────────────── +// register_label +// ────────────────────────────────────────────────────────────────── + +/// Register a Cypher label against a previously registered table. +/// +/// `discriminator_col` / `discriminator_val` are jointly NULL or +/// jointly non-NULL; when set, they select which rows of +/// `table_name` carry this label. +#[pg_extern(schema = "graph")] +fn register_label( + label: &str, + table_name: &str, + discriminator_col: default!(Option, "NULL"), + discriminator_val: default!(Option, "NULL"), +) { + let result = (|| -> GraphResult<()> { + require_graph_admin_result()?; + validate_label_args(label, &discriminator_col, &discriminator_val)?; + ensure_table_registered(table_name)?; + insert_registered_label( + label, + table_name, + discriminator_col.as_deref(), + discriminator_val.as_deref(), + ) + })(); + if let Err(err) = result { + err.report(); + } +} + +fn validate_label_args( + label: &str, + disc_col: &Option, + disc_val: &Option, +) -> GraphResult<()> { + if label.is_empty() { + return Err(GraphError::Internal("label must not be empty".to_string())); + } + if disc_col.is_some() != disc_val.is_some() { + return Err(GraphError::Internal( + "discriminator_col and discriminator_val must both be NULL or both be set".to_string(), + )); + } + Ok(()) +} + +fn ensure_table_registered(table_name: &str) -> GraphResult<()> { + let found: bool = Spi::connect(|client| { + let result = client.select( + "SELECT EXISTS ( + SELECT 1 FROM graph._registered_tables WHERE table_name = $1 + )", + None, + &[table_name.into()], + )?; + Ok::<_, pgrx::spi::SpiError>( + result + .first() + .get::(1) + .ok() + .flatten() + .unwrap_or(false), + ) + }) + .map_err(|e| GraphError::Internal(format!("table-lookup SPI failed: {e}")))?; + + if found { + Ok(()) + } else { + Err(GraphError::Internal(format!( + "table '{table_name}' is not registered with graph; call graph.add_table() first", + ))) + } +} + +fn insert_registered_label( + label: &str, + table_name: &str, + disc_col: Option<&str>, + disc_val: Option<&str>, +) -> GraphResult<()> { + Spi::run_with_args( + "INSERT INTO graph._registered_labels + (label, table_name, discriminator_col, discriminator_val) + VALUES ($1, $2, $3, $4) + ON CONFLICT (label) DO UPDATE SET + table_name = EXCLUDED.table_name, + discriminator_col = EXCLUDED.discriminator_col, + discriminator_val = EXCLUDED.discriminator_val", + &[ + label.into(), + table_name.into(), + disc_col.map(str::to_string).into(), + disc_val.map(str::to_string).into(), + ], + ) + .map_err(|e| GraphError::Internal(format!("register_label insert failed: {e}"))) +} + +// ────────────────────────────────────────────────────────────────── +// register_label_property +// ────────────────────────────────────────────────────────────────── + +/// Map a Cypher property name onto a Postgres column on the label's +/// backing table. `column_type` is the textual pg type (e.g. `text`, +/// `int8`, `jsonb`); it informs sema's type lattice but does not +/// itself enforce a cast. +#[pg_extern(schema = "graph")] +fn register_label_property( + label: &str, + property: &str, + column_name: &str, + column_type: &str, + required: default!(bool, false), +) { + let result = (|| -> GraphResult<()> { + require_graph_admin_result()?; + if label.is_empty() || property.is_empty() || column_name.is_empty() || column_type.is_empty() { + return Err(GraphError::Internal( + "register_label_property: all string args required".to_string(), + )); + } + Spi::run_with_args( + "INSERT INTO graph._registered_label_properties + (label, property, column_name, column_type, required) + VALUES ($1, $2, $3, $4, $5) + ON CONFLICT (label, property) DO UPDATE SET + column_name = EXCLUDED.column_name, + column_type = EXCLUDED.column_type, + required = EXCLUDED.required", + &[ + label.into(), + property.into(), + column_name.into(), + column_type.into(), + required.into(), + ], + ) + .map_err(|e| GraphError::Internal(format!("register_label_property insert failed: {e}"))) + })(); + if let Err(err) = result { + err.report(); + } +} + +// ────────────────────────────────────────────────────────────────── +// register_rel_type +// ────────────────────────────────────────────────────────────────── + +/// Register a Cypher relationship type against an already-registered +/// edge. The five-column foreign key matches `_registered_edges`'s +/// composite key. +#[pg_extern(schema = "graph")] +fn register_rel_type( + rel_type: &str, + from_table: &str, + from_column: &str, + to_table: &str, + to_column: &str, + label: &str, +) { + let result = (|| -> GraphResult<()> { + require_graph_admin_result()?; + if rel_type.is_empty() { + return Err(GraphError::Internal("rel_type must not be empty".to_string())); + } + Spi::run_with_args( + "INSERT INTO graph._registered_rel_types + (rel_type, from_table, from_column, to_table, to_column, label) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT (rel_type) DO UPDATE SET + from_table = EXCLUDED.from_table, + from_column = EXCLUDED.from_column, + to_table = EXCLUDED.to_table, + to_column = EXCLUDED.to_column, + label = EXCLUDED.label", + &[ + rel_type.into(), + from_table.into(), + from_column.into(), + to_table.into(), + to_column.into(), + label.into(), + ], + ) + .map_err(|e| GraphError::Internal(format!("register_rel_type insert failed: {e}"))) + })(); + if let Err(err) = result { + err.report(); + } +} + +// ────────────────────────────────────────────────────────────────── +// register_unique +// ────────────────────────────────────────────────────────────────── + +/// Declare a uniqueness tuple for MERGE atomicity. `kind` must be +/// `'label'` or `'rel_type'`; `name` is the corresponding label or +/// rel-type name; `props` is the ordered property tuple. +/// +/// **This function validates that a matching Postgres `UNIQUE` / +/// `PRIMARY KEY` constraint actually exists on the underlying +/// table.** Without that validation, MERGE's `INSERT ... ON CONFLICT` +/// lowering would not be atomic. +#[pg_extern(schema = "graph")] +fn register_unique(kind: &str, name: &str, props: Vec) { + let result = (|| -> GraphResult<()> { + require_graph_admin_result()?; + if kind != "label" && kind != "rel_type" { + return Err(GraphError::Internal(format!( + "register_unique: kind must be 'label' or 'rel_type' (got {kind:?})" + ))); + } + if props.is_empty() { + return Err(GraphError::Internal( + "register_unique: props must be non-empty".to_string(), + )); + } + validate_pg_unique_exists(kind, name, &props)?; + Spi::run_with_args( + "INSERT INTO graph._registered_unique_props (kind, name, props) + VALUES ($1, $2, $3) + ON CONFLICT (kind, name, props) DO NOTHING", + &[kind.into(), name.into(), props.into()], + ) + .map_err(|e| GraphError::Internal(format!("register_unique insert failed: {e}"))) + })(); + if let Err(err) = result { + err.report(); + } +} + +/// Resolve `kind` + `name` to the underlying table + property→column +/// map, then check that the resulting column tuple is covered by a +/// real Postgres unique constraint. +fn validate_pg_unique_exists(kind: &str, name: &str, props: &[String]) -> GraphResult<()> { + let (table_name, columns) = if kind == "label" { + resolve_label_columns(name, props)? + } else { + resolve_rel_type_columns(name, props)? + }; + + let columns_array = columns; + let exists: bool = Spi::connect(|client| { + // `pg_constraint.conkey` is an int2[] of attribute numbers; we + // compare against the requested column names via pg_attribute. + let result = client.select( + "WITH wanted AS ( + SELECT $2::text[] AS cols + ), + attnums AS ( + SELECT array_agg(a.attnum ORDER BY a.attnum) AS k + FROM pg_attribute a + WHERE a.attrelid = $1::regclass + AND a.attname = ANY((SELECT cols FROM wanted)) + ) + SELECT EXISTS ( + SELECT 1 + FROM pg_constraint c, attnums + WHERE c.conrelid = $1::regclass + AND c.contype IN ('u', 'p') + AND c.conkey @> attnums.k + AND attnums.k @> c.conkey + )", + None, + &[table_name.clone().into(), columns_array.into()], + )?; + Ok::<_, pgrx::spi::SpiError>( + result + .first() + .get::(1) + .ok() + .flatten() + .unwrap_or(false), + ) + }) + .map_err(|e| GraphError::Internal(format!("unique-constraint check SPI failed: {e}")))?; + + if exists { + Ok(()) + } else { + Err(GraphError::Internal(format!( + "no UNIQUE / PRIMARY KEY constraint on {} covering columns matching props {:?}; \ + create one before calling graph.register_unique()", + table_name, props, + ))) + } +} + +fn resolve_label_columns(label: &str, props: &[String]) -> GraphResult<(String, Vec)> { + let raw: Result<(Option, Vec>), pgrx::spi::SpiError> = + Spi::connect(|client| { + let table = client + .select( + "SELECT table_name FROM graph._registered_labels WHERE label = $1", + None, + &[label.into()], + )? + .first() + .get::(1)?; + let mut cols: Vec> = Vec::with_capacity(props.len()); + for p in props { + let col = client + .select( + "SELECT column_name FROM graph._registered_label_properties + WHERE label = $1 AND property = $2", + None, + &[label.into(), p.clone().into()], + )? + .first() + .get::(1)?; + cols.push(col); + } + Ok((table, cols)) + }); + let (table, cols) = raw + .map_err(|e| GraphError::Internal(format!("label-column resolution SPI failed: {e}")))?; + + let Some(table) = table else { + return Err(GraphError::Internal(format!( + "label {label:?} not registered" + ))); + }; + let mut resolved = Vec::with_capacity(cols.len()); + for (prop, col) in props.iter().zip(cols) { + match col { + Some(c) => resolved.push(c), + None => { + return Err(GraphError::Internal(format!( + "property {prop:?} not registered on label {label:?}" + ))); + } + } + } + Ok((table, resolved)) +} + +fn resolve_rel_type_columns(rel_type: &str, props: &[String]) -> GraphResult<(String, Vec)> { + // For rel types we treat the from_table as the carrier of the + // unique constraint when the edge has its own properties table + // (junction model). Pure FK edges don't currently support + // register_unique — MERGE atomicity on those is not in M0 scope. + let raw: Result<(Option, Vec>), pgrx::spi::SpiError> = + Spi::connect(|client| { + let table = client + .select( + "SELECT from_table FROM graph._registered_rel_types WHERE rel_type = $1", + None, + &[rel_type.into()], + )? + .first() + .get::(1)?; + let mut cols: Vec> = Vec::with_capacity(props.len()); + for p in props { + let col = client + .select( + "SELECT column_name FROM graph._registered_rel_properties + WHERE rel_type = $1 AND property = $2", + None, + &[rel_type.into(), p.clone().into()], + )? + .first() + .get::(1)?; + cols.push(col); + } + Ok((table, cols)) + }); + let (table, cols) = raw + .map_err(|e| GraphError::Internal(format!("rel-type-column resolution SPI failed: {e}")))?; + + let Some(table) = table else { + return Err(GraphError::Internal(format!( + "rel_type {rel_type:?} not registered" + ))); + }; + let mut resolved = Vec::with_capacity(cols.len()); + for (prop, col) in props.iter().zip(cols) { + match col { + Some(c) => resolved.push(c), + None => { + return Err(GraphError::Internal(format!( + "property {prop:?} not registered on rel_type {rel_type:?}" + ))); + } + } + } + Ok((table, resolved)) +} + +// ────────────────────────────────────────────────────────────────── +// allow_label_set +// ────────────────────────────────────────────────────────────────── + +/// Declare a multi-label combination that may co-exist on a single +/// node. The slice is normalised (sorted, deduplicated) before +/// insert. A single-element slice is always permitted and inserting +/// it is a no-op. +#[pg_extern(schema = "graph")] +fn allow_label_set(labels: Vec) { + let result = (|| -> GraphResult<()> { + require_graph_admin_result()?; + if labels.is_empty() { + return Err(GraphError::Internal( + "allow_label_set: labels must be non-empty".to_string(), + )); + } + let mut normalised = labels; + normalised.sort(); + normalised.dedup(); + if normalised.len() == 1 { + // Single-element sets are always permitted; no need to + // record them. + return Ok(()); + } + Spi::run_with_args( + "INSERT INTO graph._registered_label_sets (labels) + VALUES ($1) + ON CONFLICT (labels) DO NOTHING", + &[normalised.into()], + ) + .map_err(|e| GraphError::Internal(format!("allow_label_set insert failed: {e}"))) + })(); + if let Err(err) = result { + err.report(); + } +} + +/// Suppress unused-import lints; `safety` is consumed transitively +/// via re-exports above but the linter sometimes misses the chain. +#[allow(dead_code, reason = "module placeholder; expands in M1")] +fn _touch() -> safety::GraphResult<()> { + Ok(()) +} diff --git a/graph/src/cypher_facade/schema_provider.rs b/graph/src/cypher_facade/schema_provider.rs new file mode 100644 index 0000000..41e9406 --- /dev/null +++ b/graph/src/cypher_facade/schema_provider.rs @@ -0,0 +1,464 @@ +//! `cyrs_schema::SchemaProvider` implementation backed by the +//! pgGraph catalog (`_registered_labels`, `_registered_rel_types`, +//! and friends). +//! +//! Each `graph.cypher(...)` invocation snapshots the catalog once and +//! constructs a `PgGraphSchema` over the snapshot. The snapshot is +//! immutable for the duration of one query, which matches cyrs's +//! Salsa expectation that `schema_digest()` is stable across calls +//! within a single check (spec 0001 §11.2). +//! +//! Methods that depend on cyrs feature-request items +//! (`label_unique_props`, `rel_type_unique_props`, `labels_compatible`) +//! are written but rely on the trait defaults until upstream cyrs +//! ships them — see +//! `docs/contributor_guide/cypher-frontend/080-open-questions.md`. + +use std::collections::BTreeMap; + +use indexmap::IndexMap; +use pgrx::prelude::*; +use smol_str::SmolStr; + +use cyrs_schema::{ + EndpointDecl, FunctionSignature, ProcedureSignature, PropertyDecl, PropertyType, + SchemaProvider, StandardLibrary, +}; + +use crate::safety::{GraphError, GraphResult}; + +/// Immutable in-memory view of the Cypher catalog at the moment a +/// query begins. Built via [`PgGraphSchema::snapshot`]. +#[derive(Debug)] +pub(crate) struct PgGraphSchema { + labels: BTreeMap, + rel_types: BTreeMap, + unique_props_by_label: BTreeMap>>, + unique_props_by_rel_type: BTreeMap>>, + label_sets: Vec>, + digest: [u8; 32], +} + +#[derive(Debug, Clone)] +struct LabelEntry { + #[allow(dead_code, reason = "consumed in M1 read-side translation")] + table_name: String, + #[allow(dead_code, reason = "consumed in M1 read-side translation")] + discriminator: Option<(String, String)>, + properties: IndexMap, +} + +#[derive(Debug, Clone)] +struct RelTypeEntry { + #[allow(dead_code, reason = "consumed in M1/M2 read-side translation")] + from_table: String, + #[allow(dead_code, reason = "consumed in M1/M2 read-side translation")] + from_column: String, + #[allow(dead_code, reason = "consumed in M1/M2 read-side translation")] + to_table: String, + #[allow(dead_code, reason = "consumed in M1/M2 read-side translation")] + to_column: String, + #[allow(dead_code, reason = "consumed in M1/M2 read-side translation")] + edge_label: String, + properties: IndexMap, +} + +impl PgGraphSchema { + /// Read the Cypher catalog via SPI and build a snapshot. + pub(crate) fn snapshot() -> GraphResult { + let labels = load_labels()?; + let rel_types = load_rel_types()?; + let (unique_props_by_label, unique_props_by_rel_type) = load_unique_props()?; + let label_sets = load_label_sets()?; + let digest = compute_digest(&labels, &rel_types, &unique_props_by_label, + &unique_props_by_rel_type, &label_sets); + Ok(Self { + labels, + rel_types, + unique_props_by_label, + unique_props_by_rel_type, + label_sets, + digest, + }) + } +} + +// ────────────────────────────────────────────────────────────────── +// SchemaProvider impl +// ────────────────────────────────────────────────────────────────── + +impl SchemaProvider for PgGraphSchema { + fn labels(&self) -> Vec { + self.labels.keys().cloned().collect() + } + + fn relationship_types(&self) -> Vec { + self.rel_types.keys().cloned().collect() + } + + fn node_properties(&self, label: &str) -> Option> { + self.labels + .get(label) + .map(|entry| entry.properties.values().cloned().collect()) + } + + fn relationship_properties(&self, rel_type: &str) -> Option> { + self.rel_types + .get(rel_type) + .map(|entry| entry.properties.values().cloned().collect()) + } + + fn relationship_endpoints(&self, rel_type: &str) -> Vec { + // M0: we record only the from/to table names, not the labels + // pgGraph would associate with those tables. The proper join + // through `_registered_labels` lands in M1 once the read path + // needs it. + let Some(_entry) = self.rel_types.get(rel_type) else { + return Vec::new(); + }; + Vec::new() + } + + fn inverse_of(&self, _rel_type: &str) -> Option { + None + } + + fn function(&self, name: &str) -> Option { + // The stdlib catalog is `static` inside cyrs-schema; this + // construction is effectively free. + StandardLibrary::default().function(name) + } + + fn procedure(&self, _name: &str) -> Option { + None + } + + fn schema_digest(&self) -> [u8; 32] { + self.digest + } +} + +// ────────────────────────────────────────────────────────────────── +// Catalog readers +// ────────────────────────────────────────────────────────────────── + +fn load_labels() -> GraphResult> { + let mut labels: BTreeMap = BTreeMap::new(); + + Spi::connect(|client| { + let rows = client.select( + "SELECT label, table_name, discriminator_col, discriminator_val + FROM graph._registered_labels", + None, + &[], + )?; + for row in rows { + let label: String = row.get::(1)?.unwrap_or_default(); + let table_name: String = row.get::(2)?.unwrap_or_default(); + let disc_col: Option = row.get::(3)?; + let disc_val: Option = row.get::(4)?; + let discriminator = match (disc_col, disc_val) { + (Some(c), Some(v)) => Some((c, v)), + _ => None, + }; + labels.insert( + SmolStr::from(label), + LabelEntry { + table_name, + discriminator, + properties: IndexMap::new(), + }, + ); + } + Ok::<(), pgrx::spi::SpiError>(()) + }) + .map_err(|e| GraphError::Internal(format!("load_labels SPI failed: {e}")))?; + + Spi::connect(|client| { + let rows = client.select( + "SELECT label, property, column_name, column_type, required + FROM graph._registered_label_properties", + None, + &[], + )?; + for row in rows { + let label: String = row.get::(1)?.unwrap_or_default(); + let property: String = row.get::(2)?.unwrap_or_default(); + let _column: String = row.get::(3)?.unwrap_or_default(); + let column_type: String = row.get::(4)?.unwrap_or_default(); + let required: bool = row.get::(5)?.unwrap_or(false); + if let Some(entry) = labels.get_mut(label.as_str()) { + let ty = pg_type_to_property_type(&column_type); + entry + .properties + .insert(SmolStr::from(property.clone()), + PropertyDecl::new(property, ty, required)); + } + } + Ok::<(), pgrx::spi::SpiError>(()) + }) + .map_err(|e| GraphError::Internal(format!("load_label_properties SPI failed: {e}")))?; + + Ok(labels) +} + +fn load_rel_types() -> GraphResult> { + let mut rel_types: BTreeMap = BTreeMap::new(); + + Spi::connect(|client| { + let rows = client.select( + "SELECT rel_type, from_table, from_column, to_table, to_column, label + FROM graph._registered_rel_types", + None, + &[], + )?; + for row in rows { + let rel_type: String = row.get::(1)?.unwrap_or_default(); + let from_table: String = row.get::(2)?.unwrap_or_default(); + let from_column: String = row.get::(3)?.unwrap_or_default(); + let to_table: String = row.get::(4)?.unwrap_or_default(); + let to_column: String = row.get::(5)?.unwrap_or_default(); + let edge_label: String = row.get::(6)?.unwrap_or_default(); + rel_types.insert( + SmolStr::from(rel_type), + RelTypeEntry { + from_table, + from_column, + to_table, + to_column, + edge_label, + properties: IndexMap::new(), + }, + ); + } + Ok::<(), pgrx::spi::SpiError>(()) + }) + .map_err(|e| GraphError::Internal(format!("load_rel_types SPI failed: {e}")))?; + + Spi::connect(|client| { + let rows = client.select( + "SELECT rel_type, property, column_name, column_type, required + FROM graph._registered_rel_properties", + None, + &[], + )?; + for row in rows { + let rel_type: String = row.get::(1)?.unwrap_or_default(); + let property: String = row.get::(2)?.unwrap_or_default(); + let _column: String = row.get::(3)?.unwrap_or_default(); + let column_type: String = row.get::(4)?.unwrap_or_default(); + let required: bool = row.get::(5)?.unwrap_or(false); + if let Some(entry) = rel_types.get_mut(rel_type.as_str()) { + let ty = pg_type_to_property_type(&column_type); + entry.properties.insert( + SmolStr::from(property.clone()), + PropertyDecl::new(property, ty, required), + ); + } + } + Ok::<(), pgrx::spi::SpiError>(()) + }) + .map_err(|e| GraphError::Internal(format!("load_rel_properties SPI failed: {e}")))?; + + Ok(rel_types) +} + +#[allow(clippy::type_complexity, reason = "two parallel maps returned together")] +fn load_unique_props() -> GraphResult<( + BTreeMap>>, + BTreeMap>>, +)> { + let mut by_label: BTreeMap>> = BTreeMap::new(); + let mut by_rel: BTreeMap>> = BTreeMap::new(); + Spi::connect(|client| { + let rows = client.select( + "SELECT kind, name, props FROM graph._registered_unique_props", + None, + &[], + )?; + for row in rows { + let kind: String = row.get::(1)?.unwrap_or_default(); + let name: String = row.get::(2)?.unwrap_or_default(); + let props: Vec = row.get::>(3)?.unwrap_or_default(); + let key = SmolStr::from(name); + let tuple: Vec = props.into_iter().map(SmolStr::from).collect(); + let target = if kind == "label" { + by_label.entry(key).or_default() + } else { + by_rel.entry(key).or_default() + }; + target.push(tuple); + } + Ok::<(), pgrx::spi::SpiError>(()) + }) + .map_err(|e| GraphError::Internal(format!("load_unique_props SPI failed: {e}")))?; + Ok((by_label, by_rel)) +} + +fn load_label_sets() -> GraphResult>> { + let mut out: Vec> = Vec::new(); + Spi::connect(|client| { + let rows = client.select( + "SELECT labels FROM graph._registered_label_sets", + None, + &[], + )?; + for row in rows { + let labels: Vec = row.get::>(1)?.unwrap_or_default(); + out.push(labels.into_iter().map(SmolStr::from).collect()); + } + Ok::<(), pgrx::spi::SpiError>(()) + }) + .map_err(|e| GraphError::Internal(format!("load_label_sets SPI failed: {e}")))?; + Ok(out) +} + +// ────────────────────────────────────────────────────────────────── +// Type bridge (see 020-catalog-extensions.md) +// ────────────────────────────────────────────────────────────────── + +fn pg_type_to_property_type(pg_type: &str) -> PropertyType { + let normalised = pg_type.trim().to_lowercase(); + match normalised.as_str() { + "text" | "varchar" | "bpchar" | "char" | "name" => PropertyType::String, + "int2" | "int4" | "int8" | "smallint" | "integer" | "bigint" => PropertyType::Int, + "float4" | "float8" | "numeric" | "real" | "double precision" => PropertyType::Float, + "bool" | "boolean" => PropertyType::Bool, + "date" => PropertyType::Date, + "timestamp" | "timestamptz" | "timestamp with time zone" + | "timestamp without time zone" => PropertyType::Datetime, + // Array forms come back as either `_text` (internal) or `text[]`. + other if other.starts_with('_') => { + PropertyType::List(Box::new(pg_type_to_property_type(&other[1..]))) + } + other if other.ends_with("[]") => { + let inner = &other[..other.len() - 2]; + PropertyType::List(Box::new(pg_type_to_property_type(inner))) + } + "jsonb" | "json" => PropertyType::Any, + other => PropertyType::Opaque(SmolStr::from(other)), + } +} + +// ────────────────────────────────────────────────────────────────── +// Digest +// ────────────────────────────────────────────────────────────────── + +fn compute_digest( + labels: &BTreeMap, + rel_types: &BTreeMap, + unique_by_label: &BTreeMap>>, + unique_by_rel: &BTreeMap>>, + label_sets: &[Vec], +) -> [u8; 32] { + let mut hasher = xxhash_rust::xxh3::Xxh3::new(); + // Each section is prefixed by a tag byte so re-orderings between + // sections can't collide. BTreeMap iteration is sorted by key. + hasher.update(&[0x01]); + for (label, entry) in labels { + hasher.update(label.as_bytes()); + hasher.update(entry.table_name.as_bytes()); + if let Some((c, v)) = &entry.discriminator { + hasher.update(c.as_bytes()); + hasher.update(v.as_bytes()); + } + for (prop, decl) in &entry.properties { + hasher.update(prop.as_bytes()); + hasher.update(&[u8::from(decl.required)]); + } + } + hasher.update(&[0x02]); + for (rel, entry) in rel_types { + hasher.update(rel.as_bytes()); + hasher.update(entry.from_table.as_bytes()); + hasher.update(entry.from_column.as_bytes()); + hasher.update(entry.to_table.as_bytes()); + hasher.update(entry.to_column.as_bytes()); + hasher.update(entry.edge_label.as_bytes()); + for (prop, decl) in &entry.properties { + hasher.update(prop.as_bytes()); + hasher.update(&[u8::from(decl.required)]); + } + } + hasher.update(&[0x03]); + for (key, tuples) in unique_by_label { + hasher.update(key.as_bytes()); + for tuple in tuples { + for p in tuple { + hasher.update(p.as_bytes()); + hasher.update(b","); + } + hasher.update(b";"); + } + } + hasher.update(&[0x04]); + for (key, tuples) in unique_by_rel { + hasher.update(key.as_bytes()); + for tuple in tuples { + for p in tuple { + hasher.update(p.as_bytes()); + hasher.update(b","); + } + hasher.update(b";"); + } + } + hasher.update(&[0x05]); + for set in label_sets { + for label in set { + hasher.update(label.as_bytes()); + hasher.update(b","); + } + hasher.update(b";"); + } + // xxh3 is a 128-bit hash; we widen to 32 bytes by hashing twice + // with different seeds. Adequate for an invalidation key; this + // is not a cryptographic digest. + let lo = hasher.digest128().to_le_bytes(); + let mut second = xxhash_rust::xxh3::Xxh3::with_seed(0xA5A5_5A5A); + second.update(&lo); + let hi = second.digest128().to_le_bytes(); + let mut out = [0u8; 32]; + out[..16].copy_from_slice(&lo); + out[16..].copy_from_slice(&hi); + out +} + +// ────────────────────────────────────────────────────────────────── +// Accessors for future milestones (M1+ translator) +// ────────────────────────────────────────────────────────────────── + +impl PgGraphSchema { + /// Whether a multi-label combination is permitted. `None` = not + /// declared in the catalog (we treat as "rejected" at execution + /// time — see open question Q-IN-2 in 080-open-questions.md). + #[allow(dead_code, reason = "consumed in M3 write-side translation")] + pub(crate) fn labels_compatible_check(&self, labels: &[SmolStr]) -> Option { + if labels.len() <= 1 { + return Some(true); + } + let mut normalised: Vec = labels.to_vec(); + normalised.sort(); + normalised.dedup(); + Some(self.label_sets.iter().any(|s| s == &normalised)) + } + + /// Declared unique-property tuples for a label. Used in M4 to + /// validate MERGE pattern determinism against real Postgres + /// uniqueness constraints. + #[allow(dead_code, reason = "consumed in M4 MERGE lowering")] + pub(crate) fn label_unique_props(&self, label: &str) -> Vec> { + self.unique_props_by_label + .get(label) + .cloned() + .unwrap_or_default() + } + + /// Declared unique-property tuples for a relationship type. + #[allow(dead_code, reason = "consumed in M4 MERGE lowering")] + pub(crate) fn rel_type_unique_props(&self, rel_type: &str) -> Vec> { + self.unique_props_by_rel_type + .get(rel_type) + .cloned() + .unwrap_or_default() + } +} diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 5beb54a..d8c0ffc 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -25,6 +25,7 @@ mod builder; mod catalog; mod config; mod connected_components; +mod cypher_facade; mod discover; mod edge_store; mod engine; @@ -146,6 +147,11 @@ pub mod bench_support { name = "graph_bootstrap_sql", requires = [auto_discover] ); +::pgrx::extension_sql_file!( + "../sql/cypher_catalog.sql", + name = "graph_cypher_catalog_sql", + requires = ["graph_bootstrap_sql"] +); // Declare the 'graph' schema so pgrx can satisfy control-file schema checks. #[pg_schema] From 8247ebeb74cb61f9d72050227de67341d5af22fe Mon Sep 17 00:00:00 2001 From: Patrick Hall Date: Thu, 21 May 2026 04:48:12 -0400 Subject: [PATCH 2/2] =?UTF-8?q?feat(cypher):=20adopt=20shipped=20cyrs=200.?= =?UTF-8?q?1.0=20APIs=20=E2=80=94=20drop=20upstream=20workarounds?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cyrs 0.1.0 (crates.io; PRs #56/#58) resolved all 13 feat-request asks. Reflect that on the cypher-frontend branch: * schema_provider.rs: promote labels_compatible / label_unique_props / rel_type_unique_props from inherent stub methods to real cyrs_schema::SchemaProvider trait methods now that cyrs ships them, so cyrs-sema proves MERGE determinism and label-set compatibility upstream instead of at execution time. * 080-open-questions.md: rewrite the Upstream section as a resolved table mapping each Q-UP item to its shipped API; no Q-UP item blocks pgGraph any longer. Internal Q-IN decisions unchanged. * 070-milestones-and-tests.md: drop the "until cyrs ships" workarounds from M3 (multi-label CREATE), M4 (MERGE key extraction), and M5 (ShortestPath operator). Verified: cargo check --lib (rustc 1.95) passes. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../070-milestones-and-tests.md | 21 +-- .../cypher-frontend/080-open-questions.md | 124 +++++++----------- graph/src/cypher_facade/schema_provider.rs | 80 ++++++----- 3 files changed, 93 insertions(+), 132 deletions(-) diff --git a/docs/contributor_guide/cypher-frontend/070-milestones-and-tests.md b/docs/contributor_guide/cypher-frontend/070-milestones-and-tests.md index d5084cd..a6a9de1 100644 --- a/docs/contributor_guide/cypher-frontend/070-milestones-and-tests.md +++ b/docs/contributor_guide/cypher-frontend/070-milestones-and-tests.md @@ -95,8 +95,10 @@ registered tables. Scope: - `cypher_facade/plan_translator/write.rs`. -- `CreateNode` (single label only; multi-label requires the catalog - hook from cyrs §2.3). +- `CreateNode`, including multi-label `CREATE` — + `SchemaProvider::labels_compatible` (cyrs 0.1.0) gates incompatible + label sets in sema; pgGraph backs it with the `allow_label_set` + catalog. - `CreateRel` (both FK-column and junction-table cases). - `SetProperty`, `RemoveProperty`. - `Delete` (no detach). @@ -124,9 +126,11 @@ Scope: - `on_create` / `on_match` dispatch over the `xmax = 0` flag. - Sema gate: `MergeNode` whose key props don't match a registered uniqueness tuple → `E4504`. -- Requires cyrs feat-request §2.1 / §2.2 to be in place; until then - use a temporary embedder-side analysis on `MergeNode.props` to - extract the candidate key. +- `WriteOp::MergeNode.key_props` (cyrs 0.1.0, feat-request §2.1) + carries the candidate key directly; `cyrs-sema` proves MERGE + determinism via `SchemaProvider::label_unique_props` (§2.2). When + the pattern's props are not a literal map, `key_props` is empty and + the facade rejects with `E4504`. Tests: @@ -136,13 +140,12 @@ Tests: ## M5 — Shortest path, hardening, TCK gate (week 8+) -**Goal:** `shortestPath` works (blocked on cyrs §1.1); golden TCK -subset becomes a CI gate. +**Goal:** `shortestPath` works (`cyrs_plan::ReadOp::ShortestPath`, +cyrs 0.1.0); golden TCK subset becomes a CI gate. Scope: -- `ShortestPath` op → `path_finder.rs`. Lands when cyrs ships the - operator. +- `ShortestPath` op → `path_finder.rs`. - All "deferred-to-M5" items from earlier milestones. - Function-coverage table cross-checked against `cyrs_schema::StandardLibrary` (CI test). diff --git a/docs/contributor_guide/cypher-frontend/080-open-questions.md b/docs/contributor_guide/cypher-frontend/080-open-questions.md index c3e97af..0790ada 100644 --- a/docs/contributor_guide/cypher-frontend/080-open-questions.md +++ b/docs/contributor_guide/cypher-frontend/080-open-questions.md @@ -1,86 +1,50 @@ # Open questions Resolve before, or during, implementation. Each question has a -decision owner and a "needed by" milestone. Items blocked on cyrs -upstream cite the section of `../../../../cyrs/feat-request.md` they -depend on. - -## Upstream (blocking on cyrs) - -### Q-UP-1 — `ShortestPath` ReadOp lands in `cyrs-plan` - -- **Status:** awaiting upstream (cyrs `feat-request.md` §1.1). -- **Needed by:** M5. -- **Impact if delayed:** `shortestPath` / `allShortestPaths` - permanently emit `E4530` until landed. We can ship M0–M4 without - it. - -### Q-UP-2 — Path-variable surface in Plan - -- **Status:** awaiting upstream doc clarification (cyrs §1.2). -- **Needed by:** M2 (varlen patterns with `RETURN p`). -- **Workaround:** treat path variables as JSONB shaped like the - existing `path` column from `execute_traverse_rows`. If cyrs - surfaces a structured form, we adapt. - -### Q-UP-3 — MERGE key surface on `WriteOp::MergeNode/MergeRel` - -- **Status:** awaiting upstream (cyrs §2.1). -- **Needed by:** M4. -- **Workaround for M4:** embedder-side analysis of `MergeNode.props` - to extract the key. Remove the workaround when upstream ships. - -### Q-UP-4 — `label_unique_props`, `rel_type_unique_props` on `SchemaProvider` - -- **Status:** awaiting upstream (cyrs §2.2). -- **Needed by:** M4 (so sema can prove MERGE determinism instead of - embedder rejecting at exec time). -- **Workaround:** runtime check in the facade, with `E4504` if the - required unique constraint isn't registered. - -### Q-UP-5 — `labels_compatible` on `SchemaProvider` - -- **Status:** awaiting upstream (cyrs §2.3). -- **Needed by:** M3 (multi-label CREATE). -- **Workaround:** v1 rejects every multi-label CREATE with `E4503` - until cyrs ships the hook. Single-label CREATE is unaffected. - -### Q-UP-6 — Parameter type surface - -- **Status:** awaiting upstream (cyrs §2.4). -- **Needed by:** M1. -- **Workaround:** treat every param as JSONB. Loses some pg-side - type checking; otherwise works. Re-bind once upstream ships. - -### Q-UP-7 — Function builtin enumeration - -- **Status:** awaiting upstream (cyrs §1.3). -- **Needed by:** M2 (any RETURN with function calls). -- **Workaround:** maintain our own bucket table; CI test asserts every - name in cyrs's current set is covered. Drift becomes a CI failure, - not a silent miss. - -### Q-UP-8 — `cyrs-hir::lower_statement` returns `Result` - -- **Status:** awaiting upstream (cyrs §4.1). -- **Needed by:** M1. -- **Workaround:** catch unwinds from `lower_statement` in a panic - boundary and translate to `E0xxx` `42601`. Worse UX than a real - result type. - -### Q-UP-9 — `HirId → byte span` accessor - -- **Status:** awaiting upstream (cyrs §4.2). -- **Needed by:** M1 (for `errposition`). -- **Workaround:** omit `errposition`; users get diagnostics without - caret positioning. Tolerable for v1. - -### Q-UP-10 — Crates.io publication or stable git tag - -- **Status:** awaiting upstream (cyrs §6.1). -- **Needed by:** M0 (cannot pin `Cargo.toml` otherwise). -- **Workaround:** path dependency in development; flip to a tagged - rev before any release. +decision owner and a "needed by" milestone. + +## Upstream (cyrs) — all resolved + +Every upstream ask landed in **cyrs 0.1.0** (19 crates published to +crates.io 2026-05-10; embedder PRs #56 and #58). The detailed problem +statements live in `../../../../cyrs/feat-request.md`; the table below +records what shipped. **No `Q-UP-*` item blocks pgGraph any longer.** + +| ID | Ask | Shipped API | § | +|----|-----|-------------|---| +| Q-UP-1 | `ShortestPath` ReadOp | `cyrs_plan::ReadOp::ShortestPath { input, from, to, rel, kind, bind_path }` | §1.1 | +| Q-UP-2 | Path-variable surface | Documented contract: the plan IR has no `Path` type; the embedder owns path materialisation; `cyrs_hir::VarKind::Path` carries the contract | §1.2 | +| Q-UP-3 | MERGE key surface | `WriteOp::MergeNode` / `MergeRel` gained `key_props: Vec`, populated by lowering when `props` is a literal `Expr::Map` | §2.1 | +| Q-UP-4 | Uniqueness on `SchemaProvider` | `label_unique_props` / `rel_type_unique_props`; `cyrs-sema` now proves MERGE determinism upstream | §2.2 | +| Q-UP-5 | `labels_compatible` | `SchemaProvider::labels_compatible(&[SmolStr]) -> Option` (`None` = no opinion) | §2.3 | +| Q-UP-6 | Typed parameters | `PlanStatement::params`, `ParamRef` + `ParamType` (`Unknown` variant for unconstrained params) | §2.4 | +| Q-UP-7 | Function enumeration | `StandardLibrary::builtin_signature()` — per-function `deterministic` / `null_propagating` metadata; normative builtin enumeration | §1.3 | +| Q-UP-8 | `lower_*` returns `Result` | `lower_statement` / `lower_parse` → `Result` (`ParseFailed` / `Invariant`) | §4.1 | +| Q-UP-9 | `HirId → span` | `Statement::span_of(HirId) -> Option>` — a byte range, ready for `errposition()` | §4.2 | +| Q-UP-10 | Stable release channel | cyrs 0.1.0 — 19 crates on crates.io | §6.1 | + +Two further feat-request items resolved (never tracked as `Q-UP`): +**§3.1** — `E4500..=E4999` is now a formally reserved embedder-owned +diagnostic range, policed by a `DiagCode::ALL` test, so pgGraph's +`E4500`–`E4560` codes cannot collide with cyrs's own. **§5.1 / §5.2** +— the `ReadOp::Filter` 3VL and empty-key `ReadOp::Aggregate` +semantics pgGraph depends on are documented as stable contracts. + +### Consequences for the build + +- The `graph/Cargo.toml` cyrs dependency stays a `../../cyrs` path + dependency through M1–M5 co-development; flip it to the crates.io + `0.1.0` release before any pgGraph release (feat-request §6.1). +- `cypher_facade::schema_provider` now implements `labels_compatible`, + `label_unique_props`, and `rel_type_unique_props` as real + `SchemaProvider` trait methods, not inherent stubs. +- Every "until cyrs ships …" workaround in + `070-milestones-and-tests.md` is gone: M3 multi-label `CREATE`, M4 + MERGE-key extraction, and M5 `shortestPath` build straight against + the shipped API. +- `cypher_facade::diag_to_pg` can use `Statement::span_of` for + `errposition()` carets from M1 — the "omit errposition" workaround + is unnecessary. ## Internal (no upstream dependency) diff --git a/graph/src/cypher_facade/schema_provider.rs b/graph/src/cypher_facade/schema_provider.rs index 41e9406..da8f7f0 100644 --- a/graph/src/cypher_facade/schema_provider.rs +++ b/graph/src/cypher_facade/schema_provider.rs @@ -8,10 +8,11 @@ //! Salsa expectation that `schema_digest()` is stable across calls //! within a single check (spec 0001 §11.2). //! -//! Methods that depend on cyrs feature-request items -//! (`label_unique_props`, `rel_type_unique_props`, `labels_compatible`) -//! are written but rely on the trait defaults until upstream cyrs -//! ships them — see +//! `labels_compatible`, `label_unique_props`, and `rel_type_unique_props` +//! are implemented directly as `SchemaProvider` trait methods: cyrs +//! shipped them in 0.1.0 (feat-request §2.2 / §2.3), so `cyrs-sema` +//! proves MERGE determinism and rejects incompatible label sets +//! upstream rather than at execution time — see //! `docs/contributor_guide/cypher-frontend/080-open-questions.md`. use std::collections::BTreeMap; @@ -136,6 +137,38 @@ impl SchemaProvider for PgGraphSchema { fn schema_digest(&self) -> [u8; 32] { self.digest } + + /// Multi-label compatibility. `Some(true)` for a single label or a + /// combination registered via `allow_label_set`; `Some(false)` + /// otherwise. pgGraph never returns `None` — the catalog is + /// authoritative (see Q-IN-2 in 080-open-questions.md). + fn labels_compatible(&self, labels: &[SmolStr]) -> Option { + if labels.len() <= 1 { + return Some(true); + } + let mut normalised: Vec = labels.to_vec(); + normalised.sort(); + normalised.dedup(); + Some(self.label_sets.iter().any(|s| s == &normalised)) + } + + /// Declared unique-property tuples for a label. `cyrs-sema` uses + /// these to prove MERGE pattern determinism against real Postgres + /// uniqueness constraints. + fn label_unique_props(&self, label: &str) -> Vec> { + self.unique_props_by_label + .get(label) + .cloned() + .unwrap_or_default() + } + + /// Relationship-type analogue of `label_unique_props`. + fn rel_type_unique_props(&self, rel_type: &str) -> Vec> { + self.unique_props_by_rel_type + .get(rel_type) + .cloned() + .unwrap_or_default() + } } // ────────────────────────────────────────────────────────────────── @@ -423,42 +456,3 @@ fn compute_digest( out } -// ────────────────────────────────────────────────────────────────── -// Accessors for future milestones (M1+ translator) -// ────────────────────────────────────────────────────────────────── - -impl PgGraphSchema { - /// Whether a multi-label combination is permitted. `None` = not - /// declared in the catalog (we treat as "rejected" at execution - /// time — see open question Q-IN-2 in 080-open-questions.md). - #[allow(dead_code, reason = "consumed in M3 write-side translation")] - pub(crate) fn labels_compatible_check(&self, labels: &[SmolStr]) -> Option { - if labels.len() <= 1 { - return Some(true); - } - let mut normalised: Vec = labels.to_vec(); - normalised.sort(); - normalised.dedup(); - Some(self.label_sets.iter().any(|s| s == &normalised)) - } - - /// Declared unique-property tuples for a label. Used in M4 to - /// validate MERGE pattern determinism against real Postgres - /// uniqueness constraints. - #[allow(dead_code, reason = "consumed in M4 MERGE lowering")] - pub(crate) fn label_unique_props(&self, label: &str) -> Vec> { - self.unique_props_by_label - .get(label) - .cloned() - .unwrap_or_default() - } - - /// Declared unique-property tuples for a relationship type. - #[allow(dead_code, reason = "consumed in M4 MERGE lowering")] - pub(crate) fn rel_type_unique_props(&self, rel_type: &str) -> Vec> { - self.unique_props_by_rel_type - .get(rel_type) - .cloned() - .unwrap_or_default() - } -}