diff --git a/.gitignore b/.gitignore index 6fdf759..dcb1cd7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,10 @@ dev/ .wrangler/ .env .mcp.json -.vscode/ \ No newline at end of file +.vscode/ + +# Local-only planning doc (public repo) +PLAN.md + +# XSD/spec artifacts: pulled by scripts/fetch-xsd.ts; never committed. +data/xsd-cache/ \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index fefba79..0ec3ed4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,15 +29,22 @@ apps/ src/data/docs.ts ← All doc pages live here (single source of truth) src/components/ UI components (Sidebar, SuperDocPreview, etc.) src/pages/ Route pages (Home, Docs, SpecExplorer, Mcp) - mcp-server/ Cloudflare Worker — MCP server for AI spec search + mcp-server/ Cloudflare Worker - MCP server (semantic + structural tools) packages/ shared/ Database client, embedding client, types scripts/ - ingest/ PDF → chunks → embeddings → database pipeline + ingest-pdf/ ECMA PDF -> spec_content (semantic search corpus) + ingest-xsd/ ECMA XSDs -> schema graph (structural query corpus) + sources-sync.ts data/sources.json -> reference_sources + db-migrate.ts Apply db/migrations/*.sql in order db/ - schema.sql PostgreSQL + pgvector schema + schema.sql PostgreSQL + pgvector + XSD schema graph + migrations/ Numbered, idempotent SQL migrations +data/ + sources.json Source manifest (artifact URLs, sha256, license notes) + xsd-cache/ Local-only XSD download cache (gitignored) dev/ - data/ Extracted/chunked/embedded spec content + data/ Extracted/chunked/embedded PDF content ``` ## Commands @@ -97,23 +104,52 @@ The XML you provide is wrapped in a minimal `w:document > w:body` structure auto ## MCP Server -Cloudflare Worker exposing three MCP tools for semantic spec search: +Cloudflare Worker exposing two flavors of MCP tools backed by the same database. -- `search_ecma_spec` — semantic vector search across 18,000+ spec chunks -- `get_section` — fetch a specific section by ID (e.g., "17.3.1.24") -- `list_parts` — browse the spec structure +Semantic search over the spec PDF (powered by `spec_content`): + +- `search_ecma_spec` - semantic vector search across 18,000+ spec chunks +- `get_section` - fetch a specific section by ID (e.g., "17.3.1.24") +- `list_parts` - browse the spec structure + +Structural queries over the XSD schema graph (powered by `xsd_*` tables): + +- `ooxml_lookup_element` / `ooxml_lookup_type` - canonical symbol info +- `ooxml_children` - legal children of an element/type/group, in document order +- `ooxml_attributes` - attributes including those inherited and unfolded from attributeGroup refs +- `ooxml_enum` - simpleType enumeration values +- `ooxml_namespace_info` - vocabularies and per-profile symbol counts for a namespace URI Uses PostgreSQL with pgvector (Neon serverless in production, Docker locally). -## Data Pipeline +## Data Pipelines + +Two ingest paths feed the same database. Both are reproducible from `data/sources.json`. -Ingests ECMA-376 PDFs into the vector database: +**PDF (semantic corpus, into `spec_content`)**: ``` PDF → extract (Python) → chunk (6KB) → embed (Voyage) → upload (PostgreSQL) ``` -Run the full pipeline: `bun scripts/ingest/pipeline.ts` +```bash +bun run pdf:ingest 1 ./pdfs/ECMA-376-Part1.pdf # full pipeline for one part +``` + +See `scripts/ingest-pdf/README.md`. + +**XSD (structural corpus, into `xsd_*` tables)**: + +``` +ECMA Part 4 zip → fetch+verify (sha256) → parse → ingest (single transaction) +``` + +```bash +bun run xsd:fetch # URL + sha256 from data/sources.json +bun run xsd:ingest +``` + +See `scripts/ingest-xsd/README.md`. ## Database diff --git a/README.md b/README.md index 2f0da7e..768affc 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,10 @@ The OOXML spec, explained by people who actually implemented it. An interactive reference for ECMA-376 (Office Open XML) built by the [SuperDoc — DOCX editing and tooling](https://superdoc.dev) team. Every page combines XML structure, live rendered previews, and implementation notes that tell you what the spec doesn't. -- **Live previews** — Edit XML and see it render in real-time. Every example is a working document. -- **Implementation notes** — Where Word diverges from the spec, what will break your code, and what to do about it. -- **Semantic spec search** — 18,000+ spec chunks searchable by meaning via MCP server. +- **Live previews** - Edit XML and see it render in real-time. Every example is a working document. +- **Implementation notes** - Where Word diverges from the spec, what will break your code, and what to do about it. +- **Semantic spec search** - 18,000+ spec chunks searchable by meaning via MCP server. +- **Structural schema lookup** - Element children, attributes, types, enums, namespaces. Same MCP server, deterministic answers from the parsed XSDs. ## Why? @@ -22,13 +23,16 @@ We faced this at SuperDoc — building a document engine on native OOXML with no ## MCP Server -Search the ECMA-376 spec with AI. Ask questions in natural language, get answers grounded in the actual specification. +Ask questions in natural language and get answers grounded in the spec, or query the schema graph for precise structural answers. ```bash claude mcp add --transport http ecma-spec https://api.ooxml.dev/mcp ``` -Works with Claude Code, Cursor, and any MCP-compatible client. Three tools: `search_ecma_spec` (semantic search), `get_section` (by ID), and `list_parts` (browse structure). +Works with Claude Code, Cursor, and any MCP-compatible client. Two flavors of tools share one server: + +- **Semantic** (over the spec PDF): `search_ecma_spec`, `get_section`, `list_parts` +- **Structural** (over the parsed XSDs): `ooxml_lookup_element`, `ooxml_lookup_type`, `ooxml_children`, `ooxml_attributes`, `ooxml_enum`, `ooxml_namespace_info` ## Development diff --git a/apps/mcp-server/src/mcp.ts b/apps/mcp-server/src/mcp.ts index 9bf7f22..6af618f 100644 --- a/apps/mcp-server/src/mcp.ts +++ b/apps/mcp-server/src/mcp.ts @@ -7,6 +7,7 @@ import { createDb } from "./db"; import { embedQuery } from "./embeddings"; import type { Env } from "./index"; +import { callOoxmlTool, isOoxmlTool, OOXML_TOOL_DEFS } from "./ooxml-tools"; // JSON-RPC types interface JsonRpcRequest { @@ -136,9 +137,7 @@ function handleToolsList(id: number | string | null): JsonRpcResponse { return { jsonrpc: "2.0", id, - result: { - tools: TOOLS, - }, + result: { tools: [...TOOLS, ...OOXML_TOOL_DEFS] }, }; } @@ -162,6 +161,17 @@ async function handleToolsCall( try { let resultText: string; + // Structural OOXML tools share the dispatch with the existing semantic + // tools below. + if (isOoxmlTool(name)) { + resultText = await callOoxmlTool(name, args ?? {}, env); + return { + jsonrpc: "2.0", + id, + result: { content: [{ type: "text", text: resultText }] }, + }; + } + switch (name) { case "search_ecma_spec": { const query = args?.query as string; diff --git a/apps/mcp-server/src/ooxml-queries.ts b/apps/mcp-server/src/ooxml-queries.ts new file mode 100644 index 0000000..7950974 --- /dev/null +++ b/apps/mcp-server/src/ooxml-queries.ts @@ -0,0 +1,615 @@ +/** + * Read-only schema-graph queries powering the OOXML MCP tools: + * ooxml_lookup_element, ooxml_lookup_type, ooxml_children, + * ooxml_attributes, ooxml_enum, ooxml_namespace_info. + * + * These take a tagged-template SQL function (Neon in the deployed Worker, + * postgres.js in local tests). All queries are profile-scoped and walk + * inheritance chains where it matters. + */ + +// biome-ignore lint/suspicious/noExplicitAny: tagged-template sql differs between neon and postgres. +type Sql = any; + +/** + * Common OOXML prefix -> namespace map for parsing user qnames like "w:tbl". + * Documents may use other bindings; for those, callers can pass Clark form + * `{namespace}localName` or just `localName` and accept the WML default. + */ +const COMMON_PREFIXES: Record = { + w: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + r: "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + s: "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + m: "http://schemas.openxmlformats.org/officeDocument/2006/math", + a: "http://schemas.openxmlformats.org/drawingml/2006/main", + wp: "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + pic: "http://schemas.openxmlformats.org/drawingml/2006/picture", + c: "http://schemas.openxmlformats.org/drawingml/2006/chart", + dgm: "http://schemas.openxmlformats.org/drawingml/2006/diagram", + xsd: "http://www.w3.org/2001/XMLSchema", + xs: "http://www.w3.org/2001/XMLSchema", + xml: "http://www.w3.org/XML/1998/namespace", +}; + +const DEFAULT_NAMESPACE = COMMON_PREFIXES.w; + +export interface ParsedQName { + namespace: string; + localName: string; + rawPrefix: string | null; +} + +export type QNameParseResult = { ok: true; qname: ParsedQName } | { ok: false; reason: string }; + +/** + * Parse a user-supplied qname. Accepts: + * - `prefix:localName` for known OOXML prefixes (w, r, s, m, a, wp, pic, c, dgm, xsd, xml) + * - `{namespace}localName` Clark form + * - bare `localName` (assumes WML main namespace) + */ +export function parseQName(raw: string): QNameParseResult { + if (!raw) return { ok: false, reason: "empty qname" }; + if (raw.startsWith("{")) { + const close = raw.indexOf("}"); + if (close < 0) return { ok: false, reason: "malformed Clark qname (missing })" }; + const namespace = raw.slice(1, close); + const localName = raw.slice(close + 1); + if (!localName) return { ok: false, reason: "missing local name in Clark qname" }; + return { ok: true, qname: { namespace, localName, rawPrefix: null } }; + } + const colon = raw.indexOf(":"); + if (colon < 0) { + return { + ok: true, + qname: { namespace: DEFAULT_NAMESPACE, localName: raw, rawPrefix: null }, + }; + } + const prefix = raw.slice(0, colon); + const localName = raw.slice(colon + 1); + const namespace = COMMON_PREFIXES[prefix]; + if (!namespace) { + return { + ok: false, + reason: `unknown prefix '${prefix}'. Use a known prefix (w, r, s, m, a, wp, pic, c, dgm), or Clark form {namespace}localName.`, + }; + } + return { ok: true, qname: { namespace, localName, rawPrefix: prefix } }; +} + +export interface SymbolHit { + id: number; + vocabularyId: string; + localName: string; + kind: string; + typeRef: string | null; + namespaceUri: string; + profileName: string; + sourceName: string | null; +} + +/** + * Look up a top-level symbol by namespace + localName + kind in a given profile. + * + * Local element symbols (parent_symbol_id IS NOT NULL) are intentionally excluded: + * an inline `` declared in two different + * complexTypes is two distinct symbols whose identity depends on context. Reach + * those through `getChildren(parentTypeSymbolId, profile)` instead. + */ +export async function lookupSymbol( + sql: Sql, + namespace: string, + localName: string, + kind: string, + profile: string, +): Promise { + const rows = await sql` + SELECT s.id, s.vocabulary_id, s.local_name, s.kind, s.type_ref, + ns.uri AS namespace_uri, p.name AS profile_name, src.name AS source_name + FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + JOIN xsd_profiles p ON p.id = sp.profile_id + LEFT JOIN reference_sources src ON src.id = sp.source_id + WHERE s.local_name = ${localName} + AND s.kind = ${kind} + AND s.parent_symbol_id IS NULL + AND ns.uri = ${namespace} + AND p.name = ${profile} + LIMIT 1 + `; + const r = rows[0]; + if (!r) return null; + return { + id: r.id as number, + vocabularyId: r.vocabulary_id as string, + localName: r.local_name as string, + kind: r.kind as string, + typeRef: r.type_ref as string | null, + namespaceUri: r.namespace_uri as string, + profileName: r.profile_name as string, + sourceName: r.source_name as string | null, + }; +} + +/** Look up an element by qname in a profile. */ +export function lookupElement( + sql: Sql, + namespace: string, + localName: string, + profile: string, +): Promise { + return lookupSymbol(sql, namespace, localName, "element", profile); +} + +/** + * Look up a type symbol (complexType OR simpleType) by qname. + * Tries complexType first, then simpleType. + */ +export async function lookupType( + sql: Sql, + namespace: string, + localName: string, + profile: string, +): Promise { + const ct = await lookupSymbol(sql, namespace, localName, "complexType", profile); + if (ct) return ct; + return lookupSymbol(sql, namespace, localName, "simpleType", profile); +} + +/** + * Resolve a Clark-style type_ref (e.g. {ns}local) to the type symbol it points at. + */ +export async function lookupSymbolByTypeRef( + sql: Sql, + typeRef: string, + profile: string, +): Promise { + if (!typeRef.startsWith("{")) return null; + const close = typeRef.indexOf("}"); + if (close < 0) return null; + const namespace = typeRef.slice(1, close); + const localName = typeRef.slice(close + 1); + return lookupType(sql, namespace, localName, profile); +} + +export interface ChildEdge { + kind: "element" | "group"; + localName: string; + vocabularyId: string; + namespaceUri: string | null; + minOccurs: number; + maxOccurs: number | null; + orderIndex: number; + compositorKind: string | null; + compositorId: number | null; + parentCompositorId: number | null; + /** Compositor stack from outermost to direct parent, e.g. ["sequence", "choice(0..unbounded)"]. */ + compositorPath: string[]; + source: "self" | "inherited"; + owningTypeName: string; +} + +interface InheritanceEdgeRow { + baseId: number; + relation: "extension" | "restriction"; +} + +async function getInheritanceEdge( + sql: Sql, + symbolId: number, + profile: string, +): Promise { + const rows = await sql` + SELECT e.base_symbol_id, e.relation + FROM xsd_inheritance_edges e + JOIN xsd_profiles p ON p.id = e.profile_id + WHERE e.symbol_id = ${symbolId} AND p.name = ${profile} + LIMIT 1 + `; + if (rows.length === 0) return null; + return { + baseId: rows[0].base_symbol_id as number, + relation: rows[0].relation as InheritanceEdgeRow["relation"], + }; +} + +async function getSymbolName(sql: Sql, symbolId: number): Promise { + const rows = await sql`SELECT local_name FROM xsd_symbols WHERE id = ${symbolId} LIMIT 1`; + return (rows[0]?.local_name as string | undefined) ?? "(unknown)"; +} + +interface CompositorRow { + id: number; + kind: "sequence" | "choice" | "all"; + minOccurs: number; + maxOccurs: number | null; + orderIndex: number; +} + +function formatOccurs(min: number, max: number | null): string { + const maxStr = max === null ? "unbounded" : String(max); + if (min === 1 && max === 1) return "1..1"; + return `${min}..${maxStr}`; +} + +/** + * Walk a single compositor's content tree in document order, descending into + * nested compositors. Each emitted child carries the full compositor path so + * callers can reconstruct nesting. + */ +async function walkCompositor( + sql: Sql, + compositor: CompositorRow, + profile: string, + pathSoFar: string[], + source: ChildEdge["source"], + owningTypeName: string, +): Promise { + const path = [ + ...pathSoFar, + `${compositor.kind}(${formatOccurs(compositor.minOccurs, compositor.maxOccurs)})`, + ]; + + const elemRows = await sql` + SELECT 'element' AS entry_kind, s.local_name, s.vocabulary_id, ns.uri AS namespace_uri, + e.min_occurs, e.max_occurs, e.order_index, NULL::int AS nested_compositor_id + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + LEFT JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id AND sp.profile_id = e.profile_id + LEFT JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + JOIN xsd_profiles p ON p.id = e.profile_id + WHERE e.compositor_id = ${compositor.id} AND p.name = ${profile} + `; + const groupRows = await sql` + SELECT 'group' AS entry_kind, g.local_name, g.vocabulary_id, NULL AS namespace_uri, + ge.min_occurs, ge.max_occurs, ge.order_index, NULL::int AS nested_compositor_id + FROM xsd_group_edges ge + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + JOIN xsd_profiles p ON p.id = ge.profile_id + WHERE ge.compositor_id = ${compositor.id} AND ge.ref_kind = 'group' AND p.name = ${profile} + `; + const nestedRows = await sql` + SELECT 'compositor' AS entry_kind, NULL AS local_name, NULL AS vocabulary_id, NULL AS namespace_uri, + c.min_occurs, c.max_occurs, c.order_index, c.id AS nested_compositor_id, c.kind + FROM xsd_compositors c + JOIN xsd_profiles p ON p.id = c.profile_id + WHERE c.parent_compositor_id = ${compositor.id} AND p.name = ${profile} + `; + + const all = [...elemRows, ...groupRows, ...nestedRows]; + all.sort((a, b) => (a.order_index as number) - (b.order_index as number)); + + const out: ChildEdge[] = []; + for (const r of all) { + if (r.entry_kind === "compositor") { + const nested: CompositorRow = { + id: r.nested_compositor_id as number, + kind: r.kind as CompositorRow["kind"], + minOccurs: r.min_occurs as number, + maxOccurs: r.max_occurs as number | null, + orderIndex: r.order_index as number, + }; + const inner = await walkCompositor(sql, nested, profile, path, source, owningTypeName); + out.push(...inner); + } else { + out.push({ + kind: r.entry_kind as "element" | "group", + localName: r.local_name as string, + vocabularyId: r.vocabulary_id as string, + namespaceUri: (r.namespace_uri as string | null) ?? null, + minOccurs: r.min_occurs as number, + maxOccurs: r.max_occurs as number | null, + orderIndex: r.order_index as number, + compositorKind: compositor.kind, + compositorId: compositor.id, + parentCompositorId: null, + compositorPath: path, + source, + owningTypeName, + }); + } + } + return out; +} + +/** + * Children of a type symbol in correct document order. Walks inheritance per + * XSD semantics: complexContent/extension prepends the base's effective content + * before the derived type's; complexContent/restriction REPLACES the base's + * content (we don't include the base). Within a type, walks the compositor + * tree DFS so nested sequences/choices flatten in document order. + * + * Group refs are returned as edges; resolve them by calling getChildren on the + * group symbol. + */ +export async function getChildren( + sql: Sql, + rootSymbolId: number, + profile: string, +): Promise { + return getChildrenRecursive(sql, rootSymbolId, profile, true); +} + +async function getChildrenRecursive( + sql: Sql, + symbolId: number, + profile: string, + isRoot: boolean, +): Promise { + const out: ChildEdge[] = []; + + // Inheritance: extension prepends base content; restriction replaces it. + // Recursing with isRoot=false sets source="inherited" inside the base call, + // so we just push the entries through. + const inherit = await getInheritanceEdge(sql, symbolId, profile); + if (inherit && inherit.relation === "extension") { + const base = await getChildrenRecursive(sql, inherit.baseId, profile, false); + out.push(...base); + } + + // Walk this type's own top-level compositors. + const topCompositors = await sql` + SELECT c.id, c.kind, c.min_occurs, c.max_occurs, c.order_index + FROM xsd_compositors c + JOIN xsd_profiles p ON p.id = c.profile_id + WHERE c.parent_symbol_id = ${symbolId} AND p.name = ${profile} + ORDER BY c.order_index + `; + const ownName = await getSymbolName(sql, symbolId); + const source: ChildEdge["source"] = isRoot ? "self" : "inherited"; + for (const r of topCompositors) { + const c: CompositorRow = { + id: r.id as number, + kind: r.kind as CompositorRow["kind"], + minOccurs: r.min_occurs as number, + maxOccurs: r.max_occurs as number | null, + orderIndex: r.order_index as number, + }; + const inner = await walkCompositor(sql, c, profile, [], source, ownName); + out.push(...inner); + } + + // Top-level group refs that hang directly off the type (compositor_id IS NULL). + const topLevelGroups = await sql` + SELECT g.local_name, g.vocabulary_id, ge.min_occurs, ge.max_occurs, ge.order_index + FROM xsd_group_edges ge + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + JOIN xsd_profiles p ON p.id = ge.profile_id + WHERE ge.parent_symbol_id = ${symbolId} + AND ge.ref_kind = 'group' + AND ge.compositor_id IS NULL + AND p.name = ${profile} + ORDER BY ge.order_index + `; + for (const r of topLevelGroups) { + out.push({ + kind: "group", + localName: r.local_name as string, + vocabularyId: r.vocabulary_id as string, + namespaceUri: null, + minOccurs: r.min_occurs as number, + maxOccurs: r.max_occurs as number | null, + orderIndex: r.order_index as number, + compositorKind: null, + compositorId: null, + parentCompositorId: null, + compositorPath: [], + source, + owningTypeName: ownName, + }); + } + + return out; +} + +export interface AttrEntry { + localName: string; + attrUse: "required" | "optional" | "prohibited"; + defaultValue: string | null; + fixedValue: string | null; + typeRef: string | null; + source: "self" | "inherited" | "attributeGroup"; + owningName: string; +} + +/** + * Attributes on a type symbol, applying XSD §3.4.2.2 inheritance: + * - extension: derived's own attribute uses are unioned with the base's. + * - restriction: derived's attribute uses also union with the base's, with + * the derived narrowing or prohibiting individual entries. Restriction + * CANNOT silently drop a base attribute; only `use="prohibited"` does. + * + * Walk order emits the derived type's own attributes first, then attributeGroup + * refs the derived holds, then recurses into the base. Names are de-duplicated + * by first occurrence, so a derived redeclaration wins and base attrs only + * surface when the derived didn't override them. attributeGroup nesting is + * walked recursively with a visited-set against cycles. + */ +export async function getAttributes( + sql: Sql, + rootSymbolId: number, + profile: string, +): Promise { + const out: AttrEntry[] = []; + const seenAttrs = new Set(); + const visitedGroups = new Set(); + await collectAttrsForType(sql, rootSymbolId, profile, true, out, seenAttrs, visitedGroups); + return out; +} + +async function collectAttrsForType( + sql: Sql, + symbolId: number, + profile: string, + isRoot: boolean, + out: AttrEntry[], + seenAttrs: Set, + visitedGroups: Set, +): Promise { + const ownName = await getSymbolName(sql, symbolId); + + // Direct attribute declarations on this symbol (whether complexType or + // attributeGroup; both can carry xsd:attribute children). Emit first so + // derived redeclarations override base attrs found below. + const directAttrs = await sql` + SELECT a.local_name, a.attr_use, a.default_value, a.fixed_value, a.type_ref, a.order_index + FROM xsd_attr_edges a + JOIN xsd_profiles p ON p.id = a.profile_id + WHERE a.symbol_id = ${symbolId} AND p.name = ${profile} + ORDER BY a.order_index + `; + for (const r of directAttrs) { + const name = r.local_name as string; + if (seenAttrs.has(name)) continue; + seenAttrs.add(name); + out.push({ + localName: name, + attrUse: r.attr_use as "required" | "optional" | "prohibited", + defaultValue: r.default_value as string | null, + fixedValue: r.fixed_value as string | null, + typeRef: r.type_ref as string | null, + source: isRoot ? "self" : "inherited", + owningName: ownName, + }); + } + + // attributeGroup refs the derived itself holds; recurse into each before + // touching the base so a derived's group-bundled attr also wins. + const agRefs = await sql` + SELECT ge.group_symbol_id + FROM xsd_group_edges ge + JOIN xsd_profiles p ON p.id = ge.profile_id + WHERE ge.parent_symbol_id = ${symbolId} + AND ge.ref_kind = 'attributeGroup' + AND p.name = ${profile} + ORDER BY ge.order_index + `; + for (const ag of agRefs) { + await collectAttrsFromAttributeGroup( + sql, + ag.group_symbol_id as number, + profile, + out, + seenAttrs, + visitedGroups, + ); + } + + // Inherited base attrs. Both extension and restriction inherit attribute uses + // per XSD §3.4.2.2; restriction can override or prohibit but cannot drop + // silently. Dedup by seenAttrs so the derived's redeclarations win. + const inherit = await getInheritanceEdge(sql, symbolId, profile); + if (inherit) { + await collectAttrsForType(sql, inherit.baseId, profile, false, out, seenAttrs, visitedGroups); + } +} + +async function collectAttrsFromAttributeGroup( + sql: Sql, + groupSymbolId: number, + profile: string, + out: AttrEntry[], + seenAttrs: Set, + visitedGroups: Set, +): Promise { + if (visitedGroups.has(groupSymbolId)) return; + visitedGroups.add(groupSymbolId); + + const groupName = await getSymbolName(sql, groupSymbolId); + + const directAttrs = await sql` + SELECT a.local_name, a.attr_use, a.default_value, a.fixed_value, a.type_ref, a.order_index + FROM xsd_attr_edges a + JOIN xsd_profiles p ON p.id = a.profile_id + WHERE a.symbol_id = ${groupSymbolId} AND p.name = ${profile} + ORDER BY a.order_index + `; + for (const r of directAttrs) { + const name = r.local_name as string; + if (seenAttrs.has(name)) continue; + seenAttrs.add(name); + out.push({ + localName: name, + attrUse: r.attr_use as "required" | "optional" | "prohibited", + defaultValue: r.default_value as string | null, + fixedValue: r.fixed_value as string | null, + typeRef: r.type_ref as string | null, + source: "attributeGroup", + owningName: groupName, + }); + } + + // Nested attributeGroup refs inside this group. + const innerRefs = await sql` + SELECT ge.group_symbol_id + FROM xsd_group_edges ge + JOIN xsd_profiles p ON p.id = ge.profile_id + WHERE ge.parent_symbol_id = ${groupSymbolId} + AND ge.ref_kind = 'attributeGroup' + AND p.name = ${profile} + ORDER BY ge.order_index + `; + for (const ref of innerRefs) { + await collectAttrsFromAttributeGroup( + sql, + ref.group_symbol_id as number, + profile, + out, + seenAttrs, + visitedGroups, + ); + } +} + +export interface EnumEntry { + value: string; + orderIndex: number; +} + +export async function getEnums(sql: Sql, symbolId: number, profile: string): Promise { + const rows = await sql` + SELECT e.value, e.order_index + FROM xsd_enums e + JOIN xsd_profiles p ON p.id = e.profile_id + WHERE e.symbol_id = ${symbolId} AND p.name = ${profile} + ORDER BY e.order_index + `; + return rows.map((r: Record) => ({ + value: r.value as string, + orderIndex: r.order_index as number, + })); +} + +export interface NamespaceInfo { + uri: string; + vocabularies: string[]; + profiles: Array<{ name: string; symbolCount: number }>; +} + +export async function getNamespaceInfo(sql: Sql, uri: string): Promise { + const nsRows = await sql`SELECT id FROM xsd_namespaces WHERE uri = ${uri} LIMIT 1`; + if (nsRows.length === 0) return null; + const nsId = nsRows[0].id as number; + + const profileRows = await sql` + SELECT p.name AS profile_name, COUNT(*)::int AS symbol_count, + array_agg(DISTINCT s.vocabulary_id) AS vocabularies + FROM xsd_symbol_profiles sp + JOIN xsd_profiles p ON p.id = sp.profile_id + JOIN xsd_symbols s ON s.id = sp.symbol_id + WHERE sp.namespace_id = ${nsId} + GROUP BY p.name + ORDER BY p.name + `; + + const vocabSet = new Set(); + const profiles: NamespaceInfo["profiles"] = []; + for (const r of profileRows) { + profiles.push({ + name: r.profile_name as string, + symbolCount: r.symbol_count as number, + }); + for (const v of (r.vocabularies as string[]) ?? []) vocabSet.add(v); + } + return { uri, vocabularies: [...vocabSet].sort(), profiles }; +} diff --git a/apps/mcp-server/src/ooxml-tools.ts b/apps/mcp-server/src/ooxml-tools.ts new file mode 100644 index 0000000..3dcaad4 --- /dev/null +++ b/apps/mcp-server/src/ooxml-tools.ts @@ -0,0 +1,408 @@ +/** + * Read-only structural MCP tools backed by the OOXML schema graph. + * + * Tools: + * ooxml_lookup_element, ooxml_lookup_type, ooxml_children, + * ooxml_attributes, ooxml_enum, ooxml_namespace_info. + * + * Default profile is `transitional`. Future profiles (e.g. word-compatible-docx) + * will compose Transitional with Office extension schemas. + */ + +import { neon } from "@neondatabase/serverless"; +import { + type AttrEntry, + type ChildEdge, + type EnumEntry, + getAttributes, + getChildren, + getEnums, + getNamespaceInfo, + lookupElement, + lookupSymbol, + lookupSymbolByTypeRef, + lookupType, + type NamespaceInfo, + parseQName, + type SymbolHit, +} from "./ooxml-queries"; + +export const DEFAULT_PROFILE = "transitional"; + +export interface OoxmlEnv { + DATABASE_URL: string; +} + +export const OOXML_TOOL_DEFS = [ + { + name: "ooxml_lookup_element", + description: + "Look up an OOXML element by qname in a profile. Returns canonical symbol info (vocabulary, namespace, declared @type, profile membership, source). Accepts 'w:tbl', '{namespace}localName' (Clark form), or bare 'localName' (defaults to wml-main).", + inputSchema: { + type: "object" as const, + properties: { + qname: { type: "string", description: "Element qname, e.g. 'w:tbl' or '{...}tbl'." }, + profile: { + type: "string", + description: "Profile name (default: 'transitional').", + }, + }, + required: ["qname"], + }, + }, + { + name: "ooxml_lookup_type", + description: + "Look up a complexType or simpleType by qname in a profile. Tries complexType first, then simpleType.", + inputSchema: { + type: "object" as const, + properties: { + qname: { type: "string", description: "Type qname, e.g. 'w:CT_Tbl' or 'CT_Tbl'." }, + profile: { type: "string", description: "Profile name (default: 'transitional')." }, + }, + required: ["qname"], + }, + }, + { + name: "ooxml_children", + description: + "List the legal children of an element or complexType in document order. For an element, follows @type to its complexType first. Walks inheritance to union content from base types. Group refs are surfaced as-is; resolve them by calling ooxml_children on the group qname.", + inputSchema: { + type: "object" as const, + properties: { + qname: { + type: "string", + description: + "Element, complexType, or group qname (e.g. 'w:tbl', 'CT_Tbl', 'EG_PContent').", + }, + profile: { type: "string", description: "Profile name (default: 'transitional')." }, + }, + required: ["qname"], + }, + }, + { + name: "ooxml_attributes", + description: + "List the attributes of an element or complexType. For an element, follows @type to its complexType first. Walks inheritance and unfolds attributeGroup refs recursively. Each entry includes use (required/optional/prohibited), default, fixed, and type_ref.", + inputSchema: { + type: "object" as const, + properties: { + qname: { type: "string", description: "Element or complexType qname." }, + profile: { type: "string", description: "Profile name (default: 'transitional')." }, + }, + required: ["qname"], + }, + }, + { + name: "ooxml_enum", + description: + "List enumeration values for a simpleType. Pass the simpleType qname (e.g. 'w:ST_Jc' or 'ST_Jc') and get back the values in declaration order.", + inputSchema: { + type: "object" as const, + properties: { + qname: { type: "string", description: "simpleType qname." }, + profile: { type: "string", description: "Profile name (default: 'transitional')." }, + }, + required: ["qname"], + }, + }, + { + name: "ooxml_namespace_info", + description: + "Show what's known about a namespace URI: vocabularies, profiles that include it, and how many symbols each profile contributes.", + inputSchema: { + type: "object" as const, + properties: { + uri: { type: "string", description: "Namespace URI." }, + }, + required: ["uri"], + }, + }, +]; + +export type OoxmlToolName = + | "ooxml_lookup_element" + | "ooxml_lookup_type" + | "ooxml_children" + | "ooxml_attributes" + | "ooxml_enum" + | "ooxml_namespace_info"; + +const OOXML_TOOL_NAMES: ReadonlySet = new Set(OOXML_TOOL_DEFS.map((t) => t.name)); + +export function isOoxmlTool(name: string): name is OoxmlToolName { + return OOXML_TOOL_NAMES.has(name); +} + +// biome-ignore lint/suspicious/noExplicitAny: neon's tagged template is loosely typed. +type Sql = any; + +/** + * Worker-side entry point: constructs a Neon HTTP client from env and dispatches. + * Local CLIs and tests should call `runOoxmlTool` directly with their own sql + * (e.g. postgres.js against a local Postgres) to avoid the Neon HTTP path. + */ +export async function callOoxmlTool( + name: OoxmlToolName, + args: Record, + env: OoxmlEnv, +): Promise { + const sql = neon(env.DATABASE_URL); + return runOoxmlTool(name, args, sql); +} + +/** + * Driver-agnostic dispatch. `sql` is any tagged-template SQL function whose + * shape matches `(strings, ...values) => Promise` (Neon and postgres.js + * both qualify). + */ +export async function runOoxmlTool( + name: OoxmlToolName, + args: Record, + sql: Sql, +): Promise { + const profile = (args.profile as string | undefined) ?? DEFAULT_PROFILE; + + switch (name) { + case "ooxml_lookup_element": { + const q = parseQName(String(args.qname ?? "")); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + const hit = await lookupElement(sql, q.qname.namespace, q.qname.localName, profile); + if (!hit) { + return formatNotFound( + `element ${q.qname.localName} in namespace ${q.qname.namespace}`, + profile, + ); + } + return formatSymbolReport("Element", hit, profile); + } + + case "ooxml_lookup_type": { + const q = parseQName(String(args.qname ?? "")); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + const hit = await lookupType(sql, q.qname.namespace, q.qname.localName, profile); + if (!hit) { + return formatNotFound( + `type ${q.qname.localName} in namespace ${q.qname.namespace}`, + profile, + ); + } + return formatSymbolReport( + hit.kind === "simpleType" ? "SimpleType" : "ComplexType", + hit, + profile, + ); + } + + case "ooxml_children": { + const q = parseQName(String(args.qname ?? "")); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + + let typeSym = await lookupType(sql, q.qname.namespace, q.qname.localName, profile); + let elementSym: SymbolHit | null = null; + if (!typeSym) { + elementSym = await lookupElement(sql, q.qname.namespace, q.qname.localName, profile); + if (elementSym?.typeRef) { + typeSym = await lookupSymbolByTypeRef(sql, elementSym.typeRef, profile); + } else if (!elementSym) { + // Fall back to looking for a named xsd:group with this qname (so + // EG_PContent and friends are reachable directly). + typeSym = await lookupSymbol(sql, q.qname.namespace, q.qname.localName, "group", profile); + } + } + if (!typeSym) { + return formatNotFound( + `children for ${q.qname.localName} in namespace ${q.qname.namespace}`, + profile, + ); + } + const children = await getChildren(sql, typeSym.id, profile); + return formatChildrenReport(elementSym, typeSym, children, profile); + } + + case "ooxml_attributes": { + const q = parseQName(String(args.qname ?? "")); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + let typeSym = await lookupType(sql, q.qname.namespace, q.qname.localName, profile); + let elementSym: SymbolHit | null = null; + if (!typeSym) { + elementSym = await lookupElement(sql, q.qname.namespace, q.qname.localName, profile); + if (elementSym?.typeRef) { + typeSym = await lookupSymbolByTypeRef(sql, elementSym.typeRef, profile); + } + } + if (!typeSym) { + return formatNotFound( + `attributes for ${q.qname.localName} in namespace ${q.qname.namespace}`, + profile, + ); + } + const attrs = await getAttributes(sql, typeSym.id, profile); + return formatAttributesReport(elementSym, typeSym, attrs, profile); + } + + case "ooxml_enum": { + const q = parseQName(String(args.qname ?? "")); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + const sym = await lookupType(sql, q.qname.namespace, q.qname.localName, profile); + if (!sym || sym.kind !== "simpleType") { + return formatNotFound( + `simpleType ${q.qname.localName} in namespace ${q.qname.namespace}`, + profile, + ); + } + const enums = await getEnums(sql, sym.id, profile); + return formatEnumReport(sym, enums, profile); + } + + case "ooxml_namespace_info": { + const uri = String(args.uri ?? ""); + if (!uri) return formatNotFound("namespace URI not provided"); + const info = await getNamespaceInfo(sql, uri); + if (!info) return formatNotFound(`namespace URI '${uri}' not present in any profile`); + return formatNamespaceReport(info); + } + + default: { + const _exhaustive: never = name; + throw new Error(`Unhandled OOXML tool: ${_exhaustive}`); + } + } +} + +// --- Formatting -------------------------------------------------------- + +function formatSymbolReport(label: string, hit: SymbolHit, profile: string): string { + const lines: string[] = []; + lines.push(`## ${label}: ${hit.localName}`); + lines.push(""); + lines.push(`- profile: ${profile}`); + lines.push( + `- canonical: (vocabulary=${hit.vocabularyId}, kind=${hit.kind}, name=${hit.localName})`, + ); + lines.push(`- namespace: ${hit.namespaceUri}`); + if (hit.typeRef) lines.push(`- type_ref: ${hit.typeRef}`); + if (hit.sourceName) lines.push(`- source: ${hit.sourceName}`); + return lines.join("\n"); +} + +function formatChildrenReport( + element: SymbolHit | null, + type: SymbolHit, + children: ChildEdge[], + profile: string, +): string { + const lines: string[] = []; + const heading = element + ? `Children of ${element.localName} (via type ${type.localName})` + : `Children of ${type.localName}`; + lines.push(`## ${heading}`); + lines.push(""); + lines.push(`- profile: ${profile}`); + lines.push(`- type vocabulary: ${type.vocabularyId}`); + lines.push(`- type namespace: ${type.namespaceUri}`); + if (type.sourceName) lines.push(`- source: ${type.sourceName}`); + lines.push(""); + + if (children.length === 0) { + lines.push("_no direct or inherited children._"); + return lines.join("\n"); + } + + lines.push("| order | kind | name | min | max | compositor | from |"); + lines.push("| --- | --- | --- | --- | --- | --- | --- |"); + for (const c of children) { + const max = c.maxOccurs === null ? "unbounded" : String(c.maxOccurs); + const comp = c.compositorKind ?? "-"; + const from = c.source === "self" ? "self" : `inherited (${c.owningTypeName})`; + lines.push( + `| ${c.orderIndex} | ${c.kind} | ${c.localName} | ${c.minOccurs} | ${max} | ${comp} | ${from} |`, + ); + } + lines.push(""); + lines.push( + "_group entries are returned as-is; call `ooxml_children` on the group qname to expand them._", + ); + return lines.join("\n"); +} + +function formatAttributesReport( + element: SymbolHit | null, + type: SymbolHit, + attrs: AttrEntry[], + profile: string, +): string { + const lines: string[] = []; + const heading = element + ? `Attributes of ${element.localName} (via type ${type.localName})` + : `Attributes of ${type.localName}`; + lines.push(`## ${heading}`); + lines.push(""); + lines.push(`- profile: ${profile}`); + lines.push(`- type vocabulary: ${type.vocabularyId}`); + if (type.sourceName) lines.push(`- source: ${type.sourceName}`); + lines.push(""); + + if (attrs.length === 0) { + lines.push("_no attributes._"); + return lines.join("\n"); + } + + lines.push("| name | use | type | default | fixed | from |"); + lines.push("| --- | --- | --- | --- | --- | --- |"); + for (const a of attrs) { + const from = + a.source === "self" + ? "self" + : a.source === "inherited" + ? `inherited (${a.owningName})` + : `attributeGroup (${a.owningName})`; + lines.push( + `| ${a.localName} | ${a.attrUse} | ${a.typeRef ?? "-"} | ${a.defaultValue ?? "-"} | ${a.fixedValue ?? "-"} | ${from} |`, + ); + } + return lines.join("\n"); +} + +function formatEnumReport(sym: SymbolHit, enums: EnumEntry[], profile: string): string { + const lines: string[] = []; + lines.push(`## Enum values for ${sym.localName}`); + lines.push(""); + lines.push(`- profile: ${profile}`); + lines.push(`- vocabulary: ${sym.vocabularyId}`); + lines.push(`- namespace: ${sym.namespaceUri}`); + if (sym.sourceName) lines.push(`- source: ${sym.sourceName}`); + lines.push(""); + if (enums.length === 0) { + lines.push("_no enum values; this simpleType is constrained by base type or pattern only._"); + } else { + for (const e of enums) lines.push(`- ${e.value}`); + } + return lines.join("\n"); +} + +function formatNamespaceReport(info: NamespaceInfo): string { + const lines: string[] = []; + lines.push(`## Namespace ${info.uri}`); + lines.push(""); + lines.push(`- vocabularies: ${info.vocabularies.join(", ") || "(none)"}`); + if (info.profiles.length === 0) { + lines.push("- profiles: (no symbols in any profile)"); + } else { + lines.push("- profiles:"); + for (const p of info.profiles) lines.push(` - ${p.name}: ${p.symbolCount} symbols`); + } + return lines.join("\n"); +} + +function formatNotFound(what: string, profile?: string): string { + const lines: string[] = []; + lines.push(`## Not found: ${what}`); + if (profile) lines.push(`Searched in profile '${profile}'.`); + lines.push(""); + lines.push("Try one of:"); + lines.push("- a known prefix qname like `w:tbl`, `r:id`, `s:ST_OnOff`, `m:oMath`, `a:blip`"); + lines.push("- Clark form `{namespace-uri}localName`"); + lines.push("- a different profile (currently only `transitional` is populated)"); + return lines.join("\n"); +} diff --git a/bun.lock b/bun.lock index 9a06be8..a026652 100644 --- a/bun.lock +++ b/bun.lock @@ -10,6 +10,7 @@ "@semantic-release/exec": "^7.1.0", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^12.0.2", + "fast-xml-parser": "^5.7.2", "lefthook": "^2.0.16", "semantic-release": "^25.0.2", "typescript": "~5.9.3", @@ -17,7 +18,7 @@ }, "apps/mcp-server": { "name": "@ooxml-dev/mcp-server", - "version": "0.0.1", + "version": "0.13.1", "dependencies": { "@modelcontextprotocol/sdk": "^1.25.3", "@neondatabase/serverless": "^1.0.2", @@ -31,7 +32,7 @@ }, "apps/web": { "name": "@ooxml-dev/web", - "version": "0.1.3", + "version": "0.13.1", "dependencies": { "clsx": "^2.1.1", "fumadocs-core": "^16.4.9", @@ -284,6 +285,8 @@ "@neondatabase/serverless": ["@neondatabase/serverless@1.0.2", "", { "dependencies": { "@types/node": "^22.15.30", "@types/pg": "^8.8.0" } }, "sha512-I5sbpSIAHiB+b6UttofhrN/UJXII+4tZPAq1qugzwCwLIL8EZLV7F/JyHUrEIiGgQpEXzpnjlJ+zwcEhheGvCw=="], + "@nodable/entities": ["@nodable/entities@2.1.0", "", {}, "sha512-nyT7T3nbMyBI/lvr6L5TyWbFJAI9FTgVRakNoBqCD+PmID8DzFrrNdLLtHMwMszOtqZa8PAOV24ZqDnQrhQINA=="], + "@octokit/auth-token": ["@octokit/auth-token@6.0.0", "", {}, "sha512-P4YJBPdPSpWTQ1NU4XYdvHvXJJDxM6YwpS0FZHRgP7YFkdVxsWcpWGy/NVqlAA7PcPCnMacXlRm1y2PFZRWL/w=="], "@octokit/core": ["@octokit/core@7.0.6", "", { "dependencies": { "@octokit/auth-token": "^6.0.0", "@octokit/graphql": "^9.0.3", "@octokit/request": "^10.0.6", "@octokit/request-error": "^7.0.2", "@octokit/types": "^16.0.0", "before-after-hook": "^4.0.0", "universal-user-agent": "^7.0.0" } }, "sha512-DhGl4xMVFGVIyMwswXeyzdL4uXD5OGILGX5N8Y+f6W7LhC1Ze2poSNrkF/fedpVDHEEZ+PHFW0vL14I+mm8K3Q=="], @@ -762,6 +765,10 @@ "fast-uri": ["fast-uri@3.1.0", "", {}, "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA=="], + "fast-xml-builder": ["fast-xml-builder@1.1.5", "", { "dependencies": { "path-expression-matcher": "^1.1.3" } }, "sha512-4TJn/8FKLeslLAH3dnohXqE3QSoxkhvaMzepOIZytwJXZO69Bfz0HBdDHzOTOon6G59Zrk6VQ2bEiv1t61rfkA=="], + + "fast-xml-parser": ["fast-xml-parser@5.7.2", "", { "dependencies": { "@nodable/entities": "^2.1.0", "fast-xml-builder": "^1.1.5", "path-expression-matcher": "^1.5.0", "strnum": "^2.2.3" }, "bin": { "fxparser": "src/cli/cli.js" } }, "sha512-P7oW7tLbYnhOLQk/Gv7cZgzgMPP/XN03K02/Jy6Y/NHzyIAIpxuZIM/YqAkfiXFPxA2CTm7NtCijK9EDu09u2w=="], + "fdir": ["fdir@6.5.0", "", { "peerDependencies": { "picomatch": "^3 || ^4" }, "optionalPeers": ["picomatch"] }, "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg=="], "figures": ["figures@6.1.0", "", { "dependencies": { "is-unicode-supported": "^2.0.0" } }, "sha512-d+l3qxjSesT4V7v2fh+QnmFnUWv9lSpjarhShNTgBOfA0ttejbQUAlHLitbjkoRiDulW0OPoQPYIGhIC8ohejg=="], @@ -1212,6 +1219,8 @@ "path-exists": ["path-exists@3.0.0", "", {}, "sha512-bpC7GYwiDYQ4wYLe+FA8lhRjhQCMcQGuSgGGqDkg/QerRWw9CmGRT0iSOVRSZJ29NMLZgIzqaljJ63oaL4NIJQ=="], + "path-expression-matcher": ["path-expression-matcher@1.5.0", "", {}, "sha512-cbrerZV+6rvdQrrD+iGMcZFEiiSrbv9Tfdkvnusy6y0x0GKBXREFg/Y65GhIfm0tnLntThhzCnfKwp1WRjeCyQ=="], + "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="], "path-to-regexp": ["path-to-regexp@6.3.0", "", {}, "sha512-Yhpw4T9C6hPpgPeA28us07OJeqZ5EzQTkbfwuhsUg0c237RomFoETJgmp2sa3F/41gfLE6G5cqcYwznmeEeOlQ=="], @@ -1408,6 +1417,8 @@ "strip-json-comments": ["strip-json-comments@2.0.1", "", {}, "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ=="], + "strnum": ["strnum@2.2.3", "", {}, "sha512-oKx6RUCuHfT3oyVjtnrmn19H1SiCqgJSg+54XqURKp5aCMbrXrhLjRN9TjuwMjiYstZ0MzDrHqkGZ5dFTKd+zg=="], + "style-to-js": ["style-to-js@1.1.21", "", { "dependencies": { "style-to-object": "1.0.14" } }, "sha512-RjQetxJrrUJLQPHbLku6U/ocGtzyjbJMP9lCNK7Ag0CNh690nSH8woqWH9u16nMjYBAok+i7JO1NP2pOy8IsPQ=="], "style-to-object": ["style-to-object@1.0.14", "", { "dependencies": { "inline-style-parser": "0.2.7" } }, "sha512-LIN7rULI0jBscWQYaSswptyderlarFkjQ+t79nzty8tcIAceVomEVlLzH5VP4Cmsv6MtKhs7qaAiwlcp+Mgaxw=="], diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..31f829d --- /dev/null +++ b/data/README.md @@ -0,0 +1,27 @@ +# data/ + +Repository data root. Three categories live here: + +- **`sources.json`** (committed): canonical source manifest. One entry per + artifact (ECMA-376 PDFs, ECMA Part 4 XSD zip, future MS-OI29500, etc.) with + url, edition, sha256, and a license note. `bun run sources:sync` upserts these + rows into the `reference_sources` table. Edit by hand; the sync script reads + it. + +- **`xsd-cache/`** (gitignored): local XSD download cache. Populated by + `bun run xsd:fetch`. Contents are not load-bearing for the schema graph + itself - the graph lives in Postgres - they're just the source artifacts + the ingest reads. Safe to delete; regenerated on the next fetch. + +- **`behavior-notes/`** (committed when populated): curated YAML files + documenting how Microsoft Office actually behaves vs. the spec. A future + ingest will load these into the `behavior_notes` table so structural tool + responses can carry "what Word actually does" alongside the schema-level + answer. Empty until that workflow lands. + +What does NOT live here: + +- Generated build output: `dist/`, `dev/data/extracted/`, `dev/data/chunks/`, + `dev/data/embedded/` (all under `dev/`, gitignored). +- Database state: lives in Postgres; reproducible from the manifest + + ingest scripts. diff --git a/data/sources.json b/data/sources.json new file mode 100644 index 0000000..4503849 --- /dev/null +++ b/data/sources.json @@ -0,0 +1,50 @@ +{ + "$comment": "Source manifest. Human-edited; scripts/sources-sync.ts upserts these rows into reference_sources.", + "sources": [ + { + "name": "ecma-376-part1", + "kind": "spec_pdf", + "edition": "5th", + "version": "2016-12", + "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-1_5th_edition_december_2016.zip", + "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", + "sha256": "9d0bcad9cf06054785b03762fcfadbf6bab7e54a5f9d69434e34b7fd464d4129" + }, + { + "name": "ecma-376-part2", + "kind": "spec_pdf", + "edition": "5th", + "version": "2021-12", + "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-2_5th_edition_december_2021.zip", + "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", + "sha256": "1d489dc491168ea1f9e9a59063acc8dd5f02b4ad1d21aa7ec19ba9a58d020c70" + }, + { + "name": "ecma-376-part3", + "kind": "spec_pdf", + "edition": "5th", + "version": "2015-12", + "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-3_5th_edition_december_2015.zip", + "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", + "sha256": "42294159fbbbe9393ccadac95b859d7729cc68d908898bcbe31034dda059daa8" + }, + { + "name": "ecma-376-part4", + "kind": "spec_pdf", + "edition": "5th", + "version": "2016-12", + "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-4_5th_edition_december_2016.zip", + "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", + "sha256": "bd25da1109f73762356596918bf5ff8b74a1331642dba5f1c1d1dfc6bed34ecd" + }, + { + "name": "ecma-376-transitional", + "kind": "xsd", + "edition": "5th", + "version": "2016-12", + "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-4_5th_edition_december_2016.zip", + "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", + "sha256": "bd25da1109f73762356596918bf5ff8b74a1331642dba5f1c1d1dfc6bed34ecd" + } + ] +} diff --git a/db/migrations/0001_reference_sources.sql b/db/migrations/0001_reference_sources.sql new file mode 100644 index 0000000..483c4a2 --- /dev/null +++ b/db/migrations/0001_reference_sources.sql @@ -0,0 +1,22 @@ +-- Provenance foundation: reference_sources catalog + source_id FK on spec_content. +-- Idempotent: safe to run against fresh installs (matches db/schema.sql) or existing DBs. + +CREATE EXTENSION IF NOT EXISTS vector; + +CREATE TABLE IF NOT EXISTS reference_sources ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + kind TEXT NOT NULL, + edition TEXT, + version TEXT, + url TEXT, + license_note TEXT, + sha256 TEXT, + fetched_at TIMESTAMPTZ DEFAULT NOW(), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +ALTER TABLE spec_content + ADD COLUMN IF NOT EXISTS source_id INT REFERENCES reference_sources(id); + +CREATE INDEX IF NOT EXISTS idx_content_source ON spec_content(source_id); diff --git a/db/migrations/0002_xsd_schema.sql b/db/migrations/0002_xsd_schema.sql new file mode 100644 index 0000000..6a8282d --- /dev/null +++ b/db/migrations/0002_xsd_schema.sql @@ -0,0 +1,151 @@ +-- Profile-scoped XSD schema graph. All tables empty after this migration; data +-- lands when scripts/ingest-xsd/ingest.ts runs against a populated cache. +-- Idempotent: safe to run against fresh installs (matches db/schema.sql) or existing DBs. + +CREATE TABLE IF NOT EXISTS xsd_profiles ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL UNIQUE, -- 'transitional', 'strict', 'office-extension', 'word-compatible-docx' + description TEXT, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS xsd_namespaces ( + id SERIAL PRIMARY KEY, + uri TEXT NOT NULL UNIQUE, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Canonical symbol identity: (vocabulary_id, local_name, kind). +-- vocabulary_id is a normalized id like 'wml-main', 'dml-main', 'shared-types'. +-- Namespace URIs are profile aliases, not part of identity (see xsd_symbol_profiles). +CREATE TABLE IF NOT EXISTS xsd_symbols ( + id SERIAL PRIMARY KEY, + vocabulary_id TEXT NOT NULL, + local_name TEXT NOT NULL, + kind TEXT NOT NULL, -- element, complexType, simpleType, attribute, attributeGroup, group + payload JSONB DEFAULT '{}'::jsonb, -- long-tail XSD details (annotations, app-info, rare attrs) + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (vocabulary_id, local_name, kind) +); + +-- Profile membership + per-profile namespace alias for a symbol. +CREATE TABLE IF NOT EXISTS xsd_symbol_profiles ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + namespace_id INT NOT NULL REFERENCES xsd_namespaces(id), + source_id INT REFERENCES reference_sources(id), + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (symbol_id, profile_id) +); + +-- Content-model compositors (xs:sequence | xs:choice | xs:all). Profile-scoped. +-- Exactly one of parent_symbol_id (top-level on a type/group) or +-- parent_compositor_id (nested inside another compositor) is set. +CREATE TABLE IF NOT EXISTS xsd_compositors ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, + parent_compositor_id INT REFERENCES xsd_compositors(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + kind TEXT NOT NULL CHECK (kind IN ('sequence', 'choice', 'all')), + min_occurs INT DEFAULT 1, + max_occurs INT, -- NULL = unbounded + order_index INT DEFAULT 0, + CHECK ((parent_symbol_id IS NOT NULL) <> (parent_compositor_id IS NOT NULL)) +); + +-- Child element edges. parent_symbol_id is denormalized for fast "children of X" queries +-- without walking through compositor rows first. +CREATE TABLE IF NOT EXISTS xsd_child_edges ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + compositor_id INT NOT NULL REFERENCES xsd_compositors(id) ON DELETE CASCADE, + child_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + min_occurs INT DEFAULT 1, + max_occurs INT, -- NULL = unbounded + order_index INT DEFAULT 0 +); + +-- Attribute edges. attr_symbol_id is set when the attribute is a top-level symbol +-- (declared globally and referenced by ref); NULL for inline attributes. +CREATE TABLE IF NOT EXISTS xsd_attr_edges ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + attr_symbol_id INT REFERENCES xsd_symbols(id), + local_name TEXT NOT NULL, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + attr_use TEXT NOT NULL CHECK (attr_use IN ('required', 'optional', 'prohibited')) DEFAULT 'optional', + default_value TEXT, + fixed_value TEXT, + type_ref TEXT, + order_index INT DEFAULT 0 +); + +-- Group / attributeGroup references. resolved=true means the group's contents +-- have been expanded into xsd_child_edges or xsd_attr_edges on the parent. +CREATE TABLE IF NOT EXISTS xsd_group_edges ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + group_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + ref_kind TEXT NOT NULL CHECK (ref_kind IN ('group', 'attributeGroup')), + resolved BOOLEAN DEFAULT FALSE, + order_index INT DEFAULT 0 +); + +-- Inheritance: extension or restriction of a base type. A derived type has +-- exactly one base per profile. +CREATE TABLE IF NOT EXISTS xsd_inheritance_edges ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + base_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + relation TEXT NOT NULL CHECK (relation IN ('extension', 'restriction')), + UNIQUE (symbol_id, profile_id) +); + +-- Enum values from xs:simpleType / xs:restriction. +CREATE TABLE IF NOT EXISTS xsd_enums ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + value TEXT NOT NULL, + annotation TEXT, + order_index INT DEFAULT 0 +); + +-- Curated Word/Office behavior claims keyed to symbols. +-- claim_type enum is locked now; the table stays empty until curated behavior +-- notes start landing. +CREATE TABLE IF NOT EXISTS behavior_notes ( + id SERIAL PRIMARY KEY, + symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, + app TEXT NOT NULL, -- 'Word', 'Office', 'LibreOffice' + version_scope TEXT, -- e.g. 'Word 2007+', 'Word 365' + claim_type TEXT NOT NULL CHECK (claim_type IN ( + 'ignores', + 'requires_despite_optional', + 'writes', + 'reads_but_does_not_write', + 'repairs', + 'layout_behavior' + )), + summary TEXT NOT NULL, + source_id INT REFERENCES reference_sources(id), + section_id TEXT, + confidence TEXT CHECK (confidence IN ('high', 'medium', 'low')), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Indexes (UNIQUE constraints already create implicit indexes for canonical lookups) +CREATE INDEX IF NOT EXISTS idx_xsd_symbols_lookup ON xsd_symbols(vocabulary_id, local_name, kind); +CREATE INDEX IF NOT EXISTS idx_xsd_child_edges_parent ON xsd_child_edges(parent_symbol_id); +CREATE INDEX IF NOT EXISTS idx_xsd_child_edges_compositor ON xsd_child_edges(compositor_id); +CREATE INDEX IF NOT EXISTS idx_xsd_attr_edges_symbol ON xsd_attr_edges(symbol_id); +CREATE INDEX IF NOT EXISTS idx_xsd_compositors_parent_symbol ON xsd_compositors(parent_symbol_id); +CREATE INDEX IF NOT EXISTS idx_xsd_compositors_parent_compositor ON xsd_compositors(parent_compositor_id); +CREATE INDEX IF NOT EXISTS idx_xsd_group_edges_parent ON xsd_group_edges(parent_symbol_id); +CREATE INDEX IF NOT EXISTS idx_xsd_inheritance_edges_symbol ON xsd_inheritance_edges(symbol_id); +CREATE INDEX IF NOT EXISTS idx_xsd_enums_symbol ON xsd_enums(symbol_id); +CREATE INDEX IF NOT EXISTS idx_behavior_notes_symbol ON behavior_notes(symbol_id); diff --git a/db/migrations/0003_xsd_metadata.sql b/db/migrations/0003_xsd_metadata.sql new file mode 100644 index 0000000..725a97b --- /dev/null +++ b/db/migrations/0003_xsd_metadata.sql @@ -0,0 +1,12 @@ +-- Preserve element/attribute @type and group-ref compositor context so the +-- structural lookup tools can resolve element-to-type chains and attach refs +-- to their enclosing compositor. +-- Idempotent. + +ALTER TABLE xsd_symbols + ADD COLUMN IF NOT EXISTS type_ref TEXT; + +ALTER TABLE xsd_group_edges + ADD COLUMN IF NOT EXISTS compositor_id INT REFERENCES xsd_compositors(id) ON DELETE CASCADE, + ADD COLUMN IF NOT EXISTS min_occurs INT DEFAULT 1, + ADD COLUMN IF NOT EXISTS max_occurs INT; diff --git a/db/migrations/0004_local_element_scoping.sql b/db/migrations/0004_local_element_scoping.sql new file mode 100644 index 0000000..54774bc --- /dev/null +++ b/db/migrations/0004_local_element_scoping.sql @@ -0,0 +1,49 @@ +-- Scope local element symbols by their owner. +-- +-- Before this migration, an inline declared +-- inside two different complexTypes/groups collapsed to a single symbol keyed +-- on (vocabulary_id, local_name, kind). The first-seen type_ref won and the +-- later one was silently dropped, so e.g. WML's `tblGrid` (CT_TblGridBase +-- inside CT_TblGridChange vs CT_TblGrid inside CT_Tbl) gave a wrong answer +-- for ooxml_children("w:tblGrid"). +-- +-- Fix: add parent_symbol_id to xsd_symbols and include it in the canonical key +-- with NULLS NOT DISTINCT so two top-level declarations (parent NULL) still +-- collide while local declarations are scoped per-owner. +-- +-- Idempotent. + +ALTER TABLE xsd_symbols + ADD COLUMN IF NOT EXISTS parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE; + +DO $$ +DECLARE cname TEXT; +BEGIN + -- Drop the auto-named 3-tuple unique constraint, regardless of what postgres + -- ended up calling it. + SELECT conname INTO cname + FROM pg_constraint + WHERE conrelid = 'xsd_symbols'::regclass + AND contype = 'u' + AND conkey = ( + SELECT array_agg(attnum ORDER BY attnum) + FROM pg_attribute + WHERE attrelid = 'xsd_symbols'::regclass + AND attname IN ('vocabulary_id', 'local_name', 'kind') + ); + IF cname IS NOT NULL THEN + EXECUTE 'ALTER TABLE xsd_symbols DROP CONSTRAINT ' || quote_ident(cname); + END IF; + + IF NOT EXISTS ( + SELECT 1 FROM pg_constraint + WHERE conname = 'xsd_symbols_canonical_key' + AND conrelid = 'xsd_symbols'::regclass + ) THEN + ALTER TABLE xsd_symbols + ADD CONSTRAINT xsd_symbols_canonical_key + UNIQUE NULLS NOT DISTINCT (vocabulary_id, local_name, kind, parent_symbol_id); + END IF; +END $$; + +CREATE INDEX IF NOT EXISTS idx_xsd_symbols_parent ON xsd_symbols(parent_symbol_id); diff --git a/db/migrations/0005_drop_legacy_ecma376.sql b/db/migrations/0005_drop_legacy_ecma376.sql new file mode 100644 index 0000000..89afb4a --- /dev/null +++ b/db/migrations/0005_drop_legacy_ecma376.sql @@ -0,0 +1,17 @@ +-- Drop the legacy `ecma-376` placeholder source row. +-- +-- An earlier version of data/sources.json had a single placeholder +-- (`ecma-376`, edition=unknown, sha256=null) that stood in for the whole +-- spec corpus before per-part entries existed. The manifest now pins the +-- four ECMA-376 parts individually (`ecma-376-partN`), so the placeholder +-- is obsolete. +-- +-- This migration only deletes the row when nothing in spec_content +-- references it, so a developer who already backfilled source_id to the +-- legacy id stays safe. Idempotent. + +DELETE FROM reference_sources +WHERE name = 'ecma-376' + AND NOT EXISTS ( + SELECT 1 FROM spec_content WHERE spec_content.source_id = reference_sources.id + ); diff --git a/db/migrations/README.md b/db/migrations/README.md new file mode 100644 index 0000000..de32d12 --- /dev/null +++ b/db/migrations/README.md @@ -0,0 +1,28 @@ +# Migrations + +Each phase that changes the schema adds one numbered SQL file here. Files are applied in lexical order (`0001_*.sql`, `0002_*.sql`, ...). + +## Conventions + +- **Idempotent**: every statement uses `IF NOT EXISTS`, `ADD COLUMN IF NOT EXISTS`, or equivalent. Re-running a migration is a no-op. +- **Forward-only**: no `down` scripts. Reverting means writing a new migration. +- **Source of truth split**: + - `db/schema.sql` reflects the full schema after all migrations are applied. Used by `docker-compose` to initialize fresh dev databases via `db:reset`. + - Migration files are for incrementally upgrading existing databases (production / long-lived dev). + +## Applying migrations + +For now, apply manually against an existing database: + +```bash +psql "$DATABASE_URL" -f db/migrations/0001_reference_sources.sql +``` + +A small runner script can be added later if/when phases need it. + +## Adding a new migration + +1. Pick the next number (`0002`, `0003`, ...). +2. Write idempotent SQL. +3. Update `db/schema.sql` to match the new full state. +4. If the migration introduces curated data (e.g., source rows), let a script populate it (e.g., `scripts/sources-sync.ts`), not the SQL file. diff --git a/db/schema.sql b/db/schema.sql index 34e66e5..d7b1ef5 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -1,9 +1,30 @@ --- ECMA-376 Spec Vector Database Schema --- Simple single-table design - evolve as needed +-- ooxml.dev database schema +-- Single source of truth for fresh installs (loaded by docker-compose at init). +-- For incremental updates against an existing DB, apply files in db/migrations/ in order. CREATE EXTENSION IF NOT EXISTS vector; --- Single table for all spec content +-- Reference sources: provenance for every chunk and (later) every schema symbol. +-- Source artifacts (PDFs, XSDs) are NOT committed. Manifest at data/sources.json +-- is the human-edited source of truth; scripts/sync-sources.ts upserts rows from it. +-- name is the stable identity. edition/version are updatable attributes: +-- when 'unknown' is later verified to '5th', we update in place rather than +-- inserting a duplicate row that would orphan existing source_id references. +-- To track multiple editions side-by-side, use distinct names ('ecma-376-4th'). +CREATE TABLE reference_sources ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL UNIQUE, -- stable id, e.g. 'ecma-376' + kind TEXT NOT NULL, -- 'spec_pdf', 'xsd', 'reference_doc' + edition TEXT, -- '4th', '5th', or 'unknown' until verified + version TEXT, -- semver / date / null + url TEXT, -- canonical fetch URL + license_note TEXT, -- redistribution constraint + sha256 TEXT, -- artifact hash if fetched + fetched_at TIMESTAMPTZ DEFAULT NOW(), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Specification content: prose chunks for semantic search CREATE TABLE spec_content ( id SERIAL PRIMARY KEY, part_number INT NOT NULL, @@ -13,12 +34,168 @@ CREATE TABLE spec_content ( content_type TEXT DEFAULT 'text', page_number INT, embedding vector(1024), + source_id INT REFERENCES reference_sources(id), created_at TIMESTAMPTZ DEFAULT NOW() ); --- Vector similarity search CREATE INDEX idx_content_embedding ON spec_content USING hnsw (embedding vector_cosine_ops); - --- Filtering indexes CREATE INDEX idx_content_part ON spec_content(part_number); CREATE INDEX idx_content_section ON spec_content(section_id); +CREATE INDEX idx_content_source ON spec_content(source_id); + +-- ---------------------------------------------------------------------------- +-- XSD schema graph +-- +-- Profile-scoped symbol graph for OOXML schemas. Canonical symbol identity is +-- (vocabulary_id, local_name, kind, parent_symbol_id); namespace URIs are +-- profile aliases. Profile membership lives on edges/profile join tables, not +-- duplicated symbols. +-- ---------------------------------------------------------------------------- + +CREATE TABLE xsd_profiles ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + description TEXT, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE TABLE xsd_namespaces ( + id SERIAL PRIMARY KEY, + uri TEXT NOT NULL UNIQUE, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- type_ref holds the Clark-style {namespace}localName for elements and attributes +-- that declare a @type. NULL for complexType/simpleType/group/attributeGroup. +-- The lookup tools follow type_ref to resolve element -> type when reading children. +-- +-- parent_symbol_id is NULL for top-level declarations and set to the owning +-- type/group symbol for inline (local) element declarations. The canonical +-- key is 4-tuple with NULLS NOT DISTINCT so top-level decls still collide on +-- name while local decls remain scoped per-owner. +CREATE TABLE xsd_symbols ( + id SERIAL PRIMARY KEY, + vocabulary_id TEXT NOT NULL, + local_name TEXT NOT NULL, + kind TEXT NOT NULL, + type_ref TEXT, + parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, + payload JSONB DEFAULT '{}'::jsonb, + created_at TIMESTAMPTZ DEFAULT NOW(), + CONSTRAINT xsd_symbols_canonical_key + UNIQUE NULLS NOT DISTINCT (vocabulary_id, local_name, kind, parent_symbol_id) +); + +CREATE TABLE xsd_symbol_profiles ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + namespace_id INT NOT NULL REFERENCES xsd_namespaces(id), + source_id INT REFERENCES reference_sources(id), + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (symbol_id, profile_id) +); + +-- Exactly one of parent_symbol_id (top-level) or parent_compositor_id (nested) is set. +CREATE TABLE xsd_compositors ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, + parent_compositor_id INT REFERENCES xsd_compositors(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + kind TEXT NOT NULL CHECK (kind IN ('sequence', 'choice', 'all')), + min_occurs INT DEFAULT 1, + max_occurs INT, + order_index INT DEFAULT 0, + CHECK ((parent_symbol_id IS NOT NULL) <> (parent_compositor_id IS NOT NULL)) +); + +CREATE TABLE xsd_child_edges ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + compositor_id INT NOT NULL REFERENCES xsd_compositors(id) ON DELETE CASCADE, + child_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + min_occurs INT DEFAULT 1, + max_occurs INT, + order_index INT DEFAULT 0 +); + +CREATE TABLE xsd_attr_edges ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + attr_symbol_id INT REFERENCES xsd_symbols(id), + local_name TEXT NOT NULL, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + attr_use TEXT NOT NULL CHECK (attr_use IN ('required', 'optional', 'prohibited')) DEFAULT 'optional', + default_value TEXT, + fixed_value TEXT, + type_ref TEXT, + order_index INT DEFAULT 0 +); + +-- compositor_id is the enclosing compositor when a appears inside +-- a sequence/choice/all (NULL for refs at the type's top level or for +-- attributeGroup refs which don't live in a compositor). +-- min/max_occurs capture the ref site's own cardinality. +CREATE TABLE xsd_group_edges ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + compositor_id INT REFERENCES xsd_compositors(id) ON DELETE CASCADE, + group_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + ref_kind TEXT NOT NULL CHECK (ref_kind IN ('group', 'attributeGroup')), + resolved BOOLEAN DEFAULT FALSE, + min_occurs INT DEFAULT 1, + max_occurs INT, + order_index INT DEFAULT 0 +); + +CREATE TABLE xsd_inheritance_edges ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + base_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + relation TEXT NOT NULL CHECK (relation IN ('extension', 'restriction')), + UNIQUE (symbol_id, profile_id) +); + +CREATE TABLE xsd_enums ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + value TEXT NOT NULL, + annotation TEXT, + order_index INT DEFAULT 0 +); + +CREATE TABLE behavior_notes ( + id SERIAL PRIMARY KEY, + symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, + app TEXT NOT NULL, + version_scope TEXT, + claim_type TEXT NOT NULL CHECK (claim_type IN ( + 'ignores', + 'requires_despite_optional', + 'writes', + 'reads_but_does_not_write', + 'repairs', + 'layout_behavior' + )), + summary TEXT NOT NULL, + source_id INT REFERENCES reference_sources(id), + section_id TEXT, + confidence TEXT CHECK (confidence IN ('high', 'medium', 'low')), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE INDEX idx_xsd_symbols_lookup ON xsd_symbols(vocabulary_id, local_name, kind); +CREATE INDEX idx_xsd_symbols_parent ON xsd_symbols(parent_symbol_id); +CREATE INDEX idx_xsd_child_edges_parent ON xsd_child_edges(parent_symbol_id); +CREATE INDEX idx_xsd_child_edges_compositor ON xsd_child_edges(compositor_id); +CREATE INDEX idx_xsd_attr_edges_symbol ON xsd_attr_edges(symbol_id); +CREATE INDEX idx_xsd_compositors_parent_symbol ON xsd_compositors(parent_symbol_id); +CREATE INDEX idx_xsd_compositors_parent_compositor ON xsd_compositors(parent_compositor_id); +CREATE INDEX idx_xsd_group_edges_parent ON xsd_group_edges(parent_symbol_id); +CREATE INDEX idx_xsd_inheritance_edges_symbol ON xsd_inheritance_edges(symbol_id); +CREATE INDEX idx_xsd_enums_symbol ON xsd_enums(symbol_id); +CREATE INDEX idx_behavior_notes_symbol ON behavior_notes(symbol_id); diff --git a/package.json b/package.json index bc3deeb..1d4317b 100644 --- a/package.json +++ b/package.json @@ -19,11 +19,16 @@ "db:down": "docker compose down", "db:reset": "docker compose down -v && docker compose up -d", "db:shell": "docker compose exec db psql -U postgres -d ecma_spec", - "ingest": "bun scripts/ingest/pipeline.ts", - "ingest:chunk": "bun scripts/ingest/chunk.ts", - "ingest:embed": "bun scripts/ingest/embed.ts", - "ingest:upload": "bun scripts/ingest/upload.ts", - "ingest:setup": "pip install -r scripts/requirements.txt" + "db:migrate": "bun scripts/db-migrate.ts", + "sources:sync": "bun scripts/sources-sync.ts", + "pdf:ingest": "bun scripts/ingest-pdf/pipeline.ts", + "pdf:chunk": "bun scripts/ingest-pdf/chunk.ts", + "pdf:embed": "bun scripts/ingest-pdf/embed.ts", + "pdf:upload": "bun scripts/ingest-pdf/upload.ts", + "pdf:setup": "pip install -r scripts/requirements.txt", + "xsd:fetch": "bun scripts/ingest-xsd/fetch.ts", + "xsd:ingest": "bun scripts/ingest-xsd/ingest.ts", + "test": "export TEST_DATABASE_URL=${TEST_DATABASE_URL:-postgresql://postgres:postgres@localhost:5432/ecma_spec} && bun test tests/db/ && bun test tests/ingest-xsd/ && bun test tests/mcp-server/" }, "devDependencies": { "@biomejs/biome": "^2.3.13", @@ -31,6 +36,7 @@ "@semantic-release/exec": "^7.1.0", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^12.0.2", + "fast-xml-parser": "^5.7.2", "lefthook": "^2.0.16", "semantic-release": "^25.0.2", "typescript": "~5.9.3" diff --git a/scripts/db-migrate.ts b/scripts/db-migrate.ts new file mode 100644 index 0000000..61ffea6 --- /dev/null +++ b/scripts/db-migrate.ts @@ -0,0 +1,49 @@ +/** + * Apply migrations in order from db/migrations/*.sql against $DATABASE_URL. + * All migrations are idempotent; re-running is safe. + * + * Usage: + * bun scripts/db-migrate.ts + * + * Environment: + * DATABASE_URL - PostgreSQL connection string + */ + +import { readdirSync } from "node:fs"; +import { join } from "node:path"; +import { createDbClient } from "../packages/shared/src/db/index.ts"; + +async function main() { + const databaseUrl = process.env.DATABASE_URL; + if (!databaseUrl) { + console.error("Missing DATABASE_URL environment variable"); + process.exit(1); + } + + const dir = "./db/migrations"; + const files = readdirSync(dir) + .filter((f) => f.endsWith(".sql")) + .sort(); + + if (files.length === 0) { + console.log("No migrations found."); + return; + } + + const db = createDbClient(databaseUrl); + try { + for (const f of files) { + const content = await Bun.file(join(dir, f)).text(); + console.log(`Applying ${f}...`); + await db.sql.unsafe(content); + } + console.log(`Applied ${files.length} migration(s).`); + } finally { + await db.close(); + } +} + +main().catch((err) => { + console.error("Migration failed:", err); + process.exit(1); +}); diff --git a/scripts/ingest-pdf/README.md b/scripts/ingest-pdf/README.md new file mode 100644 index 0000000..8e051c0 --- /dev/null +++ b/scripts/ingest-pdf/README.md @@ -0,0 +1,53 @@ +# PDF ingest (ECMA-376 prose corpus) + +Builds the semantic-search corpus that powers `search_ecma_spec` / +`get_section` / `list_parts`. Each ECMA-376 part PDF is extracted into +section-aware markdown, chunked at ~6 KB boundaries, embedded with the +configured provider, and uploaded into `spec_content`. + +``` +PDF -> extract (Python) -> chunk (6KB, section-aware) -> embed -> upload +``` + +## Prerequisites + +- Python with `pymupdf4llm`: `bun run pdf:setup` +- `DATABASE_URL` pointed at a Postgres with `db/schema.sql` applied +- An embedding provider key (one of): + - `OPENAI_API_KEY` (default) + - `VOYAGE_API_KEY` + - `GOOGLE_API_KEY` + - `COHERE_API_KEY` + +## Run the full pipeline + +```bash +bun run pdf:ingest 1 ./pdfs/ECMA-376-Part1.pdf +bun run pdf:ingest 2 ./pdfs/ECMA-376-Part2.pdf +bun run pdf:ingest 3 ./pdfs/ECMA-376-Part3.pdf +bun run pdf:ingest 4 ./pdfs/ECMA-376-Part4.pdf +``` + +Each run extracts to `dev/data/extracted/partN/`, chunks to +`dev/data/chunks/partN-chunks.json`, embeds to +`dev/data/embedded/partN-embedded.json`, then uploads. + +## Run individual stages + +```bash +bun run pdf:chunk ./extracted/part1 ./chunks/part1.json +bun run pdf:embed ./chunks/part1.json ./embedded/part1.json +bun run pdf:upload 1 ./embedded/part1.json +``` + +Useful when iterating on chunking or trying a different embedding provider +without re-extracting. + +## Files + +- `pipeline.ts` - orchestrator (extract -> chunk -> embed -> upload) +- `extract.py` - PDF -> section-aware markdown via pymupdf4llm +- `fix-page-numbers.py` - PDF prelude-aware page-number alignment +- `chunk.ts` - markdown -> 6 KB chunks with section IDs +- `embed.ts` - chunks -> chunks + 1024-dim embeddings +- `upload.ts` - bulk insert into `spec_content` diff --git a/scripts/ingest/chunk.ts b/scripts/ingest-pdf/chunk.ts similarity index 93% rename from scripts/ingest/chunk.ts rename to scripts/ingest-pdf/chunk.ts index c3d2d15..4fe0c87 100644 --- a/scripts/ingest/chunk.ts +++ b/scripts/ingest-pdf/chunk.ts @@ -5,10 +5,10 @@ * Respects section boundaries and handles XML examples specially. * * Usage: - * bun scripts/ingest/chunk.ts + * bun scripts/ingest-pdf/chunk.ts * * Example: - * bun scripts/ingest/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json + * bun scripts/ingest-pdf/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json */ interface ExtractedSection { @@ -151,10 +151,10 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest/chunk.ts "); + console.log("Usage: bun scripts/ingest-pdf/chunk.ts "); console.log(""); console.log("Example:"); - console.log(" bun scripts/ingest/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json"); + console.log(" bun scripts/ingest-pdf/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json"); process.exit(1); } diff --git a/scripts/ingest/embed.ts b/scripts/ingest-pdf/embed.ts similarity index 90% rename from scripts/ingest/embed.ts rename to scripts/ingest-pdf/embed.ts index cacde79..8ab7e53 100644 --- a/scripts/ingest/embed.ts +++ b/scripts/ingest-pdf/embed.ts @@ -4,14 +4,14 @@ * Generates embeddings for chunks using the configured provider. * * Usage: - * bun scripts/ingest/embed.ts + * bun scripts/ingest-pdf/embed.ts * * Environment variables: * EMBEDDING_PROVIDER - openai, google, voyage, or cohere (default: openai) * OPENAI_API_KEY / GOOGLE_API_KEY / etc. * * Example: - * EMBEDDING_PROVIDER=openai bun scripts/ingest/embed.ts ./chunks/part1-chunks.json ./embedded/part1-embedded.json + * EMBEDDING_PROVIDER=openai bun scripts/ingest-pdf/embed.ts ./chunks/part1-chunks.json ./embedded/part1-embedded.json */ import { @@ -93,7 +93,7 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest/embed.ts "); + console.log("Usage: bun scripts/ingest-pdf/embed.ts "); console.log(""); console.log("Environment variables:"); console.log(" EMBEDDING_PROVIDER - openai, google, voyage, or cohere (default: openai)"); @@ -101,7 +101,7 @@ async function main() { console.log(""); console.log("Example:"); console.log( - " EMBEDDING_PROVIDER=openai bun scripts/ingest/embed.ts ./chunks/part1.json ./embedded/part1.json", + " EMBEDDING_PROVIDER=openai bun scripts/ingest-pdf/embed.ts ./chunks/part1.json ./embedded/part1.json", ); process.exit(1); } diff --git a/scripts/ingest/extract-pdf.py b/scripts/ingest-pdf/extract.py similarity index 100% rename from scripts/ingest/extract-pdf.py rename to scripts/ingest-pdf/extract.py diff --git a/scripts/ingest/fix-page-numbers.py b/scripts/ingest-pdf/fix-page-numbers.py similarity index 100% rename from scripts/ingest/fix-page-numbers.py rename to scripts/ingest-pdf/fix-page-numbers.py diff --git a/scripts/ingest/pipeline.ts b/scripts/ingest-pdf/pipeline.ts similarity index 85% rename from scripts/ingest/pipeline.ts rename to scripts/ingest-pdf/pipeline.ts index 660090a..dacc32a 100644 --- a/scripts/ingest/pipeline.ts +++ b/scripts/ingest-pdf/pipeline.ts @@ -4,7 +4,7 @@ * Runs the complete ingestion process: extract -> chunk -> embed -> upload * * Usage: - * bun scripts/ingest/pipeline.ts + * bun scripts/ingest-pdf/pipeline.ts * * Environment variables: * DATABASE_URL - PostgreSQL connection string @@ -12,7 +12,7 @@ * OPENAI_API_KEY / GOOGLE_API_KEY / etc. * * Example: - * bun scripts/ingest/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf + * bun scripts/ingest-pdf/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf */ import { $ } from "bun"; @@ -21,7 +21,7 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest/pipeline.ts "); + console.log("Usage: bun scripts/ingest-pdf/pipeline.ts "); console.log(""); console.log("Environment variables:"); console.log(" DATABASE_URL - PostgreSQL connection string"); @@ -29,7 +29,7 @@ async function main() { console.log(" OPENAI_API_KEY / GOOGLE_API_KEY / etc."); console.log(""); console.log("Example:"); - console.log(" bun scripts/ingest/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf"); + console.log(" bun scripts/ingest-pdf/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf"); process.exit(1); } @@ -92,7 +92,7 @@ async function main() { try { await $`${pythonPath} -c "import pymupdf4llm" 2>/dev/null`; console.log(`Using Python: ${pythonPath}`); - await $`${pythonPath} scripts/ingest/extract-pdf.py ${pdfPath} ${extractedDir}`; + await $`${pythonPath} scripts/ingest-pdf/extract.py ${pdfPath} ${extractedDir}`; extractSuccess = true; break; } catch { @@ -110,17 +110,17 @@ async function main() { // Step 2: Chunk console.log("\n[2/4] Chunking content..."); console.log("-".repeat(40)); - await $`bun scripts/ingest/chunk.ts ${extractedDir} ${chunksFile}`; + await $`bun scripts/ingest-pdf/chunk.ts ${extractedDir} ${chunksFile}`; // Step 3: Embed console.log("\n[3/4] Generating embeddings..."); console.log("-".repeat(40)); - await $`bun scripts/ingest/embed.ts ${chunksFile} ${embeddedFile}`; + await $`bun scripts/ingest-pdf/embed.ts ${chunksFile} ${embeddedFile}`; // Step 4: Upload console.log("\n[4/4] Uploading to database..."); console.log("-".repeat(40)); - await $`bun scripts/ingest/upload.ts ${partNumber} ${embeddedFile}`; + await $`bun scripts/ingest-pdf/upload.ts ${partNumber} ${embeddedFile}`; console.log(`\n${"=".repeat(60)}`); console.log("Pipeline complete!"); diff --git a/scripts/ingest/upload.ts b/scripts/ingest-pdf/upload.ts similarity index 88% rename from scripts/ingest/upload.ts rename to scripts/ingest-pdf/upload.ts index db19b41..c17cffc 100644 --- a/scripts/ingest/upload.ts +++ b/scripts/ingest-pdf/upload.ts @@ -4,13 +4,13 @@ * Uploads embedded chunks to the database. * * Usage: - * bun scripts/ingest/upload.ts + * bun scripts/ingest-pdf/upload.ts * * Environment variables: * DATABASE_URL - PostgreSQL connection string * * Example: - * bun scripts/ingest/upload.ts 1 ./embedded/part1-embedded.json + * bun scripts/ingest-pdf/upload.ts 1 ./embedded/part1-embedded.json */ import { createDbClient } from "../../packages/shared/src/db/index.ts"; @@ -30,13 +30,13 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest/upload.ts "); + console.log("Usage: bun scripts/ingest-pdf/upload.ts "); console.log(""); console.log("Environment variables:"); console.log(" DATABASE_URL - PostgreSQL connection string"); console.log(""); console.log("Example:"); - console.log(" bun scripts/ingest/upload.ts 1 ./embedded/part1-embedded.json"); + console.log(" bun scripts/ingest-pdf/upload.ts 1 ./embedded/part1-embedded.json"); process.exit(1); } diff --git a/scripts/ingest-xsd/README.md b/scripts/ingest-xsd/README.md new file mode 100644 index 0000000..78cdba2 --- /dev/null +++ b/scripts/ingest-xsd/README.md @@ -0,0 +1,87 @@ +# XSD ingest (ECMA-376 schema graph) + +Builds the structural-query corpus that powers `ooxml_lookup_element`, +`ooxml_children`, `ooxml_attributes`, etc. The XSDs published by Ecma +International for ECMA-376 Transitional are parsed and persisted as a +profile-scoped relational graph. + +``` +ECMA Part 4 zip -> fetch + verify (sha256) -> parse (preserveOrder) + -> ingest (single transaction) -> 11 tables in Postgres +``` + +## Prerequisites + +- `DATABASE_URL` pointed at a Postgres with `db/schema.sql` applied +- A row in `reference_sources` named `ecma-376-transitional`. Run + `bun run sources:sync` after editing `data/sources.json`. + +## Fetch the schemas + +The Part 4 zip is published on the ECMA-376 publications page. It contains +`OfficeOpenXML-XMLSchema-Transitional.zip`, which contains the 26 +Transitional XSDs (`wml.xsd`, `dml-main.xsd`, `sml.xsd`, `pml.xsd`, +`shared-*.xsd`, ...). + +```bash +bun run xsd:fetch +``` + +URL and sha256 are read from `data/sources.json`'s `ecma-376-transitional` +entry (currently pinned to ECMA-376 5th edition, December 2016). The script +verifies the outer-zip sha256, extracts the inner zip, and lands the XSDs +in `data/xsd-cache/ecma-376-transitional/`. The cache is gitignored; +nothing binary lands in the repo. + +To test a new edition before pinning it: + +```bash +bun run xsd:fetch -- --url # override URL +bun run xsd:fetch -- --expected-sha256 # override hash +``` + +## Ingest + +```bash +bun run xsd:ingest +``` + +By default it walks `wml.xsd` plus its import closure (12 documents) and +populates: `xsd_profiles`, `xsd_namespaces`, `xsd_symbols`, +`xsd_symbol_profiles`, `xsd_inheritance_edges`, `xsd_compositors`, +`xsd_child_edges`, `xsd_group_edges`, `xsd_attr_edges`, `xsd_enums`. Wraps +the whole thing in a single transaction; idempotent across runs. + +To ingest a different working set: + +```bash +bun run xsd:ingest --entrypoint dml-main.xsd +bun run xsd:ingest --schema-dir --entrypoint \ + --profile --source +``` + +## Files + +- `fetch.ts` - download Part 4 zip, verify sha256, extract XSDs +- `parse-schema.ts` - load XSDs into an in-memory schema set with ordered + AST + namespace map + import graph + qname declaration index +- `vocabulary.ts` - canonical namespace URI -> vocabulary id map +- `qname.ts` - canonical-key + qname-attribute resolution +- `ast.ts` - helpers for walking fast-xml-parser preserveOrder output +- `types.ts` - shared types +- `ingest.ts` - parser output -> 11 DB tables, single transaction + +## Smoke-test the result + +The query layer is exercised by `tests/mcp-server/ooxml-queries.test.ts` +against the same fixtures the ingest tests use. Run with: + +```bash +bun test tests/mcp-server/ +``` + +To hit the live MCP, deploy the Worker and call the tools through any +MCP client. For local poking against the dev DB, write a small bun +script that imports `runOoxmlTool` from +`apps/mcp-server/src/ooxml-tools.ts` with a `postgres.js`-backed sql +function. diff --git a/scripts/ingest-xsd/ast.ts b/scripts/ingest-xsd/ast.ts new file mode 100644 index 0000000..0e98d55 --- /dev/null +++ b/scripts/ingest-xsd/ast.ts @@ -0,0 +1,73 @@ +/** + * Helpers for navigating the preserveOrder AST emitted by fast-xml-parser. + * + * AST shape: every element is a single-key object { tagName: children[], ":@"?: { "@_attrName": value } }. + * Text nodes are { "#text": string }. Children always live in an array, so sibling + * order is preserved across different tag names. + */ + +import type { PreserveOrderDocument, PreserveOrderNode } from "./types.ts"; + +/** Strip an XML namespace prefix from a tag name: "xsd:element" → "element". */ +export function stripPrefix(tag: string): string { + const colon = tag.indexOf(":"); + return colon < 0 ? tag : tag.slice(colon + 1); +} + +/** Return the single tag name on a preserveOrder node, or null for non-element nodes. */ +export function nodeTag(node: PreserveOrderNode): string | null { + for (const k of Object.keys(node)) { + if (k !== ":@") return k; + } + return null; +} + +/** Return the children array of a preserveOrder element. */ +export function nodeChildren(node: PreserveOrderNode): PreserveOrderNode[] { + const tag = nodeTag(node); + if (!tag) return []; + const v = node[tag]; + return Array.isArray(v) ? (v as PreserveOrderNode[]) : []; +} + +/** Return attributes on a preserveOrder element. fast-xml-parser nests them under ":@" with "@_" prefix. */ +export function nodeAttrs(node: PreserveOrderNode): Record { + const raw = node[":@"]; + if (!raw || typeof raw !== "object") return {}; + const out: Record = {}; + for (const [k, v] of Object.entries(raw as Record)) { + const name = k.startsWith("@_") ? k.slice(2) : k; + if (typeof v === "string") out[name] = v; + else if (v != null) out[name] = String(v); + } + return out; +} + +/** + * Find the first element in `doc` (or under `parent`) whose stripped tag name + * matches one of the given local names. Used to locate the xsd:schema root + * regardless of whether the file uses `xsd:`, `xs:`, or no prefix. + */ +export function findFirstByLocalName( + nodes: PreserveOrderDocument | PreserveOrderNode[], + localNames: string[], +): PreserveOrderNode | null { + for (const node of nodes) { + const tag = nodeTag(node); + if (tag && localNames.includes(stripPrefix(tag))) return node; + } + return null; +} + +/** + * Iterate immediate children of an element whose stripped tag name matches `localName`. + */ +export function* eachChildByLocalName( + parent: PreserveOrderNode, + localName: string, +): Generator { + for (const child of nodeChildren(parent)) { + const tag = nodeTag(child); + if (tag && stripPrefix(tag) === localName) yield child; + } +} diff --git a/scripts/ingest-xsd/fetch.ts b/scripts/ingest-xsd/fetch.ts new file mode 100644 index 0000000..1484601 --- /dev/null +++ b/scripts/ingest-xsd/fetch.ts @@ -0,0 +1,183 @@ +/** + * Fetch ECMA-376 Transitional XSDs from the ECMA Part 4 zip. + * + * The Part 4 zip is published by Ecma International on the ECMA-376 + * publications page. It contains OfficeOpenXML-XMLSchema-Transitional.zip, + * which in turn contains the 26 Transitional XSDs (wml.xsd, dml-main.xsd, + * sml.xsd, pml.xsd, shared-*.xsd, and friends). + * + * URL and sha256 are read from data/sources.json's ecma-376-transitional + * entry by default. CLI flags and env vars override; useful for testing a + * new edition before pinning it in the manifest. + * + * Cache layout: + * data/xsd-cache/ + * _staging/ (outer + inner zip extraction scratch) + * ecma-376-transitional/ (final XSDs land here) + * + * Usage: + * bun run xsd:fetch (manifest default) + * bun run xsd:fetch -- --url (override URL) + * bun run xsd:fetch -- --expected-sha256 (override hash) + * XSD_PART4_URL= bun run xsd:fetch (override via env) + */ + +import { createHash } from "node:crypto"; +import { readdirSync } from "node:fs"; +import { mkdir, rm } from "node:fs/promises"; +import { join } from "node:path"; + +const CACHE_ROOT = "./data/xsd-cache"; +const STAGING_DIR = join(CACHE_ROOT, "_staging"); +const FINAL_DIR = join(CACHE_ROOT, "ecma-376-transitional"); +const DEFAULT_INNER_ZIP = "OfficeOpenXML-XMLSchema-Transitional.zip"; + +interface Args { + url: string; + expectedSha256: string | null; + innerZip: string; +} + +interface SourceManifestEntry { + name: string; + url?: string; + sha256?: string | null; +} + +interface SourceManifest { + sources: SourceManifestEntry[]; +} + +async function loadManifestDefault(): Promise<{ url: string | null; sha256: string | null }> { + try { + const raw = await Bun.file("./data/sources.json").text(); + const manifest = JSON.parse(raw) as SourceManifest; + const ecma = manifest.sources?.find((s) => s.name === "ecma-376-transitional"); + return { + url: ecma?.url ?? null, + sha256: ecma?.sha256 ?? null, + }; + } catch { + return { url: null, sha256: null }; + } +} + +async function parseArgs(): Promise { + const argv = process.argv.slice(2); + let url: string | null = process.env.XSD_PART4_URL ?? null; + let expectedSha256: string | null = null; + let innerZip = DEFAULT_INNER_ZIP; + + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--url") url = argv[++i] ?? null; + else if (a === "--expected-sha256") expectedSha256 = argv[++i] ?? null; + else if (a === "--inner-zip") innerZip = argv[++i] ?? DEFAULT_INNER_ZIP; + } + + // Fall back to the manifest for any unset values. data/sources.json is + // the canonical pin; we treat it as the default config so the common case + // is just `bun run xsd:fetch`. + if (!url || !expectedSha256) { + const fromManifest = await loadManifestDefault(); + if (!url) url = fromManifest.url; + if (!expectedSha256) expectedSha256 = fromManifest.sha256; + } + + if (!url) { + console.error( + "No URL configured. Set 'url' on the ecma-376-transitional entry in data/sources.json,", + ); + console.error("or pass --url / XSD_PART4_URL."); + process.exit(1); + } + return { url, expectedSha256, innerZip }; +} + +async function sha256(path: string): Promise { + const buf = await Bun.file(path).arrayBuffer(); + return createHash("sha256").update(new Uint8Array(buf)).digest("hex"); +} + +async function downloadTo(url: string, dest: string): Promise { + console.log(`Downloading ${url}`); + const res = await fetch(url); + if (!res.ok) { + throw new Error(`Fetch failed: ${res.status} ${res.statusText}`); + } + const buf = await res.arrayBuffer(); + await Bun.write(dest, buf); + console.log(` wrote ${dest} (${(buf.byteLength / 1024 / 1024).toFixed(2)} MiB)`); +} + +async function unzipInto(zipPath: string, dir: string): Promise { + await mkdir(dir, { recursive: true }); + const proc = Bun.spawn(["unzip", "-o", "-q", zipPath, "-d", dir], { + stdout: "inherit", + stderr: "inherit", + }); + const code = await proc.exited; + if (code !== 0) throw new Error(`unzip exited ${code} on ${zipPath}`); +} + +function findFile(dir: string, name: string): string | null { + const stack = [dir]; + while (stack.length) { + const cur = stack.pop()!; + for (const entry of readdirSync(cur, { withFileTypes: true })) { + const p = join(cur, entry.name); + if (entry.isDirectory()) stack.push(p); + else if (entry.name === name) return p; + } + } + return null; +} + +async function main() { + const args = await parseArgs(); + + await rm(STAGING_DIR, { recursive: true, force: true }); + await rm(FINAL_DIR, { recursive: true, force: true }); + await mkdir(STAGING_DIR, { recursive: true }); + + const outerPath = join(STAGING_DIR, "part4.zip"); + await downloadTo(args.url, outerPath); + + const outerHash = await sha256(outerPath); + console.log(`outer zip sha256: ${outerHash}`); + if (args.expectedSha256 && outerHash !== args.expectedSha256) { + throw new Error(`sha256 mismatch: expected ${args.expectedSha256}, got ${outerHash}`); + } + + console.log(`Extracting outer zip into ${STAGING_DIR}`); + await unzipInto(outerPath, STAGING_DIR); + + const innerPath = findFile(STAGING_DIR, args.innerZip); + if (!innerPath) { + throw new Error(`Did not find ${args.innerZip} inside the outer zip.`); + } + console.log(`Found inner zip at ${innerPath}`); + + console.log(`Extracting Transitional XSDs into ${FINAL_DIR}`); + await unzipInto(innerPath, FINAL_DIR); + + const wml = findFile(FINAL_DIR, "wml.xsd"); + if (!wml) { + throw new Error(`wml.xsd not found in extracted XSD set; aborting.`); + } + + const xsdFiles = readdirSync(FINAL_DIR).filter((f) => f.endsWith(".xsd")); + console.log(`\nDone. ${xsdFiles.length} XSD files in ${FINAL_DIR}:`); + for (const f of xsdFiles.slice().sort()) console.log(` ${f}`); + + if (!args.expectedSha256) { + console.log("\nTo pin this fetch for reproducibility, paste the sha256 above into"); + console.log("data/sources.json under the 'ecma-376-transitional' entry, then re-run"); + console.log("`bun run db:sync-sources` to update the row."); + } +} + +main().catch((err) => { + console.error("Fetch failed:", err); + process.exit(1); +}); diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts new file mode 100644 index 0000000..f8adc2d --- /dev/null +++ b/scripts/ingest-xsd/ingest.ts @@ -0,0 +1,1046 @@ +/** + * Ingest the OOXML schema graph from parseSchemaSet output. Runs in a single + * transaction and writes: + * + * - xsd_profiles, xsd_namespaces, xsd_symbols, xsd_symbol_profiles + * (bootstrap + per-symbol membership; symbol/inheritance passes use + * ON CONFLICT for natural-key idempotency) + * - xsd_inheritance_edges (complexContent/simpleContent extension/restriction + * and simpleType restriction) + * - xsd_compositors, xsd_child_edges, xsd_group_edges (content models; + * content-model rows have no natural unique key, so this pass uses + * delete-and-rewrite per profile) + * - xsd_attr_edges, xsd_enums (attributes, attributeGroup refs, and + * simpleType enumeration values; same delete-and-rewrite pattern) + * + * Re-running against the same source is idempotent: identical row counts on + * every run. Stale-row cleanup (when symbols vanish in a future edition) is + * deferred until needed. + * + * Library usage: + * await ingestSchemaSet({ schemaDir, entrypoints, profileName, sourceName, db }) + * + * CLI usage: + * bun run xsd:ingest + * bun run xsd:ingest --schema-dir --entrypoint wml.xsd \ + * --profile transitional --source ecma-376-transitional + */ + +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; +import { nodeAttrs } from "./ast.ts"; +import { parseSchemaSet } from "./parse-schema.ts"; +import { resolveQNameAttr } from "./qname.ts"; +import type { Declaration, ParsedSchemaSet, PreserveOrderNode } from "./types.ts"; +import { vocabularyForNamespace } from "./vocabulary.ts"; + +// biome-ignore lint/suspicious/noExplicitAny: postgres library typing is intricate; helpers stay generic. +type Sql = any; + +export interface IngestSchemaSetOptions { + schemaDir: string; + entrypoints: string[]; + /** Profile name to attach symbols to (e.g. "transitional"). Bootstrap if missing. */ + profileName: string; + /** Source name in reference_sources; used for source_id on xsd_symbol_profiles. */ + sourceName: string; + /** Existing DbClient. The ingest opens its own transaction inside. */ + db: DbClient; +} + +export interface IngestStats { + documents: number; + symbolsInserted: number; + symbolsExisting: number; + namespacesEnsured: number; + profileMembershipsInserted: number; + inheritanceEdgesInserted: number; + inheritanceUnresolved: number; + compositorsInserted: number; + childEdgesInserted: number; + childEdgesUnresolved: number; + groupRefsInserted: number; + groupRefsUnresolved: number; + localElementsCreated: number; + attrEdgesInserted: number; + attrEdgesUnresolved: number; + attrGroupRefsInserted: number; + attrGroupRefsUnresolved: number; + enumsInserted: number; +} + +export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise { + const parseResult = await parseSchemaSet({ + schemaDir: opts.schemaDir, + entrypoints: opts.entrypoints, + }); + + const stats: IngestStats = { + documents: parseResult.documents.size, + symbolsInserted: 0, + symbolsExisting: 0, + namespacesEnsured: 0, + profileMembershipsInserted: 0, + inheritanceEdgesInserted: 0, + inheritanceUnresolved: 0, + compositorsInserted: 0, + childEdgesInserted: 0, + childEdgesUnresolved: 0, + groupRefsInserted: 0, + groupRefsUnresolved: 0, + localElementsCreated: 0, + attrEdgesInserted: 0, + attrEdgesUnresolved: 0, + attrGroupRefsInserted: 0, + attrGroupRefsUnresolved: 0, + enumsInserted: 0, + }; + + await opts.db.sql.begin(async (sql: Sql) => { + const profileId = await ensureProfile(sql, opts.profileName); + const sourceId = await lookupSourceId(sql, opts.sourceName); + + // Purge anything this source previously wrote so re-ingest is a clean + // rewrite and tolerant of schema migrations that change the symbol + // shape (e.g. the addition of parent_symbol_id for local-element + // scoping). + // + // Several FKs reference xsd_symbols.id WITHOUT cascade + // (xsd_inheritance_edges.base_symbol_id, xsd_child_edges.child_symbol_id, + // xsd_attr_edges.attr_symbol_id, xsd_group_edges.group_symbol_id), so + // those rows must be cleaned explicitly before the symbol delete. The + // LHS FKs (parent_symbol_id, symbol_id) DO cascade, as does + // xsd_symbol_profiles.symbol_id and behavior_notes.symbol_id. (When + // curated behavior_notes start landing, switch to natural-key + // reconciliation rather than cascade-delete.) + await sql` + DELETE FROM xsd_inheritance_edges + WHERE base_symbol_id IN (SELECT symbol_id FROM xsd_symbol_profiles WHERE source_id = ${sourceId}) + `; + await sql` + DELETE FROM xsd_child_edges + WHERE child_symbol_id IN (SELECT symbol_id FROM xsd_symbol_profiles WHERE source_id = ${sourceId}) + `; + await sql` + DELETE FROM xsd_attr_edges + WHERE attr_symbol_id IN (SELECT symbol_id FROM xsd_symbol_profiles WHERE source_id = ${sourceId}) + `; + await sql` + DELETE FROM xsd_group_edges + WHERE group_symbol_id IN (SELECT symbol_id FROM xsd_symbol_profiles WHERE source_id = ${sourceId}) + `; + await sql` + DELETE FROM xsd_symbols + WHERE id IN (SELECT symbol_id FROM xsd_symbol_profiles WHERE source_id = ${sourceId}) + `; + + // Pass 1: namespaces, symbols, profile memberships. + const namespaceIds = new Map(); + const symbolIds = new Map(); // canonical (vocab|local|kind) -> id + + for (const doc of parseResult.documents.values()) { + if (!namespaceIds.has(doc.targetNamespace)) { + const id = await ensureNamespace(sql, doc.targetNamespace); + namespaceIds.set(doc.targetNamespace, id); + stats.namespacesEnsured++; + } + } + + for (const decls of parseResult.declarationsByQName.values()) { + for (const decl of decls) { + const key = symbolKey(decl.vocabularyId, decl.localName, decl.kind); + if (symbolIds.has(key)) continue; + + // Capture @type for elements and attributes; resolved Clark-style. + const typeRef = resolveDeclTypeRef(decl, parseResult); + + const { id, inserted } = await upsertSymbol( + sql, + decl.vocabularyId, + decl.localName, + decl.kind, + typeRef, + ); + symbolIds.set(key, id); + if (inserted) stats.symbolsInserted++; + else stats.symbolsExisting++; + + const nsId = namespaceIds.get(decl.namespace); + if (!nsId) { + throw new Error( + `Internal: missing namespace id for ${decl.namespace} (decl ${decl.localName})`, + ); + } + const linked = await linkSymbolToProfile(sql, id, profileId, nsId, sourceId); + if (linked) stats.profileMembershipsInserted++; + } + } + + // Pass 2: inheritance edges. Resolve base qname through the document's + // prefix map; ensure built-in xsd:* placeholders exist on demand. + for (const decls of parseResult.declarationsByQName.values()) { + for (const decl of decls) { + const inherit = findInheritance(decl); + if (!inherit) continue; + + const prefixMap = parseResult.namespaceByPrefix.get(decl.documentPath); + if (!prefixMap) continue; + const resolved = resolveQNameAttr(inherit.baseQName, prefixMap, decl.namespace); + if (!resolved.resolved) { + stats.inheritanceUnresolved++; + continue; + } + const baseQ = resolved.qname; + if (!baseQ.vocabularyId) { + stats.inheritanceUnresolved++; + continue; + } + + // Look up existing symbol; for xsd-builtin, ensure on demand. + let baseId: number | null = null; + const candidateKinds: Array = [ + "complexType", + "simpleType", + "element", + "group", + "attributeGroup", + "attribute", + ]; + for (const k of candidateKinds) { + const id = symbolIds.get(symbolKey(baseQ.vocabularyId, baseQ.localName, k)); + if (id != null) { + baseId = id; + break; + } + } + if (baseId == null && baseQ.vocabularyId === "xsd-builtin") { + const { id, inserted } = await upsertSymbol( + sql, + "xsd-builtin", + baseQ.localName, + "simpleType", + ); + symbolIds.set(symbolKey("xsd-builtin", baseQ.localName, "simpleType"), id); + baseId = id; + if (inserted) stats.symbolsInserted++; + else stats.symbolsExisting++; + // Link to a profile so ooxml_lookup_type / lookupSymbolByTypeRef can + // follow type_refs into the W3C XSD namespace. + let xsdNsId = namespaceIds.get(baseQ.namespace); + if (xsdNsId == null) { + xsdNsId = await ensureNamespace(sql, baseQ.namespace); + namespaceIds.set(baseQ.namespace, xsdNsId); + stats.namespacesEnsured++; + } + const linked = await linkSymbolToProfile(sql, id, profileId, xsdNsId, sourceId); + if (linked) stats.profileMembershipsInserted++; + } + if (baseId == null) { + stats.inheritanceUnresolved++; + continue; + } + + const childId = symbolIds.get(symbolKey(decl.vocabularyId, decl.localName, decl.kind)); + if (childId == null) continue; + + const inserted = await insertInheritance(sql, childId, baseId, profileId, inherit.relation); + if (inserted) stats.inheritanceEdgesInserted++; + } + } + + // Pass 3: content models. Walk every complexType and group declaration, + // emit xsd_compositors / xsd_child_edges / xsd_group_edges. Local element + // declarations are deduped under (owner-vocab, name, element); cross-CT + // reuse of a local name collapses to one symbol. + // + // Idempotency strategy: content-model rows have no natural unique key + // (a single complexType can hold multiple sibling compositors of the same + // kind), so we delete-and-rewrite per profile. xsd_child_edges FK on + // xsd_compositors with ON DELETE CASCADE handles child_edges cleanup. + // Assumes one source per profile, which holds today; revisit when + // multiple sources contribute to the same profile. + await sql`DELETE FROM xsd_compositors WHERE profile_id = ${profileId}`; + await sql`DELETE FROM xsd_group_edges WHERE profile_id = ${profileId}`; + + for (const decls of parseResult.declarationsByQName.values()) { + for (const decl of decls) { + if (decl.kind !== "complexType" && decl.kind !== "group") continue; + + const ownerSymbolId = symbolIds.get( + symbolKey(decl.vocabularyId, decl.localName, decl.kind), + ); + if (ownerSymbolId == null) continue; + const prefixMap = parseResult.namespaceByPrefix.get(decl.documentPath); + if (!prefixMap) continue; + + const ctx: WalkCtx = { + sql, + profileId, + sourceId, + ownerSymbolId, + ownerDecl: decl, + prefixMap, + symbolIds, + namespaceIds, + parseResult, + stats, + }; + + const particleParents = findContentModelParents(decl); + let topOrder = 0; + for (const parent of particleParents) { + for (const child of nodeChildrenLocal(parent)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag === "sequence" || tag === "choice" || tag === "all") { + await walkCompositor(child, tag, null, topOrder, ctx); + topOrder++; + } else if (tag === "group") { + await handleGroupRef(child, null, topOrder, ctx); + topOrder++; + } + } + } + } + } + + // Pass 4: attributes, attributeGroup refs, and simpleType enumerations. + // Same delete-and-rewrite strategy as Pass 3. xsd_group_edges already + // cleared by Pass 3, so attributeGroup ref inserts here are fresh. + await sql`DELETE FROM xsd_attr_edges WHERE profile_id = ${profileId}`; + await sql`DELETE FROM xsd_enums WHERE profile_id = ${profileId}`; + + for (const decls of parseResult.declarationsByQName.values()) { + for (const decl of decls) { + const ownerSymbolId = symbolIds.get( + symbolKey(decl.vocabularyId, decl.localName, decl.kind), + ); + if (ownerSymbolId == null) continue; + const prefixMap = parseResult.namespaceByPrefix.get(decl.documentPath); + if (!prefixMap) continue; + + if (decl.kind === "complexType" || decl.kind === "attributeGroup") { + const parents = findAttributeParents(decl); + let order = 0; + for (const parent of parents) { + for (const child of nodeChildrenLocal(parent)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag === "attribute") { + await handleAttribute( + sql, + child, + ownerSymbolId, + profileId, + prefixMap, + decl.namespace, + symbolIds, + parseResult, + order, + stats, + ); + order++; + } else if (tag === "attributeGroup") { + const a = nodeAttrs(child); + if (!a.ref) continue; + const resolved = resolveQNameAttr(a.ref, prefixMap, decl.namespace); + if (!resolved.resolved || !resolved.qname.vocabularyId) { + stats.attrGroupRefsUnresolved++; + continue; + } + const groupSymbolId = symbolIds.get( + symbolKey( + resolved.qname.vocabularyId, + resolved.qname.localName, + "attributeGroup", + ), + ); + if (groupSymbolId == null) { + stats.attrGroupRefsUnresolved++; + continue; + } + // attributeGroup refs don't live inside content compositors; + // compositor_id stays null and min/max default to 1. + await insertGroupEdge( + sql, + ownerSymbolId, + null, + groupSymbolId, + profileId, + "attributeGroup", + 1, + 1, + order, + ); + stats.attrGroupRefsInserted++; + order++; + } + } + } + } else if (decl.kind === "simpleType") { + let order = 0; + for (const value of findEnumValues(decl)) { + await insertEnum(sql, ownerSymbolId, profileId, value, order); + stats.enumsInserted++; + order++; + } + } + } + } + }); + + return stats; +} + +interface WalkCtx { + sql: Sql; + profileId: number; + sourceId: number; + ownerSymbolId: number; + ownerDecl: Declaration; + prefixMap: Map; + symbolIds: Map; + namespaceIds: Map; + parseResult: ParsedSchemaSet; + stats: IngestStats; +} + +/** + * Resolve a declaration's @type qname (for top-level element/attribute decls) + * to Clark-style {namespace}localName, or null if the declaration has no @type. + */ +function resolveDeclTypeRef(decl: Declaration, parseResult: ParsedSchemaSet): string | null { + if (decl.kind !== "element" && decl.kind !== "attribute") return null; + const a = nodeAttrs(decl.node); + if (!a.type) return null; + const prefixMap = parseResult.namespaceByPrefix.get(decl.documentPath); + if (!prefixMap) return a.type; + const r = resolveQNameAttr(a.type, prefixMap, decl.namespace); + return r.resolved ? `{${r.qname.namespace}}${r.qname.localName}` : a.type; +} + +/** + * For a complexType: yield the node(s) whose direct children are particles + * (sequence/choice/all/group). That's the complexType itself, OR (for derived + * types) the inner xsd:extension or xsd:restriction beneath complexContent. + * + * For a group definition: yield the group node itself. + * + * simpleContent has no element particles; not yielded. + */ +function findContentModelParents(decl: Declaration): PreserveOrderNode[] { + if (decl.kind === "group") return [decl.node]; + + if (decl.kind !== "complexType") return []; + + const out: PreserveOrderNode[] = []; + let sawComplexContent = false; + for (const child of nodeChildrenLocal(decl.node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag === "complexContent") { + sawComplexContent = true; + for (const inner of nodeChildrenLocal(child)) { + const innerTag = stripPrefixLocal(nodeTagLocal(inner)); + if (innerTag === "extension" || innerTag === "restriction") out.push(inner); + } + } + } + if (sawComplexContent) return out; + // No complexContent wrapper: particles live directly under complexType. + return [decl.node]; +} + +async function walkCompositor( + node: PreserveOrderNode, + kind: "sequence" | "choice" | "all", + parentCompositorId: number | null, + orderIndex: number, + ctx: WalkCtx, +): Promise { + const a = nodeAttrs(node); + const compositorId = await insertCompositor( + ctx.sql, + parentCompositorId === null ? ctx.ownerSymbolId : null, + parentCompositorId, + ctx.profileId, + kind, + parseMinOccurs(a.minOccurs), + parseMaxOccurs(a.maxOccurs), + orderIndex, + ); + ctx.stats.compositorsInserted++; + + let childOrder = 0; + for (const child of nodeChildrenLocal(node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag === "element") { + await handleElement(child, compositorId, childOrder, ctx); + childOrder++; + } else if (tag === "sequence" || tag === "choice" || tag === "all") { + await walkCompositor(child, tag, compositorId, childOrder, ctx); + childOrder++; + } else if (tag === "group") { + await handleGroupRef(child, compositorId, childOrder, ctx); + childOrder++; + } + // xsd:any: skipped for now. + } +} + +async function handleElement( + node: PreserveOrderNode, + compositorId: number, + orderIndex: number, + ctx: WalkCtx, +): Promise { + const a = nodeAttrs(node); + let childSymbolId: number | null = null; + + if (a.ref) { + const resolved = resolveQNameAttr(a.ref, ctx.prefixMap, ctx.ownerDecl.namespace); + if (!resolved.resolved || !resolved.qname.vocabularyId) { + ctx.stats.childEdgesUnresolved++; + return; + } + const id = ctx.symbolIds.get( + symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "element"), + ); + if (id == null) { + ctx.stats.childEdgesUnresolved++; + return; + } + childSymbolId = id; + } else if (a.name) { + // Resolve @type so ooxml_lookup_element / ooxml_children can follow it. + let typeRef: string | null = null; + if (a.type) { + const r = resolveQNameAttr(a.type, ctx.prefixMap, ctx.ownerDecl.namespace); + typeRef = r.resolved ? `{${r.qname.namespace}}${r.qname.localName}` : a.type; + } + // Local elements are scoped per-owner: the same name in two different + // complexTypes is not the same symbol (e.g. WML's tblGrid is + // CT_TblGridBase inside CT_TblGridChange but CT_TblGrid inside CT_Tbl). + const key = symbolKey(ctx.ownerDecl.vocabularyId, a.name, "element", ctx.ownerSymbolId); + let id = ctx.symbolIds.get(key); + if (id == null) { + const res = await upsertSymbol( + ctx.sql, + ctx.ownerDecl.vocabularyId, + a.name, + "element", + typeRef, + ctx.ownerSymbolId, + ); + ctx.symbolIds.set(key, res.id); + if (res.inserted) { + ctx.stats.symbolsInserted++; + ctx.stats.localElementsCreated++; + } else { + ctx.stats.symbolsExisting++; + } + // Local elements need profile membership too, otherwise + // ooxml_lookup_element won't find them in the transitional profile. + const nsId = ctx.namespaceIds.get(ctx.ownerDecl.namespace); + if (nsId != null) { + const linked = await linkSymbolToProfile( + ctx.sql, + res.id, + ctx.profileId, + nsId, + ctx.sourceId, + ); + if (linked) ctx.stats.profileMembershipsInserted++; + } + id = res.id; + } else if (typeRef) { + // Existing symbol; ensure type_ref is set if we have one. + await ctx.sql` + UPDATE xsd_symbols SET type_ref = ${typeRef} + WHERE id = ${id} AND type_ref IS NULL + `; + } + childSymbolId = id; + } + + if (childSymbolId == null) return; + + await insertChildEdge( + ctx.sql, + ctx.ownerSymbolId, + compositorId, + childSymbolId, + ctx.profileId, + parseMinOccurs(a.minOccurs), + parseMaxOccurs(a.maxOccurs), + orderIndex, + ); + ctx.stats.childEdgesInserted++; +} + +async function handleGroupRef( + node: PreserveOrderNode, + compositorId: number | null, + orderIndex: number, + ctx: WalkCtx, +): Promise { + const a = nodeAttrs(node); + if (!a.ref) return; + const resolved = resolveQNameAttr(a.ref, ctx.prefixMap, ctx.ownerDecl.namespace); + if (!resolved.resolved || !resolved.qname.vocabularyId) { + ctx.stats.groupRefsUnresolved++; + return; + } + const groupSymbolId = ctx.symbolIds.get( + symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "group"), + ); + if (groupSymbolId == null) { + ctx.stats.groupRefsUnresolved++; + return; + } + await insertGroupEdge( + ctx.sql, + ctx.ownerSymbolId, + compositorId, + groupSymbolId, + ctx.profileId, + "group", + parseMinOccurs(a.minOccurs), + parseMaxOccurs(a.maxOccurs), + orderIndex, + ); + ctx.stats.groupRefsInserted++; +} + +function parseMinOccurs(raw: string | undefined): number { + if (raw === undefined) return 1; + const n = parseInt(raw, 10); + return Number.isFinite(n) ? n : 1; +} + +function parseMaxOccurs(raw: string | undefined): number | null { + if (raw === undefined) return 1; + if (raw === "unbounded") return null; + const n = parseInt(raw, 10); + return Number.isFinite(n) ? n : 1; +} + +// --- DB helpers ---------------------------------------------------------- + +async function ensureProfile(sql: Sql, name: string): Promise { + const [row] = await sql` + INSERT INTO xsd_profiles (name) VALUES (${name}) + ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name + RETURNING id + `; + return row.id; +} + +async function lookupSourceId(sql: Sql, name: string): Promise { + const [row] = await sql`SELECT id FROM reference_sources WHERE name = ${name} LIMIT 1`; + if (!row) + throw new Error( + `reference_sources row not found for name='${name}'. Run \`bun run sources:sync\` first.`, + ); + return row.id; +} + +async function ensureNamespace(sql: Sql, uri: string): Promise { + const [row] = await sql` + INSERT INTO xsd_namespaces (uri) VALUES (${uri}) + ON CONFLICT (uri) DO UPDATE SET uri = EXCLUDED.uri + RETURNING id + `; + return row.id; +} + +async function upsertSymbol( + sql: Sql, + vocabularyId: string, + localName: string, + kind: string, + typeRef: string | null = null, + parentSymbolId: number | null = null, +): Promise<{ id: number; inserted: boolean }> { + const [row] = await sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind, type_ref, parent_symbol_id) + VALUES (${vocabularyId}, ${localName}, ${kind}, ${typeRef}, ${parentSymbolId}) + ON CONFLICT ON CONSTRAINT xsd_symbols_canonical_key DO UPDATE + SET type_ref = COALESCE(EXCLUDED.type_ref, xsd_symbols.type_ref) + RETURNING id, (xmax = 0) AS inserted + `; + return { id: row.id, inserted: row.inserted }; +} + +async function linkSymbolToProfile( + sql: Sql, + symbolId: number, + profileId: number, + namespaceId: number, + sourceId: number, +): Promise { + const rows = await sql` + INSERT INTO xsd_symbol_profiles (symbol_id, profile_id, namespace_id, source_id) + VALUES (${symbolId}, ${profileId}, ${namespaceId}, ${sourceId}) + ON CONFLICT (symbol_id, profile_id) DO NOTHING + RETURNING id + `; + return rows.length > 0; +} + +async function insertInheritance( + sql: Sql, + symbolId: number, + baseSymbolId: number, + profileId: number, + relation: "extension" | "restriction", +): Promise { + const rows = await sql` + INSERT INTO xsd_inheritance_edges (symbol_id, base_symbol_id, profile_id, relation) + VALUES (${symbolId}, ${baseSymbolId}, ${profileId}, ${relation}) + ON CONFLICT (symbol_id, profile_id) DO NOTHING + RETURNING id + `; + return rows.length > 0; +} + +async function insertCompositor( + sql: Sql, + parentSymbolId: number | null, + parentCompositorId: number | null, + profileId: number, + kind: "sequence" | "choice" | "all", + minOccurs: number, + maxOccurs: number | null, + orderIndex: number, +): Promise { + const [row] = await sql` + INSERT INTO xsd_compositors + (parent_symbol_id, parent_compositor_id, profile_id, kind, min_occurs, max_occurs, order_index) + VALUES + (${parentSymbolId}, ${parentCompositorId}, ${profileId}, ${kind}, ${minOccurs}, ${maxOccurs}, ${orderIndex}) + RETURNING id + `; + return row.id; +} + +async function insertChildEdge( + sql: Sql, + parentSymbolId: number, + compositorId: number, + childSymbolId: number, + profileId: number, + minOccurs: number, + maxOccurs: number | null, + orderIndex: number, +): Promise { + await sql` + INSERT INTO xsd_child_edges + (parent_symbol_id, compositor_id, child_symbol_id, profile_id, min_occurs, max_occurs, order_index) + VALUES + (${parentSymbolId}, ${compositorId}, ${childSymbolId}, ${profileId}, ${minOccurs}, ${maxOccurs}, ${orderIndex}) + `; +} + +async function insertGroupEdge( + sql: Sql, + parentSymbolId: number, + compositorId: number | null, + groupSymbolId: number, + profileId: number, + refKind: "group" | "attributeGroup", + minOccurs: number, + maxOccurs: number | null, + orderIndex: number, +): Promise { + await sql` + INSERT INTO xsd_group_edges + (parent_symbol_id, compositor_id, group_symbol_id, profile_id, ref_kind, min_occurs, max_occurs, order_index) + VALUES + (${parentSymbolId}, ${compositorId}, ${groupSymbolId}, ${profileId}, ${refKind}, ${minOccurs}, ${maxOccurs}, ${orderIndex}) + `; +} + +async function insertAttrEdge( + sql: Sql, + symbolId: number, + attrSymbolId: number | null, + localName: string, + profileId: number, + attrUse: "required" | "optional" | "prohibited", + defaultValue: string | null, + fixedValue: string | null, + typeRef: string | null, + orderIndex: number, +): Promise { + await sql` + INSERT INTO xsd_attr_edges + (symbol_id, attr_symbol_id, local_name, profile_id, attr_use, default_value, fixed_value, type_ref, order_index) + VALUES + (${symbolId}, ${attrSymbolId}, ${localName}, ${profileId}, ${attrUse}, ${defaultValue}, ${fixedValue}, ${typeRef}, ${orderIndex}) + `; +} + +async function insertEnum( + sql: Sql, + symbolId: number, + profileId: number, + value: string, + orderIndex: number, +): Promise { + await sql` + INSERT INTO xsd_enums (symbol_id, profile_id, value, order_index) + VALUES (${symbolId}, ${profileId}, ${value}, ${orderIndex}) + `; +} + +/** + * Locate the nodes whose direct children are xsd:attribute / xsd:attributeGroup. + * For complexType: the type itself when there's no complexContent/simpleContent + * wrapper, otherwise the inner extension/restriction nodes. + * For attributeGroup: the group node itself. + */ +function findAttributeParents(decl: Declaration): PreserveOrderNode[] { + if (decl.kind === "attributeGroup") return [decl.node]; + if (decl.kind !== "complexType") return []; + + const out: PreserveOrderNode[] = []; + let sawWrapper = false; + for (const child of nodeChildrenLocal(decl.node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag === "complexContent" || tag === "simpleContent") { + sawWrapper = true; + for (const inner of nodeChildrenLocal(child)) { + const innerTag = stripPrefixLocal(nodeTagLocal(inner)); + if (innerTag === "extension" || innerTag === "restriction") out.push(inner); + } + } + } + if (!sawWrapper) out.push(decl.node); + return out; +} + +/** xsd:simpleType > xsd:restriction > xsd:enumeration values, in order. */ +function findEnumValues(decl: Declaration): string[] { + const values: string[] = []; + for (const child of nodeChildrenLocal(decl.node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag !== "restriction") continue; + for (const e of nodeChildrenLocal(child)) { + const eTag = stripPrefixLocal(nodeTagLocal(e)); + if (eTag !== "enumeration") continue; + const a = nodeAttrs(e); + if (a.value !== undefined) values.push(a.value); + } + } + return values; +} + +async function handleAttribute( + sql: Sql, + node: PreserveOrderNode, + ownerSymbolId: number, + profileId: number, + prefixMap: Map, + defaultNamespace: string, + symbolIds: Map, + parseResult: ParsedSchemaSet, + orderIndex: number, + stats: IngestStats, +): Promise { + const a = nodeAttrs(node); + let localName: string | null = null; + let attrSymbolId: number | null = null; + let typeRef: string | null = null; + let defaultValue: string | null = a.default ?? null; + let fixedValue: string | null = a.fixed ?? null; + + if (a.ref) { + const resolved = resolveQNameAttr(a.ref, prefixMap, defaultNamespace); + if (!resolved.resolved || !resolved.qname.vocabularyId) { + stats.attrEdgesUnresolved++; + return; + } + localName = resolved.qname.localName; + const id = symbolIds.get( + symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "attribute"), + ); + if (id != null) attrSymbolId = id; + + // Carry type/default/fixed from the top-level declaration. + // XSD allows these only on the declaration, not the ref site, so look them up there. + const declKey = `{${resolved.qname.namespace}}attribute:${resolved.qname.localName}`; + const topDecl = parseResult.declarationsByQName.get(declKey)?.[0]; + if (topDecl) { + const declAttrs = nodeAttrs(topDecl.node); + if (declAttrs.type) { + const declPrefixMap = parseResult.namespaceByPrefix.get(topDecl.documentPath); + if (declPrefixMap) { + const t = resolveQNameAttr(declAttrs.type, declPrefixMap, topDecl.namespace); + typeRef = t.resolved ? `{${t.qname.namespace}}${t.qname.localName}` : declAttrs.type; + } else { + typeRef = declAttrs.type; + } + } + if (defaultValue == null) defaultValue = declAttrs.default ?? null; + if (fixedValue == null) fixedValue = declAttrs.fixed ?? null; + } + } else if (a.name) { + localName = a.name; + if (a.type) { + const resolved = resolveQNameAttr(a.type, prefixMap, defaultNamespace); + if (resolved.resolved) { + typeRef = `{${resolved.qname.namespace}}${resolved.qname.localName}`; + } else { + typeRef = a.type; // store raw if unresolvable; never lose info + } + } + } + + if (!localName) return; + + const rawUse = a.use; + const attrUse: "required" | "optional" | "prohibited" = + rawUse === "required" || rawUse === "optional" || rawUse === "prohibited" ? rawUse : "optional"; + + await insertAttrEdge( + sql, + ownerSymbolId, + attrSymbolId, + localName, + profileId, + attrUse, + defaultValue, + fixedValue, + typeRef, + orderIndex, + ); + stats.attrEdgesInserted++; +} + +// --- Inheritance discovery from AST ------------------------------------- + +interface InheritanceFinding { + baseQName: string; + relation: "extension" | "restriction"; +} + +function findInheritance(decl: Declaration): InheritanceFinding | null { + if (decl.kind === "complexType") { + for (const child of nodeChildrenLocal(decl.node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag !== "complexContent" && tag !== "simpleContent") continue; + for (const inner of nodeChildrenLocal(child)) { + const innerTag = stripPrefixLocal(nodeTagLocal(inner)); + if (innerTag !== "extension" && innerTag !== "restriction") continue; + const base = nodeAttrs(inner).base; + if (base) return { baseQName: base, relation: innerTag }; + } + } + return null; + } + if (decl.kind === "simpleType") { + for (const child of nodeChildrenLocal(decl.node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag !== "restriction") continue; + const base = nodeAttrs(child).base; + if (base) return { baseQName: base, relation: "restriction" }; + } + } + return null; +} + +function nodeTagLocal(node: PreserveOrderNode): string | null { + for (const k of Object.keys(node)) if (k !== ":@") return k; + return null; +} +function nodeChildrenLocal(node: PreserveOrderNode): PreserveOrderNode[] { + const tag = nodeTagLocal(node); + if (!tag) return []; + const v = node[tag]; + return Array.isArray(v) ? (v as PreserveOrderNode[]) : []; +} +function stripPrefixLocal(tag: string | null): string | null { + if (!tag) return null; + const colon = tag.indexOf(":"); + return colon < 0 ? tag : tag.slice(colon + 1); +} + +function symbolKey( + vocab: string, + local: string, + kind: string, + parentId: number | null = null, +): string { + return `${vocab}|${local}|${kind}|${parentId ?? ""}`; +} + +// --- CLI ----------------------------------------------------------------- + +interface CliArgs { + schemaDir: string; + entrypoints: string[]; + profileName: string; + sourceName: string; +} + +function parseCliArgs(): CliArgs { + const argv = process.argv.slice(2); + let schemaDir = "./data/xsd-cache/ecma-376-transitional"; + const entrypoints: string[] = []; + let profileName = "transitional"; + let sourceName = "ecma-376-transitional"; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--schema-dir") schemaDir = argv[++i] ?? schemaDir; + else if (a === "--entrypoint") entrypoints.push(argv[++i] ?? ""); + else if (a === "--profile") profileName = argv[++i] ?? profileName; + else if (a === "--source") sourceName = argv[++i] ?? sourceName; + } + if (entrypoints.length === 0) entrypoints.push("wml.xsd"); + return { schemaDir, entrypoints, profileName, sourceName }; +} + +async function main() { + const args = parseCliArgs(); + const databaseUrl = process.env.DATABASE_URL; + if (!databaseUrl) { + console.error("Missing DATABASE_URL"); + process.exit(1); + } + const db = createDbClient(databaseUrl); + + const t0 = Date.now(); + try { + const stats = await ingestSchemaSet({ ...args, db }); + const ms = Date.now() - t0; + console.log(`schemaDir: ${args.schemaDir}`); + console.log(`entrypoints: ${args.entrypoints.join(", ")}`); + console.log(`profile: ${args.profileName}`); + console.log(`source: ${args.sourceName}`); + console.log(`documents: ${stats.documents}`); + console.log(`symbols inserted: ${stats.symbolsInserted}`); + console.log(`symbols existing: ${stats.symbolsExisting}`); + console.log(`namespaces ensured: ${stats.namespacesEnsured}`); + console.log(`profile memberships: ${stats.profileMembershipsInserted}`); + console.log(`inheritance edges: ${stats.inheritanceEdgesInserted}`); + console.log(`inheritance unres.: ${stats.inheritanceUnresolved}`); + console.log(`compositors: ${stats.compositorsInserted}`); + console.log(`child edges: ${stats.childEdgesInserted}`); + console.log(`child edges unres.: ${stats.childEdgesUnresolved}`); + console.log(`group refs: ${stats.groupRefsInserted}`); + console.log(`group refs unres.: ${stats.groupRefsUnresolved}`); + console.log(`local elements: ${stats.localElementsCreated}`); + console.log(`attr edges: ${stats.attrEdgesInserted}`); + console.log(`attr edges unres.: ${stats.attrEdgesUnresolved}`); + console.log(`attrGroup refs: ${stats.attrGroupRefsInserted}`); + console.log(`attrGroup refs unr.: ${stats.attrGroupRefsUnresolved}`); + console.log(`enums: ${stats.enumsInserted}`); + console.log(`elapsed: ${ms}ms`); + } finally { + await db.close(); + } +} + +if (import.meta.path === Bun.main) { + main().catch((err) => { + console.error("ingest failed:", err); + process.exit(1); + }); +} diff --git a/scripts/ingest-xsd/parse-schema.ts b/scripts/ingest-xsd/parse-schema.ts new file mode 100644 index 0000000..b6b933e --- /dev/null +++ b/scripts/ingest-xsd/parse-schema.ts @@ -0,0 +1,187 @@ +/** + * Parse a working set of XSDs into an in-memory schema set. + * + * Walks xsd:import schemaLocation references recursively starting from + * `entrypoints`, and indexes every top-level declaration by canonical qname. + * + * No DB writes here. Subsequent phases (3c+) walk documents/declarations to + * produce xsd_symbols, edges, etc. + */ + +import { readFile } from "node:fs/promises"; +import { isAbsolute, normalize, relative, resolve, sep } from "node:path"; +import { XMLParser } from "fast-xml-parser"; +import { eachChildByLocalName, findFirstByLocalName, nodeAttrs, stripPrefix } from "./ast.ts"; +import { declarationQNameKey } from "./qname.ts"; +import type { + Declaration, + DeclarationKind, + ImportEdge, + ParsedSchemaDocument, + ParsedSchemaSet, + PreserveOrderDocument, + PreserveOrderNode, +} from "./types.ts"; +import { vocabularyForNamespace } from "./vocabulary.ts"; + +const xmlParser = new XMLParser({ + preserveOrder: true, + ignoreAttributes: false, + attributeNamePrefix: "@_", + parseAttributeValue: false, + parseTagValue: false, + trimValues: true, +}); + +const TOP_LEVEL_KINDS: Record = { + element: "element", + complexType: "complexType", + simpleType: "simpleType", + group: "group", + attributeGroup: "attributeGroup", + attribute: "attribute", +}; + +export interface ParseSchemaSetOptions { + schemaDir: string; + entrypoints: string[]; +} + +export async function parseSchemaSet(opts: ParseSchemaSetOptions): Promise { + const schemaDir = isAbsolute(opts.schemaDir) ? opts.schemaDir : resolve(opts.schemaDir); + + const documents = new Map(); + const namespaceByPrefix = new Map>(); + const importGraph = new Map(); + const declarationsByQName = new Map(); + + const queue: string[] = opts.entrypoints.map((p) => relPath(schemaDir, resolve(schemaDir, p))); + + while (queue.length) { + const relPathInDir = queue.shift()!; + if (documents.has(relPathInDir)) continue; + + const absolutePath = resolve(schemaDir, relPathInDir); + const text = await readFile(absolutePath, "utf-8"); + const ast = xmlParser.parse(text) as PreserveOrderDocument; + + const schemaNode = findFirstByLocalName(ast, ["schema"]); + if (!schemaNode) { + throw new Error(`No xsd:schema root in ${absolutePath}`); + } + + const attrs = nodeAttrs(schemaNode); + const targetNamespace = attrs.targetNamespace; + if (!targetNamespace) { + throw new Error(`Schema in ${absolutePath} is missing targetNamespace`); + } + + const prefixes = extractNamespacePrefixes(attrs); + const imports = extractImports(schemaNode, schemaDir, absolutePath); + + const doc: ParsedSchemaDocument = { + path: relPathInDir, + absolutePath, + targetNamespace, + vocabularyId: vocabularyForNamespace(targetNamespace), + schemaNode, + }; + + documents.set(relPathInDir, doc); + namespaceByPrefix.set(relPathInDir, prefixes); + importGraph.set(relPathInDir, imports); + + indexTopLevelDeclarations(doc, declarationsByQName); + + for (const edge of imports) { + if (edge.target && !documents.has(edge.target)) { + queue.push(edge.target); + } + } + } + + return { documents, namespaceByPrefix, importGraph, declarationsByQName }; +} + +function extractNamespacePrefixes(attrs: Record): Map { + const map = new Map(); + for (const [name, value] of Object.entries(attrs)) { + if (name === "xmlns") map.set("", value); + else if (name.startsWith("xmlns:")) map.set(name.slice("xmlns:".length), value); + } + return map; +} + +function extractImports( + schemaNode: PreserveOrderNode, + schemaDir: string, + currentAbsPath: string, +): ImportEdge[] { + const imports: ImportEdge[] = []; + for (const importNode of eachChildByLocalName(schemaNode, "import")) { + const a = nodeAttrs(importNode); + const schemaLocation = a.schemaLocation ?? null; + let target: string | null = null; + if (schemaLocation) { + const importedAbs = resolve(currentAbsPath, "..", schemaLocation); + target = relPath(schemaDir, importedAbs); + } + imports.push({ + namespace: a.namespace ?? "", + schemaLocation, + target, + }); + } + return imports; +} + +function indexTopLevelDeclarations( + doc: ParsedSchemaDocument, + declarationsByQName: Map, +): void { + for (const child of nodeChildrenLocal(doc.schemaNode)) { + const tag = nodeTagLocal(child); + if (!tag) continue; + const local = stripPrefix(tag); + const kind = TOP_LEVEL_KINDS[local]; + if (!kind) continue; + + const a = nodeAttrs(child); + const localName = a.name; + if (!localName) continue; + + const decl: Declaration = { + kind, + namespace: doc.targetNamespace, + vocabularyId: doc.vocabularyId, + localName, + documentPath: doc.path, + node: child, + }; + const key = declarationQNameKey(doc.targetNamespace, kind, localName); + const arr = declarationsByQName.get(key); + if (arr) arr.push(decl); + else declarationsByQName.set(key, [decl]); + } +} + +// Local helpers (avoid pulling extra exports from ast.ts). +function nodeTagLocal(node: PreserveOrderNode): string | null { + for (const k of Object.keys(node)) if (k !== ":@") return k; + return null; +} +function nodeChildrenLocal(node: PreserveOrderNode): PreserveOrderNode[] { + const tag = nodeTagLocal(node); + if (!tag) return []; + const v = node[tag]; + return Array.isArray(v) ? (v as PreserveOrderNode[]) : []; +} + +function relPath(base: string, abs: string): string { + const r = relative(base, normalize(abs)); + // Guard against escapes outside schemaDir. + if (r.startsWith(`..${sep}`) || r === "..") { + throw new Error(`Resolved path escapes schemaDir: ${abs} (base ${base})`); + } + return r; +} diff --git a/scripts/ingest-xsd/qname.ts b/scripts/ingest-xsd/qname.ts new file mode 100644 index 0000000..413a2aa --- /dev/null +++ b/scripts/ingest-xsd/qname.ts @@ -0,0 +1,86 @@ +/** + * QName resolution and canonical keys. + * + * Two distinct concerns: + * + * 1. Top-level declaration qnames are formed from the document's targetNamespace + * plus the local @name attribute. Use `declarationQNameKey(namespace, kind, localName)` + * to produce the canonical Clark-style key used in declarationsByQName. + * + * 2. QName-valued attributes (ref, type, base, substitutionGroup, etc.) hold a + * "prefix:localName" string. Resolution uses the document's xmlns:* declarations. + * `resolveQNameAttr` returns either a resolved tuple or "unresolved" - we never + * invent a namespace for an unknown prefix. + */ + +import { NAMESPACE_TO_VOCABULARY } from "./vocabulary.ts"; + +export interface ResolvedQName { + prefix: string; + localName: string; + namespace: string; + vocabularyId: string | null; +} + +export interface UnresolvedQName { + prefix: string; + localName: string; + raw: string; + reason: "unknown-prefix" | "unknown-namespace"; +} + +export type QNameResult = + | { resolved: true; qname: ResolvedQName } + | { resolved: false; qname: UnresolvedQName }; + +/** + * Canonical key for the declarationsByQName map. + * Clark-style namespace prefix plus the kind, e.g.: + * {http://schemas.openxmlformats.org/wordprocessingml/2006/main}complexType:CT_Tbl + */ +export function declarationQNameKey(namespace: string, kind: string, localName: string): string { + return `{${namespace}}${kind}:${localName}`; +} + +/** + * Resolve a qname string ("prefix:localName" or just "localName") in the context + * of a document's prefix → URI map. Unprefixed names use the empty-prefix entry + * (xmlns="..." default), falling back to the supplied default namespace. + * + * Never throws: returns { resolved: false, ... } for unknown prefixes or + * namespaces, so the caller can decide whether to surface or persist as-is. + */ +export function resolveQNameAttr( + raw: string, + prefixMap: Map, + defaultNamespace: string, +): QNameResult { + if (!raw) { + return { + resolved: false, + qname: { prefix: "", localName: "", raw, reason: "unknown-prefix" }, + }; + } + + const colon = raw.indexOf(":"); + let prefix = ""; + let localName = raw; + if (colon >= 0) { + prefix = raw.slice(0, colon); + localName = raw.slice(colon + 1); + } + + const namespace = prefix ? prefixMap.get(prefix) : (prefixMap.get("") ?? defaultNamespace); + if (!namespace) { + return { + resolved: false, + qname: { prefix, localName, raw, reason: "unknown-prefix" }, + }; + } + + const vocabularyId = NAMESPACE_TO_VOCABULARY[namespace] ?? null; + return { + resolved: true, + qname: { prefix, localName, namespace, vocabularyId }, + }; +} diff --git a/scripts/ingest-xsd/types.ts b/scripts/ingest-xsd/types.ts new file mode 100644 index 0000000..5bd1644 --- /dev/null +++ b/scripts/ingest-xsd/types.ts @@ -0,0 +1,75 @@ +/** + * Shared types for the XSD parser/ingest pipeline. + */ + +/** + * fast-xml-parser preserveOrder output: + * - Documents are arrays of single-key objects (one per top-level node). + * - Each element node has shape { tagName: children[], ":@"?: { "@_attr": value } }. + * - Text leaves are { "#text": string }. + * We type these loosely and use helpers in ast.ts to navigate. + */ +export type PreserveOrderNode = Record; +export type PreserveOrderDocument = PreserveOrderNode[]; + +/** A single XSD file, after parsing. */ +export interface ParsedSchemaDocument { + /** Path relative to schemaDir (e.g. "wml.xsd"). */ + path: string; + absolutePath: string; + targetNamespace: string; + /** Stable id derived from targetNamespace via NAMESPACE_TO_VOCABULARY. */ + vocabularyId: string; + /** The xsd:schema element from the preserveOrder AST; later passes walk it. */ + schemaNode: PreserveOrderNode; +} + +/** xsd:import declared on a document. */ +export interface ImportEdge { + namespace: string; + schemaLocation: string | null; + /** + * Relative path of the resolved imported document (within schemaDir), + * or null when schemaLocation is absent (xml namespace, externally-supplied schemas). + */ + target: string | null; +} + +/** + * A top-level declaration discovered in a schema (xsd:element, complexType, + * simpleType, group, attributeGroup, or globally-declared attribute). + * + * Top-level declarations are always in the document's targetNamespace; the + * vocabularyId is therefore the document's vocabularyId. + */ +export type DeclarationKind = + | "element" + | "complexType" + | "simpleType" + | "group" + | "attributeGroup" + | "attribute"; + +export interface Declaration { + kind: DeclarationKind; + namespace: string; + vocabularyId: string; + localName: string; + documentPath: string; + node: PreserveOrderNode; +} + +/** + * Result of parsing a working set of XSDs. + * + * - documents: every loaded file, keyed by path relative to schemaDir + * - namespaceByPrefix: per-document prefix → URI maps (each .xsd declares its own) + * - importGraph: per-document outgoing xsd:import edges, with resolved targets + * - declarationsByQName: canonical qname (Clark notation + kind) → declarations + */ +export interface ParsedSchemaSet { + documents: Map; + namespaceByPrefix: Map>; + importGraph: Map; + declarationsByQName: Map; +} diff --git a/scripts/ingest-xsd/vocabulary.ts b/scripts/ingest-xsd/vocabulary.ts new file mode 100644 index 0000000..c3846a2 --- /dev/null +++ b/scripts/ingest-xsd/vocabulary.ts @@ -0,0 +1,70 @@ +/** + * Canonical vocabulary IDs for OOXML namespaces. + * + * vocabulary_id is the stable identity used in xsd_symbols. Namespace URIs + * are profile-scoped aliases (a future profile could rebind a vocabulary to + * a different URI), so we don't key symbols by URI directly. + * + * Add an entry here when a new namespace appears. Unknown namespaces in + * input XSDs are an error: bail loudly so we extend the map deliberately + * rather than letting symbols land under an inferred id. + */ + +export const NAMESPACE_TO_VOCABULARY: Record = { + // WordprocessingML + "http://schemas.openxmlformats.org/wordprocessingml/2006/main": "wml-main", + + // SpreadsheetML + "http://schemas.openxmlformats.org/spreadsheetml/2006/main": "sml-main", + + // PresentationML + "http://schemas.openxmlformats.org/presentationml/2006/main": "pml-main", + + // DrawingML + "http://schemas.openxmlformats.org/drawingml/2006/main": "dml-main", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing": "dml-wp", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing": "dml-sp", + "http://schemas.openxmlformats.org/drawingml/2006/picture": "dml-pic", + "http://schemas.openxmlformats.org/drawingml/2006/chart": "dml-chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing": "dml-chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram": "dml-diagram", + "http://schemas.openxmlformats.org/drawingml/2006/lockedCanvas": "dml-lockedCanvas", + + // VML (legacy) + "urn:schemas-microsoft-com:vml": "vml-main", + "urn:schemas-microsoft-com:office:office": "vml-office", + "urn:schemas-microsoft-com:office:word": "vml-word", + "urn:schemas-microsoft-com:office:excel": "vml-excel", + "urn:schemas-microsoft-com:office:powerpoint": "vml-powerpoint", + + // Shared / officeDocument family + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes": "shared-types", + "http://schemas.openxmlformats.org/officeDocument/2006/math": "shared-math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships": "shared-relationships", + "http://schemas.openxmlformats.org/officeDocument/2006/customXml": "shared-customXml", + "http://schemas.openxmlformats.org/officeDocument/2006/bibliography": "shared-bibliography", + "http://schemas.openxmlformats.org/officeDocument/2006/characteristics": + "shared-additionalCharacteristics", + "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties": + "shared-extendedProperties", + "http://schemas.openxmlformats.org/officeDocument/2006/custom-properties": + "shared-customProperties", + "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes": "shared-docPropsVTypes", + + // Schema library (XML schema references) + "http://schemas.openxmlformats.org/schemaLibrary/2006/main": "schemaLibrary-main", + + // W3C built-ins + "http://www.w3.org/XML/1998/namespace": "xml", + "http://www.w3.org/2001/XMLSchema": "xsd-builtin", +}; + +export function vocabularyForNamespace(uri: string): string { + const v = NAMESPACE_TO_VOCABULARY[uri]; + if (!v) { + throw new Error( + `Unknown namespace URI: ${uri}. Add it to NAMESPACE_TO_VOCABULARY in scripts/ingest-xsd/vocabulary.ts.`, + ); + } + return v; +} diff --git a/scripts/sources-sync.ts b/scripts/sources-sync.ts new file mode 100644 index 0000000..b86e565 --- /dev/null +++ b/scripts/sources-sync.ts @@ -0,0 +1,97 @@ +/** + * Sync reference_sources from data/sources.json. + * + * - Upserts each source row (matched by name + edition + version). + * - Backfills NULL source_id on spec_content to point at the ecma-376 source. + * The backfill is a one-time concern; once all rows have source_id it is a no-op. + * + * Usage: + * bun scripts/sources-sync.ts + * + * Environment: + * DATABASE_URL - PostgreSQL connection string + */ + +import { createDbClient } from "../packages/shared/src/db/index.ts"; + +interface SourceEntry { + name: string; + kind: string; + edition: string | null; + version: string | null; + url: string | null; + license_note: string | null; + sha256: string | null; +} + +interface Manifest { + sources: SourceEntry[]; +} + +async function main() { + const databaseUrl = process.env.DATABASE_URL; + if (!databaseUrl) { + console.error("Missing DATABASE_URL environment variable"); + process.exit(1); + } + + const manifestPath = "./data/sources.json"; + const raw = await Bun.file(manifestPath).text(); + const manifest = JSON.parse(raw) as Manifest; + + if (!Array.isArray(manifest.sources) || manifest.sources.length === 0) { + console.error(`Invalid manifest at ${manifestPath}: 'sources' must be a non-empty array`); + process.exit(1); + } + + console.log(`Syncing ${manifest.sources.length} source(s) from ${manifestPath}`); + + const db = createDbClient(databaseUrl); + const sql = db.sql; + + try { + for (const s of manifest.sources) { + const [row] = await sql<[{ id: number; existed: boolean }]>` + INSERT INTO reference_sources (name, kind, edition, version, url, license_note, sha256) + VALUES (${s.name}, ${s.kind}, ${s.edition}, ${s.version}, ${s.url}, ${s.license_note}, ${s.sha256}) + ON CONFLICT (name) DO UPDATE + SET kind = EXCLUDED.kind, + edition = EXCLUDED.edition, + version = EXCLUDED.version, + url = EXCLUDED.url, + license_note = EXCLUDED.license_note, + sha256 = COALESCE(EXCLUDED.sha256, reference_sources.sha256) + RETURNING id, (xmax <> 0) AS existed + `; + console.log( + ` ${row.existed ? "updated " : "inserted"} ${s.name} (id=${row.id}, edition=${s.edition ?? "null"})`, + ); + } + + // Backfill spec_content.source_id by part_number to the matching + // ecma-376-partN row. Idempotent: only touches rows where source_id IS NULL. + for (let part = 1; part <= 4; part++) { + const sourceName = `ecma-376-part${part}`; + const [src] = await sql<[{ id: number } | undefined]>` + SELECT id FROM reference_sources WHERE name = ${sourceName} LIMIT 1 + `; + if (!src) continue; + const result = await sql` + UPDATE spec_content SET source_id = ${src.id} + WHERE part_number = ${part} AND source_id IS NULL + `; + if (result.count > 0) { + console.log( + `Backfilled ${result.count} spec_content row(s) (part ${part}) -> source_id=${src.id}`, + ); + } + } + } finally { + await db.close(); + } +} + +main().catch((err) => { + console.error("Sync failed:", err); + process.exit(1); +}); diff --git a/tests/db/xsd-schema.test.ts b/tests/db/xsd-schema.test.ts new file mode 100644 index 0000000..d44f93d --- /dev/null +++ b/tests/db/xsd-schema.test.ts @@ -0,0 +1,247 @@ +/** + * XSD schema integrity tests. + * + * Each test starts with an empty xsd_* / behavior_notes set. spec_content and + * reference_sources are left alone. Once a real ingest populates the dev DB, + * tests should move to a separate TEST_DATABASE_URL. + * + * Usage: + * DATABASE_URL=postgresql://... bun test tests/db/xsd-schema.test.ts + */ + +import { afterAll, beforeAll, beforeEach, expect, test } from "bun:test"; +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; + +import { getTestDatabaseUrl } from "../test-db.ts"; + +const databaseUrl = getTestDatabaseUrl(); + +let db: DbClient; + +const TRUNCATE_SQL = ` + TRUNCATE + behavior_notes, + xsd_enums, + xsd_inheritance_edges, + xsd_group_edges, + xsd_attr_edges, + xsd_child_edges, + xsd_compositors, + xsd_symbol_profiles, + xsd_symbols, + xsd_namespaces, + xsd_profiles + RESTART IDENTITY CASCADE +`; + +beforeAll(() => { + db = createDbClient(databaseUrl); +}); + +afterAll(async () => { + // Final cleanup so the dev DB doesn't carry the last test's rows. + await db.sql.unsafe(TRUNCATE_SQL); + await db.close(); +}); + +beforeEach(async () => { + // Wipe phase-2 tables; spec_content / reference_sources untouched. + await db.sql.unsafe(TRUNCATE_SQL); +}); + +// expect(promise).rejects.toThrow() doesn't trigger the postgres library's lazy +// query execution reliably; using an explicit try/catch instead. +async function expectThrows(fn: () => Promise): Promise { + let threw = false; + try { + await fn(); + } catch { + threw = true; + } + expect(threw).toBe(true); +} + +test("xsd_symbols enforces canonical identity (vocabulary_id, local_name, kind)", async () => { + await db.sql`INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tbl', 'element')`; + + await expectThrows( + () => db.sql`INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tbl', 'element')`, + ); + + // Same name, different kind is allowed (an element and complexType can share names). + await db.sql`INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tbl', 'complexType')`; +}); + +test("xsd_compositors CHECK constraints", async () => { + const [profile] = await db.sql`INSERT INTO xsd_profiles (name) VALUES ('test-profile') RETURNING id`; + const [symbol] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Tbl', 'complexType') RETURNING id + `; + + // Top-level: parent_symbol_id only. + const [topLevel] = await db.sql` + INSERT INTO xsd_compositors (parent_symbol_id, profile_id, kind) + VALUES (${symbol.id}, ${profile.id}, 'sequence') + RETURNING id + `; + + // Nested: parent_compositor_id only. + await db.sql` + INSERT INTO xsd_compositors (parent_compositor_id, profile_id, kind) + VALUES (${topLevel.id}, ${profile.id}, 'choice') + `; + + // kind must be sequence/choice/all. + await expectThrows(() => db.sql` + INSERT INTO xsd_compositors (parent_symbol_id, profile_id, kind) + VALUES (${symbol.id}, ${profile.id}, 'group') + `); + + // Neither parent set is rejected. + await expectThrows( + () => db.sql`INSERT INTO xsd_compositors (profile_id, kind) VALUES (${profile.id}, 'sequence')`, + ); + + // Both parents set is rejected (top-level vs nested are mutually exclusive). + await expectThrows(() => db.sql` + INSERT INTO xsd_compositors (parent_symbol_id, parent_compositor_id, profile_id, kind) + VALUES (${symbol.id}, ${topLevel.id}, ${profile.id}, 'sequence') + `); +}); + +test("xsd_attr_edges attr_use enum and default", async () => { + const [profile] = await db.sql`INSERT INTO xsd_profiles (name) VALUES ('test-profile') RETURNING id`; + const [symbol] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Tbl', 'complexType') RETURNING id + `; + + const [defaulted] = await db.sql` + INSERT INTO xsd_attr_edges (symbol_id, local_name, profile_id) + VALUES (${symbol.id}, 'someAttr', ${profile.id}) + RETURNING attr_use + `; + expect(defaulted.attr_use).toBe("optional"); + + await db.sql` + INSERT INTO xsd_attr_edges (symbol_id, local_name, profile_id, attr_use) + VALUES (${symbol.id}, 'requiredAttr', ${profile.id}, 'required') + `; + + await expectThrows(() => db.sql` + INSERT INTO xsd_attr_edges (symbol_id, local_name, profile_id, attr_use) + VALUES (${symbol.id}, 'badAttr', ${profile.id}, 'whatever') + `); +}); + +test("behavior_notes claim_type enum is enforced", async () => { + await db.sql` + INSERT INTO behavior_notes (app, claim_type, summary) + VALUES ('Word', 'ignores', 'test') + `; + + await expectThrows(() => db.sql` + INSERT INTO behavior_notes (app, claim_type, summary) + VALUES ('Word', 'does_something', 'test') + `); +}); + +test("xsd_inheritance_edges allows one base per (symbol, profile)", async () => { + const [profile] = await db.sql`INSERT INTO xsd_profiles (name) VALUES ('test-profile') RETURNING id`; + const [derived] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Derived', 'complexType') RETURNING id + `; + const [base1] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Base1', 'complexType') RETURNING id + `; + const [base2] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Base2', 'complexType') RETURNING id + `; + + await db.sql` + INSERT INTO xsd_inheritance_edges (symbol_id, base_symbol_id, profile_id, relation) + VALUES (${derived.id}, ${base1.id}, ${profile.id}, 'extension') + `; + + await expectThrows(() => db.sql` + INSERT INTO xsd_inheritance_edges (symbol_id, base_symbol_id, profile_id, relation) + VALUES (${derived.id}, ${base2.id}, ${profile.id}, 'restriction') + `); +}); + +test("CASCADE delete cleans up dependent rows", async () => { + const [profile] = await db.sql`INSERT INTO xsd_profiles (name) VALUES ('test-profile') RETURNING id`; + const [namespace] = await db.sql`INSERT INTO xsd_namespaces (uri) VALUES ('http://example.com/test') RETURNING id`; + const [symbol] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'w:tbl', 'element') RETURNING id + `; + + await db.sql` + INSERT INTO xsd_symbol_profiles (symbol_id, profile_id, namespace_id) + VALUES (${symbol.id}, ${profile.id}, ${namespace.id}) + `; + await db.sql` + INSERT INTO xsd_compositors (parent_symbol_id, profile_id, kind) + VALUES (${symbol.id}, ${profile.id}, 'sequence') + `; + + await db.sql`DELETE FROM xsd_symbols WHERE id = ${symbol.id}`; + + const [remainingProfiles] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbol_profiles WHERE symbol_id = ${symbol.id}`; + const [remainingCompositors] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_compositors WHERE parent_symbol_id = ${symbol.id}`; + expect(remainingProfiles.c).toBe(0); + expect(remainingCompositors.c).toBe(0); +}); + +test("realistic insert and lookup: 'children of w:tbl in profile transitional'", async () => { + const [profile] = await db.sql`INSERT INTO xsd_profiles (name) VALUES ('transitional') RETURNING id`; + const [namespace] = await db.sql` + INSERT INTO xsd_namespaces (uri) VALUES ('http://schemas.openxmlformats.org/wordprocessingml/2006/main') RETURNING id + `; + const [tbl] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tbl', 'element') RETURNING id + `; + const [tblPr] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tblPr', 'element') RETURNING id + `; + const [tblGrid] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tblGrid', 'element') RETURNING id + `; + + await db.sql` + INSERT INTO xsd_symbol_profiles (symbol_id, profile_id, namespace_id) + VALUES (${tbl.id}, ${profile.id}, ${namespace.id}) + `; + + const [seq] = await db.sql` + INSERT INTO xsd_compositors (parent_symbol_id, profile_id, kind) + VALUES (${tbl.id}, ${profile.id}, 'sequence') + RETURNING id + `; + + await db.sql` + INSERT INTO xsd_child_edges (parent_symbol_id, compositor_id, child_symbol_id, profile_id, min_occurs, max_occurs, order_index) + VALUES + (${tbl.id}, ${seq.id}, ${tblPr.id}, ${profile.id}, 1, 1, 0), + (${tbl.id}, ${seq.id}, ${tblGrid.id}, ${profile.id}, 1, 1, 1) + `; + + const children = await db.sql` + SELECT s.local_name, e.min_occurs, e.max_occurs, e.order_index + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + WHERE e.parent_symbol_id = ${tbl.id} AND e.profile_id = ${profile.id} + ORDER BY e.order_index + `; + + expect(children).toHaveLength(2); + expect(children[0]).toMatchObject({ local_name: "tblPr", min_occurs: 1, max_occurs: 1, order_index: 0 }); + expect(children[1]).toMatchObject({ local_name: "tblGrid", min_occurs: 1, max_occurs: 1, order_index: 1 }); +}); + +test("xsd_namespaces and xsd_profiles have unique constraints on natural keys", async () => { + await db.sql`INSERT INTO xsd_profiles (name) VALUES ('transitional')`; + await expectThrows(() => db.sql`INSERT INTO xsd_profiles (name) VALUES ('transitional')`); + + await db.sql`INSERT INTO xsd_namespaces (uri) VALUES ('http://example.com/x')`; + await expectThrows(() => db.sql`INSERT INTO xsd_namespaces (uri) VALUES ('http://example.com/x')`); +}); diff --git a/tests/ingest-xsd/fixtures/main.xsd b/tests/ingest-xsd/fixtures/main.xsd new file mode 100644 index 0000000..56a3547 --- /dev/null +++ b/tests/ingest-xsd/fixtures/main.xsd @@ -0,0 +1,133 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/ingest-xsd/fixtures/shared.xsd b/tests/ingest-xsd/fixtures/shared.xsd new file mode 100644 index 0000000..fa59f3a --- /dev/null +++ b/tests/ingest-xsd/fixtures/shared.xsd @@ -0,0 +1,12 @@ + + + + + + + + + + diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts new file mode 100644 index 0000000..36b80f2 --- /dev/null +++ b/tests/ingest-xsd/ingest.test.ts @@ -0,0 +1,523 @@ +/** + * Ingest pass tests. + * + * Each test starts with empty xsd_* / behavior_notes tables (afterEach TRUNCATE) + * and a known reference_sources row. Uses fixture XSDs. + */ + +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { afterAll, afterEach, beforeAll, beforeEach, expect, test } from "bun:test"; +import { ingestSchemaSet } from "../../scripts/ingest-xsd/ingest.ts"; +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; + +const FIXTURES_DIR = join(import.meta.dir, "fixtures"); +const REAL_CACHE_DIR = "./data/xsd-cache/ecma-376-transitional"; +const realCacheReady = existsSync(join(REAL_CACHE_DIR, "wml.xsd")); + +import { getTestDatabaseUrl } from "../test-db.ts"; + +const databaseUrl = getTestDatabaseUrl(); + +let db: DbClient; + +const TRUNCATE_SQL = ` + TRUNCATE + behavior_notes, + xsd_enums, + xsd_inheritance_edges, + xsd_group_edges, + xsd_attr_edges, + xsd_child_edges, + xsd_compositors, + xsd_symbol_profiles, + xsd_symbols, + xsd_namespaces, + xsd_profiles + RESTART IDENTITY CASCADE +`; + +beforeAll(async () => { + db = createDbClient(databaseUrl); + // Make sure ecma-376-transitional row exists; the ingest looks it up by name. + await db.sql` + INSERT INTO reference_sources (name, kind) + VALUES ('ecma-376-transitional', 'xsd') + ON CONFLICT (name) DO NOTHING + `; +}); + +afterAll(async () => { + await db.sql.unsafe(TRUNCATE_SQL); + await db.close(); +}); + +beforeEach(async () => { + await db.sql.unsafe(TRUNCATE_SQL); +}); + +afterEach(async () => { + await db.sql.unsafe(TRUNCATE_SQL); +}); + +const WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; +const SHARED_TYPES_NS = "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes"; + +test("ingest writes symbols, namespaces, memberships, and the transitional profile", async () => { + const stats = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + expect(stats.documents).toBe(2); + + // Profile bootstrapped. + const [profile] = await db.sql`SELECT id, name FROM xsd_profiles WHERE name = 'transitional'`; + expect(profile?.name).toBe("transitional"); + + // Both fixture target namespaces present. + const namespaces = await db.sql`SELECT uri FROM xsd_namespaces ORDER BY uri`; + const uris = namespaces.map((r: { uri: string }) => r.uri); + expect(uris).toContain(WML_NS); + expect(uris).toContain(SHARED_TYPES_NS); + + // Symbol count matches fixture: 1 element + 4 complexType + 3 simpleType + + // 1 group + 1 attributeGroup = 10 (plus 1 xsd-builtin auto-created for restrictions). + const [symbolCount] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbols`; + expect(symbolCount.c).toBeGreaterThanOrEqual(10); + + // CT_Para is in wml-main / transitional. + const [ctPara] = await db.sql` + SELECT s.id, s.vocabulary_id, s.kind, sp.profile_id, sp.namespace_id + FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + WHERE s.local_name = 'CT_Para' AND s.kind = 'complexType' + `; + expect(ctPara?.vocabulary_id).toBe("wml-main"); + + // ST_OnOff is in shared-types via the imported schema. + const [stOnOff] = await db.sql` + SELECT s.vocabulary_id FROM xsd_symbols s + WHERE s.local_name = 'ST_OnOff' AND s.kind = 'simpleType' + `; + expect(stOnOff?.vocabulary_id).toBe("shared-types"); +}); + +test("ingest writes inheritance edges for extension and restriction", async () => { + const stats = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Fixture inheritance: + // CT_Extended extends CT_Empty (complexContent) + // CT_Restricted restricts CT_Para (complexContent) + // ST_Jc restricts xsd:string (simpleType) + // ST_OnOff restricts xsd:boolean + // ST_String restricts xsd:string + // 6 from the original fixture + 2 new restrictions (CT_TrackedRestricted, + // CT_OverrideDerived). + expect(stats.inheritanceEdgesInserted).toBe(8); + expect(stats.inheritanceUnresolved).toBe(0); + + // Verify the CT_Extended → CT_Empty extension edge. + const [ext] = await db.sql` + SELECT base.local_name AS base_name, e.relation + FROM xsd_inheritance_edges e + JOIN xsd_symbols child ON child.id = e.symbol_id + JOIN xsd_symbols base ON base.id = e.base_symbol_id + WHERE child.local_name = 'CT_Extended' + `; + expect(ext?.base_name).toBe("CT_Empty"); + expect(ext?.relation).toBe("extension"); + + // Verify CT_Restricted → CT_Para restriction. + const [restr] = await db.sql` + SELECT base.local_name AS base_name, e.relation + FROM xsd_inheritance_edges e + JOIN xsd_symbols child ON child.id = e.symbol_id + JOIN xsd_symbols base ON base.id = e.base_symbol_id + WHERE child.local_name = 'CT_Restricted' + `; + expect(restr?.base_name).toBe("CT_Para"); + expect(restr?.relation).toBe("restriction"); + + // xsd-builtin placeholder symbol auto-created for the simpleType restrictions. + const [builtin] = await db.sql` + SELECT COUNT(*)::int AS c FROM xsd_symbols WHERE vocabulary_id = 'xsd-builtin' + `; + expect(builtin.c).toBeGreaterThan(0); +}); + +test("ingest is idempotent: re-running adds no new symbols/edges", async () => { + const first = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + const second = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Re-ingest purges everything this source previously wrote and re-creates + // it, so every stat equals the first run and symbolsExisting stays at 0. + // What matters for idempotency is that the DB row counts are stable across + // runs (asserted below). + expect(second.symbolsInserted).toBe(first.symbolsInserted); + expect(second.symbolsExisting).toBe(0); + expect(second.profileMembershipsInserted).toBe(first.profileMembershipsInserted); + expect(second.inheritanceEdgesInserted).toBe(first.inheritanceEdgesInserted); + expect(second.compositorsInserted).toBe(first.compositorsInserted); + expect(second.childEdgesInserted).toBe(first.childEdgesInserted); + expect(second.groupRefsInserted).toBe(first.groupRefsInserted); + expect(second.attrEdgesInserted).toBe(first.attrEdgesInserted); + expect(second.attrGroupRefsInserted).toBe(first.attrGroupRefsInserted); + expect(second.enumsInserted).toBe(first.enumsInserted); + + // Row counts unchanged between first and second runs. + const [c1] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbols`; + const [c2] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbol_profiles`; + const [c3] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_inheritance_edges`; + const [c4] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_compositors`; + const [c5] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_child_edges`; + const [c6] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_group_edges`; + const [c7] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_attr_edges`; + const [c8] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_enums`; + expect(c1.c).toBe(first.symbolsInserted); + expect(c2.c).toBe(first.profileMembershipsInserted); + expect(c3.c).toBe(first.inheritanceEdgesInserted); + expect(c4.c).toBe(first.compositorsInserted); + expect(c5.c).toBe(first.childEdgesInserted); + // xsd_group_edges holds both ref_kind='group' and ref_kind='attributeGroup'. + expect(c6.c).toBe(first.groupRefsInserted + first.attrGroupRefsInserted); + expect(c7.c).toBe(first.attrEdgesInserted); + expect(c8.c).toBe(first.enumsInserted); +}); + +test("ingest writes compositors and child edges for nested content models", async () => { + const stats = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Fixture content models: + // CT_Para: sequence -> element name="text" + // CT_Body: sequence -> [ element ref="document", + // choice (0..unbounded) -> [ + // group ref="EG_PContent", + // element name="break" ]] + // EG_PContent: choice -> element name="r" + // CT_BaseWithChildren: sequence -> [ alpha, beta ] + // CT_DerivedExtended: complexContent/extension -> sequence -> [ gamma ] + // CT_NestedOrder: sequence -> [ head, choice -> [ branchA, branchB ], tail ] + // Compositors: CT_Para(1) + CT_Body(2) + EG_PContent(1) + Base(1) + Derived(1) + + // Nested(2) + OuterA(1) + OuterB(1) = 10 + expect(stats.compositorsInserted).toBe(10); + expect(stats.groupRefsInserted).toBe(1); + // Local element symbols are scoped per-owner now, so the two `shared` decls + // in CT_OuterA and CT_OuterB count separately. + // Per-owner locals: text(CT_Para), break(CT_Body), r(EG_PContent), + // alpha+beta(CT_BaseWithChildren), gamma(CT_DerivedExtended), + // head+branchA+branchB+tail(CT_NestedOrder), shared(CT_OuterA), + // shared(CT_OuterB) = 12. + expect(stats.localElementsCreated).toBe(12); + expect(stats.childEdgesUnresolved).toBe(0); + expect(stats.groupRefsUnresolved).toBe(0); + + // CT_Para: one sequence with one child edge to local element "text". + const ctParaChildren = await db.sql` + SELECT s.local_name, e.min_occurs, e.max_occurs, e.order_index, c.kind AS compositor_kind + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + JOIN xsd_compositors c ON c.id = e.compositor_id + JOIN xsd_symbols parent ON parent.id = e.parent_symbol_id + WHERE parent.local_name = 'CT_Para' AND parent.kind = 'complexType' + ORDER BY e.order_index + `; + expect(ctParaChildren).toHaveLength(1); + expect(ctParaChildren[0]).toMatchObject({ + local_name: "text", + min_occurs: 1, + max_occurs: 1, + order_index: 0, + compositor_kind: "sequence", + }); + + // CT_Body: top sequence + nested choice. Two compositors for CT_Body. + const ctBodyCompositors = await db.sql` + SELECT c.kind, c.parent_symbol_id, c.parent_compositor_id, c.min_occurs, c.max_occurs, c.order_index + FROM xsd_compositors c + JOIN xsd_symbols s ON s.id = c.parent_symbol_id + WHERE s.local_name = 'CT_Body' AND s.kind = 'complexType' + ORDER BY c.order_index + `; + // Only the TOP-level compositor has parent_symbol_id set; nested has parent_compositor_id. + expect(ctBodyCompositors).toHaveLength(1); + expect(ctBodyCompositors[0]).toMatchObject({ kind: "sequence", min_occurs: 1, max_occurs: 1 }); + const topId: number = ctBodyCompositors[0].id ?? null; + void topId; + + const nestedCompositors = await db.sql` + SELECT c.kind, c.min_occurs, c.max_occurs, c.parent_compositor_id + FROM xsd_compositors c + JOIN xsd_compositors parent ON parent.id = c.parent_compositor_id + JOIN xsd_symbols owner ON owner.id = parent.parent_symbol_id + WHERE owner.local_name = 'CT_Body' + `; + expect(nestedCompositors).toHaveLength(1); + expect(nestedCompositors[0]).toMatchObject({ + kind: "choice", + min_occurs: 0, + max_occurs: null, // unbounded + }); + + // CT_Body's top sequence has 1 child edge (ref="document"). The break element is + // inside the nested choice, not the top sequence. + const ctBodyTopChildren = await db.sql` + SELECT s.local_name, e.order_index + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + JOIN xsd_compositors c ON c.id = e.compositor_id + JOIN xsd_symbols parent ON parent.id = c.parent_symbol_id + WHERE parent.local_name = 'CT_Body' AND c.kind = 'sequence' + ORDER BY e.order_index + `; + expect(ctBodyTopChildren).toHaveLength(1); + expect(ctBodyTopChildren[0].local_name).toBe("document"); + + // CT_Body's nested choice has 1 child edge (local element "break"); the group ref + // goes to xsd_group_edges, not child_edges. + const ctBodyNestedChildren = await db.sql` + SELECT s.local_name + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + JOIN xsd_compositors c ON c.id = e.compositor_id + WHERE c.kind = 'choice' AND c.parent_compositor_id IS NOT NULL + `; + const names = ctBodyNestedChildren.map((r: { local_name: string }) => r.local_name); + expect(names).toContain("break"); + + // Group ref for EG_PContent under CT_Body. + const groupEdges = await db.sql` + SELECT g.local_name AS group_name, ref_kind + FROM xsd_group_edges ge + JOIN xsd_symbols parent ON parent.id = ge.parent_symbol_id + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + WHERE parent.local_name = 'CT_Body' + `; + expect(groupEdges).toHaveLength(1); + expect(groupEdges[0]).toMatchObject({ group_name: "EG_PContent", ref_kind: "group" }); +}); + +test("ingest writes attributes, attributeGroup refs, and enum values", async () => { + const stats = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Fixture attributes: + // CT_Para/bold (optional, type s:ST_OnOff) + // CT_Extended/extra (optional, type xsd:string, under complexContent/extension) + // AG_TableProps/cols (optional, type xsd:int) + // CT_TableUser/caption (required, type xsd:string) + // CT_RefTest/space (required, ref="s:space"; type/default copied from decl) + // AG_Inner/innerAttr (optional, type xsd:string) + // AG_Outer/outerAttr (optional, type xsd:string) + // CT_TrackedBase/id (required, type xsd:string) + // CT_TrackedBase/author (optional, type xsd:string) + // CT_OverrideDerived/id (optional override, type xsd:string) + expect(stats.attrEdgesInserted).toBe(10); + expect(stats.attrEdgesUnresolved).toBe(0); + + // Fixture attributeGroup refs: + // CT_TableUser -> AG_TableProps + // AG_Outer -> AG_Inner (nested attributeGroup ref) + // CT_NestedAttrUser -> AG_Outer + expect(stats.attrGroupRefsInserted).toBe(3); + expect(stats.attrGroupRefsUnresolved).toBe(0); + + // Fixture enums: ST_Jc has 3 values; ST_OnOff and ST_String have base restrictions + // without xsd:enumeration children, so 0 enum values from those. + expect(stats.enumsInserted).toBe(3); + + // CT_Para/bold attribute resolves to s:ST_OnOff in shared-types namespace. + const [bold] = await db.sql` + SELECT a.local_name, a.attr_use, a.type_ref + FROM xsd_attr_edges a + JOIN xsd_symbols s ON s.id = a.symbol_id + WHERE s.local_name = 'CT_Para' AND a.local_name = 'bold' + `; + expect(bold?.attr_use).toBe("optional"); + expect(bold?.type_ref).toBe( + "{http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes}ST_OnOff", + ); + + // CT_Extended/extra is on complexContent/extension. + const [extra] = await db.sql` + SELECT a.local_name, a.attr_use + FROM xsd_attr_edges a + JOIN xsd_symbols s ON s.id = a.symbol_id + WHERE s.local_name = 'CT_Extended' AND a.local_name = 'extra' + `; + expect(extra?.attr_use).toBe("optional"); + + // CT_TableUser/caption is required. + const [caption] = await db.sql` + SELECT a.local_name, a.attr_use + FROM xsd_attr_edges a + JOIN xsd_symbols s ON s.id = a.symbol_id + WHERE s.local_name = 'CT_TableUser' AND a.local_name = 'caption' + `; + expect(caption?.attr_use).toBe("required"); + + // CT_TableUser has an attributeGroup ref to AG_TableProps. + const agRefs = await db.sql` + SELECT g.local_name AS group_name + FROM xsd_group_edges ge + JOIN xsd_symbols parent ON parent.id = ge.parent_symbol_id + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + WHERE parent.local_name = 'CT_TableUser' AND ge.ref_kind = 'attributeGroup' + `; + expect(agRefs).toHaveLength(1); + expect(agRefs[0].group_name).toBe("AG_TableProps"); + + // ST_Jc enum values, in declared order. + const enumValues = await db.sql` + SELECT e.value, e.order_index + FROM xsd_enums e + JOIN xsd_symbols s ON s.id = e.symbol_id + WHERE s.local_name = 'ST_Jc' AND s.kind = 'simpleType' + ORDER BY e.order_index + `; + expect(enumValues.map((r: { value: string }) => r.value)).toEqual(["left", "center", "right"]); +}); + +test("ingest preserves element/attribute @type, local-element profile membership, and group-ref compositor context", async () => { + await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Top-level element: + // type_ref must point at CT_Empty in wml-main. + const [docSym] = await db.sql` + SELECT type_ref FROM xsd_symbols + WHERE local_name = 'document' AND kind = 'element' AND vocabulary_id = 'wml-main' + `; + expect(docSym?.type_ref).toBe( + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}CT_Empty", + ); + + // Local element: inside CT_Para. + // Should have type_ref AND profile membership so ooxml_lookup_element finds it. + const [textSym] = await db.sql` + SELECT s.id, s.type_ref FROM xsd_symbols s + WHERE s.local_name = 'text' AND s.kind = 'element' AND s.vocabulary_id = 'wml-main' + `; + expect(textSym?.type_ref).toBe("{http://www.w3.org/2001/XMLSchema}string"); + + const [textMembership] = await db.sql` + SELECT sp.id FROM xsd_symbol_profiles sp + JOIN xsd_profiles p ON p.id = sp.profile_id + WHERE sp.symbol_id = ${textSym.id} AND p.name = 'transitional' + `; + expect(textMembership?.id).toBeDefined(); + + // Group ref inside a nested choice (CT_Body's choice contains ). + // compositor_id must point at the choice, not be null. Min/max occurs default to 1 + // since the ref itself has no minOccurs/maxOccurs in our fixture. + const [groupRef] = await db.sql` + SELECT ge.compositor_id, ge.min_occurs, ge.max_occurs, c.kind AS compositor_kind, + c.parent_compositor_id IS NOT NULL AS is_nested + FROM xsd_group_edges ge + JOIN xsd_compositors c ON c.id = ge.compositor_id + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + JOIN xsd_symbols parent ON parent.id = ge.parent_symbol_id + WHERE parent.local_name = 'CT_Body' AND g.local_name = 'EG_PContent' + `; + expect(groupRef?.compositor_id).toBeDefined(); + expect(groupRef?.compositor_kind).toBe("choice"); + expect(groupRef?.is_nested).toBe(true); + + // Attribute ref: inside CT_RefTest. + // type_ref and default_value must be recovered from the top-level . + // attr_use must come from the ref site (required, not the declaration's optional default). + const [refAttr] = await db.sql` + SELECT a.local_name, a.attr_use, a.default_value, a.type_ref, + a.attr_symbol_id IS NOT NULL AS has_attr_sym + FROM xsd_attr_edges a + JOIN xsd_symbols s ON s.id = a.symbol_id + WHERE s.local_name = 'CT_RefTest' AND s.kind = 'complexType' + `; + expect(refAttr?.local_name).toBe("space"); + expect(refAttr?.attr_use).toBe("required"); + expect(refAttr?.default_value).toBe("preserve"); + expect(refAttr?.type_ref).toBe("{http://www.w3.org/2001/XMLSchema}string"); + expect(refAttr?.has_attr_sym).toBe(true); +}); + +test.skipIf(!realCacheReady)( + "smoke: ingest WML closure into the dev DB and verify counts", + async () => { + // Real WML ingest writes thousands of rows; bump timeout from default 5s. + const stats = await ingestSchemaSet({ + schemaDir: REAL_CACHE_DIR, + entrypoints: ["wml.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Real WML closure has 12 documents. + expect(stats.documents).toBe(12); + expect(stats.symbolsInserted).toBeGreaterThan(1300); + expect(stats.inheritanceEdgesInserted).toBeGreaterThan(300); + expect(stats.compositorsInserted).toBeGreaterThan(500); + expect(stats.childEdgesInserted).toBeGreaterThan(1000); + expect(stats.groupRefsInserted).toBeGreaterThan(20); + expect(stats.childEdgesUnresolved).toBe(0); + expect(stats.groupRefsUnresolved).toBe(0); + // Attribute / attributeGroup / enum coverage: + expect(stats.attrEdgesInserted).toBeGreaterThan(500); + expect(stats.attrGroupRefsInserted).toBeGreaterThan(10); + expect(stats.enumsInserted).toBeGreaterThan(200); + // A handful of attribute refs target namespaces with no schemaLocation + // (notably xml:space / xml:lang). They resolve to the xml namespace but + // have no symbol because we don't load XSD's xml namespace schema. + expect(stats.attrEdgesUnresolved).toBeLessThan(10); + expect(stats.attrGroupRefsUnresolved).toBe(0); + + // w:tbl is the global element; its content type is CT_Tbl. Verify CT_Tbl has children. + const ctTblChildren = await db.sql` + SELECT s.local_name FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + JOIN xsd_symbols parent ON parent.id = e.parent_symbol_id + WHERE parent.local_name = 'CT_Tbl' AND parent.vocabulary_id = 'wml-main' + ORDER BY e.order_index + `; + expect(ctTblChildren.length).toBeGreaterThan(0); + }, + 30_000, +); diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-xsd/parse-schema.test.ts new file mode 100644 index 0000000..0e4384d --- /dev/null +++ b/tests/ingest-xsd/parse-schema.test.ts @@ -0,0 +1,156 @@ +/** + * Parser scaffolding tests. + * + * Primary tests use tiny fixture XSDs to keep the suite fast and independent + * of the local cache. One optional smoke test runs against the real + * data/xsd-cache/ecma-376-transitional/ if present. + */ + +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { expect, test } from "bun:test"; +import { parseSchemaSet } from "../../scripts/ingest-xsd/parse-schema.ts"; +import { declarationQNameKey, resolveQNameAttr } from "../../scripts/ingest-xsd/qname.ts"; +import type { Declaration, DeclarationKind } from "../../scripts/ingest-xsd/types.ts"; + +const FIXTURES_DIR = join(import.meta.dir, "fixtures"); +const REAL_CACHE_DIR = "./data/xsd-cache/ecma-376-transitional"; +const realCacheReady = existsSync(join(REAL_CACHE_DIR, "wml.xsd")); + +const WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; +const SHARED_TYPES_NS = "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes"; +const XSD_NS = "http://www.w3.org/2001/XMLSchema"; + +function countByKind(decls: Map): Record { + const out: Record = { + element: 0, + complexType: 0, + simpleType: 0, + group: 0, + attributeGroup: 0, + attribute: 0, + }; + for (const arr of decls.values()) { + for (const d of arr) out[d.kind]++; + } + return out; +} + +test("parseSchemaSet loads fixtures and follows imports transitively", async () => { + const set = await parseSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + }); + + expect(set.documents.size).toBe(2); + expect(set.documents.has("main.xsd")).toBe(true); + expect(set.documents.has("shared.xsd")).toBe(true); + + const main = set.documents.get("main.xsd"); + expect(main?.targetNamespace).toBe(WML_NS); + expect(main?.vocabularyId).toBe("wml-main"); + + const shared = set.documents.get("shared.xsd"); + expect(shared?.targetNamespace).toBe(SHARED_TYPES_NS); + expect(shared?.vocabularyId).toBe("shared-types"); +}); + +test("namespaceByPrefix is per-document and captures default + named prefixes", async () => { + const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); + const mainPrefixes = set.namespaceByPrefix.get("main.xsd"); + expect(mainPrefixes?.get("")).toBe(WML_NS); + expect(mainPrefixes?.get("s")).toBe(SHARED_TYPES_NS); + expect(mainPrefixes?.get("xsd")).toBe(XSD_NS); + + // shared.xsd has its own prefix map. + const sharedPrefixes = set.namespaceByPrefix.get("shared.xsd"); + expect(sharedPrefixes?.get("")).toBe(SHARED_TYPES_NS); + expect(sharedPrefixes?.has("s")).toBe(false); +}); + +test("importGraph resolves schemaLocation to relative target paths", async () => { + const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); + const mainImports = set.importGraph.get("main.xsd"); + expect(mainImports).toHaveLength(1); + expect(mainImports?.[0]).toMatchObject({ + namespace: SHARED_TYPES_NS, + schemaLocation: "shared.xsd", + target: "shared.xsd", + }); + + expect(set.importGraph.get("shared.xsd")).toEqual([]); +}); + +test("declarationsByQName indexes all top-level declarations across documents", async () => { + const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); + + const counts = countByKind(set.declarationsByQName); + // main.xsd: 1 element, 16 complexType, 1 simpleType, 1 group, 3 attributeGroup + // shared.xsd: 2 simpleType, 1 attribute + expect(counts.element).toBe(1); + expect(counts.complexType).toBe(16); + expect(counts.simpleType).toBe(3); + expect(counts.group).toBe(1); + expect(counts.attributeGroup).toBe(3); + expect(counts.attribute).toBe(1); + + // Specific decl lookup by canonical key. + const ctPara = set.declarationsByQName.get(declarationQNameKey(WML_NS, "complexType", "CT_Para")); + expect(ctPara).toHaveLength(1); + expect(ctPara?.[0].vocabularyId).toBe("wml-main"); + + const stOnOff = set.declarationsByQName.get( + declarationQNameKey(SHARED_TYPES_NS, "simpleType", "ST_OnOff"), + ); + expect(stOnOff).toHaveLength(1); + expect(stOnOff?.[0].documentPath).toBe("shared.xsd"); +}); + +test("resolveQNameAttr: prefixed, unprefixed, and unresolved", async () => { + const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); + const prefixes = set.namespaceByPrefix.get("main.xsd"); + if (!prefixes) throw new Error("missing prefix map for fixture"); + + const r1 = resolveQNameAttr("s:ST_OnOff", prefixes, WML_NS); + expect(r1.resolved).toBe(true); + if (r1.resolved) { + expect(r1.qname.namespace).toBe(SHARED_TYPES_NS); + expect(r1.qname.localName).toBe("ST_OnOff"); + expect(r1.qname.vocabularyId).toBe("shared-types"); + } + + const r2 = resolveQNameAttr("CT_Para", prefixes, WML_NS); + expect(r2.resolved).toBe(true); + if (r2.resolved) expect(r2.qname.namespace).toBe(WML_NS); + + const r3 = resolveQNameAttr("zzz:Whatever", prefixes, WML_NS); + expect(r3.resolved).toBe(false); + if (!r3.resolved) expect(r3.qname.reason).toBe("unknown-prefix"); +}); + +test.skipIf(!realCacheReady)( + "smoke: parses real wml.xsd from cache, counts declarations", + async () => { + const set = await parseSchemaSet({ + schemaDir: REAL_CACHE_DIR, + entrypoints: ["wml.xsd"], + }); + + expect(set.documents.size).toBeGreaterThan(5); + const wml = set.documents.get("wml.xsd"); + expect(wml?.vocabularyId).toBe("wml-main"); + expect(wml?.targetNamespace).toBe(WML_NS); + + // wml.xsd imports 5 schemas with schemaLocation + 1 (xml) without. + const wmlImports = set.importGraph.get("wml.xsd"); + expect(wmlImports).toHaveLength(6); + + const counts = countByKind(set.declarationsByQName); + // Sanity floors against the WML+imports working set. Real counts (5th ed): + // complexType=820, simpleType=389, group=67, element=47, attribute=14, attributeGroup=8. + expect(counts.complexType).toBeGreaterThan(500); + expect(counts.simpleType).toBeGreaterThan(200); + expect(counts.group).toBeGreaterThan(40); + expect(counts.element).toBeGreaterThan(40); + }, +); diff --git a/tests/mcp-server/ooxml-queries.test.ts b/tests/mcp-server/ooxml-queries.test.ts new file mode 100644 index 0000000..7d5c5f2 --- /dev/null +++ b/tests/mcp-server/ooxml-queries.test.ts @@ -0,0 +1,361 @@ +/** + * Query layer tests. Ingests the same fixture XSDs the ingest tests use, + * then exercises each MCP-tool query function against the populated DB. + */ + +import { join } from "node:path"; +import { afterAll, beforeAll, expect, test } from "bun:test"; +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; +import { ingestSchemaSet } from "../../scripts/ingest-xsd/ingest.ts"; +import { + getAttributes, + getChildren, + getEnums, + getNamespaceInfo, + lookupElement, + lookupSymbolByTypeRef, + lookupType, + parseQName, +} from "../../apps/mcp-server/src/ooxml-queries.ts"; + +const FIXTURES_DIR = join(import.meta.dir, "..", "ingest-xsd", "fixtures"); +import { getTestDatabaseUrl } from "../test-db.ts"; + +const databaseUrl = getTestDatabaseUrl(); + +let db: DbClient; + +const TRUNCATE_SQL = ` + TRUNCATE + behavior_notes, + xsd_enums, + xsd_inheritance_edges, + xsd_group_edges, + xsd_attr_edges, + xsd_child_edges, + xsd_compositors, + xsd_symbol_profiles, + xsd_symbols, + xsd_namespaces, + xsd_profiles + RESTART IDENTITY CASCADE +`; + +beforeAll(async () => { + db = createDbClient(databaseUrl); + await db.sql` + INSERT INTO reference_sources (name, kind) + VALUES ('ecma-376-transitional', 'xsd') + ON CONFLICT (name) DO NOTHING + `; + await db.sql.unsafe(TRUNCATE_SQL); + await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); +}); + +afterAll(async () => { + await db.sql.unsafe(TRUNCATE_SQL); + await db.close(); +}); + +const WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; +const SHARED_NS = "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes"; + +test("parseQName: prefixed, Clark, bare", () => { + const a = parseQName("w:tbl"); + expect(a.ok).toBe(true); + if (a.ok) { + expect(a.qname.namespace).toBe(WML_NS); + expect(a.qname.localName).toBe("tbl"); + } + + const b = parseQName("{http://example.com}foo"); + expect(b.ok).toBe(true); + if (b.ok) { + expect(b.qname.namespace).toBe("http://example.com"); + expect(b.qname.localName).toBe("foo"); + } + + const c = parseQName("CT_Tbl"); + expect(c.ok).toBe(true); + if (c.ok) expect(c.qname.namespace).toBe(WML_NS); // bare default + + const d = parseQName("zzz:something"); + expect(d.ok).toBe(false); +}); + +test("lookupElement: top-level element with type_ref", async () => { + const hit = await lookupElement(db.sql, WML_NS, "document", "transitional"); + expect(hit?.localName).toBe("document"); + expect(hit?.kind).toBe("element"); + expect(hit?.typeRef).toBe(`{${WML_NS}}CT_Empty`); + expect(hit?.profileName).toBe("transitional"); + expect(hit?.namespaceUri).toBe(WML_NS); +}); + +test("lookupElement returns null for local-only names (no qname-addressable identity)", async () => { + // 'text' is declared inline in CT_Para and is not a top-level . + // Per XSD it has no global qname; reach it via getChildren(CT_Para) instead. + const hit = await lookupElement(db.sql, WML_NS, "text", "transitional"); + expect(hit).toBeNull(); +}); + +test("lookupElement returns null for ambiguous local names (the tblGrid case)", async () => { + // 'shared' is declared inline in CT_OuterA (type ST_Jc) and in CT_OuterB + // (type xsd:string). Returning either would be wrong; lookupElement scopes + // by parent_symbol_id IS NULL and refuses to pick one. + const hit = await lookupElement(db.sql, WML_NS, "shared", "transitional"); + expect(hit).toBeNull(); +}); + +test("lookupType: complexType vs simpleType disambiguation", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_Para", "transitional"); + expect(ct?.kind).toBe("complexType"); + + const st = await lookupType(db.sql, WML_NS, "ST_Jc", "transitional"); + expect(st?.kind).toBe("simpleType"); + + const sharedSt = await lookupType(db.sql, SHARED_NS, "ST_OnOff", "transitional"); + expect(sharedSt?.vocabularyId).toBe("shared-types"); +}); + +test("lookupSymbolByTypeRef resolves Clark form", async () => { + const hit = await lookupSymbolByTypeRef(db.sql, `{${WML_NS}}CT_Empty`, "transitional"); + expect(hit?.localName).toBe("CT_Empty"); + expect(hit?.kind).toBe("complexType"); +}); + +test("getChildren: CT_Para has the local 'text' element via its sequence", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_Para", "transitional"); + if (!ct) throw new Error("CT_Para not found"); + const children = await getChildren(db.sql, ct.id, "transitional"); + expect(children).toHaveLength(1); + expect(children[0].localName).toBe("text"); + expect(children[0].compositorKind).toBe("sequence"); + expect(children[0].source).toBe("self"); +}); + +test("getChildren: CT_Body returns ordered mix of elements + group ref", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_Body", "transitional"); + if (!ct) throw new Error("CT_Body not found"); + const children = await getChildren(db.sql, ct.id, "transitional"); + // CT_Body content (top sequence): element ref="document", choice(group EG_PContent, element name="break") + // getChildren returns the top sequence's edges; the nested choice's content is reachable via compositorId + // pivot but not flattened automatically. + const localNames = children.map((c) => c.localName).sort(); + expect(localNames).toContain("document"); + expect(localNames).toContain("EG_PContent"); + expect(localNames).toContain("break"); +}); + +test("getChildren: inheritance is unioned (CT_Extended inherits from CT_Empty)", async () => { + // CT_Extended extends CT_Empty (which has no content); CT_Extended itself has no + // content model either, so children should be empty. + const ct = await lookupType(db.sql, WML_NS, "CT_Extended", "transitional"); + if (!ct) throw new Error("CT_Extended not found"); + const children = await getChildren(db.sql, ct.id, "transitional"); + expect(children).toHaveLength(0); +}); + +test("getAttributes: CT_Para has 'bold' with type_ref to ST_OnOff", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_Para", "transitional"); + if (!ct) throw new Error("CT_Para not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + const bold = attrs.find((a) => a.localName === "bold"); + expect(bold?.attrUse).toBe("optional"); + expect(bold?.typeRef).toBe(`{${SHARED_NS}}ST_OnOff`); +}); + +test("getAttributes: CT_TableUser unfolds AG_TableProps via attributeGroup ref", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_TableUser", "transitional"); + if (!ct) throw new Error("CT_TableUser not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + const names = attrs.map((a) => a.localName).sort(); + // caption is direct, cols comes from AG_TableProps. + expect(names).toContain("caption"); + expect(names).toContain("cols"); + + const cols = attrs.find((a) => a.localName === "cols"); + expect(cols?.source).toBe("attributeGroup"); + expect(cols?.owningName).toBe("AG_TableProps"); + + const caption = attrs.find((a) => a.localName === "caption"); + expect(caption?.attrUse).toBe("required"); +}); + +test("getAttributes: CT_Extended inherits 'extra' (declared on the extension)", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_Extended", "transitional"); + if (!ct) throw new Error("CT_Extended not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + const extra = attrs.find((a) => a.localName === "extra"); + expect(extra?.attrUse).toBe("optional"); + expect(extra?.typeRef).toBe("{http://www.w3.org/2001/XMLSchema}string"); +}); + +test("getEnums: ST_Jc returns left/center/right in order", async () => { + const st = await lookupType(db.sql, WML_NS, "ST_Jc", "transitional"); + if (!st) throw new Error("ST_Jc not found"); + const enums = await getEnums(db.sql, st.id, "transitional"); + expect(enums.map((e) => e.value)).toEqual(["left", "center", "right"]); +}); + +test("getNamespaceInfo: reports profile membership and vocabularies", async () => { + const info = await getNamespaceInfo(db.sql, WML_NS); + expect(info?.uri).toBe(WML_NS); + expect(info?.vocabularies).toContain("wml-main"); + expect(info?.profiles.find((p) => p.name === "transitional")?.symbolCount).toBeGreaterThan(0); + + // Unknown URI → null + const none = await getNamespaceInfo(db.sql, "http://example.com/does-not-exist"); + expect(none).toBeNull(); +}); + +test("lookupElement: returns null for unknown qname", async () => { + const hit = await lookupElement(db.sql, WML_NS, "doesNotExist", "transitional"); + expect(hit).toBeNull(); +}); + +test("getChildren: extension prepends base content (CT_DerivedExtended -> alpha, beta, gamma)", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_DerivedExtended", "transitional"); + if (!ct) throw new Error("CT_DerivedExtended not found"); + const children = await getChildren(db.sql, ct.id, "transitional"); + const names = children.map((c) => c.localName); + // XSD extension semantics: base content first, then derived. + expect(names).toEqual(["alpha", "beta", "gamma"]); + // Provenance distinguishes base-derived from self-derived. + expect(children[0].source).toBe("inherited"); + expect(children[0].owningTypeName).toBe("CT_BaseWithChildren"); + expect(children[2].source).toBe("self"); + expect(children[2].owningTypeName).toBe("CT_DerivedExtended"); +}); + +test("getChildren: nested compositor flatten preserves document order (CT_NestedOrder)", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_NestedOrder", "transitional"); + if (!ct) throw new Error("CT_NestedOrder not found"); + const children = await getChildren(db.sql, ct.id, "transitional"); + // Top sequence: head, choice(branchA, branchB), tail. + // Document order should be head, branchA, branchB, tail (NOT branchA first because + // its order_index=0 inside the choice). + const names = children.map((c) => c.localName); + expect(names).toEqual(["head", "branchA", "branchB", "tail"]); + + // Compositor path makes the nesting visible. + const head = children.find((c) => c.localName === "head"); + expect(head?.compositorPath).toEqual(["sequence(1..1)"]); + + const branchA = children.find((c) => c.localName === "branchA"); + expect(branchA?.compositorPath).toEqual(["sequence(1..1)", "choice(0..unbounded)"]); +}); + +test("local element symbols are scoped per-owner (no cross-CT collapse)", async () => { + // Mirrors the WML tblGrid case (CT_TblGridBase inside CT_TblGridChange vs + // CT_TblGrid inside CT_Tbl). CT_OuterA / CT_OuterB both declare an inline + // 'shared' element but with different @type. They must produce distinct + // per-parent symbols, each carrying its own type_ref. + const ctA = await lookupType(db.sql, WML_NS, "CT_OuterA", "transitional"); + const ctB = await lookupType(db.sql, WML_NS, "CT_OuterB", "transitional"); + if (!ctA || !ctB) throw new Error("CT_OuterA / CT_OuterB not found"); + + const aChildren = await getChildren(db.sql, ctA.id, "transitional"); + const bChildren = await getChildren(db.sql, ctB.id, "transitional"); + expect(aChildren).toHaveLength(1); + expect(bChildren).toHaveLength(1); + expect(aChildren[0].localName).toBe("shared"); + expect(bChildren[0].localName).toBe("shared"); + + // The two `shared` symbols carry different type_refs. + const sharedSymbols = await db.sql` + SELECT s.id, s.type_ref, s.parent_symbol_id, parent.local_name AS parent_name + FROM xsd_symbols s + JOIN xsd_symbols parent ON parent.id = s.parent_symbol_id + WHERE s.local_name = 'shared' AND s.kind = 'element' + ORDER BY parent.local_name + `; + expect(sharedSymbols).toHaveLength(2); + expect(sharedSymbols[0].parent_name).toBe("CT_OuterA"); + expect(sharedSymbols[0].type_ref).toBe(`{${WML_NS}}ST_Jc`); + expect(sharedSymbols[1].parent_name).toBe("CT_OuterB"); + expect(sharedSymbols[1].type_ref).toBe("{http://www.w3.org/2001/XMLSchema}string"); +}); + +test("xsd-builtin symbols have profile membership (lookupSymbolByTypeRef can follow xsd:string)", async () => { + // Built-ins like xsd:string are auto-created during inheritance resolution and + // must be linked to xsd_symbol_profiles, otherwise ooxml_lookup_type for + // 'xsd:string' and lookupSymbolByTypeRef for {...XMLSchema}string return null. + const t = await lookupSymbolByTypeRef( + db.sql, + "{http://www.w3.org/2001/XMLSchema}string", + "transitional", + ); + expect(t).not.toBeNull(); + expect(t?.localName).toBe("string"); + expect(t?.vocabularyId).toBe("xsd-builtin"); +}); + +test("getAttributes: complexContent/restriction inherits base attributes", async () => { + // CT_TrackedRestricted restricts CT_TrackedBase but redeclares nothing. + // Per XSD §3.4.2.2 the base's attribute uses are inherited; restriction can + // narrow or prohibit but cannot drop silently. + const ct = await lookupType(db.sql, WML_NS, "CT_TrackedRestricted", "transitional"); + if (!ct) throw new Error("CT_TrackedRestricted not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + const names = attrs.map((a) => a.localName).sort(); + expect(names).toEqual(["author", "id"]); + + const idAttr = attrs.find((a) => a.localName === "id"); + expect(idAttr?.attrUse).toBe("required"); + expect(idAttr?.source).toBe("inherited"); + expect(idAttr?.owningName).toBe("CT_TrackedBase"); +}); + +test("getAttributes: derived redeclaration wins over inherited base attribute", async () => { + // CT_OverrideDerived restricts CT_TrackedBase and overrides 'id' from + // required to optional. The derived's redeclaration must win; the base's + // 'author' should still be inherited unchanged. + const ct = await lookupType(db.sql, WML_NS, "CT_OverrideDerived", "transitional"); + if (!ct) throw new Error("CT_OverrideDerived not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + + const idAttr = attrs.find((a) => a.localName === "id"); + expect(idAttr?.attrUse).toBe("optional"); + expect(idAttr?.source).toBe("self"); + expect(idAttr?.owningName).toBe("CT_OverrideDerived"); + + const authorAttr = attrs.find((a) => a.localName === "author"); + expect(authorAttr?.attrUse).toBe("optional"); + expect(authorAttr?.source).toBe("inherited"); +}); + +test("getAttributes: nested attributeGroup chain unfolds (CT_NestedAttrUser -> innerAttr + outerAttr)", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_NestedAttrUser", "transitional"); + if (!ct) throw new Error("CT_NestedAttrUser not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + const names = attrs.map((a) => a.localName).sort(); + // CT_NestedAttrUser refs AG_Outer; AG_Outer refs AG_Inner. + // Both attributes must surface. + expect(names).toEqual(["innerAttr", "outerAttr"]); + + const inner = attrs.find((a) => a.localName === "innerAttr"); + expect(inner?.source).toBe("attributeGroup"); + expect(inner?.owningName).toBe("AG_Inner"); + + const outer = attrs.find((a) => a.localName === "outerAttr"); + expect(outer?.source).toBe("attributeGroup"); + expect(outer?.owningName).toBe("AG_Outer"); +}); + +test("element-to-type chain: lookup w-style element, follow type_ref, fetch children", async () => { + // document → CT_Empty (no content) ⇒ children empty. + const elem = await lookupElement(db.sql, WML_NS, "document", "transitional"); + expect(elem).not.toBeNull(); + if (!elem?.typeRef) throw new Error("expected type_ref"); + const type = await lookupSymbolByTypeRef(db.sql, elem.typeRef, "transitional"); + expect(type?.localName).toBe("CT_Empty"); + const children = await getChildren(db.sql, type!.id, "transitional"); + expect(children).toHaveLength(0); +}); diff --git a/tests/test-db.ts b/tests/test-db.ts new file mode 100644 index 0000000..b313640 --- /dev/null +++ b/tests/test-db.ts @@ -0,0 +1,38 @@ +/** + * Shared database guard for integration tests. + * + * The test suites TRUNCATE xsd_* tables aggressively (TRUNCATE ... CASCADE) and + * delete from spec_content's foreign-key sphere. They MUST NOT run against any + * non-local Postgres - in particular, never against a Neon production URL. + * + * Rules: + * 1. TEST_DATABASE_URL must be set explicitly. There is no fallback to + * DATABASE_URL: a developer who accidentally has DATABASE_URL pointed at + * Neon would otherwise wipe their schema graph data on `bun test`. + * 2. The hostname in TEST_DATABASE_URL must be local + * (localhost / 127.0.0.1 / host.docker.internal). + * + * If either rule fails, throw and refuse to run. + */ + +const LOCAL_HOSTS = new Set(["localhost", "127.0.0.1", "host.docker.internal"]); + +export function getTestDatabaseUrl(): string { + const url = process.env.TEST_DATABASE_URL; + if (!url) { + throw new Error( + "TEST_DATABASE_URL is not set. Integration tests TRUNCATE xsd_* tables and refuse to run without an explicit test database URL. Example: TEST_DATABASE_URL=postgresql://postgres:postgres@localhost:5432/ecma_spec", + ); + } + + // Extract hostname from a postgres connection string. Avoid `new URL()` on + // `postgresql://` because some Node URL parsers reject the scheme. + const hostMatch = url.match(/@([^/:?]+)/); + const host = (hostMatch?.[1] ?? "").toLowerCase(); + if (!LOCAL_HOSTS.has(host)) { + throw new Error( + `TEST_DATABASE_URL hostname '${host}' is not a local host. Refusing to TRUNCATE against a non-local database. Allowed hosts: ${[...LOCAL_HOSTS].join(", ")}.`, + ); + } + return url; +}