From bcc4d691ccc34bf6b44beb570832e36022e62c1b Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 07:47:28 -0300 Subject: [PATCH 01/24] feat(db): add provenance foundation Adds reference_sources and source_id FK on spec_content so every chunk can be traced to a known source. data/sources.json is the human-edited manifest; sync-sources upserts and backfills. name is the stable identity; edition/version update in place when verified, so re-tagging an existing source does not orphan its references. Establishes db/migrations/ convention with a small runner. --- .gitignore | 5 +- data/sources.json | 14 ++++ db/migrations/0001_reference_sources.sql | 23 ++++++ db/migrations/README.md | 28 ++++++++ db/schema.sql | 32 +++++++-- package.json | 2 + scripts/db-migrate.ts | 49 +++++++++++++ scripts/sync-sources.ts | 90 ++++++++++++++++++++++++ 8 files changed, 236 insertions(+), 7 deletions(-) create mode 100644 data/sources.json create mode 100644 db/migrations/0001_reference_sources.sql create mode 100644 db/migrations/README.md create mode 100644 scripts/db-migrate.ts create mode 100644 scripts/sync-sources.ts diff --git a/.gitignore b/.gitignore index 6fdf759..6e8b029 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,7 @@ dev/ .wrangler/ .env .mcp.json -.vscode/ \ No newline at end of file +.vscode/ + +# Local-only planning doc (public repo) +PLAN.md \ No newline at end of file diff --git a/data/sources.json b/data/sources.json new file mode 100644 index 0000000..c67f439 --- /dev/null +++ b/data/sources.json @@ -0,0 +1,14 @@ +{ + "$comment": "Source manifest. Human-edited; scripts/sync-sources.ts upserts these rows into reference_sources.", + "sources": [ + { + "name": "ecma-376", + "kind": "spec_pdf", + "edition": "unknown", + "version": null, + "url": "https://ecma-international.org/publications-and-standards/standards/ecma-376/", + "license_note": "ECMA-376 is freely available under ECMA terms; redistribution permitted with attribution.", + "sha256": null + } + ] +} diff --git a/db/migrations/0001_reference_sources.sql b/db/migrations/0001_reference_sources.sql new file mode 100644 index 0000000..0c8263d --- /dev/null +++ b/db/migrations/0001_reference_sources.sql @@ -0,0 +1,23 @@ +-- Phase 1: Provenance foundation +-- Adds reference_sources and source_id FK on spec_content. +-- Idempotent: safe to run against fresh installs (matches db/schema.sql) or existing DBs. + +CREATE EXTENSION IF NOT EXISTS vector; + +CREATE TABLE IF NOT EXISTS reference_sources ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + kind TEXT NOT NULL, + edition TEXT, + version TEXT, + url TEXT, + license_note TEXT, + sha256 TEXT, + fetched_at TIMESTAMPTZ DEFAULT NOW(), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +ALTER TABLE spec_content + ADD COLUMN IF NOT EXISTS source_id INT REFERENCES reference_sources(id); + +CREATE INDEX IF NOT EXISTS idx_content_source ON spec_content(source_id); diff --git a/db/migrations/README.md b/db/migrations/README.md new file mode 100644 index 0000000..e9a8062 --- /dev/null +++ b/db/migrations/README.md @@ -0,0 +1,28 @@ +# Migrations + +Each phase that changes the schema adds one numbered SQL file here. Files are applied in lexical order (`0001_*.sql`, `0002_*.sql`, ...). + +## Conventions + +- **Idempotent**: every statement uses `IF NOT EXISTS`, `ADD COLUMN IF NOT EXISTS`, or equivalent. Re-running a migration is a no-op. +- **Forward-only**: no `down` scripts. Reverting means writing a new migration. +- **Source of truth split**: + - `db/schema.sql` reflects the full schema after all migrations are applied. Used by `docker-compose` to initialize fresh dev databases via `db:reset`. + - Migration files are for incrementally upgrading existing databases (production / long-lived dev). + +## Applying migrations + +For now, apply manually against an existing database: + +```bash +psql "$DATABASE_URL" -f db/migrations/0001_reference_sources.sql +``` + +A small runner script can be added later if/when phases need it. + +## Adding a new migration + +1. Pick the next number (`0002`, `0003`, ...). +2. Write idempotent SQL. +3. Update `db/schema.sql` to match the new full state. +4. If the migration introduces curated data (e.g., source rows), let a script populate it (e.g., `scripts/sync-sources.ts`), not the SQL file. diff --git a/db/schema.sql b/db/schema.sql index 34e66e5..86f36ee 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -1,9 +1,30 @@ --- ECMA-376 Spec Vector Database Schema --- Simple single-table design - evolve as needed +-- ooxml.dev database schema +-- Single source of truth for fresh installs (loaded by docker-compose at init). +-- For incremental updates against an existing DB, apply files in db/migrations/ in order. CREATE EXTENSION IF NOT EXISTS vector; --- Single table for all spec content +-- Reference sources: provenance for every chunk and (later) every schema symbol. +-- Source artifacts (PDFs, XSDs) are NOT committed. Manifest at data/sources.json +-- is the human-edited source of truth; scripts/sync-sources.ts upserts rows from it. +-- name is the stable identity. edition/version are updatable attributes: +-- when 'unknown' is later verified to '5th', we update in place rather than +-- inserting a duplicate row that would orphan existing source_id references. +-- To track multiple editions side-by-side, use distinct names ('ecma-376-4th'). +CREATE TABLE reference_sources ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL UNIQUE, -- stable id, e.g. 'ecma-376' + kind TEXT NOT NULL, -- 'spec_pdf', 'xsd', 'reference_doc' + edition TEXT, -- '4th', '5th', or 'unknown' until verified + version TEXT, -- semver / date / null + url TEXT, -- canonical fetch URL + license_note TEXT, -- redistribution constraint + sha256 TEXT, -- artifact hash if fetched + fetched_at TIMESTAMPTZ DEFAULT NOW(), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Specification content: prose chunks for semantic search CREATE TABLE spec_content ( id SERIAL PRIMARY KEY, part_number INT NOT NULL, @@ -13,12 +34,11 @@ CREATE TABLE spec_content ( content_type TEXT DEFAULT 'text', page_number INT, embedding vector(1024), + source_id INT REFERENCES reference_sources(id), created_at TIMESTAMPTZ DEFAULT NOW() ); --- Vector similarity search CREATE INDEX idx_content_embedding ON spec_content USING hnsw (embedding vector_cosine_ops); - --- Filtering indexes CREATE INDEX idx_content_part ON spec_content(part_number); CREATE INDEX idx_content_section ON spec_content(section_id); +CREATE INDEX idx_content_source ON spec_content(source_id); diff --git a/package.json b/package.json index bc3deeb..4126c41 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,8 @@ "db:down": "docker compose down", "db:reset": "docker compose down -v && docker compose up -d", "db:shell": "docker compose exec db psql -U postgres -d ecma_spec", + "db:migrate": "bun scripts/db-migrate.ts", + "db:sync-sources": "bun scripts/sync-sources.ts", "ingest": "bun scripts/ingest/pipeline.ts", "ingest:chunk": "bun scripts/ingest/chunk.ts", "ingest:embed": "bun scripts/ingest/embed.ts", diff --git a/scripts/db-migrate.ts b/scripts/db-migrate.ts new file mode 100644 index 0000000..61ffea6 --- /dev/null +++ b/scripts/db-migrate.ts @@ -0,0 +1,49 @@ +/** + * Apply migrations in order from db/migrations/*.sql against $DATABASE_URL. + * All migrations are idempotent; re-running is safe. + * + * Usage: + * bun scripts/db-migrate.ts + * + * Environment: + * DATABASE_URL - PostgreSQL connection string + */ + +import { readdirSync } from "node:fs"; +import { join } from "node:path"; +import { createDbClient } from "../packages/shared/src/db/index.ts"; + +async function main() { + const databaseUrl = process.env.DATABASE_URL; + if (!databaseUrl) { + console.error("Missing DATABASE_URL environment variable"); + process.exit(1); + } + + const dir = "./db/migrations"; + const files = readdirSync(dir) + .filter((f) => f.endsWith(".sql")) + .sort(); + + if (files.length === 0) { + console.log("No migrations found."); + return; + } + + const db = createDbClient(databaseUrl); + try { + for (const f of files) { + const content = await Bun.file(join(dir, f)).text(); + console.log(`Applying ${f}...`); + await db.sql.unsafe(content); + } + console.log(`Applied ${files.length} migration(s).`); + } finally { + await db.close(); + } +} + +main().catch((err) => { + console.error("Migration failed:", err); + process.exit(1); +}); diff --git a/scripts/sync-sources.ts b/scripts/sync-sources.ts new file mode 100644 index 0000000..74c4418 --- /dev/null +++ b/scripts/sync-sources.ts @@ -0,0 +1,90 @@ +/** + * Sync reference_sources from data/sources.json. + * + * - Upserts each source row (matched by name + edition + version). + * - Backfills NULL source_id on spec_content to point at the ecma-376 source. + * The backfill is a one-time concern; once all rows have source_id it is a no-op. + * + * Usage: + * bun scripts/sync-sources.ts + * + * Environment: + * DATABASE_URL - PostgreSQL connection string + */ + +import { createDbClient } from "../packages/shared/src/db/index.ts"; + +interface SourceEntry { + name: string; + kind: string; + edition: string | null; + version: string | null; + url: string | null; + license_note: string | null; + sha256: string | null; +} + +interface Manifest { + sources: SourceEntry[]; +} + +async function main() { + const databaseUrl = process.env.DATABASE_URL; + if (!databaseUrl) { + console.error("Missing DATABASE_URL environment variable"); + process.exit(1); + } + + const manifestPath = "./data/sources.json"; + const raw = await Bun.file(manifestPath).text(); + const manifest = JSON.parse(raw) as Manifest; + + if (!Array.isArray(manifest.sources) || manifest.sources.length === 0) { + console.error(`Invalid manifest at ${manifestPath}: 'sources' must be a non-empty array`); + process.exit(1); + } + + console.log(`Syncing ${manifest.sources.length} source(s) from ${manifestPath}`); + + const db = createDbClient(databaseUrl); + const sql = db.sql; + + try { + for (const s of manifest.sources) { + const [row] = await sql<[{ id: number; existed: boolean }]>` + INSERT INTO reference_sources (name, kind, edition, version, url, license_note, sha256) + VALUES (${s.name}, ${s.kind}, ${s.edition}, ${s.version}, ${s.url}, ${s.license_note}, ${s.sha256}) + ON CONFLICT (name) DO UPDATE + SET kind = EXCLUDED.kind, + edition = EXCLUDED.edition, + version = EXCLUDED.version, + url = EXCLUDED.url, + license_note = EXCLUDED.license_note, + sha256 = COALESCE(EXCLUDED.sha256, reference_sources.sha256) + RETURNING id, (xmax <> 0) AS existed + `; + console.log( + ` ${row.existed ? "updated " : "inserted"} ${s.name} (id=${row.id}, edition=${s.edition ?? "null"})`, + ); + } + + const [ecma] = await sql<[{ id: number } | undefined]>` + SELECT id FROM reference_sources WHERE name = 'ecma-376' LIMIT 1 + `; + if (ecma) { + const result = await sql` + UPDATE spec_content SET source_id = ${ecma.id} WHERE source_id IS NULL + `; + console.log(`Backfilled ${result.count} spec_content row(s) -> source_id=${ecma.id}`); + } else { + console.warn("No ecma-376 source row found; skipped spec_content backfill."); + } + } finally { + await db.close(); + } +} + +main().catch((err) => { + console.error("Sync failed:", err); + process.exit(1); +}); From a0516ad2f458e6e17cf62093e257d26ac6502493 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 08:08:42 -0300 Subject: [PATCH 02/24] feat(db): add empty XSD schema tables (Phase 2) Profile-scoped symbol graph for OOXML schemas: xsd_profiles, xsd_namespaces, xsd_symbols, xsd_symbol_profiles, xsd_compositors (with parent_compositor_id for nested sequences/choices), xsd_child_edges (parent_symbol_id denormalized for fast 'children of X' queries), xsd_attr_edges, xsd_group_edges, xsd_inheritance_edges, xsd_enums, behavior_notes (claim_type enum locked now; Phase 5 populates). All tables empty after this migration. Integration tests verify constraint enforcement, CASCADE delete, and a realistic 'children of w:tbl in transitional' query path. --- db/migrations/0002_xsd_schema.sql | 149 +++++++++++++++++++ db/schema.sql | 136 ++++++++++++++++++ package.json | 1 + tests/db/xsd-schema.test.ts | 228 ++++++++++++++++++++++++++++++ 4 files changed, 514 insertions(+) create mode 100644 db/migrations/0002_xsd_schema.sql create mode 100644 tests/db/xsd-schema.test.ts diff --git a/db/migrations/0002_xsd_schema.sql b/db/migrations/0002_xsd_schema.sql new file mode 100644 index 0000000..45d63e6 --- /dev/null +++ b/db/migrations/0002_xsd_schema.sql @@ -0,0 +1,149 @@ +-- Phase 2: XSD schema tables (empty) +-- Profile-scoped symbol graph. All tables empty after this migration; data lands in Phase 3+. +-- Idempotent: safe to run against fresh installs (matches db/schema.sql) or existing DBs. + +CREATE TABLE IF NOT EXISTS xsd_profiles ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL UNIQUE, -- 'transitional', 'strict', 'office-extension', 'word-compatible-docx' + description TEXT, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS xsd_namespaces ( + id SERIAL PRIMARY KEY, + uri TEXT NOT NULL UNIQUE, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Canonical symbol identity: (vocabulary_id, local_name, kind). +-- vocabulary_id is a normalized id like 'wml-main', 'dml-main', 'shared-types'. +-- Namespace URIs are profile aliases, not part of identity (see xsd_symbol_profiles). +CREATE TABLE IF NOT EXISTS xsd_symbols ( + id SERIAL PRIMARY KEY, + vocabulary_id TEXT NOT NULL, + local_name TEXT NOT NULL, + kind TEXT NOT NULL, -- element, complexType, simpleType, attribute, attributeGroup, group + payload JSONB DEFAULT '{}'::jsonb, -- long-tail XSD details (annotations, app-info, rare attrs) + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (vocabulary_id, local_name, kind) +); + +-- Profile membership + per-profile namespace alias for a symbol. +CREATE TABLE IF NOT EXISTS xsd_symbol_profiles ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + namespace_id INT NOT NULL REFERENCES xsd_namespaces(id), + source_id INT REFERENCES reference_sources(id), + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (symbol_id, profile_id) +); + +-- Content-model compositors (xs:sequence | xs:choice | xs:all). Profile-scoped. +-- Either parent_symbol_id or parent_compositor_id is set (top-level vs nested). +CREATE TABLE IF NOT EXISTS xsd_compositors ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, + parent_compositor_id INT REFERENCES xsd_compositors(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + kind TEXT NOT NULL CHECK (kind IN ('sequence', 'choice', 'all')), + min_occurs INT DEFAULT 1, + max_occurs INT, -- NULL = unbounded + order_index INT DEFAULT 0, + CHECK (parent_symbol_id IS NOT NULL OR parent_compositor_id IS NOT NULL) +); + +-- Child element edges. parent_symbol_id is denormalized for fast "children of X" queries +-- without walking through compositor rows first. +CREATE TABLE IF NOT EXISTS xsd_child_edges ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + compositor_id INT NOT NULL REFERENCES xsd_compositors(id) ON DELETE CASCADE, + child_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + min_occurs INT DEFAULT 1, + max_occurs INT, -- NULL = unbounded + order_index INT DEFAULT 0 +); + +-- Attribute edges. attr_symbol_id is set when the attribute is a top-level symbol +-- (declared globally and referenced by ref); NULL for inline attributes. +CREATE TABLE IF NOT EXISTS xsd_attr_edges ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + attr_symbol_id INT REFERENCES xsd_symbols(id), + local_name TEXT NOT NULL, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + attr_use TEXT NOT NULL CHECK (attr_use IN ('required', 'optional', 'prohibited')) DEFAULT 'optional', + default_value TEXT, + fixed_value TEXT, + type_ref TEXT, + order_index INT DEFAULT 0 +); + +-- Group / attributeGroup references. resolved=true means the group's contents +-- have been expanded into xsd_child_edges or xsd_attr_edges on the parent. +CREATE TABLE IF NOT EXISTS xsd_group_edges ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + group_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + ref_kind TEXT NOT NULL CHECK (ref_kind IN ('group', 'attributeGroup')), + resolved BOOLEAN DEFAULT FALSE, + order_index INT DEFAULT 0 +); + +-- Inheritance: extension or restriction of a base type. A derived type has +-- exactly one base per profile. +CREATE TABLE IF NOT EXISTS xsd_inheritance_edges ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + base_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + relation TEXT NOT NULL CHECK (relation IN ('extension', 'restriction')), + UNIQUE (symbol_id, profile_id) +); + +-- Enum values from xs:simpleType / xs:restriction. +CREATE TABLE IF NOT EXISTS xsd_enums ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + value TEXT NOT NULL, + annotation TEXT, + order_index INT DEFAULT 0 +); + +-- Curated Word/Office behavior claims keyed to symbols. +-- claim_type enum is locked now (Phase 5 will populate). +CREATE TABLE IF NOT EXISTS behavior_notes ( + id SERIAL PRIMARY KEY, + symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, + app TEXT NOT NULL, -- 'Word', 'Office', 'LibreOffice' + version_scope TEXT, -- e.g. 'Word 2007+', 'Word 365' + claim_type TEXT NOT NULL CHECK (claim_type IN ( + 'ignores', + 'requires_despite_optional', + 'writes', + 'reads_but_does_not_write', + 'repairs', + 'layout_behavior' + )), + summary TEXT NOT NULL, + source_id INT REFERENCES reference_sources(id), + section_id TEXT, + confidence TEXT CHECK (confidence IN ('high', 'medium', 'low')), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Indexes (UNIQUE constraints already create implicit indexes for canonical lookups) +CREATE INDEX IF NOT EXISTS idx_xsd_symbols_lookup ON xsd_symbols(vocabulary_id, local_name, kind); +CREATE INDEX IF NOT EXISTS idx_xsd_child_edges_parent ON xsd_child_edges(parent_symbol_id); +CREATE INDEX IF NOT EXISTS idx_xsd_child_edges_compositor ON xsd_child_edges(compositor_id); +CREATE INDEX IF NOT EXISTS idx_xsd_attr_edges_symbol ON xsd_attr_edges(symbol_id); +CREATE INDEX IF NOT EXISTS idx_xsd_compositors_parent_symbol ON xsd_compositors(parent_symbol_id); +CREATE INDEX IF NOT EXISTS idx_xsd_compositors_parent_compositor ON xsd_compositors(parent_compositor_id); +CREATE INDEX IF NOT EXISTS idx_xsd_group_edges_parent ON xsd_group_edges(parent_symbol_id); +CREATE INDEX IF NOT EXISTS idx_xsd_inheritance_edges_symbol ON xsd_inheritance_edges(symbol_id); +CREATE INDEX IF NOT EXISTS idx_xsd_enums_symbol ON xsd_enums(symbol_id); +CREATE INDEX IF NOT EXISTS idx_behavior_notes_symbol ON behavior_notes(symbol_id); diff --git a/db/schema.sql b/db/schema.sql index 86f36ee..467909d 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -42,3 +42,139 @@ CREATE INDEX idx_content_embedding ON spec_content USING hnsw (embedding vector_ CREATE INDEX idx_content_part ON spec_content(part_number); CREATE INDEX idx_content_section ON spec_content(section_id); CREATE INDEX idx_content_source ON spec_content(source_id); + +-- ---------------------------------------------------------------------------- +-- XSD schema graph (Phase 2) +-- +-- Profile-scoped symbol graph for OOXML schemas. Canonical symbol identity is +-- (vocabulary_id, local_name, kind); namespace URIs are profile aliases. +-- Profile membership lives on edges/profile join tables, not duplicated symbols. +-- ---------------------------------------------------------------------------- + +CREATE TABLE xsd_profiles ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + description TEXT, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE TABLE xsd_namespaces ( + id SERIAL PRIMARY KEY, + uri TEXT NOT NULL UNIQUE, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE TABLE xsd_symbols ( + id SERIAL PRIMARY KEY, + vocabulary_id TEXT NOT NULL, + local_name TEXT NOT NULL, + kind TEXT NOT NULL, + payload JSONB DEFAULT '{}'::jsonb, + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (vocabulary_id, local_name, kind) +); + +CREATE TABLE xsd_symbol_profiles ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + namespace_id INT NOT NULL REFERENCES xsd_namespaces(id), + source_id INT REFERENCES reference_sources(id), + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (symbol_id, profile_id) +); + +CREATE TABLE xsd_compositors ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, + parent_compositor_id INT REFERENCES xsd_compositors(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + kind TEXT NOT NULL CHECK (kind IN ('sequence', 'choice', 'all')), + min_occurs INT DEFAULT 1, + max_occurs INT, + order_index INT DEFAULT 0, + CHECK (parent_symbol_id IS NOT NULL OR parent_compositor_id IS NOT NULL) +); + +CREATE TABLE xsd_child_edges ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + compositor_id INT NOT NULL REFERENCES xsd_compositors(id) ON DELETE CASCADE, + child_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + min_occurs INT DEFAULT 1, + max_occurs INT, + order_index INT DEFAULT 0 +); + +CREATE TABLE xsd_attr_edges ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + attr_symbol_id INT REFERENCES xsd_symbols(id), + local_name TEXT NOT NULL, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + attr_use TEXT NOT NULL CHECK (attr_use IN ('required', 'optional', 'prohibited')) DEFAULT 'optional', + default_value TEXT, + fixed_value TEXT, + type_ref TEXT, + order_index INT DEFAULT 0 +); + +CREATE TABLE xsd_group_edges ( + id SERIAL PRIMARY KEY, + parent_symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + group_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + ref_kind TEXT NOT NULL CHECK (ref_kind IN ('group', 'attributeGroup')), + resolved BOOLEAN DEFAULT FALSE, + order_index INT DEFAULT 0 +); + +CREATE TABLE xsd_inheritance_edges ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + base_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + relation TEXT NOT NULL CHECK (relation IN ('extension', 'restriction')), + UNIQUE (symbol_id, profile_id) +); + +CREATE TABLE xsd_enums ( + id SERIAL PRIMARY KEY, + symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, + value TEXT NOT NULL, + annotation TEXT, + order_index INT DEFAULT 0 +); + +CREATE TABLE behavior_notes ( + id SERIAL PRIMARY KEY, + symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, + app TEXT NOT NULL, + version_scope TEXT, + claim_type TEXT NOT NULL CHECK (claim_type IN ( + 'ignores', + 'requires_despite_optional', + 'writes', + 'reads_but_does_not_write', + 'repairs', + 'layout_behavior' + )), + summary TEXT NOT NULL, + source_id INT REFERENCES reference_sources(id), + section_id TEXT, + confidence TEXT CHECK (confidence IN ('high', 'medium', 'low')), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE INDEX idx_xsd_symbols_lookup ON xsd_symbols(vocabulary_id, local_name, kind); +CREATE INDEX idx_xsd_child_edges_parent ON xsd_child_edges(parent_symbol_id); +CREATE INDEX idx_xsd_child_edges_compositor ON xsd_child_edges(compositor_id); +CREATE INDEX idx_xsd_attr_edges_symbol ON xsd_attr_edges(symbol_id); +CREATE INDEX idx_xsd_compositors_parent_symbol ON xsd_compositors(parent_symbol_id); +CREATE INDEX idx_xsd_compositors_parent_compositor ON xsd_compositors(parent_compositor_id); +CREATE INDEX idx_xsd_group_edges_parent ON xsd_group_edges(parent_symbol_id); +CREATE INDEX idx_xsd_inheritance_edges_symbol ON xsd_inheritance_edges(symbol_id); +CREATE INDEX idx_xsd_enums_symbol ON xsd_enums(symbol_id); +CREATE INDEX idx_behavior_notes_symbol ON behavior_notes(symbol_id); diff --git a/package.json b/package.json index 4126c41..ccd100e 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "db:shell": "docker compose exec db psql -U postgres -d ecma_spec", "db:migrate": "bun scripts/db-migrate.ts", "db:sync-sources": "bun scripts/sync-sources.ts", + "test": "bun test tests/", "ingest": "bun scripts/ingest/pipeline.ts", "ingest:chunk": "bun scripts/ingest/chunk.ts", "ingest:embed": "bun scripts/ingest/embed.ts", diff --git a/tests/db/xsd-schema.test.ts b/tests/db/xsd-schema.test.ts new file mode 100644 index 0000000..0a81328 --- /dev/null +++ b/tests/db/xsd-schema.test.ts @@ -0,0 +1,228 @@ +/** + * Phase 2 acceptance tests: XSD schema integrity. + * + * Each test starts with an empty xsd_* / behavior_notes set. spec_content and + * reference_sources are left alone. The XSD tables are empty by design in Phase 2; + * once Phase 3 ingests data, tests should move to a separate TEST_DATABASE_URL. + * + * Usage: + * DATABASE_URL=postgresql://... bun test tests/db/xsd-schema.test.ts + */ + +import { afterAll, beforeAll, beforeEach, expect, test } from "bun:test"; +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; + +const databaseUrl = process.env.DATABASE_URL; +if (!databaseUrl) { + throw new Error("Missing DATABASE_URL for integration tests"); +} + +let db: DbClient; + +beforeAll(() => { + db = createDbClient(databaseUrl); +}); + +afterAll(async () => { + await db.close(); +}); + +beforeEach(async () => { + // Wipe phase-2 tables; spec_content / reference_sources untouched. + await db.sql` + TRUNCATE + behavior_notes, + xsd_enums, + xsd_inheritance_edges, + xsd_group_edges, + xsd_attr_edges, + xsd_child_edges, + xsd_compositors, + xsd_symbol_profiles, + xsd_symbols, + xsd_namespaces, + xsd_profiles + RESTART IDENTITY CASCADE + `; +}); + +// expect(promise).rejects.toThrow() doesn't trigger the postgres library's lazy +// query execution reliably; using an explicit try/catch instead. +async function expectThrows(fn: () => Promise): Promise { + let threw = false; + try { + await fn(); + } catch { + threw = true; + } + expect(threw).toBe(true); +} + +test("xsd_symbols enforces canonical identity (vocabulary_id, local_name, kind)", async () => { + await db.sql`INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tbl', 'element')`; + + await expectThrows( + () => db.sql`INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tbl', 'element')`, + ); + + // Same name, different kind is allowed (an element and complexType can share names). + await db.sql`INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tbl', 'complexType')`; +}); + +test("xsd_compositors CHECK constraints", async () => { + const [profile] = await db.sql`INSERT INTO xsd_profiles (name) VALUES ('test-profile') RETURNING id`; + const [symbol] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Tbl', 'complexType') RETURNING id + `; + + await db.sql` + INSERT INTO xsd_compositors (parent_symbol_id, profile_id, kind) + VALUES (${symbol.id}, ${profile.id}, 'sequence') + `; + + await expectThrows(() => db.sql` + INSERT INTO xsd_compositors (parent_symbol_id, profile_id, kind) + VALUES (${symbol.id}, ${profile.id}, 'group') + `); + + await expectThrows( + () => db.sql`INSERT INTO xsd_compositors (profile_id, kind) VALUES (${profile.id}, 'sequence')`, + ); +}); + +test("xsd_attr_edges attr_use enum and default", async () => { + const [profile] = await db.sql`INSERT INTO xsd_profiles (name) VALUES ('test-profile') RETURNING id`; + const [symbol] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Tbl', 'complexType') RETURNING id + `; + + const [defaulted] = await db.sql` + INSERT INTO xsd_attr_edges (symbol_id, local_name, profile_id) + VALUES (${symbol.id}, 'someAttr', ${profile.id}) + RETURNING attr_use + `; + expect(defaulted.attr_use).toBe("optional"); + + await db.sql` + INSERT INTO xsd_attr_edges (symbol_id, local_name, profile_id, attr_use) + VALUES (${symbol.id}, 'requiredAttr', ${profile.id}, 'required') + `; + + await expectThrows(() => db.sql` + INSERT INTO xsd_attr_edges (symbol_id, local_name, profile_id, attr_use) + VALUES (${symbol.id}, 'badAttr', ${profile.id}, 'whatever') + `); +}); + +test("behavior_notes claim_type enum is enforced", async () => { + await db.sql` + INSERT INTO behavior_notes (app, claim_type, summary) + VALUES ('Word', 'ignores', 'test') + `; + + await expectThrows(() => db.sql` + INSERT INTO behavior_notes (app, claim_type, summary) + VALUES ('Word', 'does_something', 'test') + `); +}); + +test("xsd_inheritance_edges allows one base per (symbol, profile)", async () => { + const [profile] = await db.sql`INSERT INTO xsd_profiles (name) VALUES ('test-profile') RETURNING id`; + const [derived] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Derived', 'complexType') RETURNING id + `; + const [base1] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Base1', 'complexType') RETURNING id + `; + const [base2] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Base2', 'complexType') RETURNING id + `; + + await db.sql` + INSERT INTO xsd_inheritance_edges (symbol_id, base_symbol_id, profile_id, relation) + VALUES (${derived.id}, ${base1.id}, ${profile.id}, 'extension') + `; + + await expectThrows(() => db.sql` + INSERT INTO xsd_inheritance_edges (symbol_id, base_symbol_id, profile_id, relation) + VALUES (${derived.id}, ${base2.id}, ${profile.id}, 'restriction') + `); +}); + +test("CASCADE delete cleans up dependent rows", async () => { + const [profile] = await db.sql`INSERT INTO xsd_profiles (name) VALUES ('test-profile') RETURNING id`; + const [namespace] = await db.sql`INSERT INTO xsd_namespaces (uri) VALUES ('http://example.com/test') RETURNING id`; + const [symbol] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'w:tbl', 'element') RETURNING id + `; + + await db.sql` + INSERT INTO xsd_symbol_profiles (symbol_id, profile_id, namespace_id) + VALUES (${symbol.id}, ${profile.id}, ${namespace.id}) + `; + await db.sql` + INSERT INTO xsd_compositors (parent_symbol_id, profile_id, kind) + VALUES (${symbol.id}, ${profile.id}, 'sequence') + `; + + await db.sql`DELETE FROM xsd_symbols WHERE id = ${symbol.id}`; + + const [remainingProfiles] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbol_profiles WHERE symbol_id = ${symbol.id}`; + const [remainingCompositors] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_compositors WHERE parent_symbol_id = ${symbol.id}`; + expect(remainingProfiles.c).toBe(0); + expect(remainingCompositors.c).toBe(0); +}); + +test("realistic insert and lookup: 'children of w:tbl in profile transitional'", async () => { + const [profile] = await db.sql`INSERT INTO xsd_profiles (name) VALUES ('transitional') RETURNING id`; + const [namespace] = await db.sql` + INSERT INTO xsd_namespaces (uri) VALUES ('http://schemas.openxmlformats.org/wordprocessingml/2006/main') RETURNING id + `; + const [tbl] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tbl', 'element') RETURNING id + `; + const [tblPr] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tblPr', 'element') RETURNING id + `; + const [tblGrid] = await db.sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'tblGrid', 'element') RETURNING id + `; + + await db.sql` + INSERT INTO xsd_symbol_profiles (symbol_id, profile_id, namespace_id) + VALUES (${tbl.id}, ${profile.id}, ${namespace.id}) + `; + + const [seq] = await db.sql` + INSERT INTO xsd_compositors (parent_symbol_id, profile_id, kind) + VALUES (${tbl.id}, ${profile.id}, 'sequence') + RETURNING id + `; + + await db.sql` + INSERT INTO xsd_child_edges (parent_symbol_id, compositor_id, child_symbol_id, profile_id, min_occurs, max_occurs, order_index) + VALUES + (${tbl.id}, ${seq.id}, ${tblPr.id}, ${profile.id}, 1, 1, 0), + (${tbl.id}, ${seq.id}, ${tblGrid.id}, ${profile.id}, 1, 1, 1) + `; + + const children = await db.sql` + SELECT s.local_name, e.min_occurs, e.max_occurs, e.order_index + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + WHERE e.parent_symbol_id = ${tbl.id} AND e.profile_id = ${profile.id} + ORDER BY e.order_index + `; + + expect(children).toHaveLength(2); + expect(children[0]).toMatchObject({ local_name: "tblPr", min_occurs: 1, max_occurs: 1, order_index: 0 }); + expect(children[1]).toMatchObject({ local_name: "tblGrid", min_occurs: 1, max_occurs: 1, order_index: 1 }); +}); + +test("xsd_namespaces and xsd_profiles have unique constraints on natural keys", async () => { + await db.sql`INSERT INTO xsd_profiles (name) VALUES ('transitional')`; + await expectThrows(() => db.sql`INSERT INTO xsd_profiles (name) VALUES ('transitional')`); + + await db.sql`INSERT INTO xsd_namespaces (uri) VALUES ('http://example.com/x')`; + await expectThrows(() => db.sql`INSERT INTO xsd_namespaces (uri) VALUES ('http://example.com/x')`); +}); From 98348c1bc5ffa42fc48bdfc2331d6164bc151b85 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 10:12:36 -0300 Subject: [PATCH 03/24] fix(db): xsd_compositors XOR check on parent Top-level (parent_symbol_id) and nested (parent_compositor_id) compositors are mutually exclusive in the model. The previous OR check let a single row claim both, which would make traversal/children queries ambiguous. Tightened to XOR in both schema.sql and the migration; test now also rejects the both-set case. --- db/migrations/0002_xsd_schema.sql | 5 +++-- db/schema.sql | 3 ++- tests/db/xsd-schema.test.ts | 18 +++++++++++++++++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/db/migrations/0002_xsd_schema.sql b/db/migrations/0002_xsd_schema.sql index 45d63e6..2e40aa6 100644 --- a/db/migrations/0002_xsd_schema.sql +++ b/db/migrations/0002_xsd_schema.sql @@ -40,7 +40,8 @@ CREATE TABLE IF NOT EXISTS xsd_symbol_profiles ( ); -- Content-model compositors (xs:sequence | xs:choice | xs:all). Profile-scoped. --- Either parent_symbol_id or parent_compositor_id is set (top-level vs nested). +-- Exactly one of parent_symbol_id (top-level on a type/group) or +-- parent_compositor_id (nested inside another compositor) is set. CREATE TABLE IF NOT EXISTS xsd_compositors ( id SERIAL PRIMARY KEY, parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, @@ -50,7 +51,7 @@ CREATE TABLE IF NOT EXISTS xsd_compositors ( min_occurs INT DEFAULT 1, max_occurs INT, -- NULL = unbounded order_index INT DEFAULT 0, - CHECK (parent_symbol_id IS NOT NULL OR parent_compositor_id IS NOT NULL) + CHECK ((parent_symbol_id IS NOT NULL) <> (parent_compositor_id IS NOT NULL)) ); -- Child element edges. parent_symbol_id is denormalized for fast "children of X" queries diff --git a/db/schema.sql b/db/schema.sql index 467909d..3f2e1e6 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -84,6 +84,7 @@ CREATE TABLE xsd_symbol_profiles ( UNIQUE (symbol_id, profile_id) ); +-- Exactly one of parent_symbol_id (top-level) or parent_compositor_id (nested) is set. CREATE TABLE xsd_compositors ( id SERIAL PRIMARY KEY, parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, @@ -93,7 +94,7 @@ CREATE TABLE xsd_compositors ( min_occurs INT DEFAULT 1, max_occurs INT, order_index INT DEFAULT 0, - CHECK (parent_symbol_id IS NOT NULL OR parent_compositor_id IS NOT NULL) + CHECK ((parent_symbol_id IS NOT NULL) <> (parent_compositor_id IS NOT NULL)) ); CREATE TABLE xsd_child_edges ( diff --git a/tests/db/xsd-schema.test.ts b/tests/db/xsd-schema.test.ts index 0a81328..ecbd398 100644 --- a/tests/db/xsd-schema.test.ts +++ b/tests/db/xsd-schema.test.ts @@ -75,19 +75,35 @@ test("xsd_compositors CHECK constraints", async () => { INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) VALUES ('wml-main', 'CT_Tbl', 'complexType') RETURNING id `; - await db.sql` + // Top-level: parent_symbol_id only. + const [topLevel] = await db.sql` INSERT INTO xsd_compositors (parent_symbol_id, profile_id, kind) VALUES (${symbol.id}, ${profile.id}, 'sequence') + RETURNING id + `; + + // Nested: parent_compositor_id only. + await db.sql` + INSERT INTO xsd_compositors (parent_compositor_id, profile_id, kind) + VALUES (${topLevel.id}, ${profile.id}, 'choice') `; + // kind must be sequence/choice/all. await expectThrows(() => db.sql` INSERT INTO xsd_compositors (parent_symbol_id, profile_id, kind) VALUES (${symbol.id}, ${profile.id}, 'group') `); + // Neither parent set is rejected. await expectThrows( () => db.sql`INSERT INTO xsd_compositors (profile_id, kind) VALUES (${profile.id}, 'sequence')`, ); + + // Both parents set is rejected (top-level vs nested are mutually exclusive). + await expectThrows(() => db.sql` + INSERT INTO xsd_compositors (parent_symbol_id, parent_compositor_id, profile_id, kind) + VALUES (${symbol.id}, ${topLevel.id}, ${profile.id}, 'sequence') + `); }); test("xsd_attr_edges attr_use enum and default", async () => { From 305873a6ddc90168235e5a13efdeda70d2169ae2 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 10:47:37 -0300 Subject: [PATCH 04/24] feat(xsd): scaffold ECMA Transitional fetch (Phase 3a) Adds scripts/fetch-xsd.ts which downloads the ECMA-376 Part 4 zip, verifies sha256, extracts the inner OfficeOpenXML-XMLSchema-Transitional zip, and lands the 26 XSDs under data/xsd-cache/ecma-376-transitional/. Cache is gitignored; manifest tracks the source identity, the canonical publications URL, and (after first fetch) the outer-zip sha256 for reproducibility. The Part 4 URL is supplied at fetch time via --url or XSD_PART4_URL. Also softens the ECMA license_note to neutral wording. --- .gitignore | 5 +- data/sources.json | 11 +++- package.json | 1 + scripts/fetch-xsd.ts | 149 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+), 2 deletions(-) create mode 100644 scripts/fetch-xsd.ts diff --git a/.gitignore b/.gitignore index 6e8b029..dcb1cd7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,7 @@ dev/ .vscode/ # Local-only planning doc (public repo) -PLAN.md \ No newline at end of file +PLAN.md + +# XSD/spec artifacts: pulled by scripts/fetch-xsd.ts; never committed. +data/xsd-cache/ \ No newline at end of file diff --git a/data/sources.json b/data/sources.json index c67f439..9755419 100644 --- a/data/sources.json +++ b/data/sources.json @@ -7,7 +7,16 @@ "edition": "unknown", "version": null, "url": "https://ecma-international.org/publications-and-standards/standards/ecma-376/", - "license_note": "ECMA-376 is freely available under ECMA terms; redistribution permitted with attribution.", + "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", + "sha256": null + }, + { + "name": "ecma-376-transitional", + "kind": "xsd", + "edition": "5th", + "version": "2016-12", + "url": "https://ecma-international.org/publications-and-standards/standards/ecma-376/", + "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", "sha256": null } ] diff --git a/package.json b/package.json index ccd100e..f548469 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "db:shell": "docker compose exec db psql -U postgres -d ecma_spec", "db:migrate": "bun scripts/db-migrate.ts", "db:sync-sources": "bun scripts/sync-sources.ts", + "xsd:fetch": "bun scripts/fetch-xsd.ts", "test": "bun test tests/", "ingest": "bun scripts/ingest/pipeline.ts", "ingest:chunk": "bun scripts/ingest/chunk.ts", diff --git a/scripts/fetch-xsd.ts b/scripts/fetch-xsd.ts new file mode 100644 index 0000000..cfeb60b --- /dev/null +++ b/scripts/fetch-xsd.ts @@ -0,0 +1,149 @@ +/** + * Fetch ECMA-376 Transitional XSDs from the ECMA Part 4 zip. + * + * The Part 4 zip is published by Ecma International on the ECMA-376 + * publications page. It contains OfficeOpenXML-XMLSchema-Transitional.zip, + * which in turn contains the 26 Transitional XSDs (wml.xsd, dml-main.xsd, + * sml.xsd, pml.xsd, shared-*.xsd, and friends). + * + * Cache layout: + * data/xsd-cache/ + * _staging/ (outer + inner zip extraction scratch) + * ecma-376-transitional/ (final XSDs land here) + * + * Usage: + * bun scripts/fetch-xsd.ts --url + * bun scripts/fetch-xsd.ts --url --expected-sha256 + * + * Or via env: + * XSD_PART4_URL= bun scripts/fetch-xsd.ts + * + * After a successful fetch the script prints the outer-zip sha256; + * paste it into data/sources.json under the ecma-376-transitional entry + * to pin reproducibility. + */ + +import { createHash } from "node:crypto"; +import { existsSync, readdirSync } from "node:fs"; +import { mkdir, rm } from "node:fs/promises"; +import { join } from "node:path"; + +const CACHE_ROOT = "./data/xsd-cache"; +const STAGING_DIR = join(CACHE_ROOT, "_staging"); +const FINAL_DIR = join(CACHE_ROOT, "ecma-376-transitional"); +const DEFAULT_INNER_ZIP = "OfficeOpenXML-XMLSchema-Transitional.zip"; + +interface Args { + url: string; + expectedSha256: string | null; + innerZip: string; +} + +function parseArgs(): Args { + const argv = process.argv.slice(2); + let url = process.env.XSD_PART4_URL ?? ""; + let expectedSha256: string | null = null; + let innerZip = DEFAULT_INNER_ZIP; + + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--url") url = argv[++i] ?? ""; + else if (a === "--expected-sha256") expectedSha256 = argv[++i] ?? null; + else if (a === "--inner-zip") innerZip = argv[++i] ?? DEFAULT_INNER_ZIP; + } + + if (!url) { + console.error("Missing --url (or XSD_PART4_URL env var)."); + console.error("Pass the canonical ECMA-376 5th edition Part 4 zip URL."); + process.exit(1); + } + return { url, expectedSha256, innerZip }; +} + +async function sha256(path: string): Promise { + const buf = await Bun.file(path).arrayBuffer(); + return createHash("sha256").update(new Uint8Array(buf)).digest("hex"); +} + +async function downloadTo(url: string, dest: string): Promise { + console.log(`Downloading ${url}`); + const res = await fetch(url); + if (!res.ok) { + throw new Error(`Fetch failed: ${res.status} ${res.statusText}`); + } + const buf = await res.arrayBuffer(); + await Bun.write(dest, buf); + console.log(` wrote ${dest} (${(buf.byteLength / 1024 / 1024).toFixed(2)} MiB)`); +} + +async function unzipInto(zipPath: string, dir: string): Promise { + await mkdir(dir, { recursive: true }); + const proc = Bun.spawn(["unzip", "-o", "-q", zipPath, "-d", dir], { + stdout: "inherit", + stderr: "inherit", + }); + const code = await proc.exited; + if (code !== 0) throw new Error(`unzip exited ${code} on ${zipPath}`); +} + +function findFile(dir: string, name: string): string | null { + const stack = [dir]; + while (stack.length) { + const cur = stack.pop()!; + for (const entry of readdirSync(cur, { withFileTypes: true })) { + const p = join(cur, entry.name); + if (entry.isDirectory()) stack.push(p); + else if (entry.name === name) return p; + } + } + return null; +} + +async function main() { + const args = parseArgs(); + + await rm(STAGING_DIR, { recursive: true, force: true }); + await rm(FINAL_DIR, { recursive: true, force: true }); + await mkdir(STAGING_DIR, { recursive: true }); + + const outerPath = join(STAGING_DIR, "part4.zip"); + await downloadTo(args.url, outerPath); + + const outerHash = await sha256(outerPath); + console.log(`outer zip sha256: ${outerHash}`); + if (args.expectedSha256 && outerHash !== args.expectedSha256) { + throw new Error(`sha256 mismatch: expected ${args.expectedSha256}, got ${outerHash}`); + } + + console.log(`Extracting outer zip into ${STAGING_DIR}`); + await unzipInto(outerPath, STAGING_DIR); + + const innerPath = findFile(STAGING_DIR, args.innerZip); + if (!innerPath) { + throw new Error(`Did not find ${args.innerZip} inside the outer zip.`); + } + console.log(`Found inner zip at ${innerPath}`); + + console.log(`Extracting Transitional XSDs into ${FINAL_DIR}`); + await unzipInto(innerPath, FINAL_DIR); + + const wml = findFile(FINAL_DIR, "wml.xsd"); + if (!wml) { + throw new Error(`wml.xsd not found in extracted XSD set; aborting.`); + } + + const xsdFiles = readdirSync(FINAL_DIR).filter((f) => f.endsWith(".xsd")); + console.log(`\nDone. ${xsdFiles.length} XSD files in ${FINAL_DIR}:`); + for (const f of xsdFiles.slice().sort()) console.log(` ${f}`); + + if (!args.expectedSha256) { + console.log("\nTo pin this fetch for reproducibility, paste the sha256 above into"); + console.log("data/sources.json under the 'ecma-376-transitional' entry, then re-run"); + console.log("`bun run db:sync-sources` to update the row."); + } +} + +main().catch((err) => { + console.error("Fetch failed:", err); + process.exit(1); +}); From b8c36ac457b9cc9fd948c0e296be748dc484d852 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 10:47:57 -0300 Subject: [PATCH 05/24] chore(xsd): drop unused existsSync import --- scripts/fetch-xsd.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fetch-xsd.ts b/scripts/fetch-xsd.ts index cfeb60b..82a3c21 100644 --- a/scripts/fetch-xsd.ts +++ b/scripts/fetch-xsd.ts @@ -24,7 +24,7 @@ */ import { createHash } from "node:crypto"; -import { existsSync, readdirSync } from "node:fs"; +import { readdirSync } from "node:fs"; import { mkdir, rm } from "node:fs/promises"; import { join } from "node:path"; From d6be8856553bc79fe4b2442f2a17c120732c7483 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 11:01:04 -0300 Subject: [PATCH 06/24] chore(xsd): pin ECMA Transitional Part 4 zip hash Direct ECMA download URL plus outer-zip sha256 captured after a successful fetch+extract; reproducible via `bun run xsd:fetch`. --- data/sources.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/sources.json b/data/sources.json index 9755419..642bd50 100644 --- a/data/sources.json +++ b/data/sources.json @@ -15,9 +15,9 @@ "kind": "xsd", "edition": "5th", "version": "2016-12", - "url": "https://ecma-international.org/publications-and-standards/standards/ecma-376/", + "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-4_5th_edition_december_2016.zip", "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", - "sha256": null + "sha256": "bd25da1109f73762356596918bf5ff8b74a1331642dba5f1c1d1dfc6bed34ecd" } ] } From 0f319c72becd63a644098d5aa603f0a0cdf8bb06 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 11:18:40 -0300 Subject: [PATCH 07/24] feat(xsd): parser scaffolding (Phase 3b) parseSchemaSet({ schemaDir, entrypoints }) loads a working set of XSDs, follows xsd:import schemaLocation references recursively, and indexes every top-level declaration (element/complexType/simpleType/group/ attributeGroup/attribute) by canonical Clark-style qname. fast-xml-parser configured with preserveOrder so sibling order across different tag names is retained, and no value coercion that would mutate XSD attribute strings. Returns a typed schema set: - documents: per-file metadata + raw schemaNode - namespaceByPrefix: per-document prefix -> URI maps - importGraph: per-document outgoing imports with resolved targets - declarationsByQName: canonical qname -> declarations[] QName resolution is conservative: declaration qnames use the document's target namespace; attribute qnames (ref/type/base) resolve through the document's prefix map and surface as { resolved: false } when the prefix or namespace is unknown rather than guessing. No DB writes in this phase. Smoke command bun run xsd:smoke parses wml.xsd from the cache and reports counts (820 complexTypes, 389 simpleTypes, 67 groups, 47 elements, etc). Also tightens DB test isolation: an afterAll TRUNCATE leaves the dev DB clean instead of carrying the last test's xsd_profiles row. --- bun.lock | 15 ++- package.json | 2 + scripts/ingest-xsd/ast.ts | 73 ++++++++++ scripts/ingest-xsd/parse-schema.ts | 187 ++++++++++++++++++++++++++ scripts/ingest-xsd/qname.ts | 88 ++++++++++++ scripts/ingest-xsd/smoke.ts | 80 +++++++++++ scripts/ingest-xsd/types.ts | 75 +++++++++++ scripts/ingest-xsd/vocabulary.ts | 67 +++++++++ tests/db/xsd-schema.test.ts | 34 ++--- tests/ingest-xsd/fixtures/main.xsd | 31 +++++ tests/ingest-xsd/fixtures/shared.xsd | 11 ++ tests/ingest-xsd/parse-schema.test.ts | 156 +++++++++++++++++++++ 12 files changed, 802 insertions(+), 17 deletions(-) create mode 100644 scripts/ingest-xsd/ast.ts create mode 100644 scripts/ingest-xsd/parse-schema.ts create mode 100644 scripts/ingest-xsd/qname.ts create mode 100644 scripts/ingest-xsd/smoke.ts create mode 100644 scripts/ingest-xsd/types.ts create mode 100644 scripts/ingest-xsd/vocabulary.ts create mode 100644 tests/ingest-xsd/fixtures/main.xsd create mode 100644 tests/ingest-xsd/fixtures/shared.xsd create mode 100644 tests/ingest-xsd/parse-schema.test.ts diff --git a/bun.lock b/bun.lock index 9a06be8..a026652 100644 --- a/bun.lock +++ b/bun.lock @@ -10,6 +10,7 @@ "@semantic-release/exec": "^7.1.0", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^12.0.2", + "fast-xml-parser": "^5.7.2", "lefthook": "^2.0.16", "semantic-release": "^25.0.2", "typescript": "~5.9.3", @@ -17,7 +18,7 @@ }, "apps/mcp-server": { "name": "@ooxml-dev/mcp-server", - "version": "0.0.1", + "version": "0.13.1", "dependencies": { "@modelcontextprotocol/sdk": "^1.25.3", "@neondatabase/serverless": "^1.0.2", @@ -31,7 +32,7 @@ }, "apps/web": { "name": "@ooxml-dev/web", - "version": "0.1.3", + "version": "0.13.1", "dependencies": { "clsx": "^2.1.1", "fumadocs-core": "^16.4.9", @@ -284,6 +285,8 @@ "@neondatabase/serverless": ["@neondatabase/serverless@1.0.2", "", { "dependencies": { "@types/node": "^22.15.30", "@types/pg": "^8.8.0" } }, "sha512-I5sbpSIAHiB+b6UttofhrN/UJXII+4tZPAq1qugzwCwLIL8EZLV7F/JyHUrEIiGgQpEXzpnjlJ+zwcEhheGvCw=="], + "@nodable/entities": ["@nodable/entities@2.1.0", "", {}, "sha512-nyT7T3nbMyBI/lvr6L5TyWbFJAI9FTgVRakNoBqCD+PmID8DzFrrNdLLtHMwMszOtqZa8PAOV24ZqDnQrhQINA=="], + "@octokit/auth-token": ["@octokit/auth-token@6.0.0", "", {}, "sha512-P4YJBPdPSpWTQ1NU4XYdvHvXJJDxM6YwpS0FZHRgP7YFkdVxsWcpWGy/NVqlAA7PcPCnMacXlRm1y2PFZRWL/w=="], "@octokit/core": ["@octokit/core@7.0.6", "", { "dependencies": { "@octokit/auth-token": "^6.0.0", "@octokit/graphql": "^9.0.3", "@octokit/request": "^10.0.6", "@octokit/request-error": "^7.0.2", "@octokit/types": "^16.0.0", "before-after-hook": "^4.0.0", "universal-user-agent": "^7.0.0" } }, "sha512-DhGl4xMVFGVIyMwswXeyzdL4uXD5OGILGX5N8Y+f6W7LhC1Ze2poSNrkF/fedpVDHEEZ+PHFW0vL14I+mm8K3Q=="], @@ -762,6 +765,10 @@ "fast-uri": ["fast-uri@3.1.0", "", {}, "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA=="], + "fast-xml-builder": ["fast-xml-builder@1.1.5", "", { "dependencies": { "path-expression-matcher": "^1.1.3" } }, "sha512-4TJn/8FKLeslLAH3dnohXqE3QSoxkhvaMzepOIZytwJXZO69Bfz0HBdDHzOTOon6G59Zrk6VQ2bEiv1t61rfkA=="], + + "fast-xml-parser": ["fast-xml-parser@5.7.2", "", { "dependencies": { "@nodable/entities": "^2.1.0", "fast-xml-builder": "^1.1.5", "path-expression-matcher": "^1.5.0", "strnum": "^2.2.3" }, "bin": { "fxparser": "src/cli/cli.js" } }, "sha512-P7oW7tLbYnhOLQk/Gv7cZgzgMPP/XN03K02/Jy6Y/NHzyIAIpxuZIM/YqAkfiXFPxA2CTm7NtCijK9EDu09u2w=="], + "fdir": ["fdir@6.5.0", "", { "peerDependencies": { "picomatch": "^3 || ^4" }, "optionalPeers": ["picomatch"] }, "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg=="], "figures": ["figures@6.1.0", "", { "dependencies": { "is-unicode-supported": "^2.0.0" } }, "sha512-d+l3qxjSesT4V7v2fh+QnmFnUWv9lSpjarhShNTgBOfA0ttejbQUAlHLitbjkoRiDulW0OPoQPYIGhIC8ohejg=="], @@ -1212,6 +1219,8 @@ "path-exists": ["path-exists@3.0.0", "", {}, "sha512-bpC7GYwiDYQ4wYLe+FA8lhRjhQCMcQGuSgGGqDkg/QerRWw9CmGRT0iSOVRSZJ29NMLZgIzqaljJ63oaL4NIJQ=="], + "path-expression-matcher": ["path-expression-matcher@1.5.0", "", {}, "sha512-cbrerZV+6rvdQrrD+iGMcZFEiiSrbv9Tfdkvnusy6y0x0GKBXREFg/Y65GhIfm0tnLntThhzCnfKwp1WRjeCyQ=="], + "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="], "path-to-regexp": ["path-to-regexp@6.3.0", "", {}, "sha512-Yhpw4T9C6hPpgPeA28us07OJeqZ5EzQTkbfwuhsUg0c237RomFoETJgmp2sa3F/41gfLE6G5cqcYwznmeEeOlQ=="], @@ -1408,6 +1417,8 @@ "strip-json-comments": ["strip-json-comments@2.0.1", "", {}, "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ=="], + "strnum": ["strnum@2.2.3", "", {}, "sha512-oKx6RUCuHfT3oyVjtnrmn19H1SiCqgJSg+54XqURKp5aCMbrXrhLjRN9TjuwMjiYstZ0MzDrHqkGZ5dFTKd+zg=="], + "style-to-js": ["style-to-js@1.1.21", "", { "dependencies": { "style-to-object": "1.0.14" } }, "sha512-RjQetxJrrUJLQPHbLku6U/ocGtzyjbJMP9lCNK7Ag0CNh690nSH8woqWH9u16nMjYBAok+i7JO1NP2pOy8IsPQ=="], "style-to-object": ["style-to-object@1.0.14", "", { "dependencies": { "inline-style-parser": "0.2.7" } }, "sha512-LIN7rULI0jBscWQYaSswptyderlarFkjQ+t79nzty8tcIAceVomEVlLzH5VP4Cmsv6MtKhs7qaAiwlcp+Mgaxw=="], diff --git a/package.json b/package.json index f548469..a9633ae 100644 --- a/package.json +++ b/package.json @@ -22,6 +22,7 @@ "db:migrate": "bun scripts/db-migrate.ts", "db:sync-sources": "bun scripts/sync-sources.ts", "xsd:fetch": "bun scripts/fetch-xsd.ts", + "xsd:smoke": "bun scripts/ingest-xsd/smoke.ts", "test": "bun test tests/", "ingest": "bun scripts/ingest/pipeline.ts", "ingest:chunk": "bun scripts/ingest/chunk.ts", @@ -35,6 +36,7 @@ "@semantic-release/exec": "^7.1.0", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^12.0.2", + "fast-xml-parser": "^5.7.2", "lefthook": "^2.0.16", "semantic-release": "^25.0.2", "typescript": "~5.9.3" diff --git a/scripts/ingest-xsd/ast.ts b/scripts/ingest-xsd/ast.ts new file mode 100644 index 0000000..0e98d55 --- /dev/null +++ b/scripts/ingest-xsd/ast.ts @@ -0,0 +1,73 @@ +/** + * Helpers for navigating the preserveOrder AST emitted by fast-xml-parser. + * + * AST shape: every element is a single-key object { tagName: children[], ":@"?: { "@_attrName": value } }. + * Text nodes are { "#text": string }. Children always live in an array, so sibling + * order is preserved across different tag names. + */ + +import type { PreserveOrderDocument, PreserveOrderNode } from "./types.ts"; + +/** Strip an XML namespace prefix from a tag name: "xsd:element" → "element". */ +export function stripPrefix(tag: string): string { + const colon = tag.indexOf(":"); + return colon < 0 ? tag : tag.slice(colon + 1); +} + +/** Return the single tag name on a preserveOrder node, or null for non-element nodes. */ +export function nodeTag(node: PreserveOrderNode): string | null { + for (const k of Object.keys(node)) { + if (k !== ":@") return k; + } + return null; +} + +/** Return the children array of a preserveOrder element. */ +export function nodeChildren(node: PreserveOrderNode): PreserveOrderNode[] { + const tag = nodeTag(node); + if (!tag) return []; + const v = node[tag]; + return Array.isArray(v) ? (v as PreserveOrderNode[]) : []; +} + +/** Return attributes on a preserveOrder element. fast-xml-parser nests them under ":@" with "@_" prefix. */ +export function nodeAttrs(node: PreserveOrderNode): Record { + const raw = node[":@"]; + if (!raw || typeof raw !== "object") return {}; + const out: Record = {}; + for (const [k, v] of Object.entries(raw as Record)) { + const name = k.startsWith("@_") ? k.slice(2) : k; + if (typeof v === "string") out[name] = v; + else if (v != null) out[name] = String(v); + } + return out; +} + +/** + * Find the first element in `doc` (or under `parent`) whose stripped tag name + * matches one of the given local names. Used to locate the xsd:schema root + * regardless of whether the file uses `xsd:`, `xs:`, or no prefix. + */ +export function findFirstByLocalName( + nodes: PreserveOrderDocument | PreserveOrderNode[], + localNames: string[], +): PreserveOrderNode | null { + for (const node of nodes) { + const tag = nodeTag(node); + if (tag && localNames.includes(stripPrefix(tag))) return node; + } + return null; +} + +/** + * Iterate immediate children of an element whose stripped tag name matches `localName`. + */ +export function* eachChildByLocalName( + parent: PreserveOrderNode, + localName: string, +): Generator { + for (const child of nodeChildren(parent)) { + const tag = nodeTag(child); + if (tag && stripPrefix(tag) === localName) yield child; + } +} diff --git a/scripts/ingest-xsd/parse-schema.ts b/scripts/ingest-xsd/parse-schema.ts new file mode 100644 index 0000000..b6b933e --- /dev/null +++ b/scripts/ingest-xsd/parse-schema.ts @@ -0,0 +1,187 @@ +/** + * Parse a working set of XSDs into an in-memory schema set. + * + * Walks xsd:import schemaLocation references recursively starting from + * `entrypoints`, and indexes every top-level declaration by canonical qname. + * + * No DB writes here. Subsequent phases (3c+) walk documents/declarations to + * produce xsd_symbols, edges, etc. + */ + +import { readFile } from "node:fs/promises"; +import { isAbsolute, normalize, relative, resolve, sep } from "node:path"; +import { XMLParser } from "fast-xml-parser"; +import { eachChildByLocalName, findFirstByLocalName, nodeAttrs, stripPrefix } from "./ast.ts"; +import { declarationQNameKey } from "./qname.ts"; +import type { + Declaration, + DeclarationKind, + ImportEdge, + ParsedSchemaDocument, + ParsedSchemaSet, + PreserveOrderDocument, + PreserveOrderNode, +} from "./types.ts"; +import { vocabularyForNamespace } from "./vocabulary.ts"; + +const xmlParser = new XMLParser({ + preserveOrder: true, + ignoreAttributes: false, + attributeNamePrefix: "@_", + parseAttributeValue: false, + parseTagValue: false, + trimValues: true, +}); + +const TOP_LEVEL_KINDS: Record = { + element: "element", + complexType: "complexType", + simpleType: "simpleType", + group: "group", + attributeGroup: "attributeGroup", + attribute: "attribute", +}; + +export interface ParseSchemaSetOptions { + schemaDir: string; + entrypoints: string[]; +} + +export async function parseSchemaSet(opts: ParseSchemaSetOptions): Promise { + const schemaDir = isAbsolute(opts.schemaDir) ? opts.schemaDir : resolve(opts.schemaDir); + + const documents = new Map(); + const namespaceByPrefix = new Map>(); + const importGraph = new Map(); + const declarationsByQName = new Map(); + + const queue: string[] = opts.entrypoints.map((p) => relPath(schemaDir, resolve(schemaDir, p))); + + while (queue.length) { + const relPathInDir = queue.shift()!; + if (documents.has(relPathInDir)) continue; + + const absolutePath = resolve(schemaDir, relPathInDir); + const text = await readFile(absolutePath, "utf-8"); + const ast = xmlParser.parse(text) as PreserveOrderDocument; + + const schemaNode = findFirstByLocalName(ast, ["schema"]); + if (!schemaNode) { + throw new Error(`No xsd:schema root in ${absolutePath}`); + } + + const attrs = nodeAttrs(schemaNode); + const targetNamespace = attrs.targetNamespace; + if (!targetNamespace) { + throw new Error(`Schema in ${absolutePath} is missing targetNamespace`); + } + + const prefixes = extractNamespacePrefixes(attrs); + const imports = extractImports(schemaNode, schemaDir, absolutePath); + + const doc: ParsedSchemaDocument = { + path: relPathInDir, + absolutePath, + targetNamespace, + vocabularyId: vocabularyForNamespace(targetNamespace), + schemaNode, + }; + + documents.set(relPathInDir, doc); + namespaceByPrefix.set(relPathInDir, prefixes); + importGraph.set(relPathInDir, imports); + + indexTopLevelDeclarations(doc, declarationsByQName); + + for (const edge of imports) { + if (edge.target && !documents.has(edge.target)) { + queue.push(edge.target); + } + } + } + + return { documents, namespaceByPrefix, importGraph, declarationsByQName }; +} + +function extractNamespacePrefixes(attrs: Record): Map { + const map = new Map(); + for (const [name, value] of Object.entries(attrs)) { + if (name === "xmlns") map.set("", value); + else if (name.startsWith("xmlns:")) map.set(name.slice("xmlns:".length), value); + } + return map; +} + +function extractImports( + schemaNode: PreserveOrderNode, + schemaDir: string, + currentAbsPath: string, +): ImportEdge[] { + const imports: ImportEdge[] = []; + for (const importNode of eachChildByLocalName(schemaNode, "import")) { + const a = nodeAttrs(importNode); + const schemaLocation = a.schemaLocation ?? null; + let target: string | null = null; + if (schemaLocation) { + const importedAbs = resolve(currentAbsPath, "..", schemaLocation); + target = relPath(schemaDir, importedAbs); + } + imports.push({ + namespace: a.namespace ?? "", + schemaLocation, + target, + }); + } + return imports; +} + +function indexTopLevelDeclarations( + doc: ParsedSchemaDocument, + declarationsByQName: Map, +): void { + for (const child of nodeChildrenLocal(doc.schemaNode)) { + const tag = nodeTagLocal(child); + if (!tag) continue; + const local = stripPrefix(tag); + const kind = TOP_LEVEL_KINDS[local]; + if (!kind) continue; + + const a = nodeAttrs(child); + const localName = a.name; + if (!localName) continue; + + const decl: Declaration = { + kind, + namespace: doc.targetNamespace, + vocabularyId: doc.vocabularyId, + localName, + documentPath: doc.path, + node: child, + }; + const key = declarationQNameKey(doc.targetNamespace, kind, localName); + const arr = declarationsByQName.get(key); + if (arr) arr.push(decl); + else declarationsByQName.set(key, [decl]); + } +} + +// Local helpers (avoid pulling extra exports from ast.ts). +function nodeTagLocal(node: PreserveOrderNode): string | null { + for (const k of Object.keys(node)) if (k !== ":@") return k; + return null; +} +function nodeChildrenLocal(node: PreserveOrderNode): PreserveOrderNode[] { + const tag = nodeTagLocal(node); + if (!tag) return []; + const v = node[tag]; + return Array.isArray(v) ? (v as PreserveOrderNode[]) : []; +} + +function relPath(base: string, abs: string): string { + const r = relative(base, normalize(abs)); + // Guard against escapes outside schemaDir. + if (r.startsWith(`..${sep}`) || r === "..") { + throw new Error(`Resolved path escapes schemaDir: ${abs} (base ${base})`); + } + return r; +} diff --git a/scripts/ingest-xsd/qname.ts b/scripts/ingest-xsd/qname.ts new file mode 100644 index 0000000..fa884d3 --- /dev/null +++ b/scripts/ingest-xsd/qname.ts @@ -0,0 +1,88 @@ +/** + * QName resolution and canonical keys. + * + * Two distinct concerns: + * + * 1. Top-level declaration qnames are formed from the document's targetNamespace + * plus the local @name attribute. Use `declarationQNameKey(namespace, kind, localName)` + * to produce the canonical Clark-style key used in declarationsByQName. + * + * 2. QName-valued attributes (ref, type, base, substitutionGroup, etc.) hold a + * "prefix:localName" string. Resolution uses the document's xmlns:* declarations. + * `resolveQNameAttr` returns either a resolved tuple or "unresolved" — we never + * invent a namespace for an unknown prefix. + */ + +import { NAMESPACE_TO_VOCABULARY } from "./vocabulary.ts"; + +export interface ResolvedQName { + prefix: string; + localName: string; + namespace: string; + vocabularyId: string | null; +} + +export interface UnresolvedQName { + prefix: string; + localName: string; + raw: string; + reason: "unknown-prefix" | "unknown-namespace"; +} + +export type QNameResult = { resolved: true; qname: ResolvedQName } | { resolved: false; qname: UnresolvedQName }; + +/** + * Canonical key for the declarationsByQName map. + * Clark-style namespace prefix plus the kind, e.g.: + * {http://schemas.openxmlformats.org/wordprocessingml/2006/main}complexType:CT_Tbl + */ +export function declarationQNameKey( + namespace: string, + kind: string, + localName: string, +): string { + return `{${namespace}}${kind}:${localName}`; +} + +/** + * Resolve a qname string ("prefix:localName" or just "localName") in the context + * of a document's prefix → URI map. Unprefixed names use the empty-prefix entry + * (xmlns="..." default), falling back to the supplied default namespace. + * + * Never throws: returns { resolved: false, ... } for unknown prefixes or + * namespaces, so the caller can decide whether to surface or persist as-is. + */ +export function resolveQNameAttr( + raw: string, + prefixMap: Map, + defaultNamespace: string, +): QNameResult { + if (!raw) { + return { + resolved: false, + qname: { prefix: "", localName: "", raw, reason: "unknown-prefix" }, + }; + } + + const colon = raw.indexOf(":"); + let prefix = ""; + let localName = raw; + if (colon >= 0) { + prefix = raw.slice(0, colon); + localName = raw.slice(colon + 1); + } + + const namespace = prefix ? prefixMap.get(prefix) : (prefixMap.get("") ?? defaultNamespace); + if (!namespace) { + return { + resolved: false, + qname: { prefix, localName, raw, reason: "unknown-prefix" }, + }; + } + + const vocabularyId = NAMESPACE_TO_VOCABULARY[namespace] ?? null; + return { + resolved: true, + qname: { prefix, localName, namespace, vocabularyId }, + }; +} diff --git a/scripts/ingest-xsd/smoke.ts b/scripts/ingest-xsd/smoke.ts new file mode 100644 index 0000000..210cab4 --- /dev/null +++ b/scripts/ingest-xsd/smoke.ts @@ -0,0 +1,80 @@ +/** + * Phase 3b smoke: parse the real WML working set and print a summary. + * + * Verifies the parser end-to-end against the live cache before Phase 3c + * starts writing symbols/edges to the DB. + * + * Usage: + * bun scripts/ingest-xsd/smoke.ts + * bun scripts/ingest-xsd/smoke.ts --schema-dir ./some/dir --entrypoint wml.xsd + */ + +import { parseSchemaSet } from "./parse-schema.ts"; +import type { DeclarationKind } from "./types.ts"; + +interface Args { + schemaDir: string; + entrypoints: string[]; +} + +function parseArgs(): Args { + const argv = process.argv.slice(2); + let schemaDir = "./data/xsd-cache/ecma-376-transitional"; + const entrypoints: string[] = []; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--schema-dir") schemaDir = argv[++i] ?? schemaDir; + else if (a === "--entrypoint") entrypoints.push(argv[++i] ?? ""); + } + if (entrypoints.length === 0) entrypoints.push("wml.xsd"); + return { schemaDir, entrypoints }; +} + +async function main() { + const args = parseArgs(); + const set = await parseSchemaSet({ + schemaDir: args.schemaDir, + entrypoints: args.entrypoints, + }); + + console.log(`schemaDir: ${args.schemaDir}`); + console.log(`entrypoints: ${args.entrypoints.join(", ")}`); + console.log(`documents loaded: ${set.documents.size}\n`); + + for (const ep of args.entrypoints) { + const doc = set.documents.get(ep); + if (!doc) continue; + console.log(`${ep}`); + console.log(` targetNamespace: ${doc.targetNamespace}`); + console.log(` vocabularyId: ${doc.vocabularyId}`); + const imports = set.importGraph.get(ep) ?? []; + console.log(` imports (${imports.length}):`); + for (const imp of imports) { + console.log( + ` ${imp.namespace} → ${imp.target ?? "(no schemaLocation)"}`, + ); + } + console.log(); + } + + const counts: Record = { + element: 0, + complexType: 0, + simpleType: 0, + group: 0, + attributeGroup: 0, + attribute: 0, + }; + for (const arr of set.declarationsByQName.values()) { + for (const d of arr) counts[d.kind]++; + } + console.log("declaration counts (across all loaded documents):"); + for (const k of Object.keys(counts).sort() as DeclarationKind[]) { + console.log(` ${k.padEnd(16)} ${counts[k]}`); + } +} + +main().catch((err) => { + console.error("smoke failed:", err); + process.exit(1); +}); diff --git a/scripts/ingest-xsd/types.ts b/scripts/ingest-xsd/types.ts new file mode 100644 index 0000000..5bd1644 --- /dev/null +++ b/scripts/ingest-xsd/types.ts @@ -0,0 +1,75 @@ +/** + * Shared types for the XSD parser/ingest pipeline. + */ + +/** + * fast-xml-parser preserveOrder output: + * - Documents are arrays of single-key objects (one per top-level node). + * - Each element node has shape { tagName: children[], ":@"?: { "@_attr": value } }. + * - Text leaves are { "#text": string }. + * We type these loosely and use helpers in ast.ts to navigate. + */ +export type PreserveOrderNode = Record; +export type PreserveOrderDocument = PreserveOrderNode[]; + +/** A single XSD file, after parsing. */ +export interface ParsedSchemaDocument { + /** Path relative to schemaDir (e.g. "wml.xsd"). */ + path: string; + absolutePath: string; + targetNamespace: string; + /** Stable id derived from targetNamespace via NAMESPACE_TO_VOCABULARY. */ + vocabularyId: string; + /** The xsd:schema element from the preserveOrder AST; later passes walk it. */ + schemaNode: PreserveOrderNode; +} + +/** xsd:import declared on a document. */ +export interface ImportEdge { + namespace: string; + schemaLocation: string | null; + /** + * Relative path of the resolved imported document (within schemaDir), + * or null when schemaLocation is absent (xml namespace, externally-supplied schemas). + */ + target: string | null; +} + +/** + * A top-level declaration discovered in a schema (xsd:element, complexType, + * simpleType, group, attributeGroup, or globally-declared attribute). + * + * Top-level declarations are always in the document's targetNamespace; the + * vocabularyId is therefore the document's vocabularyId. + */ +export type DeclarationKind = + | "element" + | "complexType" + | "simpleType" + | "group" + | "attributeGroup" + | "attribute"; + +export interface Declaration { + kind: DeclarationKind; + namespace: string; + vocabularyId: string; + localName: string; + documentPath: string; + node: PreserveOrderNode; +} + +/** + * Result of parsing a working set of XSDs. + * + * - documents: every loaded file, keyed by path relative to schemaDir + * - namespaceByPrefix: per-document prefix → URI maps (each .xsd declares its own) + * - importGraph: per-document outgoing xsd:import edges, with resolved targets + * - declarationsByQName: canonical qname (Clark notation + kind) → declarations + */ +export interface ParsedSchemaSet { + documents: Map; + namespaceByPrefix: Map>; + importGraph: Map; + declarationsByQName: Map; +} diff --git a/scripts/ingest-xsd/vocabulary.ts b/scripts/ingest-xsd/vocabulary.ts new file mode 100644 index 0000000..3a9ae9a --- /dev/null +++ b/scripts/ingest-xsd/vocabulary.ts @@ -0,0 +1,67 @@ +/** + * Canonical vocabulary IDs for OOXML namespaces. + * + * vocabulary_id is the stable identity used in xsd_symbols. Namespace URIs + * are profile-scoped aliases (a future profile could rebind a vocabulary to + * a different URI), so we don't key symbols by URI directly. + * + * Add an entry here when a new namespace appears. Unknown namespaces in + * input XSDs are an error: bail loudly so we extend the map deliberately + * rather than letting symbols land under an inferred id. + */ + +export const NAMESPACE_TO_VOCABULARY: Record = { + // WordprocessingML + "http://schemas.openxmlformats.org/wordprocessingml/2006/main": "wml-main", + + // SpreadsheetML + "http://schemas.openxmlformats.org/spreadsheetml/2006/main": "sml-main", + + // PresentationML + "http://schemas.openxmlformats.org/presentationml/2006/main": "pml-main", + + // DrawingML + "http://schemas.openxmlformats.org/drawingml/2006/main": "dml-main", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing": "dml-wp", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing": "dml-sp", + "http://schemas.openxmlformats.org/drawingml/2006/picture": "dml-pic", + "http://schemas.openxmlformats.org/drawingml/2006/chart": "dml-chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing": "dml-chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram": "dml-diagram", + "http://schemas.openxmlformats.org/drawingml/2006/lockedCanvas": "dml-lockedCanvas", + + // VML (legacy) + "urn:schemas-microsoft-com:vml": "vml-main", + "urn:schemas-microsoft-com:office:office": "vml-office", + "urn:schemas-microsoft-com:office:word": "vml-word", + "urn:schemas-microsoft-com:office:excel": "vml-excel", + "urn:schemas-microsoft-com:office:powerpoint": "vml-powerpoint", + + // Shared / officeDocument family + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes": "shared-types", + "http://schemas.openxmlformats.org/officeDocument/2006/math": "shared-math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships": "shared-relationships", + "http://schemas.openxmlformats.org/officeDocument/2006/customXml": "shared-customXml", + "http://schemas.openxmlformats.org/officeDocument/2006/bibliography": "shared-bibliography", + "http://schemas.openxmlformats.org/officeDocument/2006/characteristics": "shared-additionalCharacteristics", + "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties": "shared-extendedProperties", + "http://schemas.openxmlformats.org/officeDocument/2006/custom-properties": "shared-customProperties", + "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes": "shared-docPropsVTypes", + + // Schema library (XML schema references) + "http://schemas.openxmlformats.org/schemaLibrary/2006/main": "schemaLibrary-main", + + // W3C built-ins + "http://www.w3.org/XML/1998/namespace": "xml", + "http://www.w3.org/2001/XMLSchema": "xsd-builtin", +}; + +export function vocabularyForNamespace(uri: string): string { + const v = NAMESPACE_TO_VOCABULARY[uri]; + if (!v) { + throw new Error( + `Unknown namespace URI: ${uri}. Add it to NAMESPACE_TO_VOCABULARY in scripts/ingest-xsd/vocabulary.ts.`, + ); + } + return v; +} diff --git a/tests/db/xsd-schema.test.ts b/tests/db/xsd-schema.test.ts index ecbd398..926d902 100644 --- a/tests/db/xsd-schema.test.ts +++ b/tests/db/xsd-schema.test.ts @@ -19,31 +19,35 @@ if (!databaseUrl) { let db: DbClient; +const TRUNCATE_SQL = ` + TRUNCATE + behavior_notes, + xsd_enums, + xsd_inheritance_edges, + xsd_group_edges, + xsd_attr_edges, + xsd_child_edges, + xsd_compositors, + xsd_symbol_profiles, + xsd_symbols, + xsd_namespaces, + xsd_profiles + RESTART IDENTITY CASCADE +`; + beforeAll(() => { db = createDbClient(databaseUrl); }); afterAll(async () => { + // Final cleanup so the dev DB doesn't carry the last test's rows. + await db.sql.unsafe(TRUNCATE_SQL); await db.close(); }); beforeEach(async () => { // Wipe phase-2 tables; spec_content / reference_sources untouched. - await db.sql` - TRUNCATE - behavior_notes, - xsd_enums, - xsd_inheritance_edges, - xsd_group_edges, - xsd_attr_edges, - xsd_child_edges, - xsd_compositors, - xsd_symbol_profiles, - xsd_symbols, - xsd_namespaces, - xsd_profiles - RESTART IDENTITY CASCADE - `; + await db.sql.unsafe(TRUNCATE_SQL); }); // expect(promise).rejects.toThrow() doesn't trigger the postgres library's lazy diff --git a/tests/ingest-xsd/fixtures/main.xsd b/tests/ingest-xsd/fixtures/main.xsd new file mode 100644 index 0000000..181209f --- /dev/null +++ b/tests/ingest-xsd/fixtures/main.xsd @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/ingest-xsd/fixtures/shared.xsd b/tests/ingest-xsd/fixtures/shared.xsd new file mode 100644 index 0000000..12f113b --- /dev/null +++ b/tests/ingest-xsd/fixtures/shared.xsd @@ -0,0 +1,11 @@ + + + + + + + + + diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-xsd/parse-schema.test.ts new file mode 100644 index 0000000..92c13be --- /dev/null +++ b/tests/ingest-xsd/parse-schema.test.ts @@ -0,0 +1,156 @@ +/** + * Phase 3b: parser scaffolding tests. + * + * Primary tests use tiny fixture XSDs to keep the suite fast and independent + * of the local cache. One optional smoke test runs against the real + * data/xsd-cache/ecma-376-transitional/ if present. + */ + +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { expect, test } from "bun:test"; +import { parseSchemaSet } from "../../scripts/ingest-xsd/parse-schema.ts"; +import { declarationQNameKey, resolveQNameAttr } from "../../scripts/ingest-xsd/qname.ts"; +import type { Declaration, DeclarationKind } from "../../scripts/ingest-xsd/types.ts"; + +const FIXTURES_DIR = join(import.meta.dir, "fixtures"); +const REAL_CACHE_DIR = "./data/xsd-cache/ecma-376-transitional"; +const realCacheReady = existsSync(join(REAL_CACHE_DIR, "wml.xsd")); + +const WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; +const SHARED_TYPES_NS = "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes"; +const XSD_NS = "http://www.w3.org/2001/XMLSchema"; + +function countByKind(decls: Map): Record { + const out: Record = { + element: 0, + complexType: 0, + simpleType: 0, + group: 0, + attributeGroup: 0, + attribute: 0, + }; + for (const arr of decls.values()) { + for (const d of arr) out[d.kind]++; + } + return out; +} + +test("parseSchemaSet loads fixtures and follows imports transitively", async () => { + const set = await parseSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + }); + + expect(set.documents.size).toBe(2); + expect(set.documents.has("main.xsd")).toBe(true); + expect(set.documents.has("shared.xsd")).toBe(true); + + const main = set.documents.get("main.xsd"); + expect(main?.targetNamespace).toBe(WML_NS); + expect(main?.vocabularyId).toBe("wml-main"); + + const shared = set.documents.get("shared.xsd"); + expect(shared?.targetNamespace).toBe(SHARED_TYPES_NS); + expect(shared?.vocabularyId).toBe("shared-types"); +}); + +test("namespaceByPrefix is per-document and captures default + named prefixes", async () => { + const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); + const mainPrefixes = set.namespaceByPrefix.get("main.xsd"); + expect(mainPrefixes?.get("")).toBe(WML_NS); + expect(mainPrefixes?.get("s")).toBe(SHARED_TYPES_NS); + expect(mainPrefixes?.get("xsd")).toBe(XSD_NS); + + // shared.xsd has its own prefix map. + const sharedPrefixes = set.namespaceByPrefix.get("shared.xsd"); + expect(sharedPrefixes?.get("")).toBe(SHARED_TYPES_NS); + expect(sharedPrefixes?.has("s")).toBe(false); +}); + +test("importGraph resolves schemaLocation to relative target paths", async () => { + const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); + const mainImports = set.importGraph.get("main.xsd"); + expect(mainImports).toHaveLength(1); + expect(mainImports?.[0]).toMatchObject({ + namespace: SHARED_TYPES_NS, + schemaLocation: "shared.xsd", + target: "shared.xsd", + }); + + expect(set.importGraph.get("shared.xsd")).toEqual([]); +}); + +test("declarationsByQName indexes all top-level declarations across documents", async () => { + const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); + + const counts = countByKind(set.declarationsByQName); + // main.xsd: 1 element, 2 complexType, 1 simpleType, 1 group, 1 attributeGroup + // shared.xsd: 2 simpleType + expect(counts.element).toBe(1); + expect(counts.complexType).toBe(2); + expect(counts.simpleType).toBe(3); + expect(counts.group).toBe(1); + expect(counts.attributeGroup).toBe(1); + expect(counts.attribute).toBe(0); + + // Specific decl lookup by canonical key. + const ctPara = set.declarationsByQName.get(declarationQNameKey(WML_NS, "complexType", "CT_Para")); + expect(ctPara).toHaveLength(1); + expect(ctPara?.[0].vocabularyId).toBe("wml-main"); + + const stOnOff = set.declarationsByQName.get( + declarationQNameKey(SHARED_TYPES_NS, "simpleType", "ST_OnOff"), + ); + expect(stOnOff).toHaveLength(1); + expect(stOnOff?.[0].documentPath).toBe("shared.xsd"); +}); + +test("resolveQNameAttr: prefixed, unprefixed, and unresolved", async () => { + const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); + const prefixes = set.namespaceByPrefix.get("main.xsd"); + if (!prefixes) throw new Error("missing prefix map for fixture"); + + const r1 = resolveQNameAttr("s:ST_OnOff", prefixes, WML_NS); + expect(r1.resolved).toBe(true); + if (r1.resolved) { + expect(r1.qname.namespace).toBe(SHARED_TYPES_NS); + expect(r1.qname.localName).toBe("ST_OnOff"); + expect(r1.qname.vocabularyId).toBe("shared-types"); + } + + const r2 = resolveQNameAttr("CT_Para", prefixes, WML_NS); + expect(r2.resolved).toBe(true); + if (r2.resolved) expect(r2.qname.namespace).toBe(WML_NS); + + const r3 = resolveQNameAttr("zzz:Whatever", prefixes, WML_NS); + expect(r3.resolved).toBe(false); + if (!r3.resolved) expect(r3.qname.reason).toBe("unknown-prefix"); +}); + +test.skipIf(!realCacheReady)( + "smoke: parses real wml.xsd from cache, counts declarations", + async () => { + const set = await parseSchemaSet({ + schemaDir: REAL_CACHE_DIR, + entrypoints: ["wml.xsd"], + }); + + expect(set.documents.size).toBeGreaterThan(5); + const wml = set.documents.get("wml.xsd"); + expect(wml?.vocabularyId).toBe("wml-main"); + expect(wml?.targetNamespace).toBe(WML_NS); + + // wml.xsd imports 5 schemas with schemaLocation + 1 (xml) without. + const wmlImports = set.importGraph.get("wml.xsd"); + expect(wmlImports).toHaveLength(6); + + const counts = countByKind(set.declarationsByQName); + // Sanity floors against the WML+imports working set. Real counts (5th ed): + // complexType=820, simpleType=389, group=67, element=47, attribute=14, attributeGroup=8. + expect(counts.complexType).toBeGreaterThan(500); + expect(counts.simpleType).toBeGreaterThan(200); + expect(counts.group).toBeGreaterThan(40); + expect(counts.element).toBeGreaterThan(40); + }, +); From a1d7cba43282f79522c23ab8fad0a944b2cbce32 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 11:28:46 -0300 Subject: [PATCH 08/24] feat(xsd): symbol + inheritance ingest (Phase 3c) ingestSchemaSet wraps parseSchemaSet and writes: - xsd_profiles (bootstrap target profile) - xsd_namespaces (one per unique URI) - xsd_symbols (canonical (vocabulary_id, local_name, kind), upsert) - xsd_symbol_profiles (membership for the target profile, with source_id) - xsd_inheritance_edges (extension/restriction from complexContent/simpleContent and simpleType/restriction) The whole ingest runs in one transaction. Re-runs are no-ops via UNIQUE + ON CONFLICT DO NOTHING; stale-row cleanup is deferred per PLAN.md's edition-flip open item. QName base resolution uses the document's prefix map. Built-in xsd:* bases are auto-created on demand as kind=simpleType in vocabulary xsd-builtin so the FK on xsd_inheritance_edges.base_symbol_id holds. Phase 3c does not touch compositors, child edges, attributes, group refs, or enums (those are 3d/3e). Tests: fixture-driven happy path, idempotency check, plus an optional real-cache smoke test against the WML closure (12 docs, ~1359 symbols, ~389 inheritance edges, all bases resolved). Fixture main.xsd gains CT_Extended (extends CT_Empty) and CT_Restricted (restricts CT_Para) so the inheritance walker is exercised on both forms; existing parser test counts adjusted to match. --- package.json | 1 + scripts/ingest-xsd/ingest.ts | 376 ++++++++++++++++++++++++++ scripts/ingest-xsd/qname.ts | 10 +- scripts/ingest-xsd/smoke.ts | 4 +- scripts/ingest-xsd/vocabulary.ts | 9 +- tests/ingest-xsd/fixtures/main.xsd | 12 + tests/ingest-xsd/ingest.test.ts | 208 ++++++++++++++ tests/ingest-xsd/parse-schema.test.ts | 4 +- 8 files changed, 610 insertions(+), 14 deletions(-) create mode 100644 scripts/ingest-xsd/ingest.ts create mode 100644 tests/ingest-xsd/ingest.test.ts diff --git a/package.json b/package.json index a9633ae..2bf9e9a 100644 --- a/package.json +++ b/package.json @@ -23,6 +23,7 @@ "db:sync-sources": "bun scripts/sync-sources.ts", "xsd:fetch": "bun scripts/fetch-xsd.ts", "xsd:smoke": "bun scripts/ingest-xsd/smoke.ts", + "xsd:ingest": "bun scripts/ingest-xsd/ingest.ts", "test": "bun test tests/", "ingest": "bun scripts/ingest/pipeline.ts", "ingest:chunk": "bun scripts/ingest/chunk.ts", diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts new file mode 100644 index 0000000..3ad3354 --- /dev/null +++ b/scripts/ingest-xsd/ingest.ts @@ -0,0 +1,376 @@ +/** + * Phase 3c: ingest top-level symbols and inheritance edges. + * + * Walks parseSchemaSet output and writes: + * - xsd_profiles (bootstrap target profile, idempotent) + * - xsd_namespaces (one row per unique URI seen across documents) + * - xsd_symbols (canonical (vocabulary_id, local_name, kind), upsert by natural key) + * - xsd_symbol_profiles (membership for the target profile, with source_id) + * - xsd_inheritance_edges (extension/restriction from complexContent/simpleContent + * and simpleType/restriction) + * + * NOT touched here (Phases 3d/3e): + * - xsd_compositors, xsd_child_edges (content models) + * - xsd_attr_edges (attributes) + * - xsd_group_edges (group/attributeGroup refs) + * - xsd_enums (simpleType enumerations) + * + * Idempotency: the entire ingest runs in a single transaction. Re-running + * against the same source produces no new rows (UNIQUE + ON CONFLICT DO NOTHING). + * Stale-row cleanup (when symbols vanish in a future edition) is deferred, + * see PLAN.md "Edition flip and behavior_notes" open item. + * + * Usage as a library: + * await ingestSchemaSet({ schemaDir, entrypoints, profileName, sourceName, sql }) + * + * Usage as a CLI: + * bun scripts/ingest-xsd/ingest.ts + * bun scripts/ingest-xsd/ingest.ts --schema-dir --entrypoint wml.xsd \ + * --profile transitional --source ecma-376-transitional + */ + +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; +import { nodeAttrs } from "./ast.ts"; +import { parseSchemaSet } from "./parse-schema.ts"; +import { resolveQNameAttr } from "./qname.ts"; +import type { + Declaration, + ParsedSchemaSet, + PreserveOrderNode, +} from "./types.ts"; +import { vocabularyForNamespace } from "./vocabulary.ts"; + +// biome-ignore lint/suspicious/noExplicitAny: postgres library typing is intricate; helpers stay generic. +type Sql = any; + +export interface IngestSchemaSetOptions { + schemaDir: string; + entrypoints: string[]; + /** Profile name to attach symbols to (e.g. "transitional"). Bootstrap if missing. */ + profileName: string; + /** Source name in reference_sources; used for source_id on xsd_symbol_profiles. */ + sourceName: string; + /** Existing DbClient. The ingest opens its own transaction inside. */ + db: DbClient; +} + +export interface IngestStats { + documents: number; + symbolsInserted: number; + symbolsExisting: number; + namespacesEnsured: number; + profileMembershipsInserted: number; + inheritanceEdgesInserted: number; + inheritanceUnresolved: number; +} + +export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise { + const parseResult = await parseSchemaSet({ + schemaDir: opts.schemaDir, + entrypoints: opts.entrypoints, + }); + + const stats: IngestStats = { + documents: parseResult.documents.size, + symbolsInserted: 0, + symbolsExisting: 0, + namespacesEnsured: 0, + profileMembershipsInserted: 0, + inheritanceEdgesInserted: 0, + inheritanceUnresolved: 0, + }; + + await opts.db.sql.begin(async (sql: Sql) => { + const profileId = await ensureProfile(sql, opts.profileName); + const sourceId = await lookupSourceId(sql, opts.sourceName); + + // Pass 1: namespaces, symbols, profile memberships. + const namespaceIds = new Map(); + const symbolIds = new Map(); // canonical (vocab|local|kind) -> id + + for (const doc of parseResult.documents.values()) { + if (!namespaceIds.has(doc.targetNamespace)) { + const id = await ensureNamespace(sql, doc.targetNamespace); + namespaceIds.set(doc.targetNamespace, id); + stats.namespacesEnsured++; + } + } + + for (const decls of parseResult.declarationsByQName.values()) { + for (const decl of decls) { + const key = symbolKey(decl.vocabularyId, decl.localName, decl.kind); + if (symbolIds.has(key)) continue; + const { id, inserted } = await upsertSymbol( + sql, + decl.vocabularyId, + decl.localName, + decl.kind, + ); + symbolIds.set(key, id); + if (inserted) stats.symbolsInserted++; + else stats.symbolsExisting++; + + const nsId = namespaceIds.get(decl.namespace); + if (!nsId) { + throw new Error( + `Internal: missing namespace id for ${decl.namespace} (decl ${decl.localName})`, + ); + } + const linked = await linkSymbolToProfile(sql, id, profileId, nsId, sourceId); + if (linked) stats.profileMembershipsInserted++; + } + } + + // Pass 2: inheritance edges. Resolve base qname through the document's + // prefix map; ensure built-in xsd:* placeholders exist on demand. + for (const decls of parseResult.declarationsByQName.values()) { + for (const decl of decls) { + const inherit = findInheritance(decl); + if (!inherit) continue; + + const prefixMap = parseResult.namespaceByPrefix.get(decl.documentPath); + if (!prefixMap) continue; + const resolved = resolveQNameAttr(inherit.baseQName, prefixMap, decl.namespace); + if (!resolved.resolved) { + stats.inheritanceUnresolved++; + continue; + } + const baseQ = resolved.qname; + if (!baseQ.vocabularyId) { + stats.inheritanceUnresolved++; + continue; + } + + // Look up existing symbol; for xsd-builtin, ensure on demand. + let baseId: number | null = null; + const candidateKinds: Array = [ + "complexType", + "simpleType", + "element", + "group", + "attributeGroup", + "attribute", + ]; + for (const k of candidateKinds) { + const id = symbolIds.get(symbolKey(baseQ.vocabularyId, baseQ.localName, k)); + if (id != null) { + baseId = id; + break; + } + } + if (baseId == null && baseQ.vocabularyId === "xsd-builtin") { + const { id, inserted } = await upsertSymbol( + sql, + "xsd-builtin", + baseQ.localName, + "simpleType", + ); + symbolIds.set(symbolKey("xsd-builtin", baseQ.localName, "simpleType"), id); + baseId = id; + if (inserted) stats.symbolsInserted++; + else stats.symbolsExisting++; + } + if (baseId == null) { + stats.inheritanceUnresolved++; + continue; + } + + const childId = symbolIds.get(symbolKey(decl.vocabularyId, decl.localName, decl.kind)); + if (childId == null) continue; + + const inserted = await insertInheritance(sql, childId, baseId, profileId, inherit.relation); + if (inserted) stats.inheritanceEdgesInserted++; + } + } + }); + + return stats; +} + +// --- DB helpers ---------------------------------------------------------- + +async function ensureProfile(sql: Sql, name: string): Promise { + const [row] = await sql` + INSERT INTO xsd_profiles (name) VALUES (${name}) + ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name + RETURNING id + `; + return row.id; +} + +async function lookupSourceId(sql: Sql, name: string): Promise { + const [row] = await sql`SELECT id FROM reference_sources WHERE name = ${name} LIMIT 1`; + if (!row) throw new Error(`reference_sources row not found for name='${name}'. Run db:sync-sources first.`); + return row.id; +} + +async function ensureNamespace(sql: Sql, uri: string): Promise { + const [row] = await sql` + INSERT INTO xsd_namespaces (uri) VALUES (${uri}) + ON CONFLICT (uri) DO UPDATE SET uri = EXCLUDED.uri + RETURNING id + `; + return row.id; +} + +async function upsertSymbol( + sql: Sql, + vocabularyId: string, + localName: string, + kind: string, +): Promise<{ id: number; inserted: boolean }> { + const [row] = await sql` + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) + VALUES (${vocabularyId}, ${localName}, ${kind}) + ON CONFLICT (vocabulary_id, local_name, kind) DO UPDATE SET kind = EXCLUDED.kind + RETURNING id, (xmax = 0) AS inserted + `; + return { id: row.id, inserted: row.inserted }; +} + +async function linkSymbolToProfile( + sql: Sql, + symbolId: number, + profileId: number, + namespaceId: number, + sourceId: number, +): Promise { + const rows = await sql` + INSERT INTO xsd_symbol_profiles (symbol_id, profile_id, namespace_id, source_id) + VALUES (${symbolId}, ${profileId}, ${namespaceId}, ${sourceId}) + ON CONFLICT (symbol_id, profile_id) DO NOTHING + RETURNING id + `; + return rows.length > 0; +} + +async function insertInheritance( + sql: Sql, + symbolId: number, + baseSymbolId: number, + profileId: number, + relation: "extension" | "restriction", +): Promise { + const rows = await sql` + INSERT INTO xsd_inheritance_edges (symbol_id, base_symbol_id, profile_id, relation) + VALUES (${symbolId}, ${baseSymbolId}, ${profileId}, ${relation}) + ON CONFLICT (symbol_id, profile_id) DO NOTHING + RETURNING id + `; + return rows.length > 0; +} + +// --- Inheritance discovery from AST ------------------------------------- + +interface InheritanceFinding { + baseQName: string; + relation: "extension" | "restriction"; +} + +function findInheritance(decl: Declaration): InheritanceFinding | null { + if (decl.kind === "complexType") { + for (const child of nodeChildrenLocal(decl.node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag !== "complexContent" && tag !== "simpleContent") continue; + for (const inner of nodeChildrenLocal(child)) { + const innerTag = stripPrefixLocal(nodeTagLocal(inner)); + if (innerTag !== "extension" && innerTag !== "restriction") continue; + const base = nodeAttrs(inner).base; + if (base) return { baseQName: base, relation: innerTag }; + } + } + return null; + } + if (decl.kind === "simpleType") { + for (const child of nodeChildrenLocal(decl.node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag !== "restriction") continue; + const base = nodeAttrs(child).base; + if (base) return { baseQName: base, relation: "restriction" }; + } + } + return null; +} + +function nodeTagLocal(node: PreserveOrderNode): string | null { + for (const k of Object.keys(node)) if (k !== ":@") return k; + return null; +} +function nodeChildrenLocal(node: PreserveOrderNode): PreserveOrderNode[] { + const tag = nodeTagLocal(node); + if (!tag) return []; + const v = node[tag]; + return Array.isArray(v) ? (v as PreserveOrderNode[]) : []; +} +function stripPrefixLocal(tag: string | null): string | null { + if (!tag) return null; + const colon = tag.indexOf(":"); + return colon < 0 ? tag : tag.slice(colon + 1); +} + +function symbolKey(vocab: string, local: string, kind: string): string { + return `${vocab}|${local}|${kind}`; +} + +// --- CLI ----------------------------------------------------------------- + +interface CliArgs { + schemaDir: string; + entrypoints: string[]; + profileName: string; + sourceName: string; +} + +function parseCliArgs(): CliArgs { + const argv = process.argv.slice(2); + let schemaDir = "./data/xsd-cache/ecma-376-transitional"; + const entrypoints: string[] = []; + let profileName = "transitional"; + let sourceName = "ecma-376-transitional"; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--schema-dir") schemaDir = argv[++i] ?? schemaDir; + else if (a === "--entrypoint") entrypoints.push(argv[++i] ?? ""); + else if (a === "--profile") profileName = argv[++i] ?? profileName; + else if (a === "--source") sourceName = argv[++i] ?? sourceName; + } + if (entrypoints.length === 0) entrypoints.push("wml.xsd"); + return { schemaDir, entrypoints, profileName, sourceName }; +} + +async function main() { + const args = parseCliArgs(); + const databaseUrl = process.env.DATABASE_URL; + if (!databaseUrl) { + console.error("Missing DATABASE_URL"); + process.exit(1); + } + const db = createDbClient(databaseUrl); + + const t0 = Date.now(); + try { + const stats = await ingestSchemaSet({ ...args, db }); + const ms = Date.now() - t0; + console.log(`schemaDir: ${args.schemaDir}`); + console.log(`entrypoints: ${args.entrypoints.join(", ")}`); + console.log(`profile: ${args.profileName}`); + console.log(`source: ${args.sourceName}`); + console.log(`documents: ${stats.documents}`); + console.log(`symbols inserted: ${stats.symbolsInserted}`); + console.log(`symbols existing: ${stats.symbolsExisting}`); + console.log(`namespaces ensured: ${stats.namespacesEnsured}`); + console.log(`profile memberships: ${stats.profileMembershipsInserted}`); + console.log(`inheritance edges: ${stats.inheritanceEdgesInserted}`); + console.log(`inheritance unres.: ${stats.inheritanceUnresolved}`); + console.log(`elapsed: ${ms}ms`); + } finally { + await db.close(); + } +} + +if (import.meta.path === Bun.main) { + main().catch((err) => { + console.error("ingest failed:", err); + process.exit(1); + }); +} diff --git a/scripts/ingest-xsd/qname.ts b/scripts/ingest-xsd/qname.ts index fa884d3..008e2ca 100644 --- a/scripts/ingest-xsd/qname.ts +++ b/scripts/ingest-xsd/qname.ts @@ -29,18 +29,16 @@ export interface UnresolvedQName { reason: "unknown-prefix" | "unknown-namespace"; } -export type QNameResult = { resolved: true; qname: ResolvedQName } | { resolved: false; qname: UnresolvedQName }; +export type QNameResult = + | { resolved: true; qname: ResolvedQName } + | { resolved: false; qname: UnresolvedQName }; /** * Canonical key for the declarationsByQName map. * Clark-style namespace prefix plus the kind, e.g.: * {http://schemas.openxmlformats.org/wordprocessingml/2006/main}complexType:CT_Tbl */ -export function declarationQNameKey( - namespace: string, - kind: string, - localName: string, -): string { +export function declarationQNameKey(namespace: string, kind: string, localName: string): string { return `{${namespace}}${kind}:${localName}`; } diff --git a/scripts/ingest-xsd/smoke.ts b/scripts/ingest-xsd/smoke.ts index 210cab4..316fb10 100644 --- a/scripts/ingest-xsd/smoke.ts +++ b/scripts/ingest-xsd/smoke.ts @@ -50,9 +50,7 @@ async function main() { const imports = set.importGraph.get(ep) ?? []; console.log(` imports (${imports.length}):`); for (const imp of imports) { - console.log( - ` ${imp.namespace} → ${imp.target ?? "(no schemaLocation)"}`, - ); + console.log(` ${imp.namespace} → ${imp.target ?? "(no schemaLocation)"}`); } console.log(); } diff --git a/scripts/ingest-xsd/vocabulary.ts b/scripts/ingest-xsd/vocabulary.ts index 3a9ae9a..c3846a2 100644 --- a/scripts/ingest-xsd/vocabulary.ts +++ b/scripts/ingest-xsd/vocabulary.ts @@ -43,9 +43,12 @@ export const NAMESPACE_TO_VOCABULARY: Record = { "http://schemas.openxmlformats.org/officeDocument/2006/relationships": "shared-relationships", "http://schemas.openxmlformats.org/officeDocument/2006/customXml": "shared-customXml", "http://schemas.openxmlformats.org/officeDocument/2006/bibliography": "shared-bibliography", - "http://schemas.openxmlformats.org/officeDocument/2006/characteristics": "shared-additionalCharacteristics", - "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties": "shared-extendedProperties", - "http://schemas.openxmlformats.org/officeDocument/2006/custom-properties": "shared-customProperties", + "http://schemas.openxmlformats.org/officeDocument/2006/characteristics": + "shared-additionalCharacteristics", + "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties": + "shared-extendedProperties", + "http://schemas.openxmlformats.org/officeDocument/2006/custom-properties": + "shared-customProperties", "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes": "shared-docPropsVTypes", // Schema library (XML schema references) diff --git a/tests/ingest-xsd/fixtures/main.xsd b/tests/ingest-xsd/fixtures/main.xsd index 181209f..d7fffb1 100644 --- a/tests/ingest-xsd/fixtures/main.xsd +++ b/tests/ingest-xsd/fixtures/main.xsd @@ -12,6 +12,18 @@ + + + + + + + + + + + + diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts new file mode 100644 index 0000000..ffce574 --- /dev/null +++ b/tests/ingest-xsd/ingest.test.ts @@ -0,0 +1,208 @@ +/** + * Phase 3c: ingest pass tests. + * + * Each test starts with empty xsd_* / behavior_notes tables (afterEach TRUNCATE) + * and a known reference_sources row. Uses fixture XSDs. + */ + +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { afterAll, afterEach, beforeAll, beforeEach, expect, test } from "bun:test"; +import { ingestSchemaSet } from "../../scripts/ingest-xsd/ingest.ts"; +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; + +const FIXTURES_DIR = join(import.meta.dir, "fixtures"); +const REAL_CACHE_DIR = "./data/xsd-cache/ecma-376-transitional"; +const realCacheReady = existsSync(join(REAL_CACHE_DIR, "wml.xsd")); + +const databaseUrl = process.env.DATABASE_URL; +if (!databaseUrl) { + throw new Error("Missing DATABASE_URL for integration tests"); +} + +let db: DbClient; + +const TRUNCATE_SQL = ` + TRUNCATE + behavior_notes, + xsd_enums, + xsd_inheritance_edges, + xsd_group_edges, + xsd_attr_edges, + xsd_child_edges, + xsd_compositors, + xsd_symbol_profiles, + xsd_symbols, + xsd_namespaces, + xsd_profiles + RESTART IDENTITY CASCADE +`; + +beforeAll(async () => { + db = createDbClient(databaseUrl); + // Make sure ecma-376-transitional row exists; the ingest looks it up by name. + await db.sql` + INSERT INTO reference_sources (name, kind) + VALUES ('ecma-376-transitional', 'xsd') + ON CONFLICT (name) DO NOTHING + `; +}); + +afterAll(async () => { + await db.sql.unsafe(TRUNCATE_SQL); + await db.close(); +}); + +beforeEach(async () => { + await db.sql.unsafe(TRUNCATE_SQL); +}); + +afterEach(async () => { + await db.sql.unsafe(TRUNCATE_SQL); +}); + +const WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; +const SHARED_TYPES_NS = "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes"; + +test("ingest writes symbols, namespaces, memberships, and the transitional profile", async () => { + const stats = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + expect(stats.documents).toBe(2); + + // Profile bootstrapped. + const [profile] = await db.sql`SELECT id, name FROM xsd_profiles WHERE name = 'transitional'`; + expect(profile?.name).toBe("transitional"); + + // Both fixture target namespaces present. + const namespaces = await db.sql`SELECT uri FROM xsd_namespaces ORDER BY uri`; + const uris = namespaces.map((r: { uri: string }) => r.uri); + expect(uris).toContain(WML_NS); + expect(uris).toContain(SHARED_TYPES_NS); + + // Symbol count matches fixture: 1 element + 4 complexType + 3 simpleType + + // 1 group + 1 attributeGroup = 10 (plus 1 xsd-builtin auto-created for restrictions). + const [symbolCount] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbols`; + expect(symbolCount.c).toBeGreaterThanOrEqual(10); + + // CT_Para is in wml-main / transitional. + const [ctPara] = await db.sql` + SELECT s.id, s.vocabulary_id, s.kind, sp.profile_id, sp.namespace_id + FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + WHERE s.local_name = 'CT_Para' AND s.kind = 'complexType' + `; + expect(ctPara?.vocabulary_id).toBe("wml-main"); + + // ST_OnOff is in shared-types via the imported schema. + const [stOnOff] = await db.sql` + SELECT s.vocabulary_id FROM xsd_symbols s + WHERE s.local_name = 'ST_OnOff' AND s.kind = 'simpleType' + `; + expect(stOnOff?.vocabulary_id).toBe("shared-types"); +}); + +test("ingest writes inheritance edges for extension and restriction", async () => { + const stats = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Fixture inheritance: + // CT_Extended extends CT_Empty (complexContent) + // CT_Restricted restricts CT_Para (complexContent) + // ST_Jc restricts xsd:string (simpleType) + // ST_OnOff restricts xsd:boolean + // ST_String restricts xsd:string + expect(stats.inheritanceEdgesInserted).toBe(5); + expect(stats.inheritanceUnresolved).toBe(0); + + // Verify the CT_Extended → CT_Empty extension edge. + const [ext] = await db.sql` + SELECT base.local_name AS base_name, e.relation + FROM xsd_inheritance_edges e + JOIN xsd_symbols child ON child.id = e.symbol_id + JOIN xsd_symbols base ON base.id = e.base_symbol_id + WHERE child.local_name = 'CT_Extended' + `; + expect(ext?.base_name).toBe("CT_Empty"); + expect(ext?.relation).toBe("extension"); + + // Verify CT_Restricted → CT_Para restriction. + const [restr] = await db.sql` + SELECT base.local_name AS base_name, e.relation + FROM xsd_inheritance_edges e + JOIN xsd_symbols child ON child.id = e.symbol_id + JOIN xsd_symbols base ON base.id = e.base_symbol_id + WHERE child.local_name = 'CT_Restricted' + `; + expect(restr?.base_name).toBe("CT_Para"); + expect(restr?.relation).toBe("restriction"); + + // xsd-builtin placeholder symbol auto-created for the simpleType restrictions. + const [builtin] = await db.sql` + SELECT COUNT(*)::int AS c FROM xsd_symbols WHERE vocabulary_id = 'xsd-builtin' + `; + expect(builtin.c).toBeGreaterThan(0); +}); + +test("ingest is idempotent: re-running adds no new symbols/edges", async () => { + const first = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + const second = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + expect(second.symbolsInserted).toBe(0); + expect(second.symbolsExisting).toBeGreaterThan(0); + expect(second.profileMembershipsInserted).toBe(0); + expect(second.inheritanceEdgesInserted).toBe(0); + + // Row counts unchanged between first and second runs. + const [c1] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbols`; + const [c2] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbol_profiles`; + const [c3] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_inheritance_edges`; + expect(c1.c).toBe(first.symbolsInserted); + // One membership per symbol per profile. + expect(c2.c).toBe(first.profileMembershipsInserted); + expect(c3.c).toBe(first.inheritanceEdgesInserted); +}); + +test.skipIf(!realCacheReady)( + "smoke: ingest WML closure into the dev DB and verify counts", + async () => { + const stats = await ingestSchemaSet({ + schemaDir: REAL_CACHE_DIR, + entrypoints: ["wml.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Real WML closure has 12 documents (wml + closure). + expect(stats.documents).toBe(12); + // At least the parser-level totals should land as symbols. + // Real counts: 820 CT, 47 elem, 389 ST, 67 grp, 8 attrGrp, 14 attr = 1345 (+ a few xsd-builtins). + expect(stats.symbolsInserted).toBeGreaterThan(1300); + // Most types have an explicit base (extension or restriction); expect many edges. + expect(stats.inheritanceEdgesInserted).toBeGreaterThan(300); + }, +); diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-xsd/parse-schema.test.ts index 92c13be..b3fba01 100644 --- a/tests/ingest-xsd/parse-schema.test.ts +++ b/tests/ingest-xsd/parse-schema.test.ts @@ -85,10 +85,10 @@ test("declarationsByQName indexes all top-level declarations across documents", const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); const counts = countByKind(set.declarationsByQName); - // main.xsd: 1 element, 2 complexType, 1 simpleType, 1 group, 1 attributeGroup + // main.xsd: 1 element, 4 complexType, 1 simpleType, 1 group, 1 attributeGroup // shared.xsd: 2 simpleType expect(counts.element).toBe(1); - expect(counts.complexType).toBe(2); + expect(counts.complexType).toBe(4); expect(counts.simpleType).toBe(3); expect(counts.group).toBe(1); expect(counts.attributeGroup).toBe(1); From 6cb04acf5a80445cd1038358805fc82abd7e5066 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 11:35:06 -0300 Subject: [PATCH 09/24] feat(xsd): content model ingest (Phase 3d) Pass 3 of ingestSchemaSet walks every complexType and group declaration and writes xsd_compositors, xsd_child_edges, and xsd_group_edges. Compositor handling: - sequence/choice/all under a complexType (or under complexContent/extension|restriction) become top-level compositors with parent_symbol_id set. - Nested compositors (sequence inside choice etc.) recurse with parent_compositor_id set; the XOR check guarantees exactly one parent dimension is populated. - simpleContent contributes attributes only and is skipped here. Element handling inside compositors: - ref="..." resolves through the document's prefix map to a top-level symbol; child_edge points at it. - name="..." (local) creates / reuses a symbol under the owner vocabulary (vocab, name, kind=element). Cross-CT name reuse collapses; that is a known imprecision until we need to disambiguate. Group refs become xsd_group_edges with resolved=false; future passes can expand them. attributeGroup refs are still Phase 3e (attributes). WML closure ingest stats: - 2737 symbols (1345 declarations + 14 builtins + 1378 local elements) - 585 compositors - 2098 child edges (0 unresolved) - 161 group refs (0 unresolved) - 389 inheritance edges (0 unresolved) - elapsed ~2s Fixture main.xsd gains CT_Body to exercise nested compositors, ref-vs-name elements, and group refs in one test path. --- scripts/ingest-xsd/ingest.ts | 303 +++++++++++++++++++++++++- tests/ingest-xsd/fixtures/main.xsd | 9 + tests/ingest-xsd/ingest.test.ts | 129 ++++++++++- tests/ingest-xsd/parse-schema.test.ts | 4 +- 4 files changed, 433 insertions(+), 12 deletions(-) diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index 3ad3354..5abcef5 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -33,11 +33,7 @@ import { createDbClient, type DbClient } from "../../packages/shared/src/db/inde import { nodeAttrs } from "./ast.ts"; import { parseSchemaSet } from "./parse-schema.ts"; import { resolveQNameAttr } from "./qname.ts"; -import type { - Declaration, - ParsedSchemaSet, - PreserveOrderNode, -} from "./types.ts"; +import type { Declaration, ParsedSchemaSet, PreserveOrderNode } from "./types.ts"; import { vocabularyForNamespace } from "./vocabulary.ts"; // biome-ignore lint/suspicious/noExplicitAny: postgres library typing is intricate; helpers stay generic. @@ -62,6 +58,12 @@ export interface IngestStats { profileMembershipsInserted: number; inheritanceEdgesInserted: number; inheritanceUnresolved: number; + compositorsInserted: number; + childEdgesInserted: number; + childEdgesUnresolved: number; + groupRefsInserted: number; + groupRefsUnresolved: number; + localElementsCreated: number; } export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise { @@ -78,6 +80,12 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise { @@ -182,11 +190,231 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise; + symbolIds: Map; + stats: IngestStats; +} + +/** + * For a complexType: yield the node(s) whose direct children are particles + * (sequence/choice/all/group). That's the complexType itself, OR (for derived + * types) the inner xsd:extension or xsd:restriction beneath complexContent. + * + * For a group definition: yield the group node itself. + * + * simpleContent has no element particles; not yielded. + */ +function findContentModelParents(decl: Declaration): PreserveOrderNode[] { + if (decl.kind === "group") return [decl.node]; + + if (decl.kind !== "complexType") return []; + + const out: PreserveOrderNode[] = []; + let sawComplexContent = false; + for (const child of nodeChildrenLocal(decl.node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag === "complexContent") { + sawComplexContent = true; + for (const inner of nodeChildrenLocal(child)) { + const innerTag = stripPrefixLocal(nodeTagLocal(inner)); + if (innerTag === "extension" || innerTag === "restriction") out.push(inner); + } + } + } + if (sawComplexContent) return out; + // No complexContent wrapper: particles live directly under complexType. + return [decl.node]; +} + +async function walkCompositor( + node: PreserveOrderNode, + kind: "sequence" | "choice" | "all", + parentCompositorId: number | null, + orderIndex: number, + ctx: WalkCtx, +): Promise { + const a = nodeAttrs(node); + const compositorId = await insertCompositor( + ctx.sql, + parentCompositorId === null ? ctx.ownerSymbolId : null, + parentCompositorId, + ctx.profileId, + kind, + parseMinOccurs(a.minOccurs), + parseMaxOccurs(a.maxOccurs), + orderIndex, + ); + ctx.stats.compositorsInserted++; + + let childOrder = 0; + for (const child of nodeChildrenLocal(node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag === "element") { + await handleElement(child, compositorId, childOrder, ctx); + childOrder++; + } else if (tag === "sequence" || tag === "choice" || tag === "all") { + await walkCompositor(child, tag, compositorId, childOrder, ctx); + childOrder++; + } else if (tag === "group") { + await handleGroupRef(child, childOrder, ctx, compositorId); + childOrder++; + } + // xsd:any: skipped for now. + } +} + +async function handleElement( + node: PreserveOrderNode, + compositorId: number, + orderIndex: number, + ctx: WalkCtx, +): Promise { + const a = nodeAttrs(node); + let childSymbolId: number | null = null; + + if (a.ref) { + const resolved = resolveQNameAttr(a.ref, ctx.prefixMap, ctx.ownerDecl.namespace); + if (!resolved.resolved || !resolved.qname.vocabularyId) { + ctx.stats.childEdgesUnresolved++; + return; + } + const id = ctx.symbolIds.get( + symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "element"), + ); + if (id == null) { + ctx.stats.childEdgesUnresolved++; + return; + } + childSymbolId = id; + } else if (a.name) { + const key = symbolKey(ctx.ownerDecl.vocabularyId, a.name, "element"); + let id = ctx.symbolIds.get(key); + if (id == null) { + const res = await upsertSymbol(ctx.sql, ctx.ownerDecl.vocabularyId, a.name, "element"); + ctx.symbolIds.set(key, res.id); + if (res.inserted) { + ctx.stats.symbolsInserted++; + ctx.stats.localElementsCreated++; + } else { + ctx.stats.symbolsExisting++; + } + id = res.id; + } + childSymbolId = id; + } + + if (childSymbolId == null) return; + + await insertChildEdge( + ctx.sql, + ctx.ownerSymbolId, + compositorId, + childSymbolId, + ctx.profileId, + parseMinOccurs(a.minOccurs), + parseMaxOccurs(a.maxOccurs), + orderIndex, + ); + ctx.stats.childEdgesInserted++; +} + +async function handleGroupRef( + node: PreserveOrderNode, + orderIndex: number, + ctx: WalkCtx, + _compositorId: number | null = null, +): Promise { + void _compositorId; // group_edges aren't compositor-scoped in our schema; refs hang off the parent symbol. + const a = nodeAttrs(node); + if (!a.ref) return; + const resolved = resolveQNameAttr(a.ref, ctx.prefixMap, ctx.ownerDecl.namespace); + if (!resolved.resolved || !resolved.qname.vocabularyId) { + ctx.stats.groupRefsUnresolved++; + return; + } + const groupSymbolId = ctx.symbolIds.get( + symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "group"), + ); + if (groupSymbolId == null) { + ctx.stats.groupRefsUnresolved++; + return; + } + await insertGroupEdge( + ctx.sql, + ctx.ownerSymbolId, + groupSymbolId, + ctx.profileId, + "group", + orderIndex, + ); + ctx.stats.groupRefsInserted++; +} + +function parseMinOccurs(raw: string | undefined): number { + if (raw === undefined) return 1; + const n = parseInt(raw, 10); + return Number.isFinite(n) ? n : 1; +} + +function parseMaxOccurs(raw: string | undefined): number | null { + if (raw === undefined) return 1; + if (raw === "unbounded") return null; + const n = parseInt(raw, 10); + return Number.isFinite(n) ? n : 1; +} + // --- DB helpers ---------------------------------------------------------- async function ensureProfile(sql: Sql, name: string): Promise { @@ -200,7 +428,10 @@ async function ensureProfile(sql: Sql, name: string): Promise { async function lookupSourceId(sql: Sql, name: string): Promise { const [row] = await sql`SELECT id FROM reference_sources WHERE name = ${name} LIMIT 1`; - if (!row) throw new Error(`reference_sources row not found for name='${name}'. Run db:sync-sources first.`); + if (!row) + throw new Error( + `reference_sources row not found for name='${name}'. Run db:sync-sources first.`, + ); return row.id; } @@ -260,6 +491,60 @@ async function insertInheritance( return rows.length > 0; } +async function insertCompositor( + sql: Sql, + parentSymbolId: number | null, + parentCompositorId: number | null, + profileId: number, + kind: "sequence" | "choice" | "all", + minOccurs: number, + maxOccurs: number | null, + orderIndex: number, +): Promise { + const [row] = await sql` + INSERT INTO xsd_compositors + (parent_symbol_id, parent_compositor_id, profile_id, kind, min_occurs, max_occurs, order_index) + VALUES + (${parentSymbolId}, ${parentCompositorId}, ${profileId}, ${kind}, ${minOccurs}, ${maxOccurs}, ${orderIndex}) + RETURNING id + `; + return row.id; +} + +async function insertChildEdge( + sql: Sql, + parentSymbolId: number, + compositorId: number, + childSymbolId: number, + profileId: number, + minOccurs: number, + maxOccurs: number | null, + orderIndex: number, +): Promise { + await sql` + INSERT INTO xsd_child_edges + (parent_symbol_id, compositor_id, child_symbol_id, profile_id, min_occurs, max_occurs, order_index) + VALUES + (${parentSymbolId}, ${compositorId}, ${childSymbolId}, ${profileId}, ${minOccurs}, ${maxOccurs}, ${orderIndex}) + `; +} + +async function insertGroupEdge( + sql: Sql, + parentSymbolId: number, + groupSymbolId: number, + profileId: number, + refKind: "group" | "attributeGroup", + orderIndex: number, +): Promise { + await sql` + INSERT INTO xsd_group_edges + (parent_symbol_id, group_symbol_id, profile_id, ref_kind, order_index) + VALUES + (${parentSymbolId}, ${groupSymbolId}, ${profileId}, ${refKind}, ${orderIndex}) + `; +} + // --- Inheritance discovery from AST ------------------------------------- interface InheritanceFinding { @@ -362,6 +647,12 @@ async function main() { console.log(`profile memberships: ${stats.profileMembershipsInserted}`); console.log(`inheritance edges: ${stats.inheritanceEdgesInserted}`); console.log(`inheritance unres.: ${stats.inheritanceUnresolved}`); + console.log(`compositors: ${stats.compositorsInserted}`); + console.log(`child edges: ${stats.childEdgesInserted}`); + console.log(`child edges unres.: ${stats.childEdgesUnresolved}`); + console.log(`group refs: ${stats.groupRefsInserted}`); + console.log(`group refs unres.: ${stats.groupRefsUnresolved}`); + console.log(`local elements: ${stats.localElementsCreated}`); console.log(`elapsed: ${ms}ms`); } finally { await db.close(); diff --git a/tests/ingest-xsd/fixtures/main.xsd b/tests/ingest-xsd/fixtures/main.xsd index d7fffb1..b8122e2 100644 --- a/tests/ingest-xsd/fixtures/main.xsd +++ b/tests/ingest-xsd/fixtures/main.xsd @@ -24,6 +24,15 @@ + + + + + + + + + diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts index ffce574..5b3152b 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-xsd/ingest.test.ts @@ -186,6 +186,115 @@ test("ingest is idempotent: re-running adds no new symbols/edges", async () => { expect(c3.c).toBe(first.inheritanceEdgesInserted); }); +test("ingest writes compositors and child edges for nested content models", async () => { + const stats = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Fixture content models: + // CT_Para: sequence -> element name="text" + // CT_Body: sequence -> [ element ref="document", + // choice (minOccurs=0, maxOccurs=unbounded) -> [ + // group ref="EG_PContent", + // element name="break" + // ]] + // EG_PContent: choice -> element name="r" + // Compositors total: CT_Para(1) + CT_Body(2) + EG_PContent(1) = 4 + expect(stats.compositorsInserted).toBe(4); + expect(stats.groupRefsInserted).toBe(1); + expect(stats.localElementsCreated).toBe(3); // text, break, r + expect(stats.childEdgesUnresolved).toBe(0); + expect(stats.groupRefsUnresolved).toBe(0); + + // CT_Para: one sequence with one child edge to local element "text". + const ctParaChildren = await db.sql` + SELECT s.local_name, e.min_occurs, e.max_occurs, e.order_index, c.kind AS compositor_kind + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + JOIN xsd_compositors c ON c.id = e.compositor_id + JOIN xsd_symbols parent ON parent.id = e.parent_symbol_id + WHERE parent.local_name = 'CT_Para' AND parent.kind = 'complexType' + ORDER BY e.order_index + `; + expect(ctParaChildren).toHaveLength(1); + expect(ctParaChildren[0]).toMatchObject({ + local_name: "text", + min_occurs: 1, + max_occurs: 1, + order_index: 0, + compositor_kind: "sequence", + }); + + // CT_Body: top sequence + nested choice. Two compositors for CT_Body. + const ctBodyCompositors = await db.sql` + SELECT c.kind, c.parent_symbol_id, c.parent_compositor_id, c.min_occurs, c.max_occurs, c.order_index + FROM xsd_compositors c + JOIN xsd_symbols s ON s.id = c.parent_symbol_id + WHERE s.local_name = 'CT_Body' AND s.kind = 'complexType' + ORDER BY c.order_index + `; + // Only the TOP-level compositor has parent_symbol_id set; nested has parent_compositor_id. + expect(ctBodyCompositors).toHaveLength(1); + expect(ctBodyCompositors[0]).toMatchObject({ kind: "sequence", min_occurs: 1, max_occurs: 1 }); + const topId: number = ctBodyCompositors[0].id ?? null; + void topId; + + const nestedCompositors = await db.sql` + SELECT c.kind, c.min_occurs, c.max_occurs, c.parent_compositor_id + FROM xsd_compositors c + JOIN xsd_compositors parent ON parent.id = c.parent_compositor_id + JOIN xsd_symbols owner ON owner.id = parent.parent_symbol_id + WHERE owner.local_name = 'CT_Body' + `; + expect(nestedCompositors).toHaveLength(1); + expect(nestedCompositors[0]).toMatchObject({ + kind: "choice", + min_occurs: 0, + max_occurs: null, // unbounded + }); + + // CT_Body's top sequence has 1 child edge (ref="document"). The break element is + // inside the nested choice, not the top sequence. + const ctBodyTopChildren = await db.sql` + SELECT s.local_name, e.order_index + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + JOIN xsd_compositors c ON c.id = e.compositor_id + JOIN xsd_symbols parent ON parent.id = c.parent_symbol_id + WHERE parent.local_name = 'CT_Body' AND c.kind = 'sequence' + ORDER BY e.order_index + `; + expect(ctBodyTopChildren).toHaveLength(1); + expect(ctBodyTopChildren[0].local_name).toBe("document"); + + // CT_Body's nested choice has 1 child edge (local element "break"); the group ref + // goes to xsd_group_edges, not child_edges. + const ctBodyNestedChildren = await db.sql` + SELECT s.local_name + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + JOIN xsd_compositors c ON c.id = e.compositor_id + WHERE c.kind = 'choice' AND c.parent_compositor_id IS NOT NULL + `; + const names = ctBodyNestedChildren.map((r: { local_name: string }) => r.local_name); + expect(names).toContain("break"); + + // Group ref for EG_PContent under CT_Body. + const groupEdges = await db.sql` + SELECT g.local_name AS group_name, ref_kind + FROM xsd_group_edges ge + JOIN xsd_symbols parent ON parent.id = ge.parent_symbol_id + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + WHERE parent.local_name = 'CT_Body' + `; + expect(groupEdges).toHaveLength(1); + expect(groupEdges[0]).toMatchObject({ group_name: "EG_PContent", ref_kind: "group" }); +}); + test.skipIf(!realCacheReady)( "smoke: ingest WML closure into the dev DB and verify counts", async () => { @@ -197,12 +306,24 @@ test.skipIf(!realCacheReady)( db, }); - // Real WML closure has 12 documents (wml + closure). + // Real WML closure has 12 documents. expect(stats.documents).toBe(12); - // At least the parser-level totals should land as symbols. - // Real counts: 820 CT, 47 elem, 389 ST, 67 grp, 8 attrGrp, 14 attr = 1345 (+ a few xsd-builtins). expect(stats.symbolsInserted).toBeGreaterThan(1300); - // Most types have an explicit base (extension or restriction); expect many edges. expect(stats.inheritanceEdgesInserted).toBeGreaterThan(300); + expect(stats.compositorsInserted).toBeGreaterThan(500); + expect(stats.childEdgesInserted).toBeGreaterThan(1000); + expect(stats.groupRefsInserted).toBeGreaterThan(20); + expect(stats.childEdgesUnresolved).toBe(0); + expect(stats.groupRefsUnresolved).toBe(0); + + // w:tbl is the global element; its content type is CT_Tbl. Verify CT_Tbl has children. + const ctTblChildren = await db.sql` + SELECT s.local_name FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + JOIN xsd_symbols parent ON parent.id = e.parent_symbol_id + WHERE parent.local_name = 'CT_Tbl' AND parent.vocabulary_id = 'wml-main' + ORDER BY e.order_index + `; + expect(ctTblChildren.length).toBeGreaterThan(0); }, ); diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-xsd/parse-schema.test.ts index b3fba01..5a5d9d3 100644 --- a/tests/ingest-xsd/parse-schema.test.ts +++ b/tests/ingest-xsd/parse-schema.test.ts @@ -85,10 +85,10 @@ test("declarationsByQName indexes all top-level declarations across documents", const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); const counts = countByKind(set.declarationsByQName); - // main.xsd: 1 element, 4 complexType, 1 simpleType, 1 group, 1 attributeGroup + // main.xsd: 1 element, 5 complexType, 1 simpleType, 1 group, 1 attributeGroup // shared.xsd: 2 simpleType expect(counts.element).toBe(1); - expect(counts.complexType).toBe(4); + expect(counts.complexType).toBe(5); expect(counts.simpleType).toBe(3); expect(counts.group).toBe(1); expect(counts.attributeGroup).toBe(1); From 280e76f047c9e1577bb25162bf895825b7460d46 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 11:51:58 -0300 Subject: [PATCH 10/24] fix(xsd): make Phase 3d content-model ingest idempotent Compositors / child_edges / group_edges have no natural unique key (a complexType can hold sibling sequences/choices), so the prior pass unconditionally inserted on every run, doubling rows on the second ingest. CT_Tbl content lookups against a re-ingested DB returned 0 rows because the order_index ranges no longer matched what queries expected. Switching to delete-and-rewrite per profile at the start of pass 3: DELETE FROM xsd_compositors WHERE profile_id = ? DELETE FROM xsd_group_edges WHERE profile_id = ? xsd_child_edges cleans up automatically via FK CASCADE on compositor_id. Inheritance / symbols / memberships stay upsert-only since they have natural keys. Idempotency test now also asserts compositor / child-edge / group-ref counts in the DB match the first-run insert counts after a second run. Verified: two consecutive `bun run xsd:ingest` against the WML closure both produce 585 compositors / 2098 child edges / 161 group refs and the DB ends at exactly those counts. --- scripts/ingest-xsd/ingest.ts | 10 ++++++++++ tests/ingest-xsd/ingest.test.ts | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index 5abcef5..31df0b7 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -195,6 +195,16 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise { expect(second.symbolsExisting).toBeGreaterThan(0); expect(second.profileMembershipsInserted).toBe(0); expect(second.inheritanceEdgesInserted).toBe(0); + // Content-model passes use delete-and-rewrite, so insert counts equal + // the first run on every re-run; DB row counts stay stable. + expect(second.compositorsInserted).toBe(first.compositorsInserted); + expect(second.childEdgesInserted).toBe(first.childEdgesInserted); + expect(second.groupRefsInserted).toBe(first.groupRefsInserted); // Row counts unchanged between first and second runs. const [c1] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbols`; const [c2] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbol_profiles`; const [c3] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_inheritance_edges`; + const [c4] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_compositors`; + const [c5] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_child_edges`; + const [c6] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_group_edges`; expect(c1.c).toBe(first.symbolsInserted); - // One membership per symbol per profile. expect(c2.c).toBe(first.profileMembershipsInserted); expect(c3.c).toBe(first.inheritanceEdgesInserted); + expect(c4.c).toBe(first.compositorsInserted); + expect(c5.c).toBe(first.childEdgesInserted); + expect(c6.c).toBe(first.groupRefsInserted); }); test("ingest writes compositors and child edges for nested content models", async () => { From 33072f5227b1aea26e7dc321b236526f29685406 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 12:02:17 -0300 Subject: [PATCH 11/24] feat(xsd): attributes, attributeGroup refs, and enums (Phase 3e) Pass 4 of ingestSchemaSet walks every complexType and attributeGroup declaration and writes: - xsd_attr_edges one row per direct or extension/restriction attribute. attr_use enum locked to required / optional / prohibited; default 'optional'. type_ref stores the Clark-style {namespace}localName so Phase 4 lookups can join across vocabularies, with the raw qname as a fallback when unresolvable. - xsd_group_edges additional rows with ref_kind='attributeGroup' for every on a complexType or another attributeGroup body. - xsd_enums one row per beneath a simpleType restriction; order_index preserved. Idempotency: same delete-and-rewrite-per-profile pattern as Pass 3. xsd_group_edges already gets cleared by Pass 3 so attributeGroup ref inserts here run on a fresh slate. attribute parents handled: - complexType direct (no wrapper) - complexContent / extension|restriction - simpleContent / extension|restriction - attributeGroup body (top-level) WML closure ingest stats: - 1114 attr edges (2 unresolved: xml:space / xml:lang) - 17 attributeGroup refs (0 unresolved) - 2189 enum values - elapsed ~3s - all unresolved counters elsewhere still 0 Real-data sanity: top attribute-heavy types match expectation (CT_ElemPropSet 28, CT_TextBodyProperties 19, ...). type_ref distribution shows xsd:boolean, ST_OnOff, ST_DecimalNumber, etc resolved to the right namespaces. Fixture main.xsd gains CT_TableUser to exercise an attributeGroup ref + a required attribute, alongside the existing direct, extension, and attributeGroup-body attribute paths and the ST_Jc enum. --- scripts/ingest-xsd/ingest.ts | 225 ++++++++++++++++++++++++++ tests/ingest-xsd/fixtures/main.xsd | 4 + tests/ingest-xsd/ingest.test.ts | 101 +++++++++++- tests/ingest-xsd/parse-schema.test.ts | 4 +- 4 files changed, 329 insertions(+), 5 deletions(-) diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index 31df0b7..5427a8c 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -64,6 +64,11 @@ export interface IngestStats { groupRefsInserted: number; groupRefsUnresolved: number; localElementsCreated: number; + attrEdgesInserted: number; + attrEdgesUnresolved: number; + attrGroupRefsInserted: number; + attrGroupRefsUnresolved: number; + enumsInserted: number; } export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise { @@ -86,6 +91,11 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise { @@ -242,6 +252,79 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise { + await sql` + INSERT INTO xsd_attr_edges + (symbol_id, attr_symbol_id, local_name, profile_id, attr_use, default_value, fixed_value, type_ref, order_index) + VALUES + (${symbolId}, ${attrSymbolId}, ${localName}, ${profileId}, ${attrUse}, ${defaultValue}, ${fixedValue}, ${typeRef}, ${orderIndex}) + `; +} + +async function insertEnum( + sql: Sql, + symbolId: number, + profileId: number, + value: string, + orderIndex: number, +): Promise { + await sql` + INSERT INTO xsd_enums (symbol_id, profile_id, value, order_index) + VALUES (${symbolId}, ${profileId}, ${value}, ${orderIndex}) + `; +} + +/** + * Locate the nodes whose direct children are xsd:attribute / xsd:attributeGroup. + * For complexType: the type itself when there's no complexContent/simpleContent + * wrapper, otherwise the inner extension/restriction nodes. + * For attributeGroup: the group node itself. + */ +function findAttributeParents(decl: Declaration): PreserveOrderNode[] { + if (decl.kind === "attributeGroup") return [decl.node]; + if (decl.kind !== "complexType") return []; + + const out: PreserveOrderNode[] = []; + let sawWrapper = false; + for (const child of nodeChildrenLocal(decl.node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag === "complexContent" || tag === "simpleContent") { + sawWrapper = true; + for (const inner of nodeChildrenLocal(child)) { + const innerTag = stripPrefixLocal(nodeTagLocal(inner)); + if (innerTag === "extension" || innerTag === "restriction") out.push(inner); + } + } + } + if (!sawWrapper) out.push(decl.node); + return out; +} + +/** xsd:simpleType > xsd:restriction > xsd:enumeration values, in order. */ +function findEnumValues(decl: Declaration): string[] { + const values: string[] = []; + for (const child of nodeChildrenLocal(decl.node)) { + const tag = stripPrefixLocal(nodeTagLocal(child)); + if (tag !== "restriction") continue; + for (const e of nodeChildrenLocal(child)) { + const eTag = stripPrefixLocal(nodeTagLocal(e)); + if (eTag !== "enumeration") continue; + const a = nodeAttrs(e); + if (a.value !== undefined) values.push(a.value); + } + } + return values; +} + +async function handleAttribute( + sql: Sql, + node: PreserveOrderNode, + ownerSymbolId: number, + profileId: number, + prefixMap: Map, + defaultNamespace: string, + symbolIds: Map, + orderIndex: number, + stats: IngestStats, +): Promise { + const a = nodeAttrs(node); + let localName: string | null = null; + let attrSymbolId: number | null = null; + let typeRef: string | null = null; + + if (a.ref) { + const resolved = resolveQNameAttr(a.ref, prefixMap, defaultNamespace); + if (!resolved.resolved || !resolved.qname.vocabularyId) { + stats.attrEdgesUnresolved++; + return; + } + localName = resolved.qname.localName; + const id = symbolIds.get( + symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "attribute"), + ); + if (id != null) attrSymbolId = id; + } else if (a.name) { + localName = a.name; + if (a.type) { + const resolved = resolveQNameAttr(a.type, prefixMap, defaultNamespace); + if (resolved.resolved) { + typeRef = `{${resolved.qname.namespace}}${resolved.qname.localName}`; + } else { + typeRef = a.type; // store raw if unresolvable; never lose info + } + } + } + + if (!localName) return; + + const rawUse = a.use; + const attrUse: "required" | "optional" | "prohibited" = + rawUse === "required" || rawUse === "optional" || rawUse === "prohibited" + ? rawUse + : "optional"; + + await insertAttrEdge( + sql, + ownerSymbolId, + attrSymbolId, + localName, + profileId, + attrUse, + a.default ?? null, + a.fixed ?? null, + typeRef, + orderIndex, + ); + stats.attrEdgesInserted++; +} + // --- Inheritance discovery from AST ------------------------------------- interface InheritanceFinding { @@ -663,6 +883,11 @@ async function main() { console.log(`group refs: ${stats.groupRefsInserted}`); console.log(`group refs unres.: ${stats.groupRefsUnresolved}`); console.log(`local elements: ${stats.localElementsCreated}`); + console.log(`attr edges: ${stats.attrEdgesInserted}`); + console.log(`attr edges unres.: ${stats.attrEdgesUnresolved}`); + console.log(`attrGroup refs: ${stats.attrGroupRefsInserted}`); + console.log(`attrGroup refs unr.: ${stats.attrGroupRefsUnresolved}`); + console.log(`enums: ${stats.enumsInserted}`); console.log(`elapsed: ${ms}ms`); } finally { await db.close(); diff --git a/tests/ingest-xsd/fixtures/main.xsd b/tests/ingest-xsd/fixtures/main.xsd index b8122e2..fae8d56 100644 --- a/tests/ingest-xsd/fixtures/main.xsd +++ b/tests/ingest-xsd/fixtures/main.xsd @@ -48,5 +48,9 @@ + + + + diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts index b610d3b..e3e1890 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-xsd/ingest.test.ts @@ -175,11 +175,14 @@ test("ingest is idempotent: re-running adds no new symbols/edges", async () => { expect(second.symbolsExisting).toBeGreaterThan(0); expect(second.profileMembershipsInserted).toBe(0); expect(second.inheritanceEdgesInserted).toBe(0); - // Content-model passes use delete-and-rewrite, so insert counts equal - // the first run on every re-run; DB row counts stay stable. + // Content-model + attribute passes use delete-and-rewrite, so insert counts + // equal the first run on every re-run; DB row counts stay stable. expect(second.compositorsInserted).toBe(first.compositorsInserted); expect(second.childEdgesInserted).toBe(first.childEdgesInserted); expect(second.groupRefsInserted).toBe(first.groupRefsInserted); + expect(second.attrEdgesInserted).toBe(first.attrEdgesInserted); + expect(second.attrGroupRefsInserted).toBe(first.attrGroupRefsInserted); + expect(second.enumsInserted).toBe(first.enumsInserted); // Row counts unchanged between first and second runs. const [c1] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_symbols`; @@ -188,12 +191,17 @@ test("ingest is idempotent: re-running adds no new symbols/edges", async () => { const [c4] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_compositors`; const [c5] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_child_edges`; const [c6] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_group_edges`; + const [c7] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_attr_edges`; + const [c8] = await db.sql`SELECT COUNT(*)::int AS c FROM xsd_enums`; expect(c1.c).toBe(first.symbolsInserted); expect(c2.c).toBe(first.profileMembershipsInserted); expect(c3.c).toBe(first.inheritanceEdgesInserted); expect(c4.c).toBe(first.compositorsInserted); expect(c5.c).toBe(first.childEdgesInserted); - expect(c6.c).toBe(first.groupRefsInserted); + // xsd_group_edges holds both ref_kind='group' and ref_kind='attributeGroup'. + expect(c6.c).toBe(first.groupRefsInserted + first.attrGroupRefsInserted); + expect(c7.c).toBe(first.attrEdgesInserted); + expect(c8.c).toBe(first.enumsInserted); }); test("ingest writes compositors and child edges for nested content models", async () => { @@ -305,6 +313,84 @@ test("ingest writes compositors and child edges for nested content models", asyn expect(groupEdges[0]).toMatchObject({ group_name: "EG_PContent", ref_kind: "group" }); }); +test("ingest writes attributes, attributeGroup refs, and enum values", async () => { + const stats = await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Fixture attributes: + // CT_Para/bold (optional, type s:ST_OnOff) + // CT_Extended/extra (optional, type xsd:string, under complexContent/extension) + // AG_TableProps/cols (optional, type xsd:int) + // CT_TableUser/caption (required, type xsd:string) + expect(stats.attrEdgesInserted).toBe(4); + expect(stats.attrEdgesUnresolved).toBe(0); + + // Fixture attributeGroup refs: + // CT_TableUser -> AG_TableProps + expect(stats.attrGroupRefsInserted).toBe(1); + expect(stats.attrGroupRefsUnresolved).toBe(0); + + // Fixture enums: ST_Jc has 3 values; ST_OnOff and ST_String have base restrictions + // without xsd:enumeration children, so 0 enum values from those. + expect(stats.enumsInserted).toBe(3); + + // CT_Para/bold attribute resolves to s:ST_OnOff in shared-types namespace. + const [bold] = await db.sql` + SELECT a.local_name, a.attr_use, a.type_ref + FROM xsd_attr_edges a + JOIN xsd_symbols s ON s.id = a.symbol_id + WHERE s.local_name = 'CT_Para' AND a.local_name = 'bold' + `; + expect(bold?.attr_use).toBe("optional"); + expect(bold?.type_ref).toBe( + "{http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes}ST_OnOff", + ); + + // CT_Extended/extra is on complexContent/extension. + const [extra] = await db.sql` + SELECT a.local_name, a.attr_use + FROM xsd_attr_edges a + JOIN xsd_symbols s ON s.id = a.symbol_id + WHERE s.local_name = 'CT_Extended' AND a.local_name = 'extra' + `; + expect(extra?.attr_use).toBe("optional"); + + // CT_TableUser/caption is required. + const [caption] = await db.sql` + SELECT a.local_name, a.attr_use + FROM xsd_attr_edges a + JOIN xsd_symbols s ON s.id = a.symbol_id + WHERE s.local_name = 'CT_TableUser' AND a.local_name = 'caption' + `; + expect(caption?.attr_use).toBe("required"); + + // CT_TableUser has an attributeGroup ref to AG_TableProps. + const agRefs = await db.sql` + SELECT g.local_name AS group_name + FROM xsd_group_edges ge + JOIN xsd_symbols parent ON parent.id = ge.parent_symbol_id + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + WHERE parent.local_name = 'CT_TableUser' AND ge.ref_kind = 'attributeGroup' + `; + expect(agRefs).toHaveLength(1); + expect(agRefs[0].group_name).toBe("AG_TableProps"); + + // ST_Jc enum values, in declared order. + const enumValues = await db.sql` + SELECT e.value, e.order_index + FROM xsd_enums e + JOIN xsd_symbols s ON s.id = e.symbol_id + WHERE s.local_name = 'ST_Jc' AND s.kind = 'simpleType' + ORDER BY e.order_index + `; + expect(enumValues.map((r: { value: string }) => r.value)).toEqual(["left", "center", "right"]); +}); + test.skipIf(!realCacheReady)( "smoke: ingest WML closure into the dev DB and verify counts", async () => { @@ -325,6 +411,15 @@ test.skipIf(!realCacheReady)( expect(stats.groupRefsInserted).toBeGreaterThan(20); expect(stats.childEdgesUnresolved).toBe(0); expect(stats.groupRefsUnresolved).toBe(0); + // Phase 3e additions: + expect(stats.attrEdgesInserted).toBeGreaterThan(500); + expect(stats.attrGroupRefsInserted).toBeGreaterThan(10); + expect(stats.enumsInserted).toBeGreaterThan(200); + // A handful of attribute refs target namespaces with no schemaLocation + // (notably xml:space / xml:lang). They resolve to the xml namespace but + // have no symbol because we don't load XSD's xml namespace schema. + expect(stats.attrEdgesUnresolved).toBeLessThan(10); + expect(stats.attrGroupRefsUnresolved).toBe(0); // w:tbl is the global element; its content type is CT_Tbl. Verify CT_Tbl has children. const ctTblChildren = await db.sql` diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-xsd/parse-schema.test.ts index 5a5d9d3..406c18c 100644 --- a/tests/ingest-xsd/parse-schema.test.ts +++ b/tests/ingest-xsd/parse-schema.test.ts @@ -85,10 +85,10 @@ test("declarationsByQName indexes all top-level declarations across documents", const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); const counts = countByKind(set.declarationsByQName); - // main.xsd: 1 element, 5 complexType, 1 simpleType, 1 group, 1 attributeGroup + // main.xsd: 1 element, 6 complexType, 1 simpleType, 1 group, 1 attributeGroup // shared.xsd: 2 simpleType expect(counts.element).toBe(1); - expect(counts.complexType).toBe(5); + expect(counts.complexType).toBe(6); expect(counts.simpleType).toBe(3); expect(counts.group).toBe(1); expect(counts.attributeGroup).toBe(1); From c742f430f537c854830a1465700dd7adb48e35e1 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 13:33:23 -0300 Subject: [PATCH 12/24] fix(xsd): preserve element/attr type and group-ref compositor metadata Three correctness gaps surfaced before Phase 4: P1 - Local elements lost type and profile membership. WML uses inside groups; before this change the local element symbol carried no @type and was never linked to xsd_symbol_profiles, so ooxml_lookup_element/ooxml_children would not find it in the transitional profile or follow it to CT_P. P2 - Group refs in nested compositors lost context. inside a nested sequence/choice was inserted with parent_symbol_id and order_index only. The compositor it lives inside and the ref's own minOccurs/maxOccurs were dropped, so later expansion could not preserve ordering or cardinality relative to siblings. P2 - Referenced attributes lost type/default/fixed. set attr_symbol_id only; the type and default declared on the top-level were not recovered into the edge. Migration 0003_phase3_metadata adds: - xsd_symbols.type_ref TEXT (Clark-style {namespace}localName for elements and attributes that declare @type; NULL for the rest). - xsd_group_edges.compositor_id INT (FK with ON DELETE CASCADE), plus min_occurs / max_occurs. ingest.ts: - upsertSymbol now accepts typeRef; ON CONFLICT preserves the existing value via COALESCE so a re-run never blanks it out. - Pass 1 captures @type for top-level element/attribute decls. - Pass 3 captures @type and links local elements to xsd_symbol_profiles. - Pass 3 group refs thread compositor_id and parse min/max occurs. - Pass 4 attribute refs copy type_ref / default / fixed from the top-level declaration; attr_use stays from the ref site (XSD lets refs override use only). Real WML ingest after fix: - profile memberships: 1345 -> 2723 (1345 top-level + 1378 local elements now visible to ooxml_lookup_element). - 148 / 161 group refs carry compositor_id (rest are top-level). - Sample r:id attribute refs now expose type_ref={...relationships}ST_RelationshipId. Fixtures gain a top-level in shared.xsd and a CT_RefTest in main.xsd that refs it; the new test checks all three fixes. --- db/migrations/0003_phase3_metadata.sql | 10 ++ db/schema.sql | 11 +++ scripts/ingest-xsd/ingest.ts | 127 +++++++++++++++++++++---- tests/ingest-xsd/fixtures/main.xsd | 3 + tests/ingest-xsd/fixtures/shared.xsd | 1 + tests/ingest-xsd/ingest.test.ts | 76 ++++++++++++++- tests/ingest-xsd/parse-schema.test.ts | 8 +- 7 files changed, 212 insertions(+), 24 deletions(-) create mode 100644 db/migrations/0003_phase3_metadata.sql diff --git a/db/migrations/0003_phase3_metadata.sql b/db/migrations/0003_phase3_metadata.sql new file mode 100644 index 0000000..7093e35 --- /dev/null +++ b/db/migrations/0003_phase3_metadata.sql @@ -0,0 +1,10 @@ +-- Phase 3 review fix: preserve element/attribute @type and group-ref compositor context. +-- Idempotent. + +ALTER TABLE xsd_symbols + ADD COLUMN IF NOT EXISTS type_ref TEXT; + +ALTER TABLE xsd_group_edges + ADD COLUMN IF NOT EXISTS compositor_id INT REFERENCES xsd_compositors(id) ON DELETE CASCADE, + ADD COLUMN IF NOT EXISTS min_occurs INT DEFAULT 1, + ADD COLUMN IF NOT EXISTS max_occurs INT; diff --git a/db/schema.sql b/db/schema.sql index 3f2e1e6..69aaf54 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -64,11 +64,15 @@ CREATE TABLE xsd_namespaces ( created_at TIMESTAMPTZ DEFAULT NOW() ); +-- type_ref holds the Clark-style {namespace}localName for elements and attributes +-- that declare a @type. NULL for complexType/simpleType/group/attributeGroup. +-- Phase 4 lookups follow type_ref to resolve element -> type when reading children. CREATE TABLE xsd_symbols ( id SERIAL PRIMARY KEY, vocabulary_id TEXT NOT NULL, local_name TEXT NOT NULL, kind TEXT NOT NULL, + type_ref TEXT, payload JSONB DEFAULT '{}'::jsonb, created_at TIMESTAMPTZ DEFAULT NOW(), UNIQUE (vocabulary_id, local_name, kind) @@ -121,13 +125,20 @@ CREATE TABLE xsd_attr_edges ( order_index INT DEFAULT 0 ); +-- compositor_id is the enclosing compositor when a appears inside +-- a sequence/choice/all (NULL for refs at the type's top level or for +-- attributeGroup refs which don't live in a compositor). +-- min/max_occurs capture the ref site's own cardinality. CREATE TABLE xsd_group_edges ( id SERIAL PRIMARY KEY, parent_symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE, + compositor_id INT REFERENCES xsd_compositors(id) ON DELETE CASCADE, group_symbol_id INT NOT NULL REFERENCES xsd_symbols(id), profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE, ref_kind TEXT NOT NULL CHECK (ref_kind IN ('group', 'attributeGroup')), resolved BOOLEAN DEFAULT FALSE, + min_occurs INT DEFAULT 1, + max_occurs INT, order_index INT DEFAULT 0 ); diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index 5427a8c..072d32e 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -118,11 +118,16 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise; symbolIds: Map; + namespaceIds: Map; + parseResult: ParsedSchemaSet; stats: IngestStats; } +/** + * Resolve a declaration's @type qname (for top-level element/attribute decls) + * to Clark-style {namespace}localName, or null if the declaration has no @type. + */ +function resolveDeclTypeRef(decl: Declaration, parseResult: ParsedSchemaSet): string | null { + if (decl.kind !== "element" && decl.kind !== "attribute") return null; + const a = nodeAttrs(decl.node); + if (!a.type) return null; + const prefixMap = parseResult.namespaceByPrefix.get(decl.documentPath); + if (!prefixMap) return a.type; + const r = resolveQNameAttr(a.type, prefixMap, decl.namespace); + return r.resolved ? `{${r.qname.namespace}}${r.qname.localName}` : a.type; +} + /** * For a complexType: yield the node(s) whose direct children are particles * (sequence/choice/all/group). That's the complexType itself, OR (for derived @@ -401,7 +436,7 @@ async function walkCompositor( await walkCompositor(child, tag, compositorId, childOrder, ctx); childOrder++; } else if (tag === "group") { - await handleGroupRef(child, childOrder, ctx, compositorId); + await handleGroupRef(child, compositorId, childOrder, ctx); childOrder++; } // xsd:any: skipped for now. @@ -432,10 +467,22 @@ async function handleElement( } childSymbolId = id; } else if (a.name) { + // Resolve @type so ooxml_lookup_element / ooxml_children can follow it. + let typeRef: string | null = null; + if (a.type) { + const r = resolveQNameAttr(a.type, ctx.prefixMap, ctx.ownerDecl.namespace); + typeRef = r.resolved ? `{${r.qname.namespace}}${r.qname.localName}` : a.type; + } const key = symbolKey(ctx.ownerDecl.vocabularyId, a.name, "element"); let id = ctx.symbolIds.get(key); if (id == null) { - const res = await upsertSymbol(ctx.sql, ctx.ownerDecl.vocabularyId, a.name, "element"); + const res = await upsertSymbol( + ctx.sql, + ctx.ownerDecl.vocabularyId, + a.name, + "element", + typeRef, + ); ctx.symbolIds.set(key, res.id); if (res.inserted) { ctx.stats.symbolsInserted++; @@ -443,7 +490,26 @@ async function handleElement( } else { ctx.stats.symbolsExisting++; } + // Local elements need profile membership too, otherwise + // ooxml_lookup_element won't find them in the transitional profile. + const nsId = ctx.namespaceIds.get(ctx.ownerDecl.namespace); + if (nsId != null) { + const linked = await linkSymbolToProfile( + ctx.sql, + res.id, + ctx.profileId, + nsId, + ctx.sourceId, + ); + if (linked) ctx.stats.profileMembershipsInserted++; + } id = res.id; + } else if (typeRef) { + // Existing symbol; ensure type_ref is set if we have one. + await ctx.sql` + UPDATE xsd_symbols SET type_ref = ${typeRef} + WHERE id = ${id} AND type_ref IS NULL + `; } childSymbolId = id; } @@ -465,11 +531,10 @@ async function handleElement( async function handleGroupRef( node: PreserveOrderNode, + compositorId: number | null, orderIndex: number, ctx: WalkCtx, - _compositorId: number | null = null, ): Promise { - void _compositorId; // group_edges aren't compositor-scoped in our schema; refs hang off the parent symbol. const a = nodeAttrs(node); if (!a.ref) return; const resolved = resolveQNameAttr(a.ref, ctx.prefixMap, ctx.ownerDecl.namespace); @@ -487,9 +552,12 @@ async function handleGroupRef( await insertGroupEdge( ctx.sql, ctx.ownerSymbolId, + compositorId, groupSymbolId, ctx.profileId, "group", + parseMinOccurs(a.minOccurs), + parseMaxOccurs(a.maxOccurs), orderIndex, ); ctx.stats.groupRefsInserted++; @@ -542,11 +610,13 @@ async function upsertSymbol( vocabularyId: string, localName: string, kind: string, + typeRef: string | null = null, ): Promise<{ id: number; inserted: boolean }> { const [row] = await sql` - INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) - VALUES (${vocabularyId}, ${localName}, ${kind}) - ON CONFLICT (vocabulary_id, local_name, kind) DO UPDATE SET kind = EXCLUDED.kind + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind, type_ref) + VALUES (${vocabularyId}, ${localName}, ${kind}, ${typeRef}) + ON CONFLICT (vocabulary_id, local_name, kind) DO UPDATE + SET type_ref = COALESCE(EXCLUDED.type_ref, xsd_symbols.type_ref) RETURNING id, (xmax = 0) AS inserted `; return { id: row.id, inserted: row.inserted }; @@ -625,16 +695,19 @@ async function insertChildEdge( async function insertGroupEdge( sql: Sql, parentSymbolId: number, + compositorId: number | null, groupSymbolId: number, profileId: number, refKind: "group" | "attributeGroup", + minOccurs: number, + maxOccurs: number | null, orderIndex: number, ): Promise { await sql` INSERT INTO xsd_group_edges - (parent_symbol_id, group_symbol_id, profile_id, ref_kind, order_index) + (parent_symbol_id, compositor_id, group_symbol_id, profile_id, ref_kind, min_occurs, max_occurs, order_index) VALUES - (${parentSymbolId}, ${groupSymbolId}, ${profileId}, ${refKind}, ${orderIndex}) + (${parentSymbolId}, ${compositorId}, ${groupSymbolId}, ${profileId}, ${refKind}, ${minOccurs}, ${maxOccurs}, ${orderIndex}) `; } @@ -721,6 +794,7 @@ async function handleAttribute( prefixMap: Map, defaultNamespace: string, symbolIds: Map, + parseResult: ParsedSchemaSet, orderIndex: number, stats: IngestStats, ): Promise { @@ -728,6 +802,8 @@ async function handleAttribute( let localName: string | null = null; let attrSymbolId: number | null = null; let typeRef: string | null = null; + let defaultValue: string | null = a.default ?? null; + let fixedValue: string | null = a.fixed ?? null; if (a.ref) { const resolved = resolveQNameAttr(a.ref, prefixMap, defaultNamespace); @@ -740,6 +816,27 @@ async function handleAttribute( symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "attribute"), ); if (id != null) attrSymbolId = id; + + // Carry type/default/fixed from the top-level declaration. + // XSD allows these only on the declaration, not the ref site, so look them up there. + const declKey = `{${resolved.qname.namespace}}attribute:${resolved.qname.localName}`; + const topDecl = parseResult.declarationsByQName.get(declKey)?.[0]; + if (topDecl) { + const declAttrs = nodeAttrs(topDecl.node); + if (declAttrs.type) { + const declPrefixMap = parseResult.namespaceByPrefix.get(topDecl.documentPath); + if (declPrefixMap) { + const t = resolveQNameAttr(declAttrs.type, declPrefixMap, topDecl.namespace); + typeRef = t.resolved + ? `{${t.qname.namespace}}${t.qname.localName}` + : declAttrs.type; + } else { + typeRef = declAttrs.type; + } + } + if (defaultValue == null) defaultValue = declAttrs.default ?? null; + if (fixedValue == null) fixedValue = declAttrs.fixed ?? null; + } } else if (a.name) { localName = a.name; if (a.type) { @@ -756,9 +853,7 @@ async function handleAttribute( const rawUse = a.use; const attrUse: "required" | "optional" | "prohibited" = - rawUse === "required" || rawUse === "optional" || rawUse === "prohibited" - ? rawUse - : "optional"; + rawUse === "required" || rawUse === "optional" || rawUse === "prohibited" ? rawUse : "optional"; await insertAttrEdge( sql, @@ -767,8 +862,8 @@ async function handleAttribute( localName, profileId, attrUse, - a.default ?? null, - a.fixed ?? null, + defaultValue, + fixedValue, typeRef, orderIndex, ); diff --git a/tests/ingest-xsd/fixtures/main.xsd b/tests/ingest-xsd/fixtures/main.xsd index fae8d56..2c44a6a 100644 --- a/tests/ingest-xsd/fixtures/main.xsd +++ b/tests/ingest-xsd/fixtures/main.xsd @@ -52,5 +52,8 @@ + + + diff --git a/tests/ingest-xsd/fixtures/shared.xsd b/tests/ingest-xsd/fixtures/shared.xsd index 12f113b..fa59f3a 100644 --- a/tests/ingest-xsd/fixtures/shared.xsd +++ b/tests/ingest-xsd/fixtures/shared.xsd @@ -8,4 +8,5 @@ + diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts index e3e1890..8367feb 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-xsd/ingest.test.ts @@ -323,11 +323,12 @@ test("ingest writes attributes, attributeGroup refs, and enum values", async () }); // Fixture attributes: - // CT_Para/bold (optional, type s:ST_OnOff) - // CT_Extended/extra (optional, type xsd:string, under complexContent/extension) - // AG_TableProps/cols (optional, type xsd:int) + // CT_Para/bold (optional, type s:ST_OnOff) + // CT_Extended/extra (optional, type xsd:string, under complexContent/extension) + // AG_TableProps/cols (optional, type xsd:int) // CT_TableUser/caption (required, type xsd:string) - expect(stats.attrEdgesInserted).toBe(4); + // CT_RefTest/space (required, ref="s:space"; type/default copied from decl) + expect(stats.attrEdgesInserted).toBe(5); expect(stats.attrEdgesUnresolved).toBe(0); // Fixture attributeGroup refs: @@ -391,6 +392,73 @@ test("ingest writes attributes, attributeGroup refs, and enum values", async () expect(enumValues.map((r: { value: string }) => r.value)).toEqual(["left", "center", "right"]); }); +test("ingest preserves element/attribute @type, local-element profile membership, and group-ref compositor context", async () => { + await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Top-level element: + // type_ref must point at CT_Empty in wml-main. + const [docSym] = await db.sql` + SELECT type_ref FROM xsd_symbols + WHERE local_name = 'document' AND kind = 'element' AND vocabulary_id = 'wml-main' + `; + expect(docSym?.type_ref).toBe( + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}CT_Empty", + ); + + // Local element: inside CT_Para. + // Should have type_ref AND profile membership so ooxml_lookup_element finds it. + const [textSym] = await db.sql` + SELECT s.id, s.type_ref FROM xsd_symbols s + WHERE s.local_name = 'text' AND s.kind = 'element' AND s.vocabulary_id = 'wml-main' + `; + expect(textSym?.type_ref).toBe("{http://www.w3.org/2001/XMLSchema}string"); + + const [textMembership] = await db.sql` + SELECT sp.id FROM xsd_symbol_profiles sp + JOIN xsd_profiles p ON p.id = sp.profile_id + WHERE sp.symbol_id = ${textSym.id} AND p.name = 'transitional' + `; + expect(textMembership?.id).toBeDefined(); + + // Group ref inside a nested choice (CT_Body's choice contains ). + // compositor_id must point at the choice, not be null. Min/max occurs default to 1 + // since the ref itself has no minOccurs/maxOccurs in our fixture. + const [groupRef] = await db.sql` + SELECT ge.compositor_id, ge.min_occurs, ge.max_occurs, c.kind AS compositor_kind, + c.parent_compositor_id IS NOT NULL AS is_nested + FROM xsd_group_edges ge + JOIN xsd_compositors c ON c.id = ge.compositor_id + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + JOIN xsd_symbols parent ON parent.id = ge.parent_symbol_id + WHERE parent.local_name = 'CT_Body' AND g.local_name = 'EG_PContent' + `; + expect(groupRef?.compositor_id).toBeDefined(); + expect(groupRef?.compositor_kind).toBe("choice"); + expect(groupRef?.is_nested).toBe(true); + + // Attribute ref: inside CT_RefTest. + // type_ref and default_value must be recovered from the top-level . + // attr_use must come from the ref site (required, not the declaration's optional default). + const [refAttr] = await db.sql` + SELECT a.local_name, a.attr_use, a.default_value, a.type_ref, + a.attr_symbol_id IS NOT NULL AS has_attr_sym + FROM xsd_attr_edges a + JOIN xsd_symbols s ON s.id = a.symbol_id + WHERE s.local_name = 'CT_RefTest' AND s.kind = 'complexType' + `; + expect(refAttr?.local_name).toBe("space"); + expect(refAttr?.attr_use).toBe("required"); + expect(refAttr?.default_value).toBe("preserve"); + expect(refAttr?.type_ref).toBe("{http://www.w3.org/2001/XMLSchema}string"); + expect(refAttr?.has_attr_sym).toBe(true); +}); + test.skipIf(!realCacheReady)( "smoke: ingest WML closure into the dev DB and verify counts", async () => { diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-xsd/parse-schema.test.ts index 406c18c..46942a8 100644 --- a/tests/ingest-xsd/parse-schema.test.ts +++ b/tests/ingest-xsd/parse-schema.test.ts @@ -85,14 +85,14 @@ test("declarationsByQName indexes all top-level declarations across documents", const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); const counts = countByKind(set.declarationsByQName); - // main.xsd: 1 element, 6 complexType, 1 simpleType, 1 group, 1 attributeGroup - // shared.xsd: 2 simpleType + // main.xsd: 1 element, 7 complexType, 1 simpleType, 1 group, 1 attributeGroup + // shared.xsd: 2 simpleType, 1 attribute expect(counts.element).toBe(1); - expect(counts.complexType).toBe(6); + expect(counts.complexType).toBe(7); expect(counts.simpleType).toBe(3); expect(counts.group).toBe(1); expect(counts.attributeGroup).toBe(1); - expect(counts.attribute).toBe(0); + expect(counts.attribute).toBe(1); // Specific decl lookup by canonical key. const ctPara = set.declarationsByQName.get(declarationQNameKey(WML_NS, "complexType", "CT_Para")); From e1c5cb024c7750eb105de2b4e82c32889fb23a5d Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 13:41:39 -0300 Subject: [PATCH 13/24] feat(mcp): add read-only OOXML structural tools (Phase 4) Six new MCP tools, gated by the ENABLE_OOXML_TOOLS env var. tools/list filters them out and tools/call returns method-not-found until the flag is set, so api.ooxml.dev/mcp's existing surface (search_ecma_spec / get_section / list_parts) is unaffected. Tools: ooxml_lookup_element qname (w:tbl, {ns}local, or bare) -> symbol info ooxml_lookup_type qname -> complexType or simpleType symbol ooxml_children element/type/group qname -> ordered child + group ref list ooxml_attributes element/type qname -> attrs unfolded through inheritance and attributeGroup refs ooxml_enum simpleType qname -> enumeration values in declared order ooxml_namespace_info uri -> profiles + symbol counts per profile Query layer (apps/mcp-server/src/ooxml-queries.ts): - parseQName accepts known OOXML prefixes (w/r/s/m/a/wp/pic/c/dgm/xsd), Clark form, or bare local names (defaults to wml-main). - lookupElement / lookupType / lookupSymbolByTypeRef walk xsd_symbol_profiles for profile-scoped hits. - getChildren walks the xsd_inheritance_edges chain via a recursive CTE and unions self + base xsd_child_edges and xsd_group_edges (group refs) in document order. Each entry carries its compositor kind and the type that contributed it. - getAttributes does the same and additionally recurses through attributeGroup refs; each entry carries 'self' / 'inherited' / 'attributeGroup' provenance with the owning name. - getEnums and getNamespaceInfo are direct profile-scoped lookups. Tool dispatch (apps/mcp-server/src/ooxml-tools.ts): - For element qnames passed to ooxml_children / ooxml_attributes the handler looks up the element, follows type_ref to its complexType, then reads from there (per the Phase 4 caveat in PLAN.md). - ooxml_children also falls back to looking up groups by name so users can call it on EG_PContent etc. - Unknown qnames produce a 'Not found' card listing alternative formats and the searched profile. - Default profile is literal 'transitional' until Phase 6. Response shape per PLAN.md: canonical symbol, namespace, type_ref where relevant, source, and a behavior-notes placeholder hooked up to nothing yet (Phase 5 fills it). Tests: 15 query-layer tests against a fresh ingest of the existing fixtures; passes alongside 21 ingest tests for a 36 / 0 total. Worker bundle dry-runs at 263 KiB (67 KiB gzip). --- apps/mcp-server/src/index.ts | 6 + apps/mcp-server/src/mcp.ts | 34 +- apps/mcp-server/src/ooxml-queries.ts | 440 +++++++++++++++++++++++++ apps/mcp-server/src/ooxml-tools.ts | 427 ++++++++++++++++++++++++ scripts/ingest-xsd/ingest.ts | 4 +- tests/mcp-server/ooxml-queries.test.ts | 222 +++++++++++++ 6 files changed, 1125 insertions(+), 8 deletions(-) create mode 100644 apps/mcp-server/src/ooxml-queries.ts create mode 100644 apps/mcp-server/src/ooxml-tools.ts create mode 100644 tests/mcp-server/ooxml-queries.test.ts diff --git a/apps/mcp-server/src/index.ts b/apps/mcp-server/src/index.ts index f50d025..17e1a21 100644 --- a/apps/mcp-server/src/index.ts +++ b/apps/mcp-server/src/index.ts @@ -16,6 +16,12 @@ import { handleMcpRequest } from "./mcp"; export interface Env { DATABASE_URL: string; VOYAGE_API_KEY: string; + /** + * Phase 4 feature flag. Set to "true" to expose ooxml_lookup_element / + * ooxml_lookup_type / ooxml_children / ooxml_attributes / ooxml_enum / + * ooxml_namespace_info via tools/list and tools/call. Default off. + */ + ENABLE_OOXML_TOOLS?: string; } // Part descriptions diff --git a/apps/mcp-server/src/mcp.ts b/apps/mcp-server/src/mcp.ts index 9bf7f22..6837a28 100644 --- a/apps/mcp-server/src/mcp.ts +++ b/apps/mcp-server/src/mcp.ts @@ -7,6 +7,12 @@ import { createDb } from "./db"; import { embedQuery } from "./embeddings"; import type { Env } from "./index"; +import { + OOXML_TOOL_DEFS, + callOoxmlTool, + isOoxmlTool, + ooxmlToolsEnabled, +} from "./ooxml-tools"; // JSON-RPC types interface JsonRpcRequest { @@ -132,13 +138,12 @@ function handleInitialize(id: number | string | null): JsonRpcResponse { }; } -function handleToolsList(id: number | string | null): JsonRpcResponse { +function handleToolsList(id: number | string | null, env: Env): JsonRpcResponse { + const tools = ooxmlToolsEnabled(env) ? [...TOOLS, ...OOXML_TOOL_DEFS] : TOOLS; return { jsonrpc: "2.0", id, - result: { - tools: TOOLS, - }, + result: { tools }, }; } @@ -162,6 +167,25 @@ async function handleToolsCall( try { let resultText: string; + // Phase 4 OOXML tools, feature-flagged. tools/list also gates on the same flag, + // so callers should not see these tool names unless the flag is on. Defensive + // check here in case a caller hand-crafts a request. + if (isOoxmlTool(name)) { + if (!ooxmlToolsEnabled(env)) { + return { + jsonrpc: "2.0", + id, + error: { code: METHOD_NOT_FOUND, message: `Unknown tool: ${name}` }, + }; + } + resultText = await callOoxmlTool(name, args ?? {}, env); + return { + jsonrpc: "2.0", + id, + result: { content: [{ type: "text", text: resultText }] }, + }; + } + switch (name) { case "search_ecma_spec": { const query = args?.query as string; @@ -374,7 +398,7 @@ export async function handleMcpRequest(request: Request, env: Env): Promise namespace map for parsing user qnames like "w:tbl". + * Documents may use other bindings; for those, callers can pass Clark form + * `{namespace}localName` or just `localName` and accept the WML default. + */ +const COMMON_PREFIXES: Record = { + w: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + r: "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + s: "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + m: "http://schemas.openxmlformats.org/officeDocument/2006/math", + a: "http://schemas.openxmlformats.org/drawingml/2006/main", + wp: "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + pic: "http://schemas.openxmlformats.org/drawingml/2006/picture", + c: "http://schemas.openxmlformats.org/drawingml/2006/chart", + dgm: "http://schemas.openxmlformats.org/drawingml/2006/diagram", + xsd: "http://www.w3.org/2001/XMLSchema", + xs: "http://www.w3.org/2001/XMLSchema", + xml: "http://www.w3.org/XML/1998/namespace", +}; + +const DEFAULT_NAMESPACE = COMMON_PREFIXES.w; + +export interface ParsedQName { + namespace: string; + localName: string; + rawPrefix: string | null; +} + +export type QNameParseResult = + | { ok: true; qname: ParsedQName } + | { ok: false; reason: string }; + +/** + * Parse a user-supplied qname. Accepts: + * - `prefix:localName` for known OOXML prefixes (w, r, s, m, a, wp, pic, c, dgm, xsd, xml) + * - `{namespace}localName` Clark form + * - bare `localName` (assumes WML main namespace) + */ +export function parseQName(raw: string): QNameParseResult { + if (!raw) return { ok: false, reason: "empty qname" }; + if (raw.startsWith("{")) { + const close = raw.indexOf("}"); + if (close < 0) return { ok: false, reason: "malformed Clark qname (missing })" }; + const namespace = raw.slice(1, close); + const localName = raw.slice(close + 1); + if (!localName) return { ok: false, reason: "missing local name in Clark qname" }; + return { ok: true, qname: { namespace, localName, rawPrefix: null } }; + } + const colon = raw.indexOf(":"); + if (colon < 0) { + return { + ok: true, + qname: { namespace: DEFAULT_NAMESPACE, localName: raw, rawPrefix: null }, + }; + } + const prefix = raw.slice(0, colon); + const localName = raw.slice(colon + 1); + const namespace = COMMON_PREFIXES[prefix]; + if (!namespace) { + return { + ok: false, + reason: `unknown prefix '${prefix}'. Use a known prefix (w, r, s, m, a, wp, pic, c, dgm), or Clark form {namespace}localName.`, + }; + } + return { ok: true, qname: { namespace, localName, rawPrefix: prefix } }; +} + +export interface SymbolHit { + id: number; + vocabularyId: string; + localName: string; + kind: string; + typeRef: string | null; + namespaceUri: string; + profileName: string; + sourceName: string | null; +} + +/** Look up a symbol by namespace + localName + kind in a given profile. */ +export async function lookupSymbol( + sql: Sql, + namespace: string, + localName: string, + kind: string, + profile: string, +): Promise { + const rows = await sql` + SELECT s.id, s.vocabulary_id, s.local_name, s.kind, s.type_ref, + ns.uri AS namespace_uri, p.name AS profile_name, src.name AS source_name + FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + JOIN xsd_profiles p ON p.id = sp.profile_id + LEFT JOIN reference_sources src ON src.id = sp.source_id + WHERE s.local_name = ${localName} + AND s.kind = ${kind} + AND ns.uri = ${namespace} + AND p.name = ${profile} + LIMIT 1 + `; + const r = rows[0]; + if (!r) return null; + return { + id: r.id as number, + vocabularyId: r.vocabulary_id as string, + localName: r.local_name as string, + kind: r.kind as string, + typeRef: r.type_ref as string | null, + namespaceUri: r.namespace_uri as string, + profileName: r.profile_name as string, + sourceName: r.source_name as string | null, + }; +} + +/** Look up an element by qname in a profile. */ +export function lookupElement( + sql: Sql, + namespace: string, + localName: string, + profile: string, +): Promise { + return lookupSymbol(sql, namespace, localName, "element", profile); +} + +/** + * Look up a type symbol (complexType OR simpleType) by qname. + * Tries complexType first, then simpleType. + */ +export async function lookupType( + sql: Sql, + namespace: string, + localName: string, + profile: string, +): Promise { + const ct = await lookupSymbol(sql, namespace, localName, "complexType", profile); + if (ct) return ct; + return lookupSymbol(sql, namespace, localName, "simpleType", profile); +} + +/** + * Resolve a Clark-style type_ref (e.g. {ns}local) to the type symbol it points at. + */ +export async function lookupSymbolByTypeRef( + sql: Sql, + typeRef: string, + profile: string, +): Promise { + if (!typeRef.startsWith("{")) return null; + const close = typeRef.indexOf("}"); + if (close < 0) return null; + const namespace = typeRef.slice(1, close); + const localName = typeRef.slice(close + 1); + return lookupType(sql, namespace, localName, profile); +} + +export interface ChildEdge { + kind: "element" | "group"; + localName: string; + vocabularyId: string; + namespaceUri: string | null; + minOccurs: number; + maxOccurs: number | null; + orderIndex: number; + compositorKind: string | null; + compositorId: number | null; + parentCompositorId: number | null; + source: "self" | "inherited"; + owningTypeName: string; +} + +/** + * Collect inheritance ancestors of a type symbol (self first, then bases). + * Each entry is the symbol id and its name for surfacing in responses. + */ +async function collectInheritance( + sql: Sql, + rootSymbolId: number, + profile: string, +): Promise> { + const rows = await sql` + WITH RECURSIVE chain AS ( + SELECT s.id, s.local_name, s.vocabulary_id, 0 AS depth + FROM xsd_symbols s + WHERE s.id = ${rootSymbolId} + UNION ALL + SELECT base.id, base.local_name, base.vocabulary_id, c.depth + 1 + FROM chain c + JOIN xsd_inheritance_edges e ON e.symbol_id = c.id + JOIN xsd_profiles p ON p.id = e.profile_id + JOIN xsd_symbols base ON base.id = e.base_symbol_id + WHERE p.name = ${profile} + ) + SELECT id, local_name, vocabulary_id FROM chain ORDER BY depth + `; + return rows.map((r: Record) => ({ + id: r.id as number, + localName: r.local_name as string, + vocabularyId: r.vocabulary_id as string, + })); +} + +/** + * Children of a type symbol, walking inheritance to union the bases' content. + * Returns elements (from xsd_child_edges) and group refs (from xsd_group_edges + * with ref_kind='group') in document order. Group refs are returned as-is; + * callers who want them flattened can call getChildren on the referenced group. + */ +export async function getChildren( + sql: Sql, + rootSymbolId: number, + profile: string, +): Promise { + const chain = await collectInheritance(sql, rootSymbolId, profile); + if (chain.length === 0) return []; + + const out: ChildEdge[] = []; + for (const ancestor of chain) { + const elemRows = await sql` + SELECT s.local_name, s.vocabulary_id, ns.uri AS namespace_uri, + e.min_occurs, e.max_occurs, e.order_index, + c.kind AS compositor_kind, c.id AS compositor_id, c.parent_compositor_id + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + LEFT JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id AND sp.profile_id = e.profile_id + LEFT JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + JOIN xsd_compositors c ON c.id = e.compositor_id + JOIN xsd_profiles p ON p.id = e.profile_id + WHERE e.parent_symbol_id = ${ancestor.id} AND p.name = ${profile} + ORDER BY e.order_index + `; + const groupRows = await sql` + SELECT g.local_name, g.vocabulary_id, + ge.min_occurs, ge.max_occurs, ge.order_index, + c.kind AS compositor_kind, ge.compositor_id, c.parent_compositor_id + FROM xsd_group_edges ge + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + LEFT JOIN xsd_compositors c ON c.id = ge.compositor_id + JOIN xsd_profiles p ON p.id = ge.profile_id + WHERE ge.parent_symbol_id = ${ancestor.id} + AND ge.ref_kind = 'group' + AND p.name = ${profile} + ORDER BY ge.order_index + `; + + const ancestorEntries: ChildEdge[] = []; + for (const r of elemRows) { + ancestorEntries.push({ + kind: "element", + localName: r.local_name as string, + vocabularyId: r.vocabulary_id as string, + namespaceUri: (r.namespace_uri as string | null) ?? null, + minOccurs: r.min_occurs as number, + maxOccurs: r.max_occurs as number | null, + orderIndex: r.order_index as number, + compositorKind: r.compositor_kind as string | null, + compositorId: r.compositor_id as number | null, + parentCompositorId: r.parent_compositor_id as number | null, + source: ancestor.id === rootSymbolId ? "self" : "inherited", + owningTypeName: ancestor.localName, + }); + } + for (const r of groupRows) { + ancestorEntries.push({ + kind: "group", + localName: r.local_name as string, + vocabularyId: r.vocabulary_id as string, + namespaceUri: null, + minOccurs: r.min_occurs as number, + maxOccurs: r.max_occurs as number | null, + orderIndex: r.order_index as number, + compositorKind: r.compositor_kind as string | null, + compositorId: r.compositor_id as number | null, + parentCompositorId: r.parent_compositor_id as number | null, + source: ancestor.id === rootSymbolId ? "self" : "inherited", + owningTypeName: ancestor.localName, + }); + } + ancestorEntries.sort((a, b) => a.orderIndex - b.orderIndex); + out.push(...ancestorEntries); + } + return out; +} + +export interface AttrEntry { + localName: string; + attrUse: "required" | "optional" | "prohibited"; + defaultValue: string | null; + fixedValue: string | null; + typeRef: string | null; + source: "self" | "inherited" | "attributeGroup"; + owningName: string; +} + +/** + * Attributes on a type symbol, including those from base types (inheritance) + * and from attributeGroup refs (recursively). + */ +export async function getAttributes( + sql: Sql, + rootSymbolId: number, + profile: string, +): Promise { + const chain = await collectInheritance(sql, rootSymbolId, profile); + const out: AttrEntry[] = []; + const seenAttrs = new Set(); // dedupe by local name; derived overrides base + + for (const ancestor of chain) { + const directAttrs = await sql` + SELECT a.local_name, a.attr_use, a.default_value, a.fixed_value, a.type_ref, a.order_index + FROM xsd_attr_edges a + JOIN xsd_profiles p ON p.id = a.profile_id + WHERE a.symbol_id = ${ancestor.id} AND p.name = ${profile} + ORDER BY a.order_index + `; + for (const r of directAttrs) { + const name = r.local_name as string; + if (seenAttrs.has(name)) continue; + seenAttrs.add(name); + out.push({ + localName: name, + attrUse: r.attr_use as "required" | "optional" | "prohibited", + defaultValue: r.default_value as string | null, + fixedValue: r.fixed_value as string | null, + typeRef: r.type_ref as string | null, + source: ancestor.id === rootSymbolId ? "self" : "inherited", + owningName: ancestor.localName, + }); + } + + // attributeGroup refs (resolve recursively) + const agRefs = await sql` + SELECT ge.group_symbol_id, g.local_name AS group_name + FROM xsd_group_edges ge + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + JOIN xsd_profiles p ON p.id = ge.profile_id + WHERE ge.parent_symbol_id = ${ancestor.id} + AND ge.ref_kind = 'attributeGroup' + AND p.name = ${profile} + ORDER BY ge.order_index + `; + for (const ag of agRefs) { + const groupName = ag.group_name as string; + const innerChain = await collectInheritance(sql, ag.group_symbol_id as number, profile); + for (const inner of innerChain) { + const innerAttrs = await sql` + SELECT a.local_name, a.attr_use, a.default_value, a.fixed_value, a.type_ref, a.order_index + FROM xsd_attr_edges a + JOIN xsd_profiles p ON p.id = a.profile_id + WHERE a.symbol_id = ${inner.id} AND p.name = ${profile} + ORDER BY a.order_index + `; + for (const r of innerAttrs) { + const name = r.local_name as string; + if (seenAttrs.has(name)) continue; + seenAttrs.add(name); + out.push({ + localName: name, + attrUse: r.attr_use as "required" | "optional" | "prohibited", + defaultValue: r.default_value as string | null, + fixedValue: r.fixed_value as string | null, + typeRef: r.type_ref as string | null, + source: "attributeGroup", + owningName: groupName, + }); + } + } + } + } + return out; +} + +export interface EnumEntry { + value: string; + orderIndex: number; +} + +export async function getEnums( + sql: Sql, + symbolId: number, + profile: string, +): Promise { + const rows = await sql` + SELECT e.value, e.order_index + FROM xsd_enums e + JOIN xsd_profiles p ON p.id = e.profile_id + WHERE e.symbol_id = ${symbolId} AND p.name = ${profile} + ORDER BY e.order_index + `; + return rows.map((r: Record) => ({ + value: r.value as string, + orderIndex: r.order_index as number, + })); +} + +export interface NamespaceInfo { + uri: string; + vocabularies: string[]; + profiles: Array<{ name: string; symbolCount: number }>; +} + +export async function getNamespaceInfo(sql: Sql, uri: string): Promise { + const nsRows = await sql`SELECT id FROM xsd_namespaces WHERE uri = ${uri} LIMIT 1`; + if (nsRows.length === 0) return null; + const nsId = nsRows[0].id as number; + + const profileRows = await sql` + SELECT p.name AS profile_name, COUNT(*)::int AS symbol_count, + array_agg(DISTINCT s.vocabulary_id) AS vocabularies + FROM xsd_symbol_profiles sp + JOIN xsd_profiles p ON p.id = sp.profile_id + JOIN xsd_symbols s ON s.id = sp.symbol_id + WHERE sp.namespace_id = ${nsId} + GROUP BY p.name + ORDER BY p.name + `; + + const vocabSet = new Set(); + const profiles: NamespaceInfo["profiles"] = []; + for (const r of profileRows) { + profiles.push({ + name: r.profile_name as string, + symbolCount: r.symbol_count as number, + }); + for (const v of (r.vocabularies as string[]) ?? []) vocabSet.add(v); + } + return { uri, vocabularies: [...vocabSet].sort(), profiles }; +} diff --git a/apps/mcp-server/src/ooxml-tools.ts b/apps/mcp-server/src/ooxml-tools.ts new file mode 100644 index 0000000..7d558ff --- /dev/null +++ b/apps/mcp-server/src/ooxml-tools.ts @@ -0,0 +1,427 @@ +/** + * Phase 4 read-only structural MCP tools. Behind ENABLE_OOXML_TOOLS env flag, + * which gates both tools/list discovery and tools/call dispatch so the public + * surface stays unchanged until the feature is intentionally enabled. + * + * Tools: + * ooxml_lookup_element, ooxml_lookup_type, ooxml_children, + * ooxml_attributes, ooxml_enum, ooxml_namespace_info. + * + * Default profile is `transitional` until word-compatible-docx is composed + * in Phase 6. + */ + +import { neon } from "@neondatabase/serverless"; +import { + type AttrEntry, + type ChildEdge, + type EnumEntry, + type NamespaceInfo, + type SymbolHit, + getAttributes, + getChildren, + getEnums, + getNamespaceInfo, + lookupElement, + lookupSymbolByTypeRef, + lookupType, + parseQName, +} from "./ooxml-queries"; + +export const DEFAULT_PROFILE = "transitional"; + +export interface OoxmlEnv { + DATABASE_URL: string; + ENABLE_OOXML_TOOLS?: string; +} + +export function ooxmlToolsEnabled(env: OoxmlEnv): boolean { + const v = env.ENABLE_OOXML_TOOLS; + return v === "true" || v === "1"; +} + +export const OOXML_TOOL_DEFS = [ + { + name: "ooxml_lookup_element", + description: + "Look up an OOXML element by qname in a profile. Returns canonical symbol info (vocabulary, namespace, declared @type, profile membership, source). Accepts 'w:tbl', '{namespace}localName' (Clark form), or bare 'localName' (defaults to wml-main).", + inputSchema: { + type: "object" as const, + properties: { + qname: { type: "string", description: "Element qname, e.g. 'w:tbl' or '{...}tbl'." }, + profile: { + type: "string", + description: "Profile name (default: 'transitional').", + }, + }, + required: ["qname"], + }, + }, + { + name: "ooxml_lookup_type", + description: + "Look up a complexType or simpleType by qname in a profile. Tries complexType first, then simpleType.", + inputSchema: { + type: "object" as const, + properties: { + qname: { type: "string", description: "Type qname, e.g. 'w:CT_Tbl' or 'CT_Tbl'." }, + profile: { type: "string", description: "Profile name (default: 'transitional')." }, + }, + required: ["qname"], + }, + }, + { + name: "ooxml_children", + description: + "List the legal children of an element or complexType in document order. For an element, follows @type to its complexType first. Walks inheritance to union content from base types. Group refs are surfaced as-is; resolve them by calling ooxml_children on the group qname.", + inputSchema: { + type: "object" as const, + properties: { + qname: { + type: "string", + description: "Element, complexType, or group qname (e.g. 'w:tbl', 'CT_Tbl', 'EG_PContent').", + }, + profile: { type: "string", description: "Profile name (default: 'transitional')." }, + }, + required: ["qname"], + }, + }, + { + name: "ooxml_attributes", + description: + "List the attributes of an element or complexType. For an element, follows @type to its complexType first. Walks inheritance and unfolds attributeGroup refs recursively. Each entry includes use (required/optional/prohibited), default, fixed, and type_ref.", + inputSchema: { + type: "object" as const, + properties: { + qname: { type: "string", description: "Element or complexType qname." }, + profile: { type: "string", description: "Profile name (default: 'transitional')." }, + }, + required: ["qname"], + }, + }, + { + name: "ooxml_enum", + description: + "List enumeration values for a simpleType. Pass the simpleType qname (e.g. 'w:ST_Jc' or 'ST_Jc') and get back the values in declaration order.", + inputSchema: { + type: "object" as const, + properties: { + qname: { type: "string", description: "simpleType qname." }, + profile: { type: "string", description: "Profile name (default: 'transitional')." }, + }, + required: ["qname"], + }, + }, + { + name: "ooxml_namespace_info", + description: + "Show what's known about a namespace URI: vocabularies, profiles that include it, and how many symbols each profile contributes.", + inputSchema: { + type: "object" as const, + properties: { + uri: { type: "string", description: "Namespace URI." }, + }, + required: ["uri"], + }, + }, +]; + +export type OoxmlToolName = + | "ooxml_lookup_element" + | "ooxml_lookup_type" + | "ooxml_children" + | "ooxml_attributes" + | "ooxml_enum" + | "ooxml_namespace_info"; + +const OOXML_TOOL_NAMES: ReadonlySet = new Set(OOXML_TOOL_DEFS.map((t) => t.name)); + +export function isOoxmlTool(name: string): name is OoxmlToolName { + return OOXML_TOOL_NAMES.has(name); +} + +// biome-ignore lint/suspicious/noExplicitAny: neon's tagged template is loosely typed. +type Sql = any; + +export async function callOoxmlTool( + name: OoxmlToolName, + args: Record, + env: OoxmlEnv, +): Promise { + const sql: Sql = neon(env.DATABASE_URL); + const profile = (args.profile as string | undefined) ?? DEFAULT_PROFILE; + + switch (name) { + case "ooxml_lookup_element": { + const q = parseQName(String(args.qname ?? "")); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + const hit = await lookupElement(sql, q.qname.namespace, q.qname.localName, profile); + if (!hit) { + return formatNotFound( + `element ${q.qname.localName} in namespace ${q.qname.namespace}`, + profile, + ); + } + return formatSymbolReport("Element", hit, profile); + } + + case "ooxml_lookup_type": { + const q = parseQName(String(args.qname ?? "")); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + const hit = await lookupType(sql, q.qname.namespace, q.qname.localName, profile); + if (!hit) { + return formatNotFound( + `type ${q.qname.localName} in namespace ${q.qname.namespace}`, + profile, + ); + } + return formatSymbolReport(hit.kind === "simpleType" ? "SimpleType" : "ComplexType", hit, profile); + } + + case "ooxml_children": { + const q = parseQName(String(args.qname ?? "")); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + + let typeSym = await lookupType(sql, q.qname.namespace, q.qname.localName, profile); + let elementSym: SymbolHit | null = null; + if (!typeSym) { + elementSym = await lookupElement(sql, q.qname.namespace, q.qname.localName, profile); + if (elementSym?.typeRef) { + typeSym = await lookupSymbolByTypeRef(sql, elementSym.typeRef, profile); + } else if (!elementSym) { + // Fall back to looking for a group with this name (so EG_PContent works). + const grp = await sql` + SELECT s.id, s.local_name, s.kind, s.vocabulary_id, s.type_ref, + ns.uri AS namespace_uri, p.name AS profile_name, src.name AS source_name + FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + JOIN xsd_profiles p ON p.id = sp.profile_id + LEFT JOIN reference_sources src ON src.id = sp.source_id + WHERE s.local_name = ${q.qname.localName} + AND s.kind = 'group' + AND ns.uri = ${q.qname.namespace} + AND p.name = ${profile} + LIMIT 1 + `; + const r = grp[0]; + if (r) { + typeSym = { + id: r.id as number, + vocabularyId: r.vocabulary_id as string, + localName: r.local_name as string, + kind: r.kind as string, + typeRef: r.type_ref as string | null, + namespaceUri: r.namespace_uri as string, + profileName: r.profile_name as string, + sourceName: r.source_name as string | null, + }; + } + } + } + if (!typeSym) { + return formatNotFound( + `children for ${q.qname.localName} in namespace ${q.qname.namespace}`, + profile, + ); + } + const children = await getChildren(sql, typeSym.id, profile); + return formatChildrenReport(elementSym, typeSym, children, profile); + } + + case "ooxml_attributes": { + const q = parseQName(String(args.qname ?? "")); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + let typeSym = await lookupType(sql, q.qname.namespace, q.qname.localName, profile); + let elementSym: SymbolHit | null = null; + if (!typeSym) { + elementSym = await lookupElement(sql, q.qname.namespace, q.qname.localName, profile); + if (elementSym?.typeRef) { + typeSym = await lookupSymbolByTypeRef(sql, elementSym.typeRef, profile); + } + } + if (!typeSym) { + return formatNotFound( + `attributes for ${q.qname.localName} in namespace ${q.qname.namespace}`, + profile, + ); + } + const attrs = await getAttributes(sql, typeSym.id, profile); + return formatAttributesReport(elementSym, typeSym, attrs, profile); + } + + case "ooxml_enum": { + const q = parseQName(String(args.qname ?? "")); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + const sym = await lookupType(sql, q.qname.namespace, q.qname.localName, profile); + if (!sym || sym.kind !== "simpleType") { + return formatNotFound( + `simpleType ${q.qname.localName} in namespace ${q.qname.namespace}`, + profile, + ); + } + const enums = await getEnums(sql, sym.id, profile); + return formatEnumReport(sym, enums, profile); + } + + case "ooxml_namespace_info": { + const uri = String(args.uri ?? ""); + if (!uri) return formatNotFound("namespace URI not provided"); + const info = await getNamespaceInfo(sql, uri); + if (!info) return formatNotFound(`namespace URI '${uri}' not present in any profile`); + return formatNamespaceReport(info); + } + + default: { + const _exhaustive: never = name; + throw new Error(`Unhandled OOXML tool: ${_exhaustive}`); + } + } +} + +// --- Formatting -------------------------------------------------------- + +function formatSymbolReport(label: string, hit: SymbolHit, profile: string): string { + const lines: string[] = []; + lines.push(`## ${label}: ${hit.localName}`); + lines.push(""); + lines.push(`- profile: ${profile}`); + lines.push(`- canonical: (vocabulary=${hit.vocabularyId}, kind=${hit.kind}, name=${hit.localName})`); + lines.push(`- namespace: ${hit.namespaceUri}`); + if (hit.typeRef) lines.push(`- type_ref: ${hit.typeRef}`); + if (hit.sourceName) lines.push(`- source: ${hit.sourceName}`); + lines.push(""); + lines.push("_behavior notes: none yet (Phase 5)._"); + return lines.join("\n"); +} + +function formatChildrenReport( + element: SymbolHit | null, + type: SymbolHit, + children: ChildEdge[], + profile: string, +): string { + const lines: string[] = []; + const heading = element + ? `Children of ${element.localName} (via type ${type.localName})` + : `Children of ${type.localName}`; + lines.push(`## ${heading}`); + lines.push(""); + lines.push(`- profile: ${profile}`); + lines.push(`- type vocabulary: ${type.vocabularyId}`); + lines.push(`- type namespace: ${type.namespaceUri}`); + if (type.sourceName) lines.push(`- source: ${type.sourceName}`); + lines.push(""); + + if (children.length === 0) { + lines.push("_no direct or inherited children._"); + lines.push(""); + lines.push("_behavior notes: none yet (Phase 5)._"); + return lines.join("\n"); + } + + lines.push("| order | kind | name | min | max | compositor | from |"); + lines.push("| --- | --- | --- | --- | --- | --- | --- |"); + for (const c of children) { + const max = c.maxOccurs === null ? "unbounded" : String(c.maxOccurs); + const comp = c.compositorKind ?? "-"; + const from = c.source === "self" ? "self" : `inherited (${c.owningTypeName})`; + lines.push( + `| ${c.orderIndex} | ${c.kind} | ${c.localName} | ${c.minOccurs} | ${max} | ${comp} | ${from} |`, + ); + } + lines.push(""); + lines.push( + "_group entries are returned as-is; call `ooxml_children` on the group qname to expand them._", + ); + lines.push(""); + lines.push("_behavior notes: none yet (Phase 5)._"); + return lines.join("\n"); +} + +function formatAttributesReport( + element: SymbolHit | null, + type: SymbolHit, + attrs: AttrEntry[], + profile: string, +): string { + const lines: string[] = []; + const heading = element + ? `Attributes of ${element.localName} (via type ${type.localName})` + : `Attributes of ${type.localName}`; + lines.push(`## ${heading}`); + lines.push(""); + lines.push(`- profile: ${profile}`); + lines.push(`- type vocabulary: ${type.vocabularyId}`); + if (type.sourceName) lines.push(`- source: ${type.sourceName}`); + lines.push(""); + + if (attrs.length === 0) { + lines.push("_no attributes._"); + lines.push(""); + lines.push("_behavior notes: none yet (Phase 5)._"); + return lines.join("\n"); + } + + lines.push("| name | use | type | default | fixed | from |"); + lines.push("| --- | --- | --- | --- | --- | --- |"); + for (const a of attrs) { + const from = + a.source === "self" + ? "self" + : a.source === "inherited" + ? `inherited (${a.owningName})` + : `attributeGroup (${a.owningName})`; + lines.push( + `| ${a.localName} | ${a.attrUse} | ${a.typeRef ?? "-"} | ${a.defaultValue ?? "-"} | ${a.fixedValue ?? "-"} | ${from} |`, + ); + } + lines.push(""); + lines.push("_behavior notes: none yet (Phase 5)._"); + return lines.join("\n"); +} + +function formatEnumReport(sym: SymbolHit, enums: EnumEntry[], profile: string): string { + const lines: string[] = []; + lines.push(`## Enum values for ${sym.localName}`); + lines.push(""); + lines.push(`- profile: ${profile}`); + lines.push(`- vocabulary: ${sym.vocabularyId}`); + lines.push(`- namespace: ${sym.namespaceUri}`); + if (sym.sourceName) lines.push(`- source: ${sym.sourceName}`); + lines.push(""); + if (enums.length === 0) { + lines.push("_no enum values; this simpleType is constrained by base type or pattern only._"); + } else { + for (const e of enums) lines.push(`- ${e.value}`); + } + lines.push(""); + lines.push("_behavior notes: none yet (Phase 5)._"); + return lines.join("\n"); +} + +function formatNamespaceReport(info: NamespaceInfo): string { + const lines: string[] = []; + lines.push(`## Namespace ${info.uri}`); + lines.push(""); + lines.push(`- vocabularies: ${info.vocabularies.join(", ") || "(none)"}`); + if (info.profiles.length === 0) { + lines.push("- profiles: (no symbols in any profile)"); + } else { + lines.push("- profiles:"); + for (const p of info.profiles) lines.push(` - ${p.name}: ${p.symbolCount} symbols`); + } + return lines.join("\n"); +} + +function formatNotFound(what: string, profile?: string): string { + const lines: string[] = []; + lines.push(`## Not found: ${what}`); + if (profile) lines.push(`Searched in profile '${profile}'.`); + lines.push(""); + lines.push("Try one of:"); + lines.push("- a known prefix qname like `w:tbl`, `r:id`, `s:ST_OnOff`, `m:oMath`, `a:blip`"); + lines.push("- Clark form `{namespace-uri}localName`"); + lines.push("- a different profile (currently only `transitional` is populated)"); + return lines.join("\n"); +} diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index 072d32e..1e40e14 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -827,9 +827,7 @@ async function handleAttribute( const declPrefixMap = parseResult.namespaceByPrefix.get(topDecl.documentPath); if (declPrefixMap) { const t = resolveQNameAttr(declAttrs.type, declPrefixMap, topDecl.namespace); - typeRef = t.resolved - ? `{${t.qname.namespace}}${t.qname.localName}` - : declAttrs.type; + typeRef = t.resolved ? `{${t.qname.namespace}}${t.qname.localName}` : declAttrs.type; } else { typeRef = declAttrs.type; } diff --git a/tests/mcp-server/ooxml-queries.test.ts b/tests/mcp-server/ooxml-queries.test.ts new file mode 100644 index 0000000..5554972 --- /dev/null +++ b/tests/mcp-server/ooxml-queries.test.ts @@ -0,0 +1,222 @@ +/** + * Phase 4 query layer tests. Ingests the same fixture XSDs the ingest tests use, + * then exercises each MCP-tool query function against the populated DB. + */ + +import { join } from "node:path"; +import { afterAll, beforeAll, expect, test } from "bun:test"; +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; +import { ingestSchemaSet } from "../../scripts/ingest-xsd/ingest.ts"; +import { + getAttributes, + getChildren, + getEnums, + getNamespaceInfo, + lookupElement, + lookupSymbolByTypeRef, + lookupType, + parseQName, +} from "../../apps/mcp-server/src/ooxml-queries.ts"; + +const FIXTURES_DIR = join(import.meta.dir, "..", "ingest-xsd", "fixtures"); +const databaseUrl = process.env.DATABASE_URL; +if (!databaseUrl) throw new Error("Missing DATABASE_URL for integration tests"); + +let db: DbClient; + +const TRUNCATE_SQL = ` + TRUNCATE + behavior_notes, + xsd_enums, + xsd_inheritance_edges, + xsd_group_edges, + xsd_attr_edges, + xsd_child_edges, + xsd_compositors, + xsd_symbol_profiles, + xsd_symbols, + xsd_namespaces, + xsd_profiles + RESTART IDENTITY CASCADE +`; + +beforeAll(async () => { + db = createDbClient(databaseUrl); + await db.sql` + INSERT INTO reference_sources (name, kind) + VALUES ('ecma-376-transitional', 'xsd') + ON CONFLICT (name) DO NOTHING + `; + await db.sql.unsafe(TRUNCATE_SQL); + await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); +}); + +afterAll(async () => { + await db.sql.unsafe(TRUNCATE_SQL); + await db.close(); +}); + +const WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; +const SHARED_NS = "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes"; + +test("parseQName: prefixed, Clark, bare", () => { + const a = parseQName("w:tbl"); + expect(a.ok).toBe(true); + if (a.ok) { + expect(a.qname.namespace).toBe(WML_NS); + expect(a.qname.localName).toBe("tbl"); + } + + const b = parseQName("{http://example.com}foo"); + expect(b.ok).toBe(true); + if (b.ok) { + expect(b.qname.namespace).toBe("http://example.com"); + expect(b.qname.localName).toBe("foo"); + } + + const c = parseQName("CT_Tbl"); + expect(c.ok).toBe(true); + if (c.ok) expect(c.qname.namespace).toBe(WML_NS); // bare default + + const d = parseQName("zzz:something"); + expect(d.ok).toBe(false); +}); + +test("lookupElement: top-level element with type_ref", async () => { + const hit = await lookupElement(db.sql, WML_NS, "document", "transitional"); + expect(hit?.localName).toBe("document"); + expect(hit?.kind).toBe("element"); + expect(hit?.typeRef).toBe(`{${WML_NS}}CT_Empty`); + expect(hit?.profileName).toBe("transitional"); + expect(hit?.namespaceUri).toBe(WML_NS); +}); + +test("lookupElement: local element (text inside CT_Para) is in the profile", async () => { + const hit = await lookupElement(db.sql, WML_NS, "text", "transitional"); + expect(hit).not.toBeNull(); + expect(hit?.typeRef).toBe("{http://www.w3.org/2001/XMLSchema}string"); +}); + +test("lookupType: complexType vs simpleType disambiguation", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_Para", "transitional"); + expect(ct?.kind).toBe("complexType"); + + const st = await lookupType(db.sql, WML_NS, "ST_Jc", "transitional"); + expect(st?.kind).toBe("simpleType"); + + const sharedSt = await lookupType(db.sql, SHARED_NS, "ST_OnOff", "transitional"); + expect(sharedSt?.vocabularyId).toBe("shared-types"); +}); + +test("lookupSymbolByTypeRef resolves Clark form", async () => { + const hit = await lookupSymbolByTypeRef(db.sql, `{${WML_NS}}CT_Empty`, "transitional"); + expect(hit?.localName).toBe("CT_Empty"); + expect(hit?.kind).toBe("complexType"); +}); + +test("getChildren: CT_Para has the local 'text' element via its sequence", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_Para", "transitional"); + if (!ct) throw new Error("CT_Para not found"); + const children = await getChildren(db.sql, ct.id, "transitional"); + expect(children).toHaveLength(1); + expect(children[0].localName).toBe("text"); + expect(children[0].compositorKind).toBe("sequence"); + expect(children[0].source).toBe("self"); +}); + +test("getChildren: CT_Body returns ordered mix of elements + group ref", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_Body", "transitional"); + if (!ct) throw new Error("CT_Body not found"); + const children = await getChildren(db.sql, ct.id, "transitional"); + // CT_Body content (top sequence): element ref="document", choice(group EG_PContent, element name="break") + // getChildren returns the top sequence's edges; the nested choice's content is reachable via compositorId + // pivot but not flattened automatically. + const localNames = children.map((c) => c.localName).sort(); + expect(localNames).toContain("document"); + expect(localNames).toContain("EG_PContent"); + expect(localNames).toContain("break"); +}); + +test("getChildren: inheritance is unioned (CT_Extended inherits from CT_Empty)", async () => { + // CT_Extended extends CT_Empty (which has no content); CT_Extended itself has no + // content model either, so children should be empty. + const ct = await lookupType(db.sql, WML_NS, "CT_Extended", "transitional"); + if (!ct) throw new Error("CT_Extended not found"); + const children = await getChildren(db.sql, ct.id, "transitional"); + expect(children).toHaveLength(0); +}); + +test("getAttributes: CT_Para has 'bold' with type_ref to ST_OnOff", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_Para", "transitional"); + if (!ct) throw new Error("CT_Para not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + const bold = attrs.find((a) => a.localName === "bold"); + expect(bold?.attrUse).toBe("optional"); + expect(bold?.typeRef).toBe(`{${SHARED_NS}}ST_OnOff`); +}); + +test("getAttributes: CT_TableUser unfolds AG_TableProps via attributeGroup ref", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_TableUser", "transitional"); + if (!ct) throw new Error("CT_TableUser not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + const names = attrs.map((a) => a.localName).sort(); + // caption is direct, cols comes from AG_TableProps. + expect(names).toContain("caption"); + expect(names).toContain("cols"); + + const cols = attrs.find((a) => a.localName === "cols"); + expect(cols?.source).toBe("attributeGroup"); + expect(cols?.owningName).toBe("AG_TableProps"); + + const caption = attrs.find((a) => a.localName === "caption"); + expect(caption?.attrUse).toBe("required"); +}); + +test("getAttributes: CT_Extended inherits 'extra' (declared on the extension)", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_Extended", "transitional"); + if (!ct) throw new Error("CT_Extended not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + const extra = attrs.find((a) => a.localName === "extra"); + expect(extra?.attrUse).toBe("optional"); + expect(extra?.typeRef).toBe("{http://www.w3.org/2001/XMLSchema}string"); +}); + +test("getEnums: ST_Jc returns left/center/right in order", async () => { + const st = await lookupType(db.sql, WML_NS, "ST_Jc", "transitional"); + if (!st) throw new Error("ST_Jc not found"); + const enums = await getEnums(db.sql, st.id, "transitional"); + expect(enums.map((e) => e.value)).toEqual(["left", "center", "right"]); +}); + +test("getNamespaceInfo: reports profile membership and vocabularies", async () => { + const info = await getNamespaceInfo(db.sql, WML_NS); + expect(info?.uri).toBe(WML_NS); + expect(info?.vocabularies).toContain("wml-main"); + expect(info?.profiles.find((p) => p.name === "transitional")?.symbolCount).toBeGreaterThan(0); + + // Unknown URI → null + const none = await getNamespaceInfo(db.sql, "http://example.com/does-not-exist"); + expect(none).toBeNull(); +}); + +test("lookupElement: returns null for unknown qname", async () => { + const hit = await lookupElement(db.sql, WML_NS, "doesNotExist", "transitional"); + expect(hit).toBeNull(); +}); + +test("element-to-type chain: lookup w-style element, follow type_ref, fetch children", async () => { + // document → CT_Empty (no content) ⇒ children empty. + const elem = await lookupElement(db.sql, WML_NS, "document", "transitional"); + expect(elem).not.toBeNull(); + if (!elem?.typeRef) throw new Error("expected type_ref"); + const type = await lookupSymbolByTypeRef(db.sql, elem.typeRef, "transitional"); + expect(type?.localName).toBe("CT_Empty"); + const children = await getChildren(db.sql, type!.id, "transitional"); + expect(children).toHaveLength(0); +}); From 7b0898cb2a1ee4f80c0e41cf5d50174ffc06af4e Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 14:41:56 -0300 Subject: [PATCH 14/24] chore(mcp): split ooxml dispatch + add local e2e harness The deployed Worker uses @neondatabase/serverless (HTTP-only) which can't talk to local Postgres, so callOoxmlTool is now a thin Neon-creating wrapper around runOoxmlTool, which takes any tagged-template sql function. Tests and the new CLI use postgres.js against local Docker; the Worker keeps Neon. scripts/ooxml-call.ts dispatches the same code path the Worker uses. Five PLAN.md acceptance queries verified against the real WML closure: ooxml_children("w:tbl") -> EG_RangeMarkupElements (group, 0..unbounded), tblPr (1..1), tblGrid (1..1), EG_ContentRowContent (group, 0..unbounded) ooxml_lookup_element("w:tblGrid") -> type_ref={...wml-main}CT_TblGridBase; in CT_Tbl context min/max=1 (required, per Q1) ooxml_attributes("w:jc") -> single attr 'val' (required), type_ref to ST_Jc ooxml_enum("w:ST_Jc") -> 12 values incl. start/end (Strict) and left/right (Transitional) ooxml_lookup_element("w:notARealElement") -> 'Not found' card with profile and recovery hints --- apps/mcp-server/src/ooxml-tools.ts | 37 ++++++++++++++++---- package.json | 1 + scripts/ooxml-call.ts | 56 ++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 6 deletions(-) create mode 100644 scripts/ooxml-call.ts diff --git a/apps/mcp-server/src/ooxml-tools.ts b/apps/mcp-server/src/ooxml-tools.ts index 7d558ff..44f2947 100644 --- a/apps/mcp-server/src/ooxml-tools.ts +++ b/apps/mcp-server/src/ooxml-tools.ts @@ -16,8 +16,6 @@ import { type AttrEntry, type ChildEdge, type EnumEntry, - type NamespaceInfo, - type SymbolHit, getAttributes, getChildren, getEnums, @@ -25,7 +23,9 @@ import { lookupElement, lookupSymbolByTypeRef, lookupType, + type NamespaceInfo, parseQName, + type SymbolHit, } from "./ooxml-queries"; export const DEFAULT_PROFILE = "transitional"; @@ -79,7 +79,8 @@ export const OOXML_TOOL_DEFS = [ properties: { qname: { type: "string", - description: "Element, complexType, or group qname (e.g. 'w:tbl', 'CT_Tbl', 'EG_PContent').", + description: + "Element, complexType, or group qname (e.g. 'w:tbl', 'CT_Tbl', 'EG_PContent').", }, profile: { type: "string", description: "Profile name (default: 'transitional')." }, }, @@ -143,12 +144,30 @@ export function isOoxmlTool(name: string): name is OoxmlToolName { // biome-ignore lint/suspicious/noExplicitAny: neon's tagged template is loosely typed. type Sql = any; +/** + * Worker-side entry point: constructs a Neon HTTP client from env and dispatches. + * Local CLIs and tests should call `runOoxmlTool` directly with their own sql + * (e.g. postgres.js against a local Postgres) to avoid the Neon HTTP path. + */ export async function callOoxmlTool( name: OoxmlToolName, args: Record, env: OoxmlEnv, ): Promise { - const sql: Sql = neon(env.DATABASE_URL); + const sql = neon(env.DATABASE_URL); + return runOoxmlTool(name, args, sql); +} + +/** + * Driver-agnostic dispatch. `sql` is any tagged-template SQL function whose + * shape matches `(strings, ...values) => Promise` (Neon and postgres.js + * both qualify). + */ +export async function runOoxmlTool( + name: OoxmlToolName, + args: Record, + sql: Sql, +): Promise { const profile = (args.profile as string | undefined) ?? DEFAULT_PROFILE; switch (name) { @@ -175,7 +194,11 @@ export async function callOoxmlTool( profile, ); } - return formatSymbolReport(hit.kind === "simpleType" ? "SimpleType" : "ComplexType", hit, profile); + return formatSymbolReport( + hit.kind === "simpleType" ? "SimpleType" : "ComplexType", + hit, + profile, + ); } case "ooxml_children": { @@ -286,7 +309,9 @@ function formatSymbolReport(label: string, hit: SymbolHit, profile: string): str lines.push(`## ${label}: ${hit.localName}`); lines.push(""); lines.push(`- profile: ${profile}`); - lines.push(`- canonical: (vocabulary=${hit.vocabularyId}, kind=${hit.kind}, name=${hit.localName})`); + lines.push( + `- canonical: (vocabulary=${hit.vocabularyId}, kind=${hit.kind}, name=${hit.localName})`, + ); lines.push(`- namespace: ${hit.namespaceUri}`); if (hit.typeRef) lines.push(`- type_ref: ${hit.typeRef}`); if (hit.sourceName) lines.push(`- source: ${hit.sourceName}`); diff --git a/package.json b/package.json index 2bf9e9a..3c431ab 100644 --- a/package.json +++ b/package.json @@ -24,6 +24,7 @@ "xsd:fetch": "bun scripts/fetch-xsd.ts", "xsd:smoke": "bun scripts/ingest-xsd/smoke.ts", "xsd:ingest": "bun scripts/ingest-xsd/ingest.ts", + "ooxml:call": "bun scripts/ooxml-call.ts", "test": "bun test tests/", "ingest": "bun scripts/ingest/pipeline.ts", "ingest:chunk": "bun scripts/ingest/chunk.ts", diff --git a/scripts/ooxml-call.ts b/scripts/ooxml-call.ts new file mode 100644 index 0000000..107e38d --- /dev/null +++ b/scripts/ooxml-call.ts @@ -0,0 +1,56 @@ +/** + * Local end-to-end harness for the Phase 4 OOXML tools. + * + * The deployed Worker uses @neondatabase/serverless (HTTP-only), which can't + * talk to local Postgres. This CLI bypasses the Worker and dispatches through + * `runOoxmlTool` directly with a postgres.js-backed sql function, so the same + * code path that the Worker exercises runs end-to-end against the dev DB. + * + * Usage: + * bun scripts/ooxml-call.ts + * bun scripts/ooxml-call.ts ooxml_children '{"qname":"w:tbl"}' + * bun scripts/ooxml-call.ts ooxml_attributes '{"qname":"w:pBdr"}' + * bun scripts/ooxml-call.ts ooxml_enum '{"qname":"w:ST_Jc"}' + * + * Environment: + * DATABASE_URL - postgres connection string (defaults to local docker) + */ + +import { createDbClient } from "../packages/shared/src/db/index.ts"; +import { + type OoxmlToolName, + isOoxmlTool, + runOoxmlTool, +} from "../apps/mcp-server/src/ooxml-tools.ts"; + +async function main() { + const [, , toolArg, argsArg] = process.argv; + if (!toolArg) { + console.error("Usage: bun scripts/ooxml-call.ts [jsonArgs]"); + console.error("Tools: ooxml_lookup_element, ooxml_lookup_type, ooxml_children,"); + console.error(" ooxml_attributes, ooxml_enum, ooxml_namespace_info"); + process.exit(1); + } + if (!isOoxmlTool(toolArg)) { + console.error(`Unknown tool: ${toolArg}`); + process.exit(1); + } + + const args: Record = argsArg ? JSON.parse(argsArg) : {}; + + const databaseUrl = + process.env.DATABASE_URL ?? "postgresql://postgres:postgres@localhost:5432/ecma_spec"; + const db = createDbClient(databaseUrl); + + try { + const text = await runOoxmlTool(toolArg as OoxmlToolName, args, db.sql); + console.log(text); + } finally { + await db.close(); + } +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); From 99d149f3d91d61c86b76eb8c7d45a2415f503171 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 15:23:21 -0300 Subject: [PATCH 15/24] fix(mcp): correct inheritance order, compositor flattening, and nested attributeGroup walk Three Phase 4 query bugs surfaced by review against real WML schemas. Each one would have produced wrong structural answers from the new tools before the dogfood window even opened. P2 - Inheritance ordering for complexContent/extension. XSD says base content comes before the extension's own content (e.g. CT_PPr extends CT_PPrBase: pStyle and friends from base, then rPr/sectPr from the extension). The old code walked the chain self- first, so ooxml_children("w:pPr") would have surfaced the extension's rPr/sectPr ahead of inherited pStyle. complexContent/restriction is now also handled correctly: derived REPLACES base content, so the base is no longer included for restriction relations. P2 - Compositor flattening across nested particles. order_index is local to each compositor. The old query joined child_edges + group_edges across ALL compositors of a type and sorted by order_index alone, so a nested choice's children (which restart at order 0) sorted before later siblings of the outer sequence. WML's CT_Object would have reported the inner choice's first child before drawing. Fixed with a recursive walkCompositor that does DFS through parent_compositor_id, emitting children in true document order. Each ChildEdge now carries a compositorPath like ["sequence(1..1)", "choice(0..unbounded)"] for downstream rendering. P2 - Recursive attributeGroup refs. The previous code only fetched direct xsd_attr_edges from a referenced group, not the group's own xsd_group_edges with ref_kind='attributeGroup'. VML's AG_AllCoreAttributes -> AG_CoreAttributes -> AG_Id/AG_Style chain would have lost most attributes. Now collectAttrsFromAttributeGroup recurses with a visited-set guard against cycles, so nested attributeGroup chains unfold completely. Tests: - 3 new query-layer tests cover each fix path against fixtures: CT_DerivedExtended verifies extension order, CT_NestedOrder verifies nested compositor flatten, CT_NestedAttrUser verifies nested attributeGroup chain. - Fixture main.xsd grows: CT_BaseWithChildren / CT_DerivedExtended / CT_NestedOrder / AG_Inner / AG_Outer / CT_NestedAttrUser. Existing ingest counts updated to match. Test infra: bun's default 5s timeout was tight for the WML smoke ingest on a busy DB; bumped that test to 30s. test runner now sequences the three test directories so the WML smoke and the fixture-ingest tests do not race for the same connection pool. 39 / 0 across db / ingest / mcp-server. --- apps/mcp-server/src/mcp.ts | 7 +- apps/mcp-server/src/ooxml-queries.ts | 477 ++++++++++++++++--------- package.json | 2 +- scripts/ooxml-call.ts | 4 +- tests/ingest-xsd/fixtures/main.xsd | 40 +++ tests/ingest-xsd/ingest.test.ts | 44 ++- tests/ingest-xsd/parse-schema.test.ts | 6 +- tests/mcp-server/ooxml-queries.test.ts | 50 +++ 8 files changed, 441 insertions(+), 189 deletions(-) diff --git a/apps/mcp-server/src/mcp.ts b/apps/mcp-server/src/mcp.ts index 6837a28..96a4fa9 100644 --- a/apps/mcp-server/src/mcp.ts +++ b/apps/mcp-server/src/mcp.ts @@ -7,12 +7,7 @@ import { createDb } from "./db"; import { embedQuery } from "./embeddings"; import type { Env } from "./index"; -import { - OOXML_TOOL_DEFS, - callOoxmlTool, - isOoxmlTool, - ooxmlToolsEnabled, -} from "./ooxml-tools"; +import { callOoxmlTool, isOoxmlTool, OOXML_TOOL_DEFS, ooxmlToolsEnabled } from "./ooxml-tools"; // JSON-RPC types interface JsonRpcRequest { diff --git a/apps/mcp-server/src/ooxml-queries.ts b/apps/mcp-server/src/ooxml-queries.ts index f827fcd..e155dc5 100644 --- a/apps/mcp-server/src/ooxml-queries.ts +++ b/apps/mcp-server/src/ooxml-queries.ts @@ -39,9 +39,7 @@ export interface ParsedQName { rawPrefix: string | null; } -export type QNameParseResult = - | { ok: true; qname: ParsedQName } - | { ok: false; reason: string }; +export type QNameParseResult = { ok: true; qname: ParsedQName } | { ok: false; reason: string }; /** * Parse a user-supplied qname. Accepts: @@ -177,123 +175,222 @@ export interface ChildEdge { compositorKind: string | null; compositorId: number | null; parentCompositorId: number | null; + /** Compositor stack from outermost to direct parent, e.g. ["sequence", "choice(0..unbounded)"]. */ + compositorPath: string[]; source: "self" | "inherited"; owningTypeName: string; } -/** - * Collect inheritance ancestors of a type symbol (self first, then bases). - * Each entry is the symbol id and its name for surfacing in responses. - */ -async function collectInheritance( +interface InheritanceEdgeRow { + baseId: number; + relation: "extension" | "restriction"; +} + +async function getInheritanceEdge( sql: Sql, - rootSymbolId: number, + symbolId: number, profile: string, -): Promise> { +): Promise { const rows = await sql` - WITH RECURSIVE chain AS ( - SELECT s.id, s.local_name, s.vocabulary_id, 0 AS depth - FROM xsd_symbols s - WHERE s.id = ${rootSymbolId} - UNION ALL - SELECT base.id, base.local_name, base.vocabulary_id, c.depth + 1 - FROM chain c - JOIN xsd_inheritance_edges e ON e.symbol_id = c.id - JOIN xsd_profiles p ON p.id = e.profile_id - JOIN xsd_symbols base ON base.id = e.base_symbol_id - WHERE p.name = ${profile} - ) - SELECT id, local_name, vocabulary_id FROM chain ORDER BY depth + SELECT e.base_symbol_id, e.relation + FROM xsd_inheritance_edges e + JOIN xsd_profiles p ON p.id = e.profile_id + WHERE e.symbol_id = ${symbolId} AND p.name = ${profile} + LIMIT 1 `; - return rows.map((r: Record) => ({ - id: r.id as number, - localName: r.local_name as string, - vocabularyId: r.vocabulary_id as string, - })); + if (rows.length === 0) return null; + return { baseId: rows[0].base_symbol_id as number, relation: rows[0].relation as InheritanceEdgeRow["relation"] }; +} + +async function getSymbolName(sql: Sql, symbolId: number): Promise { + const rows = await sql`SELECT local_name FROM xsd_symbols WHERE id = ${symbolId} LIMIT 1`; + return (rows[0]?.local_name as string | undefined) ?? "(unknown)"; +} + +interface CompositorRow { + id: number; + kind: "sequence" | "choice" | "all"; + minOccurs: number; + maxOccurs: number | null; + orderIndex: number; +} + +function formatOccurs(min: number, max: number | null): string { + const maxStr = max === null ? "unbounded" : String(max); + if (min === 1 && max === 1) return "1..1"; + return `${min}..${maxStr}`; } /** - * Children of a type symbol, walking inheritance to union the bases' content. - * Returns elements (from xsd_child_edges) and group refs (from xsd_group_edges - * with ref_kind='group') in document order. Group refs are returned as-is; - * callers who want them flattened can call getChildren on the referenced group. + * Walk a single compositor's content tree in document order, descending into + * nested compositors. Each emitted child carries the full compositor path so + * callers can reconstruct nesting. */ -export async function getChildren( +async function walkCompositor( sql: Sql, - rootSymbolId: number, + compositor: CompositorRow, profile: string, + pathSoFar: string[], + source: ChildEdge["source"], + owningTypeName: string, ): Promise { - const chain = await collectInheritance(sql, rootSymbolId, profile); - if (chain.length === 0) return []; + const path = [...pathSoFar, `${compositor.kind}(${formatOccurs(compositor.minOccurs, compositor.maxOccurs)})`]; + + const elemRows = await sql` + SELECT 'element' AS entry_kind, s.local_name, s.vocabulary_id, ns.uri AS namespace_uri, + e.min_occurs, e.max_occurs, e.order_index, NULL::int AS nested_compositor_id + FROM xsd_child_edges e + JOIN xsd_symbols s ON s.id = e.child_symbol_id + LEFT JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id AND sp.profile_id = e.profile_id + LEFT JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + JOIN xsd_profiles p ON p.id = e.profile_id + WHERE e.compositor_id = ${compositor.id} AND p.name = ${profile} + `; + const groupRows = await sql` + SELECT 'group' AS entry_kind, g.local_name, g.vocabulary_id, NULL AS namespace_uri, + ge.min_occurs, ge.max_occurs, ge.order_index, NULL::int AS nested_compositor_id + FROM xsd_group_edges ge + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + JOIN xsd_profiles p ON p.id = ge.profile_id + WHERE ge.compositor_id = ${compositor.id} AND ge.ref_kind = 'group' AND p.name = ${profile} + `; + const nestedRows = await sql` + SELECT 'compositor' AS entry_kind, NULL AS local_name, NULL AS vocabulary_id, NULL AS namespace_uri, + c.min_occurs, c.max_occurs, c.order_index, c.id AS nested_compositor_id, c.kind + FROM xsd_compositors c + JOIN xsd_profiles p ON p.id = c.profile_id + WHERE c.parent_compositor_id = ${compositor.id} AND p.name = ${profile} + `; + + const all = [...elemRows, ...groupRows, ...nestedRows]; + all.sort((a, b) => (a.order_index as number) - (b.order_index as number)); const out: ChildEdge[] = []; - for (const ancestor of chain) { - const elemRows = await sql` - SELECT s.local_name, s.vocabulary_id, ns.uri AS namespace_uri, - e.min_occurs, e.max_occurs, e.order_index, - c.kind AS compositor_kind, c.id AS compositor_id, c.parent_compositor_id - FROM xsd_child_edges e - JOIN xsd_symbols s ON s.id = e.child_symbol_id - LEFT JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id AND sp.profile_id = e.profile_id - LEFT JOIN xsd_namespaces ns ON ns.id = sp.namespace_id - JOIN xsd_compositors c ON c.id = e.compositor_id - JOIN xsd_profiles p ON p.id = e.profile_id - WHERE e.parent_symbol_id = ${ancestor.id} AND p.name = ${profile} - ORDER BY e.order_index - `; - const groupRows = await sql` - SELECT g.local_name, g.vocabulary_id, - ge.min_occurs, ge.max_occurs, ge.order_index, - c.kind AS compositor_kind, ge.compositor_id, c.parent_compositor_id - FROM xsd_group_edges ge - JOIN xsd_symbols g ON g.id = ge.group_symbol_id - LEFT JOIN xsd_compositors c ON c.id = ge.compositor_id - JOIN xsd_profiles p ON p.id = ge.profile_id - WHERE ge.parent_symbol_id = ${ancestor.id} - AND ge.ref_kind = 'group' - AND p.name = ${profile} - ORDER BY ge.order_index - `; - - const ancestorEntries: ChildEdge[] = []; - for (const r of elemRows) { - ancestorEntries.push({ - kind: "element", - localName: r.local_name as string, - vocabularyId: r.vocabulary_id as string, - namespaceUri: (r.namespace_uri as string | null) ?? null, + for (const r of all) { + if (r.entry_kind === "compositor") { + const nested: CompositorRow = { + id: r.nested_compositor_id as number, + kind: r.kind as CompositorRow["kind"], minOccurs: r.min_occurs as number, maxOccurs: r.max_occurs as number | null, orderIndex: r.order_index as number, - compositorKind: r.compositor_kind as string | null, - compositorId: r.compositor_id as number | null, - parentCompositorId: r.parent_compositor_id as number | null, - source: ancestor.id === rootSymbolId ? "self" : "inherited", - owningTypeName: ancestor.localName, - }); - } - for (const r of groupRows) { - ancestorEntries.push({ - kind: "group", + }; + const inner = await walkCompositor(sql, nested, profile, path, source, owningTypeName); + out.push(...inner); + } else { + out.push({ + kind: r.entry_kind as "element" | "group", localName: r.local_name as string, vocabularyId: r.vocabulary_id as string, - namespaceUri: null, + namespaceUri: (r.namespace_uri as string | null) ?? null, minOccurs: r.min_occurs as number, maxOccurs: r.max_occurs as number | null, orderIndex: r.order_index as number, - compositorKind: r.compositor_kind as string | null, - compositorId: r.compositor_id as number | null, - parentCompositorId: r.parent_compositor_id as number | null, - source: ancestor.id === rootSymbolId ? "self" : "inherited", - owningTypeName: ancestor.localName, + compositorKind: compositor.kind, + compositorId: compositor.id, + parentCompositorId: null, + compositorPath: path, + source, + owningTypeName, }); } - ancestorEntries.sort((a, b) => a.orderIndex - b.orderIndex); - out.push(...ancestorEntries); } return out; } +/** + * Children of a type symbol in correct document order. Walks inheritance per + * XSD semantics: complexContent/extension prepends the base's effective content + * before the derived type's; complexContent/restriction REPLACES the base's + * content (we don't include the base). Within a type, walks the compositor + * tree DFS so nested sequences/choices flatten in document order. + * + * Group refs are returned as edges; resolve them by calling getChildren on the + * group symbol. + */ +export async function getChildren( + sql: Sql, + rootSymbolId: number, + profile: string, +): Promise { + return getChildrenRecursive(sql, rootSymbolId, profile, true); +} + +async function getChildrenRecursive( + sql: Sql, + symbolId: number, + profile: string, + isRoot: boolean, +): Promise { + const out: ChildEdge[] = []; + + // Inheritance: extension prepends base content; restriction replaces it. + const inherit = await getInheritanceEdge(sql, symbolId, profile); + if (inherit && inherit.relation === "extension") { + const base = await getChildrenRecursive(sql, inherit.baseId, profile, false); + // Already-emitted entries in `base` already carry their owning type name; + // flip their source to "inherited" relative to the root request. + for (const c of base) { + if (isRoot) c.source = "inherited"; + out.push(c); + } + } + + // Walk this type's own top-level compositors. + const topCompositors = await sql` + SELECT c.id, c.kind, c.min_occurs, c.max_occurs, c.order_index + FROM xsd_compositors c + JOIN xsd_profiles p ON p.id = c.profile_id + WHERE c.parent_symbol_id = ${symbolId} AND p.name = ${profile} + ORDER BY c.order_index + `; + const ownName = await getSymbolName(sql, symbolId); + const source: ChildEdge["source"] = isRoot ? "self" : "inherited"; + for (const r of topCompositors) { + const c: CompositorRow = { + id: r.id as number, + kind: r.kind as CompositorRow["kind"], + minOccurs: r.min_occurs as number, + maxOccurs: r.max_occurs as number | null, + orderIndex: r.order_index as number, + }; + const inner = await walkCompositor(sql, c, profile, [], source, ownName); + out.push(...inner); + } + + // Top-level group refs that hang directly off the type (compositor_id IS NULL). + const topLevelGroups = await sql` + SELECT g.local_name, g.vocabulary_id, ge.min_occurs, ge.max_occurs, ge.order_index + FROM xsd_group_edges ge + JOIN xsd_symbols g ON g.id = ge.group_symbol_id + JOIN xsd_profiles p ON p.id = ge.profile_id + WHERE ge.parent_symbol_id = ${symbolId} + AND ge.ref_kind = 'group' + AND ge.compositor_id IS NULL + AND p.name = ${profile} + ORDER BY ge.order_index + `; + for (const r of topLevelGroups) { + out.push({ + kind: "group", + localName: r.local_name as string, + vocabularyId: r.vocabulary_id as string, + namespaceUri: null, + minOccurs: r.min_occurs as number, + maxOccurs: r.max_occurs as number | null, + orderIndex: r.order_index as number, + compositorKind: null, + compositorId: null, + parentCompositorId: null, + compositorPath: [], + source, + owningTypeName: ownName, + }); + } + + return out; +} + export interface AttrEntry { localName: string; attrUse: "required" | "optional" | "prohibited"; @@ -305,81 +402,145 @@ export interface AttrEntry { } /** - * Attributes on a type symbol, including those from base types (inheritance) - * and from attributeGroup refs (recursively). + * Attributes on a type symbol. Walks inheritance per XSD semantics + * (extension prepends base attrs; restriction replaces them) and recurses + * through attributeGroup refs, including refs nested inside other + * attributeGroups. Cycles are guarded by a visited-set. + * + * Names are de-duplicated: a derived type's redeclaration of an inherited + * attribute wins, so the first occurrence in walk order is what surfaces. */ export async function getAttributes( sql: Sql, rootSymbolId: number, profile: string, ): Promise { - const chain = await collectInheritance(sql, rootSymbolId, profile); const out: AttrEntry[] = []; - const seenAttrs = new Set(); // dedupe by local name; derived overrides base - - for (const ancestor of chain) { - const directAttrs = await sql` - SELECT a.local_name, a.attr_use, a.default_value, a.fixed_value, a.type_ref, a.order_index - FROM xsd_attr_edges a - JOIN xsd_profiles p ON p.id = a.profile_id - WHERE a.symbol_id = ${ancestor.id} AND p.name = ${profile} - ORDER BY a.order_index - `; - for (const r of directAttrs) { - const name = r.local_name as string; - if (seenAttrs.has(name)) continue; - seenAttrs.add(name); - out.push({ - localName: name, - attrUse: r.attr_use as "required" | "optional" | "prohibited", - defaultValue: r.default_value as string | null, - fixedValue: r.fixed_value as string | null, - typeRef: r.type_ref as string | null, - source: ancestor.id === rootSymbolId ? "self" : "inherited", - owningName: ancestor.localName, - }); - } + const seenAttrs = new Set(); + const visitedGroups = new Set(); + await collectAttrsForType(sql, rootSymbolId, profile, true, out, seenAttrs, visitedGroups); + return out; +} - // attributeGroup refs (resolve recursively) - const agRefs = await sql` - SELECT ge.group_symbol_id, g.local_name AS group_name - FROM xsd_group_edges ge - JOIN xsd_symbols g ON g.id = ge.group_symbol_id - JOIN xsd_profiles p ON p.id = ge.profile_id - WHERE ge.parent_symbol_id = ${ancestor.id} - AND ge.ref_kind = 'attributeGroup' - AND p.name = ${profile} - ORDER BY ge.order_index - `; - for (const ag of agRefs) { - const groupName = ag.group_name as string; - const innerChain = await collectInheritance(sql, ag.group_symbol_id as number, profile); - for (const inner of innerChain) { - const innerAttrs = await sql` - SELECT a.local_name, a.attr_use, a.default_value, a.fixed_value, a.type_ref, a.order_index - FROM xsd_attr_edges a - JOIN xsd_profiles p ON p.id = a.profile_id - WHERE a.symbol_id = ${inner.id} AND p.name = ${profile} - ORDER BY a.order_index - `; - for (const r of innerAttrs) { - const name = r.local_name as string; - if (seenAttrs.has(name)) continue; - seenAttrs.add(name); - out.push({ - localName: name, - attrUse: r.attr_use as "required" | "optional" | "prohibited", - defaultValue: r.default_value as string | null, - fixedValue: r.fixed_value as string | null, - typeRef: r.type_ref as string | null, - source: "attributeGroup", - owningName: groupName, - }); - } - } - } +async function collectAttrsForType( + sql: Sql, + symbolId: number, + profile: string, + isRoot: boolean, + out: AttrEntry[], + seenAttrs: Set, + visitedGroups: Set, +): Promise { + // Per XSD: extension prepends base; restriction replaces. We always emit base + // first when extending so derived declarations correctly override later. + const inherit = await getInheritanceEdge(sql, symbolId, profile); + if (inherit && inherit.relation === "extension") { + await collectAttrsForType(sql, inherit.baseId, profile, false, out, seenAttrs, visitedGroups); + } + + const ownName = await getSymbolName(sql, symbolId); + + // Direct attribute declarations on this symbol (whether complexType or + // attributeGroup; both can carry xsd:attribute children). + const directAttrs = await sql` + SELECT a.local_name, a.attr_use, a.default_value, a.fixed_value, a.type_ref, a.order_index + FROM xsd_attr_edges a + JOIN xsd_profiles p ON p.id = a.profile_id + WHERE a.symbol_id = ${symbolId} AND p.name = ${profile} + ORDER BY a.order_index + `; + for (const r of directAttrs) { + const name = r.local_name as string; + if (seenAttrs.has(name)) continue; + seenAttrs.add(name); + out.push({ + localName: name, + attrUse: r.attr_use as "required" | "optional" | "prohibited", + defaultValue: r.default_value as string | null, + fixedValue: r.fixed_value as string | null, + typeRef: r.type_ref as string | null, + source: isRoot ? "self" : "inherited", + owningName: ownName, + }); + } + + // attributeGroup refs hanging off this symbol; recurse into each. + const agRefs = await sql` + SELECT ge.group_symbol_id + FROM xsd_group_edges ge + JOIN xsd_profiles p ON p.id = ge.profile_id + WHERE ge.parent_symbol_id = ${symbolId} + AND ge.ref_kind = 'attributeGroup' + AND p.name = ${profile} + ORDER BY ge.order_index + `; + for (const ag of agRefs) { + await collectAttrsFromAttributeGroup( + sql, + ag.group_symbol_id as number, + profile, + out, + seenAttrs, + visitedGroups, + ); + } +} + +async function collectAttrsFromAttributeGroup( + sql: Sql, + groupSymbolId: number, + profile: string, + out: AttrEntry[], + seenAttrs: Set, + visitedGroups: Set, +): Promise { + if (visitedGroups.has(groupSymbolId)) return; + visitedGroups.add(groupSymbolId); + + const groupName = await getSymbolName(sql, groupSymbolId); + + const directAttrs = await sql` + SELECT a.local_name, a.attr_use, a.default_value, a.fixed_value, a.type_ref, a.order_index + FROM xsd_attr_edges a + JOIN xsd_profiles p ON p.id = a.profile_id + WHERE a.symbol_id = ${groupSymbolId} AND p.name = ${profile} + ORDER BY a.order_index + `; + for (const r of directAttrs) { + const name = r.local_name as string; + if (seenAttrs.has(name)) continue; + seenAttrs.add(name); + out.push({ + localName: name, + attrUse: r.attr_use as "required" | "optional" | "prohibited", + defaultValue: r.default_value as string | null, + fixedValue: r.fixed_value as string | null, + typeRef: r.type_ref as string | null, + source: "attributeGroup", + owningName: groupName, + }); + } + + // Nested attributeGroup refs inside this group. + const innerRefs = await sql` + SELECT ge.group_symbol_id + FROM xsd_group_edges ge + JOIN xsd_profiles p ON p.id = ge.profile_id + WHERE ge.parent_symbol_id = ${groupSymbolId} + AND ge.ref_kind = 'attributeGroup' + AND p.name = ${profile} + ORDER BY ge.order_index + `; + for (const ref of innerRefs) { + await collectAttrsFromAttributeGroup( + sql, + ref.group_symbol_id as number, + profile, + out, + seenAttrs, + visitedGroups, + ); } - return out; } export interface EnumEntry { @@ -387,11 +548,7 @@ export interface EnumEntry { orderIndex: number; } -export async function getEnums( - sql: Sql, - symbolId: number, - profile: string, -): Promise { +export async function getEnums(sql: Sql, symbolId: number, profile: string): Promise { const rows = await sql` SELECT e.value, e.order_index FROM xsd_enums e diff --git a/package.json b/package.json index 3c431ab..cd061c2 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,7 @@ "xsd:smoke": "bun scripts/ingest-xsd/smoke.ts", "xsd:ingest": "bun scripts/ingest-xsd/ingest.ts", "ooxml:call": "bun scripts/ooxml-call.ts", - "test": "bun test tests/", + "test": "bun test tests/db/ && bun test tests/ingest-xsd/ && bun test tests/mcp-server/", "ingest": "bun scripts/ingest/pipeline.ts", "ingest:chunk": "bun scripts/ingest/chunk.ts", "ingest:embed": "bun scripts/ingest/embed.ts", diff --git a/scripts/ooxml-call.ts b/scripts/ooxml-call.ts index 107e38d..e404089 100644 --- a/scripts/ooxml-call.ts +++ b/scripts/ooxml-call.ts @@ -16,12 +16,12 @@ * DATABASE_URL - postgres connection string (defaults to local docker) */ -import { createDbClient } from "../packages/shared/src/db/index.ts"; import { - type OoxmlToolName, isOoxmlTool, + type OoxmlToolName, runOoxmlTool, } from "../apps/mcp-server/src/ooxml-tools.ts"; +import { createDbClient } from "../packages/shared/src/db/index.ts"; async function main() { const [, , toolArg, argsArg] = process.argv; diff --git a/tests/ingest-xsd/fixtures/main.xsd b/tests/ingest-xsd/fixtures/main.xsd index 2c44a6a..1937e08 100644 --- a/tests/ingest-xsd/fixtures/main.xsd +++ b/tests/ingest-xsd/fixtures/main.xsd @@ -55,5 +55,45 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts index 8367feb..72d1d52 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-xsd/ingest.test.ts @@ -122,7 +122,7 @@ test("ingest writes inheritance edges for extension and restriction", async () = // ST_Jc restricts xsd:string (simpleType) // ST_OnOff restricts xsd:boolean // ST_String restricts xsd:string - expect(stats.inheritanceEdgesInserted).toBe(5); + expect(stats.inheritanceEdgesInserted).toBe(6); expect(stats.inheritanceUnresolved).toBe(0); // Verify the CT_Extended → CT_Empty extension edge. @@ -214,17 +214,21 @@ test("ingest writes compositors and child edges for nested content models", asyn }); // Fixture content models: - // CT_Para: sequence -> element name="text" - // CT_Body: sequence -> [ element ref="document", - // choice (minOccurs=0, maxOccurs=unbounded) -> [ - // group ref="EG_PContent", - // element name="break" - // ]] - // EG_PContent: choice -> element name="r" - // Compositors total: CT_Para(1) + CT_Body(2) + EG_PContent(1) = 4 - expect(stats.compositorsInserted).toBe(4); + // CT_Para: sequence -> element name="text" + // CT_Body: sequence -> [ element ref="document", + // choice (0..unbounded) -> [ + // group ref="EG_PContent", + // element name="break" ]] + // EG_PContent: choice -> element name="r" + // CT_BaseWithChildren: sequence -> [ alpha, beta ] + // CT_DerivedExtended: complexContent/extension -> sequence -> [ gamma ] + // CT_NestedOrder: sequence -> [ head, choice -> [ branchA, branchB ], tail ] + // Compositors: CT_Para(1) + CT_Body(2) + EG_PContent(1) + Base(1) + Derived(1) + Nested(2) = 8 + expect(stats.compositorsInserted).toBe(8); expect(stats.groupRefsInserted).toBe(1); - expect(stats.localElementsCreated).toBe(3); // text, break, r + // Local element names (deduped per vocab): text, break, r, alpha, beta, gamma, + // head, branchA, branchB, tail = 10. + expect(stats.localElementsCreated).toBe(10); expect(stats.childEdgesUnresolved).toBe(0); expect(stats.groupRefsUnresolved).toBe(0); @@ -323,17 +327,21 @@ test("ingest writes attributes, attributeGroup refs, and enum values", async () }); // Fixture attributes: - // CT_Para/bold (optional, type s:ST_OnOff) - // CT_Extended/extra (optional, type xsd:string, under complexContent/extension) - // AG_TableProps/cols (optional, type xsd:int) + // CT_Para/bold (optional, type s:ST_OnOff) + // CT_Extended/extra (optional, type xsd:string, under complexContent/extension) + // AG_TableProps/cols (optional, type xsd:int) // CT_TableUser/caption (required, type xsd:string) - // CT_RefTest/space (required, ref="s:space"; type/default copied from decl) - expect(stats.attrEdgesInserted).toBe(5); + // CT_RefTest/space (required, ref="s:space"; type/default copied from decl) + // AG_Inner/innerAttr (optional, type xsd:string) + // AG_Outer/outerAttr (optional, type xsd:string) + expect(stats.attrEdgesInserted).toBe(7); expect(stats.attrEdgesUnresolved).toBe(0); // Fixture attributeGroup refs: // CT_TableUser -> AG_TableProps - expect(stats.attrGroupRefsInserted).toBe(1); + // AG_Outer -> AG_Inner (nested attributeGroup ref) + // CT_NestedAttrUser -> AG_Outer + expect(stats.attrGroupRefsInserted).toBe(3); expect(stats.attrGroupRefsUnresolved).toBe(0); // Fixture enums: ST_Jc has 3 values; ST_OnOff and ST_String have base restrictions @@ -462,6 +470,7 @@ test("ingest preserves element/attribute @type, local-element profile membership test.skipIf(!realCacheReady)( "smoke: ingest WML closure into the dev DB and verify counts", async () => { + // Real WML ingest writes thousands of rows; bump timeout from default 5s. const stats = await ingestSchemaSet({ schemaDir: REAL_CACHE_DIR, entrypoints: ["wml.xsd"], @@ -499,4 +508,5 @@ test.skipIf(!realCacheReady)( `; expect(ctTblChildren.length).toBeGreaterThan(0); }, + 30_000, ); diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-xsd/parse-schema.test.ts index 46942a8..d40f2fe 100644 --- a/tests/ingest-xsd/parse-schema.test.ts +++ b/tests/ingest-xsd/parse-schema.test.ts @@ -85,13 +85,13 @@ test("declarationsByQName indexes all top-level declarations across documents", const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); const counts = countByKind(set.declarationsByQName); - // main.xsd: 1 element, 7 complexType, 1 simpleType, 1 group, 1 attributeGroup + // main.xsd: 1 element, 11 complexType, 1 simpleType, 1 group, 3 attributeGroup // shared.xsd: 2 simpleType, 1 attribute expect(counts.element).toBe(1); - expect(counts.complexType).toBe(7); + expect(counts.complexType).toBe(11); expect(counts.simpleType).toBe(3); expect(counts.group).toBe(1); - expect(counts.attributeGroup).toBe(1); + expect(counts.attributeGroup).toBe(3); expect(counts.attribute).toBe(1); // Specific decl lookup by canonical key. diff --git a/tests/mcp-server/ooxml-queries.test.ts b/tests/mcp-server/ooxml-queries.test.ts index 5554972..0292380 100644 --- a/tests/mcp-server/ooxml-queries.test.ts +++ b/tests/mcp-server/ooxml-queries.test.ts @@ -210,6 +210,56 @@ test("lookupElement: returns null for unknown qname", async () => { expect(hit).toBeNull(); }); +test("getChildren: extension prepends base content (CT_DerivedExtended -> alpha, beta, gamma)", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_DerivedExtended", "transitional"); + if (!ct) throw new Error("CT_DerivedExtended not found"); + const children = await getChildren(db.sql, ct.id, "transitional"); + const names = children.map((c) => c.localName); + // XSD extension semantics: base content first, then derived. + expect(names).toEqual(["alpha", "beta", "gamma"]); + // Provenance distinguishes base-derived from self-derived. + expect(children[0].source).toBe("inherited"); + expect(children[0].owningTypeName).toBe("CT_BaseWithChildren"); + expect(children[2].source).toBe("self"); + expect(children[2].owningTypeName).toBe("CT_DerivedExtended"); +}); + +test("getChildren: nested compositor flatten preserves document order (CT_NestedOrder)", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_NestedOrder", "transitional"); + if (!ct) throw new Error("CT_NestedOrder not found"); + const children = await getChildren(db.sql, ct.id, "transitional"); + // Top sequence: head, choice(branchA, branchB), tail. + // Document order should be head, branchA, branchB, tail (NOT branchA first because + // its order_index=0 inside the choice). + const names = children.map((c) => c.localName); + expect(names).toEqual(["head", "branchA", "branchB", "tail"]); + + // Compositor path makes the nesting visible. + const head = children.find((c) => c.localName === "head"); + expect(head?.compositorPath).toEqual(["sequence(1..1)"]); + + const branchA = children.find((c) => c.localName === "branchA"); + expect(branchA?.compositorPath).toEqual(["sequence(1..1)", "choice(0..unbounded)"]); +}); + +test("getAttributes: nested attributeGroup chain unfolds (CT_NestedAttrUser -> innerAttr + outerAttr)", async () => { + const ct = await lookupType(db.sql, WML_NS, "CT_NestedAttrUser", "transitional"); + if (!ct) throw new Error("CT_NestedAttrUser not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + const names = attrs.map((a) => a.localName).sort(); + // CT_NestedAttrUser refs AG_Outer; AG_Outer refs AG_Inner. + // Both attributes must surface. + expect(names).toEqual(["innerAttr", "outerAttr"]); + + const inner = attrs.find((a) => a.localName === "innerAttr"); + expect(inner?.source).toBe("attributeGroup"); + expect(inner?.owningName).toBe("AG_Inner"); + + const outer = attrs.find((a) => a.localName === "outerAttr"); + expect(outer?.source).toBe("attributeGroup"); + expect(outer?.owningName).toBe("AG_Outer"); +}); + test("element-to-type chain: lookup w-style element, follow type_ref, fetch children", async () => { // document → CT_Empty (no content) ⇒ children empty. const elem = await lookupElement(db.sql, WML_NS, "document", "transitional"); From f1b32234857dcb3e2be976923887d884ed39dec1 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 15:47:19 -0300 Subject: [PATCH 16/24] chore(mcp): biome line wrapping --- apps/mcp-server/src/ooxml-queries.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/apps/mcp-server/src/ooxml-queries.ts b/apps/mcp-server/src/ooxml-queries.ts index e155dc5..4e8c831 100644 --- a/apps/mcp-server/src/ooxml-queries.ts +++ b/apps/mcp-server/src/ooxml-queries.ts @@ -199,7 +199,10 @@ async function getInheritanceEdge( LIMIT 1 `; if (rows.length === 0) return null; - return { baseId: rows[0].base_symbol_id as number, relation: rows[0].relation as InheritanceEdgeRow["relation"] }; + return { + baseId: rows[0].base_symbol_id as number, + relation: rows[0].relation as InheritanceEdgeRow["relation"], + }; } async function getSymbolName(sql: Sql, symbolId: number): Promise { @@ -234,7 +237,10 @@ async function walkCompositor( source: ChildEdge["source"], owningTypeName: string, ): Promise { - const path = [...pathSoFar, `${compositor.kind}(${formatOccurs(compositor.minOccurs, compositor.maxOccurs)})`]; + const path = [ + ...pathSoFar, + `${compositor.kind}(${formatOccurs(compositor.minOccurs, compositor.maxOccurs)})`, + ]; const elemRows = await sql` SELECT 'element' AS entry_kind, s.local_name, s.vocabulary_id, ns.uri AS namespace_uri, From cb3e16d78d91193c15e2b21b12c309d1d02b2dbb Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 16:05:24 -0300 Subject: [PATCH 17/24] fix(xsd): scope local elements per-owner; link xsd-builtin symbols to profile; drop em dash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three review-flagged correctness gaps before flipping ENABLE_OOXML_TOOLS. P1 - Local element symbols collapsed across complexTypes. Inline declared inside two different complexTypes was deduped under (vocabulary, name, kind), keeping only the first-seen type_ref. Real WML hits this on tblGrid alone: declared as CT_TblGridBase inside CT_TblGridChange and as CT_TblGrid inside CT_Tbl. ooxml_children("w:tblGrid") would have followed CT_TblGridBase and missed the CT_TblGrid children. Migration 0004 adds xsd_symbols.parent_symbol_id (nullable) and replaces the 3-tuple unique with a 4-tuple UNIQUE NULLS NOT DISTINCT (vocab, local_name, kind, parent_symbol_id). Top-level decls keep parent NULL and still collide on name; local decls are scoped to their owner. ingest.ts passes the owning symbol id when upserting local element symbols. P2 - xsd-builtin symbols had no profile membership. The on-demand inheritance pass created xsd:string / xsd:boolean / etc. via upsertSymbol but never called linkSymbolToProfile, so lookupSymbol (which JOINs xsd_symbol_profiles) returned null. Following an element's type_ref into a W3C built-in silently failed. Now also ensure the xs/xsd namespace exists and link the built-in symbol into the target profile. P3 - Em dash in code comment. scripts/ingest-xsd/qname.ts line 12 used "—". Replaced with "-". Tests: - New "local element symbols are scoped per-owner (no cross-CT collapse)" against a CT_OuterA / CT_OuterB fixture mirroring the WML tblGrid pattern: each `shared` element resolves to its own symbol with the correct per-owner type_ref. - New "xsd-builtin symbols have profile membership" verifies lookupSymbolByTypeRef succeeds for {...XMLSchema}string. - Existing fixture and WML smoke counts adjusted. 41 / 0 across db / ingest / mcp-server. --- db/migrations/0004_local_element_scoping.sql | 49 ++++++++++++++++++++ db/schema.sql | 10 +++- scripts/ingest-xsd/ingest.ts | 32 ++++++++++--- scripts/ingest-xsd/qname.ts | 2 +- tests/ingest-xsd/fixtures/main.xsd | 13 ++++++ tests/ingest-xsd/ingest.test.ts | 15 ++++-- tests/ingest-xsd/parse-schema.test.ts | 4 +- tests/mcp-server/ooxml-queries.test.ts | 45 ++++++++++++++++++ 8 files changed, 155 insertions(+), 15 deletions(-) create mode 100644 db/migrations/0004_local_element_scoping.sql diff --git a/db/migrations/0004_local_element_scoping.sql b/db/migrations/0004_local_element_scoping.sql new file mode 100644 index 0000000..89be098 --- /dev/null +++ b/db/migrations/0004_local_element_scoping.sql @@ -0,0 +1,49 @@ +-- Phase 4 review fix: scope local element symbols by their owner. +-- +-- Before this migration, an inline declared +-- inside two different complexTypes/groups collapsed to a single symbol keyed +-- on (vocabulary_id, local_name, kind). The first-seen type_ref won and the +-- later one was silently dropped, so e.g. WML's `tblGrid` (CT_TblGridBase +-- inside CT_TblGridChange vs CT_TblGrid inside CT_Tbl) gave a wrong answer +-- for ooxml_children("w:tblGrid"). +-- +-- Fix: add parent_symbol_id to xsd_symbols and include it in the canonical key +-- with NULLS NOT DISTINCT so two top-level declarations (parent NULL) still +-- collide while local declarations are scoped per-owner. +-- +-- Idempotent. + +ALTER TABLE xsd_symbols + ADD COLUMN IF NOT EXISTS parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE; + +DO $$ +DECLARE cname TEXT; +BEGIN + -- Drop the auto-named 3-tuple unique constraint, regardless of what postgres + -- ended up calling it. + SELECT conname INTO cname + FROM pg_constraint + WHERE conrelid = 'xsd_symbols'::regclass + AND contype = 'u' + AND conkey = ( + SELECT array_agg(attnum ORDER BY attnum) + FROM pg_attribute + WHERE attrelid = 'xsd_symbols'::regclass + AND attname IN ('vocabulary_id', 'local_name', 'kind') + ); + IF cname IS NOT NULL THEN + EXECUTE 'ALTER TABLE xsd_symbols DROP CONSTRAINT ' || quote_ident(cname); + END IF; + + IF NOT EXISTS ( + SELECT 1 FROM pg_constraint + WHERE conname = 'xsd_symbols_canonical_key' + AND conrelid = 'xsd_symbols'::regclass + ) THEN + ALTER TABLE xsd_symbols + ADD CONSTRAINT xsd_symbols_canonical_key + UNIQUE NULLS NOT DISTINCT (vocabulary_id, local_name, kind, parent_symbol_id); + END IF; +END $$; + +CREATE INDEX IF NOT EXISTS idx_xsd_symbols_parent ON xsd_symbols(parent_symbol_id); diff --git a/db/schema.sql b/db/schema.sql index 69aaf54..a1deef2 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -67,15 +67,22 @@ CREATE TABLE xsd_namespaces ( -- type_ref holds the Clark-style {namespace}localName for elements and attributes -- that declare a @type. NULL for complexType/simpleType/group/attributeGroup. -- Phase 4 lookups follow type_ref to resolve element -> type when reading children. +-- +-- parent_symbol_id is NULL for top-level declarations and set to the owning +-- type/group symbol for inline (local) element declarations. The canonical +-- key is 4-tuple with NULLS NOT DISTINCT so top-level decls still collide on +-- name while local decls remain scoped per-owner. CREATE TABLE xsd_symbols ( id SERIAL PRIMARY KEY, vocabulary_id TEXT NOT NULL, local_name TEXT NOT NULL, kind TEXT NOT NULL, type_ref TEXT, + parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, payload JSONB DEFAULT '{}'::jsonb, created_at TIMESTAMPTZ DEFAULT NOW(), - UNIQUE (vocabulary_id, local_name, kind) + CONSTRAINT xsd_symbols_canonical_key + UNIQUE NULLS NOT DISTINCT (vocabulary_id, local_name, kind, parent_symbol_id) ); CREATE TABLE xsd_symbol_profiles ( @@ -181,6 +188,7 @@ CREATE TABLE behavior_notes ( ); CREATE INDEX idx_xsd_symbols_lookup ON xsd_symbols(vocabulary_id, local_name, kind); +CREATE INDEX idx_xsd_symbols_parent ON xsd_symbols(parent_symbol_id); CREATE INDEX idx_xsd_child_edges_parent ON xsd_child_edges(parent_symbol_id); CREATE INDEX idx_xsd_child_edges_compositor ON xsd_child_edges(compositor_id); CREATE INDEX idx_xsd_attr_edges_symbol ON xsd_attr_edges(symbol_id); diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index 1e40e14..d018093 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -192,6 +192,16 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise { const [row] = await sql` - INSERT INTO xsd_symbols (vocabulary_id, local_name, kind, type_ref) - VALUES (${vocabularyId}, ${localName}, ${kind}, ${typeRef}) - ON CONFLICT (vocabulary_id, local_name, kind) DO UPDATE + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind, type_ref, parent_symbol_id) + VALUES (${vocabularyId}, ${localName}, ${kind}, ${typeRef}, ${parentSymbolId}) + ON CONFLICT ON CONSTRAINT xsd_symbols_canonical_key DO UPDATE SET type_ref = COALESCE(EXCLUDED.type_ref, xsd_symbols.type_ref) RETURNING id, (xmax = 0) AS inserted `; @@ -916,8 +936,8 @@ function stripPrefixLocal(tag: string | null): string | null { return colon < 0 ? tag : tag.slice(colon + 1); } -function symbolKey(vocab: string, local: string, kind: string): string { - return `${vocab}|${local}|${kind}`; +function symbolKey(vocab: string, local: string, kind: string, parentId: number | null = null): string { + return `${vocab}|${local}|${kind}|${parentId ?? ""}`; } // --- CLI ----------------------------------------------------------------- diff --git a/scripts/ingest-xsd/qname.ts b/scripts/ingest-xsd/qname.ts index 008e2ca..413a2aa 100644 --- a/scripts/ingest-xsd/qname.ts +++ b/scripts/ingest-xsd/qname.ts @@ -9,7 +9,7 @@ * * 2. QName-valued attributes (ref, type, base, substitutionGroup, etc.) hold a * "prefix:localName" string. Resolution uses the document's xmlns:* declarations. - * `resolveQNameAttr` returns either a resolved tuple or "unresolved" — we never + * `resolveQNameAttr` returns either a resolved tuple or "unresolved" - we never * invent a namespace for an unknown prefix. */ diff --git a/tests/ingest-xsd/fixtures/main.xsd b/tests/ingest-xsd/fixtures/main.xsd index 1937e08..4454d3f 100644 --- a/tests/ingest-xsd/fixtures/main.xsd +++ b/tests/ingest-xsd/fixtures/main.xsd @@ -95,5 +95,18 @@ + + + + + + + + + + + diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts index 72d1d52..3aaaea4 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-xsd/ingest.test.ts @@ -223,12 +223,17 @@ test("ingest writes compositors and child edges for nested content models", asyn // CT_BaseWithChildren: sequence -> [ alpha, beta ] // CT_DerivedExtended: complexContent/extension -> sequence -> [ gamma ] // CT_NestedOrder: sequence -> [ head, choice -> [ branchA, branchB ], tail ] - // Compositors: CT_Para(1) + CT_Body(2) + EG_PContent(1) + Base(1) + Derived(1) + Nested(2) = 8 - expect(stats.compositorsInserted).toBe(8); + // Compositors: CT_Para(1) + CT_Body(2) + EG_PContent(1) + Base(1) + Derived(1) + + // Nested(2) + OuterA(1) + OuterB(1) = 10 + expect(stats.compositorsInserted).toBe(10); expect(stats.groupRefsInserted).toBe(1); - // Local element names (deduped per vocab): text, break, r, alpha, beta, gamma, - // head, branchA, branchB, tail = 10. - expect(stats.localElementsCreated).toBe(10); + // Local element symbols are scoped per-owner now, so the two `shared` decls + // in CT_OuterA and CT_OuterB count separately. + // Per-owner locals: text(CT_Para), break(CT_Body), r(EG_PContent), + // alpha+beta(CT_BaseWithChildren), gamma(CT_DerivedExtended), + // head+branchA+branchB+tail(CT_NestedOrder), shared(CT_OuterA), + // shared(CT_OuterB) = 12. + expect(stats.localElementsCreated).toBe(12); expect(stats.childEdgesUnresolved).toBe(0); expect(stats.groupRefsUnresolved).toBe(0); diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-xsd/parse-schema.test.ts index d40f2fe..6c7aeb4 100644 --- a/tests/ingest-xsd/parse-schema.test.ts +++ b/tests/ingest-xsd/parse-schema.test.ts @@ -85,10 +85,10 @@ test("declarationsByQName indexes all top-level declarations across documents", const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); const counts = countByKind(set.declarationsByQName); - // main.xsd: 1 element, 11 complexType, 1 simpleType, 1 group, 3 attributeGroup + // main.xsd: 1 element, 13 complexType, 1 simpleType, 1 group, 3 attributeGroup // shared.xsd: 2 simpleType, 1 attribute expect(counts.element).toBe(1); - expect(counts.complexType).toBe(11); + expect(counts.complexType).toBe(13); expect(counts.simpleType).toBe(3); expect(counts.group).toBe(1); expect(counts.attributeGroup).toBe(3); diff --git a/tests/mcp-server/ooxml-queries.test.ts b/tests/mcp-server/ooxml-queries.test.ts index 0292380..76e62bf 100644 --- a/tests/mcp-server/ooxml-queries.test.ts +++ b/tests/mcp-server/ooxml-queries.test.ts @@ -242,6 +242,51 @@ test("getChildren: nested compositor flatten preserves document order (CT_Nested expect(branchA?.compositorPath).toEqual(["sequence(1..1)", "choice(0..unbounded)"]); }); +test("local element symbols are scoped per-owner (no cross-CT collapse)", async () => { + // Mirrors the WML tblGrid case (CT_TblGridBase inside CT_TblGridChange vs + // CT_TblGrid inside CT_Tbl). CT_OuterA / CT_OuterB both declare an inline + // 'shared' element but with different @type. They must produce distinct + // per-parent symbols, each carrying its own type_ref. + const ctA = await lookupType(db.sql, WML_NS, "CT_OuterA", "transitional"); + const ctB = await lookupType(db.sql, WML_NS, "CT_OuterB", "transitional"); + if (!ctA || !ctB) throw new Error("CT_OuterA / CT_OuterB not found"); + + const aChildren = await getChildren(db.sql, ctA.id, "transitional"); + const bChildren = await getChildren(db.sql, ctB.id, "transitional"); + expect(aChildren).toHaveLength(1); + expect(bChildren).toHaveLength(1); + expect(aChildren[0].localName).toBe("shared"); + expect(bChildren[0].localName).toBe("shared"); + + // The two `shared` symbols carry different type_refs. + const sharedSymbols = await db.sql` + SELECT s.id, s.type_ref, s.parent_symbol_id, parent.local_name AS parent_name + FROM xsd_symbols s + JOIN xsd_symbols parent ON parent.id = s.parent_symbol_id + WHERE s.local_name = 'shared' AND s.kind = 'element' + ORDER BY parent.local_name + `; + expect(sharedSymbols).toHaveLength(2); + expect(sharedSymbols[0].parent_name).toBe("CT_OuterA"); + expect(sharedSymbols[0].type_ref).toBe(`{${WML_NS}}ST_Jc`); + expect(sharedSymbols[1].parent_name).toBe("CT_OuterB"); + expect(sharedSymbols[1].type_ref).toBe("{http://www.w3.org/2001/XMLSchema}string"); +}); + +test("xsd-builtin symbols have profile membership (lookupSymbolByTypeRef can follow xsd:string)", async () => { + // Built-ins like xsd:string are auto-created during inheritance resolution and + // must be linked to xsd_symbol_profiles, otherwise ooxml_lookup_type for + // 'xsd:string' and lookupSymbolByTypeRef for {...XMLSchema}string return null. + const t = await lookupSymbolByTypeRef( + db.sql, + "{http://www.w3.org/2001/XMLSchema}string", + "transitional", + ); + expect(t).not.toBeNull(); + expect(t?.localName).toBe("string"); + expect(t?.vocabularyId).toBe("xsd-builtin"); +}); + test("getAttributes: nested attributeGroup chain unfolds (CT_NestedAttrUser -> innerAttr + outerAttr)", async () => { const ct = await lookupType(db.sql, WML_NS, "CT_NestedAttrUser", "transitional"); if (!ct) throw new Error("CT_NestedAttrUser not found"); From 5013eecc18d3d1669a127e7ae39b560c987e2a76 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 16:35:56 -0300 Subject: [PATCH 18/24] chore: clean up internal phase markers and reorganize scripts The repo had grown two ingestion pipelines (PDF prose corpus and XSD schema graph) without making the duality obvious. Header comments referenced internal "Phase N" planning vocabulary that doesn't help public readers, and a tool-output line shipped a forward reference to a future phase to every MCP caller. Reorganization: scripts/ingest/ -> scripts/ingest-pdf/ (was ambiguous) scripts/ingest-xsd/ stays scripts/fetch-xsd.ts -> scripts/ingest-xsd/fetch.ts (sibling layout) scripts/sync-sources.ts -> scripts/sources-sync.ts (verb-style name) scripts/ingest-pdf/extract-pdf.py -> extract.py db/migrations/0003_phase3_metadata.sql -> 0003_xsd_metadata.sql scripts/ingest-xsd/smoke.ts removed (debug-only, low value) Renamed npm scripts to match the new directory layout: ingest -> pdf:ingest ingest:chunk -> pdf:chunk ingest:embed -> pdf:embed ingest:upload -> pdf:upload ingest:setup -> pdf:setup db:sync-sources -> sources:sync xsd:smoke removed Strip "Phase N" markers from migration headers, source-file headers, test-file headers, and inline comments. None of those references were load-bearing; they were artifacts of the planning doc. Drop the user-facing "_behavior notes: none yet (Phase 5)._" line that shipped in every children/attributes/enum tool response. The line gave no information when notes are absent and exposed an internal phase label to the public. Replace the lone PLAN.md reference in scripts/ingest-xsd/ingest.ts with self-contained context. PLAN.md is gitignored; pointing at it was a broken link for anyone reading the public repo. Add scripts/ingest-pdf/README.md and scripts/ingest-xsd/README.md so each pipeline is documented at the level that contributors land at, and refresh CLAUDE.md to make the two corpora explicit and surface both flavors of MCP tools. 41 / 0 across db / ingest / mcp-server. --- CLAUDE.md | 59 +++++++++++--- apps/mcp-server/src/index.ts | 7 +- apps/mcp-server/src/mcp.ts | 6 +- apps/mcp-server/src/ooxml-queries.ts | 2 +- apps/mcp-server/src/ooxml-tools.ts | 22 ++---- data/sources.json | 2 +- db/migrations/0001_reference_sources.sql | 3 +- db/migrations/0002_xsd_schema.sql | 7 +- ...se3_metadata.sql => 0003_xsd_metadata.sql} | 4 +- db/migrations/0004_local_element_scoping.sql | 2 +- db/migrations/README.md | 2 +- db/schema.sql | 9 ++- package.json | 17 ++-- scripts/ingest-pdf/README.md | 53 +++++++++++++ scripts/{ingest => ingest-pdf}/chunk.ts | 8 +- scripts/{ingest => ingest-pdf}/embed.ts | 8 +- .../extract-pdf.py => ingest-pdf/extract.py} | 0 .../fix-page-numbers.py | 0 scripts/{ingest => ingest-pdf}/pipeline.ts | 16 ++-- scripts/{ingest => ingest-pdf}/upload.ts | 8 +- scripts/ingest-xsd/README.md | 73 +++++++++++++++++ scripts/{fetch-xsd.ts => ingest-xsd/fetch.ts} | 6 +- scripts/ingest-xsd/ingest.ts | 59 +++++++------- scripts/ingest-xsd/smoke.ts | 78 ------------------- scripts/ooxml-call.ts | 10 +-- scripts/{sync-sources.ts => sources-sync.ts} | 2 +- tests/db/xsd-schema.test.ts | 6 +- tests/ingest-xsd/ingest.test.ts | 4 +- tests/ingest-xsd/parse-schema.test.ts | 2 +- tests/mcp-server/ooxml-queries.test.ts | 2 +- 30 files changed, 275 insertions(+), 202 deletions(-) rename db/migrations/{0003_phase3_metadata.sql => 0003_xsd_metadata.sql} (61%) create mode 100644 scripts/ingest-pdf/README.md rename scripts/{ingest => ingest-pdf}/chunk.ts (93%) rename scripts/{ingest => ingest-pdf}/embed.ts (90%) rename scripts/{ingest/extract-pdf.py => ingest-pdf/extract.py} (100%) rename scripts/{ingest => ingest-pdf}/fix-page-numbers.py (100%) rename scripts/{ingest => ingest-pdf}/pipeline.ts (85%) rename scripts/{ingest => ingest-pdf}/upload.ts (88%) create mode 100644 scripts/ingest-xsd/README.md rename scripts/{fetch-xsd.ts => ingest-xsd/fetch.ts} (96%) delete mode 100644 scripts/ingest-xsd/smoke.ts rename scripts/{sync-sources.ts => sources-sync.ts} (98%) diff --git a/CLAUDE.md b/CLAUDE.md index fefba79..27a439f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,15 +29,23 @@ apps/ src/data/docs.ts ← All doc pages live here (single source of truth) src/components/ UI components (Sidebar, SuperDocPreview, etc.) src/pages/ Route pages (Home, Docs, SpecExplorer, Mcp) - mcp-server/ Cloudflare Worker — MCP server for AI spec search + mcp-server/ Cloudflare Worker - MCP server (semantic + structural tools) packages/ shared/ Database client, embedding client, types scripts/ - ingest/ PDF → chunks → embeddings → database pipeline + ingest-pdf/ ECMA PDF -> spec_content (semantic search corpus) + ingest-xsd/ ECMA XSDs -> schema graph (structural query corpus) + sources-sync.ts data/sources.json -> reference_sources + db-migrate.ts Apply db/migrations/*.sql in order + ooxml-call.ts Local CLI harness for the structural MCP tools db/ - schema.sql PostgreSQL + pgvector schema + schema.sql PostgreSQL + pgvector + XSD schema graph + migrations/ Numbered, idempotent SQL migrations +data/ + sources.json Source manifest (artifact URLs, sha256, license notes) + xsd-cache/ Local-only XSD download cache (gitignored) dev/ - data/ Extracted/chunked/embedded spec content + data/ Extracted/chunked/embedded PDF content ``` ## Commands @@ -97,23 +105,52 @@ The XML you provide is wrapped in a minimal `w:document > w:body` structure auto ## MCP Server -Cloudflare Worker exposing three MCP tools for semantic spec search: +Cloudflare Worker exposing two flavors of MCP tools backed by the same database: -- `search_ecma_spec` — semantic vector search across 18,000+ spec chunks -- `get_section` — fetch a specific section by ID (e.g., "17.3.1.24") -- `list_parts` — browse the spec structure +Always-on (semantic search over the spec PDF): + +- `search_ecma_spec` - semantic vector search across 18,000+ spec chunks +- `get_section` - fetch a specific section by ID (e.g., "17.3.1.24") +- `list_parts` - browse the spec structure + +Behind `ENABLE_OOXML_TOOLS` (structural queries over the XSD schema graph): + +- `ooxml_lookup_element` / `ooxml_lookup_type` - canonical symbol info +- `ooxml_children` - legal children of an element/type/group, in document order +- `ooxml_attributes` - attributes including those inherited and unfolded from attributeGroup refs +- `ooxml_enum` - simpleType enumeration values +- `ooxml_namespace_info` - vocabularies and per-profile symbol counts for a namespace URI Uses PostgreSQL with pgvector (Neon serverless in production, Docker locally). -## Data Pipeline +## Data Pipelines + +Two ingest paths feed the same database. Both are reproducible from `data/sources.json`. -Ingests ECMA-376 PDFs into the vector database: +**PDF (semantic corpus, into `spec_content`)**: ``` PDF → extract (Python) → chunk (6KB) → embed (Voyage) → upload (PostgreSQL) ``` -Run the full pipeline: `bun scripts/ingest/pipeline.ts` +```bash +bun run pdf:ingest 1 ./pdfs/ECMA-376-Part1.pdf # full pipeline for one part +``` + +See `scripts/ingest-pdf/README.md`. + +**XSD (structural corpus, into `xsd_*` tables)**: + +``` +ECMA Part 4 zip → fetch+verify (sha256) → parse → ingest (single transaction) +``` + +```bash +bun run xsd:fetch --url --expected-sha256 +bun run xsd:ingest +``` + +See `scripts/ingest-xsd/README.md`. ## Database diff --git a/apps/mcp-server/src/index.ts b/apps/mcp-server/src/index.ts index 17e1a21..6c37d5c 100644 --- a/apps/mcp-server/src/index.ts +++ b/apps/mcp-server/src/index.ts @@ -17,9 +17,10 @@ export interface Env { DATABASE_URL: string; VOYAGE_API_KEY: string; /** - * Phase 4 feature flag. Set to "true" to expose ooxml_lookup_element / - * ooxml_lookup_type / ooxml_children / ooxml_attributes / ooxml_enum / - * ooxml_namespace_info via tools/list and tools/call. Default off. + * Feature flag for the OOXML structural tools. Set to "true" to expose + * ooxml_lookup_element / ooxml_lookup_type / ooxml_children / + * ooxml_attributes / ooxml_enum / ooxml_namespace_info via tools/list + * and tools/call. Default off. */ ENABLE_OOXML_TOOLS?: string; } diff --git a/apps/mcp-server/src/mcp.ts b/apps/mcp-server/src/mcp.ts index 96a4fa9..5059151 100644 --- a/apps/mcp-server/src/mcp.ts +++ b/apps/mcp-server/src/mcp.ts @@ -162,9 +162,9 @@ async function handleToolsCall( try { let resultText: string; - // Phase 4 OOXML tools, feature-flagged. tools/list also gates on the same flag, - // so callers should not see these tool names unless the flag is on. Defensive - // check here in case a caller hand-crafts a request. + // OOXML tools are feature-flagged; tools/list filters them out when the flag + // is off, so callers should not see these tool names. Defensive check here in + // case a caller hand-crafts a request. if (isOoxmlTool(name)) { if (!ooxmlToolsEnabled(env)) { return { diff --git a/apps/mcp-server/src/ooxml-queries.ts b/apps/mcp-server/src/ooxml-queries.ts index 4e8c831..5746bf6 100644 --- a/apps/mcp-server/src/ooxml-queries.ts +++ b/apps/mcp-server/src/ooxml-queries.ts @@ -1,5 +1,5 @@ /** - * Read-only schema-graph queries powering the Phase 4 MCP tools: + * Read-only schema-graph queries powering the OOXML MCP tools: * ooxml_lookup_element, ooxml_lookup_type, ooxml_children, * ooxml_attributes, ooxml_enum, ooxml_namespace_info. * diff --git a/apps/mcp-server/src/ooxml-tools.ts b/apps/mcp-server/src/ooxml-tools.ts index 44f2947..c4dc156 100644 --- a/apps/mcp-server/src/ooxml-tools.ts +++ b/apps/mcp-server/src/ooxml-tools.ts @@ -1,14 +1,14 @@ /** - * Phase 4 read-only structural MCP tools. Behind ENABLE_OOXML_TOOLS env flag, - * which gates both tools/list discovery and tools/call dispatch so the public - * surface stays unchanged until the feature is intentionally enabled. + * Read-only structural MCP tools backed by the OOXML schema graph. Gated by + * ENABLE_OOXML_TOOLS, which filters both tools/list discovery and tools/call + * dispatch so the public surface stays unchanged until the flag is set. * * Tools: * ooxml_lookup_element, ooxml_lookup_type, ooxml_children, * ooxml_attributes, ooxml_enum, ooxml_namespace_info. * - * Default profile is `transitional` until word-compatible-docx is composed - * in Phase 6. + * Default profile is `transitional`. Future profiles (e.g. word-compatible-docx) + * will compose Transitional with Office extension schemas. */ import { neon } from "@neondatabase/serverless"; @@ -315,8 +315,6 @@ function formatSymbolReport(label: string, hit: SymbolHit, profile: string): str lines.push(`- namespace: ${hit.namespaceUri}`); if (hit.typeRef) lines.push(`- type_ref: ${hit.typeRef}`); if (hit.sourceName) lines.push(`- source: ${hit.sourceName}`); - lines.push(""); - lines.push("_behavior notes: none yet (Phase 5)._"); return lines.join("\n"); } @@ -340,8 +338,6 @@ function formatChildrenReport( if (children.length === 0) { lines.push("_no direct or inherited children._"); - lines.push(""); - lines.push("_behavior notes: none yet (Phase 5)._"); return lines.join("\n"); } @@ -359,8 +355,6 @@ function formatChildrenReport( lines.push( "_group entries are returned as-is; call `ooxml_children` on the group qname to expand them._", ); - lines.push(""); - lines.push("_behavior notes: none yet (Phase 5)._"); return lines.join("\n"); } @@ -383,8 +377,6 @@ function formatAttributesReport( if (attrs.length === 0) { lines.push("_no attributes._"); - lines.push(""); - lines.push("_behavior notes: none yet (Phase 5)._"); return lines.join("\n"); } @@ -401,8 +393,6 @@ function formatAttributesReport( `| ${a.localName} | ${a.attrUse} | ${a.typeRef ?? "-"} | ${a.defaultValue ?? "-"} | ${a.fixedValue ?? "-"} | ${from} |`, ); } - lines.push(""); - lines.push("_behavior notes: none yet (Phase 5)._"); return lines.join("\n"); } @@ -420,8 +410,6 @@ function formatEnumReport(sym: SymbolHit, enums: EnumEntry[], profile: string): } else { for (const e of enums) lines.push(`- ${e.value}`); } - lines.push(""); - lines.push("_behavior notes: none yet (Phase 5)._"); return lines.join("\n"); } diff --git a/data/sources.json b/data/sources.json index 642bd50..7a77b27 100644 --- a/data/sources.json +++ b/data/sources.json @@ -1,5 +1,5 @@ { - "$comment": "Source manifest. Human-edited; scripts/sync-sources.ts upserts these rows into reference_sources.", + "$comment": "Source manifest. Human-edited; scripts/sources-sync.ts upserts these rows into reference_sources.", "sources": [ { "name": "ecma-376", diff --git a/db/migrations/0001_reference_sources.sql b/db/migrations/0001_reference_sources.sql index 0c8263d..483c4a2 100644 --- a/db/migrations/0001_reference_sources.sql +++ b/db/migrations/0001_reference_sources.sql @@ -1,5 +1,4 @@ --- Phase 1: Provenance foundation --- Adds reference_sources and source_id FK on spec_content. +-- Provenance foundation: reference_sources catalog + source_id FK on spec_content. -- Idempotent: safe to run against fresh installs (matches db/schema.sql) or existing DBs. CREATE EXTENSION IF NOT EXISTS vector; diff --git a/db/migrations/0002_xsd_schema.sql b/db/migrations/0002_xsd_schema.sql index 2e40aa6..6a8282d 100644 --- a/db/migrations/0002_xsd_schema.sql +++ b/db/migrations/0002_xsd_schema.sql @@ -1,5 +1,5 @@ --- Phase 2: XSD schema tables (empty) --- Profile-scoped symbol graph. All tables empty after this migration; data lands in Phase 3+. +-- Profile-scoped XSD schema graph. All tables empty after this migration; data +-- lands when scripts/ingest-xsd/ingest.ts runs against a populated cache. -- Idempotent: safe to run against fresh installs (matches db/schema.sql) or existing DBs. CREATE TABLE IF NOT EXISTS xsd_profiles ( @@ -116,7 +116,8 @@ CREATE TABLE IF NOT EXISTS xsd_enums ( ); -- Curated Word/Office behavior claims keyed to symbols. --- claim_type enum is locked now (Phase 5 will populate). +-- claim_type enum is locked now; the table stays empty until curated behavior +-- notes start landing. CREATE TABLE IF NOT EXISTS behavior_notes ( id SERIAL PRIMARY KEY, symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, diff --git a/db/migrations/0003_phase3_metadata.sql b/db/migrations/0003_xsd_metadata.sql similarity index 61% rename from db/migrations/0003_phase3_metadata.sql rename to db/migrations/0003_xsd_metadata.sql index 7093e35..725a97b 100644 --- a/db/migrations/0003_phase3_metadata.sql +++ b/db/migrations/0003_xsd_metadata.sql @@ -1,4 +1,6 @@ --- Phase 3 review fix: preserve element/attribute @type and group-ref compositor context. +-- Preserve element/attribute @type and group-ref compositor context so the +-- structural lookup tools can resolve element-to-type chains and attach refs +-- to their enclosing compositor. -- Idempotent. ALTER TABLE xsd_symbols diff --git a/db/migrations/0004_local_element_scoping.sql b/db/migrations/0004_local_element_scoping.sql index 89be098..54774bc 100644 --- a/db/migrations/0004_local_element_scoping.sql +++ b/db/migrations/0004_local_element_scoping.sql @@ -1,4 +1,4 @@ --- Phase 4 review fix: scope local element symbols by their owner. +-- Scope local element symbols by their owner. -- -- Before this migration, an inline declared -- inside two different complexTypes/groups collapsed to a single symbol keyed diff --git a/db/migrations/README.md b/db/migrations/README.md index e9a8062..de32d12 100644 --- a/db/migrations/README.md +++ b/db/migrations/README.md @@ -25,4 +25,4 @@ A small runner script can be added later if/when phases need it. 1. Pick the next number (`0002`, `0003`, ...). 2. Write idempotent SQL. 3. Update `db/schema.sql` to match the new full state. -4. If the migration introduces curated data (e.g., source rows), let a script populate it (e.g., `scripts/sync-sources.ts`), not the SQL file. +4. If the migration introduces curated data (e.g., source rows), let a script populate it (e.g., `scripts/sources-sync.ts`), not the SQL file. diff --git a/db/schema.sql b/db/schema.sql index a1deef2..d7b1ef5 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -44,11 +44,12 @@ CREATE INDEX idx_content_section ON spec_content(section_id); CREATE INDEX idx_content_source ON spec_content(source_id); -- ---------------------------------------------------------------------------- --- XSD schema graph (Phase 2) +-- XSD schema graph -- -- Profile-scoped symbol graph for OOXML schemas. Canonical symbol identity is --- (vocabulary_id, local_name, kind); namespace URIs are profile aliases. --- Profile membership lives on edges/profile join tables, not duplicated symbols. +-- (vocabulary_id, local_name, kind, parent_symbol_id); namespace URIs are +-- profile aliases. Profile membership lives on edges/profile join tables, not +-- duplicated symbols. -- ---------------------------------------------------------------------------- CREATE TABLE xsd_profiles ( @@ -66,7 +67,7 @@ CREATE TABLE xsd_namespaces ( -- type_ref holds the Clark-style {namespace}localName for elements and attributes -- that declare a @type. NULL for complexType/simpleType/group/attributeGroup. --- Phase 4 lookups follow type_ref to resolve element -> type when reading children. +-- The lookup tools follow type_ref to resolve element -> type when reading children. -- -- parent_symbol_id is NULL for top-level declarations and set to the owning -- type/group symbol for inline (local) element declarations. The canonical diff --git a/package.json b/package.json index cd061c2..ae55196 100644 --- a/package.json +++ b/package.json @@ -20,17 +20,16 @@ "db:reset": "docker compose down -v && docker compose up -d", "db:shell": "docker compose exec db psql -U postgres -d ecma_spec", "db:migrate": "bun scripts/db-migrate.ts", - "db:sync-sources": "bun scripts/sync-sources.ts", - "xsd:fetch": "bun scripts/fetch-xsd.ts", - "xsd:smoke": "bun scripts/ingest-xsd/smoke.ts", + "sources:sync": "bun scripts/sources-sync.ts", + "pdf:ingest": "bun scripts/ingest-pdf/pipeline.ts", + "pdf:chunk": "bun scripts/ingest-pdf/chunk.ts", + "pdf:embed": "bun scripts/ingest-pdf/embed.ts", + "pdf:upload": "bun scripts/ingest-pdf/upload.ts", + "pdf:setup": "pip install -r scripts/requirements.txt", + "xsd:fetch": "bun scripts/ingest-xsd/fetch.ts", "xsd:ingest": "bun scripts/ingest-xsd/ingest.ts", "ooxml:call": "bun scripts/ooxml-call.ts", - "test": "bun test tests/db/ && bun test tests/ingest-xsd/ && bun test tests/mcp-server/", - "ingest": "bun scripts/ingest/pipeline.ts", - "ingest:chunk": "bun scripts/ingest/chunk.ts", - "ingest:embed": "bun scripts/ingest/embed.ts", - "ingest:upload": "bun scripts/ingest/upload.ts", - "ingest:setup": "pip install -r scripts/requirements.txt" + "test": "bun test tests/db/ && bun test tests/ingest-xsd/ && bun test tests/mcp-server/" }, "devDependencies": { "@biomejs/biome": "^2.3.13", diff --git a/scripts/ingest-pdf/README.md b/scripts/ingest-pdf/README.md new file mode 100644 index 0000000..8e051c0 --- /dev/null +++ b/scripts/ingest-pdf/README.md @@ -0,0 +1,53 @@ +# PDF ingest (ECMA-376 prose corpus) + +Builds the semantic-search corpus that powers `search_ecma_spec` / +`get_section` / `list_parts`. Each ECMA-376 part PDF is extracted into +section-aware markdown, chunked at ~6 KB boundaries, embedded with the +configured provider, and uploaded into `spec_content`. + +``` +PDF -> extract (Python) -> chunk (6KB, section-aware) -> embed -> upload +``` + +## Prerequisites + +- Python with `pymupdf4llm`: `bun run pdf:setup` +- `DATABASE_URL` pointed at a Postgres with `db/schema.sql` applied +- An embedding provider key (one of): + - `OPENAI_API_KEY` (default) + - `VOYAGE_API_KEY` + - `GOOGLE_API_KEY` + - `COHERE_API_KEY` + +## Run the full pipeline + +```bash +bun run pdf:ingest 1 ./pdfs/ECMA-376-Part1.pdf +bun run pdf:ingest 2 ./pdfs/ECMA-376-Part2.pdf +bun run pdf:ingest 3 ./pdfs/ECMA-376-Part3.pdf +bun run pdf:ingest 4 ./pdfs/ECMA-376-Part4.pdf +``` + +Each run extracts to `dev/data/extracted/partN/`, chunks to +`dev/data/chunks/partN-chunks.json`, embeds to +`dev/data/embedded/partN-embedded.json`, then uploads. + +## Run individual stages + +```bash +bun run pdf:chunk ./extracted/part1 ./chunks/part1.json +bun run pdf:embed ./chunks/part1.json ./embedded/part1.json +bun run pdf:upload 1 ./embedded/part1.json +``` + +Useful when iterating on chunking or trying a different embedding provider +without re-extracting. + +## Files + +- `pipeline.ts` - orchestrator (extract -> chunk -> embed -> upload) +- `extract.py` - PDF -> section-aware markdown via pymupdf4llm +- `fix-page-numbers.py` - PDF prelude-aware page-number alignment +- `chunk.ts` - markdown -> 6 KB chunks with section IDs +- `embed.ts` - chunks -> chunks + 1024-dim embeddings +- `upload.ts` - bulk insert into `spec_content` diff --git a/scripts/ingest/chunk.ts b/scripts/ingest-pdf/chunk.ts similarity index 93% rename from scripts/ingest/chunk.ts rename to scripts/ingest-pdf/chunk.ts index c3d2d15..4fe0c87 100644 --- a/scripts/ingest/chunk.ts +++ b/scripts/ingest-pdf/chunk.ts @@ -5,10 +5,10 @@ * Respects section boundaries and handles XML examples specially. * * Usage: - * bun scripts/ingest/chunk.ts + * bun scripts/ingest-pdf/chunk.ts * * Example: - * bun scripts/ingest/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json + * bun scripts/ingest-pdf/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json */ interface ExtractedSection { @@ -151,10 +151,10 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest/chunk.ts "); + console.log("Usage: bun scripts/ingest-pdf/chunk.ts "); console.log(""); console.log("Example:"); - console.log(" bun scripts/ingest/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json"); + console.log(" bun scripts/ingest-pdf/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json"); process.exit(1); } diff --git a/scripts/ingest/embed.ts b/scripts/ingest-pdf/embed.ts similarity index 90% rename from scripts/ingest/embed.ts rename to scripts/ingest-pdf/embed.ts index cacde79..8ab7e53 100644 --- a/scripts/ingest/embed.ts +++ b/scripts/ingest-pdf/embed.ts @@ -4,14 +4,14 @@ * Generates embeddings for chunks using the configured provider. * * Usage: - * bun scripts/ingest/embed.ts + * bun scripts/ingest-pdf/embed.ts * * Environment variables: * EMBEDDING_PROVIDER - openai, google, voyage, or cohere (default: openai) * OPENAI_API_KEY / GOOGLE_API_KEY / etc. * * Example: - * EMBEDDING_PROVIDER=openai bun scripts/ingest/embed.ts ./chunks/part1-chunks.json ./embedded/part1-embedded.json + * EMBEDDING_PROVIDER=openai bun scripts/ingest-pdf/embed.ts ./chunks/part1-chunks.json ./embedded/part1-embedded.json */ import { @@ -93,7 +93,7 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest/embed.ts "); + console.log("Usage: bun scripts/ingest-pdf/embed.ts "); console.log(""); console.log("Environment variables:"); console.log(" EMBEDDING_PROVIDER - openai, google, voyage, or cohere (default: openai)"); @@ -101,7 +101,7 @@ async function main() { console.log(""); console.log("Example:"); console.log( - " EMBEDDING_PROVIDER=openai bun scripts/ingest/embed.ts ./chunks/part1.json ./embedded/part1.json", + " EMBEDDING_PROVIDER=openai bun scripts/ingest-pdf/embed.ts ./chunks/part1.json ./embedded/part1.json", ); process.exit(1); } diff --git a/scripts/ingest/extract-pdf.py b/scripts/ingest-pdf/extract.py similarity index 100% rename from scripts/ingest/extract-pdf.py rename to scripts/ingest-pdf/extract.py diff --git a/scripts/ingest/fix-page-numbers.py b/scripts/ingest-pdf/fix-page-numbers.py similarity index 100% rename from scripts/ingest/fix-page-numbers.py rename to scripts/ingest-pdf/fix-page-numbers.py diff --git a/scripts/ingest/pipeline.ts b/scripts/ingest-pdf/pipeline.ts similarity index 85% rename from scripts/ingest/pipeline.ts rename to scripts/ingest-pdf/pipeline.ts index 660090a..dacc32a 100644 --- a/scripts/ingest/pipeline.ts +++ b/scripts/ingest-pdf/pipeline.ts @@ -4,7 +4,7 @@ * Runs the complete ingestion process: extract -> chunk -> embed -> upload * * Usage: - * bun scripts/ingest/pipeline.ts + * bun scripts/ingest-pdf/pipeline.ts * * Environment variables: * DATABASE_URL - PostgreSQL connection string @@ -12,7 +12,7 @@ * OPENAI_API_KEY / GOOGLE_API_KEY / etc. * * Example: - * bun scripts/ingest/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf + * bun scripts/ingest-pdf/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf */ import { $ } from "bun"; @@ -21,7 +21,7 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest/pipeline.ts "); + console.log("Usage: bun scripts/ingest-pdf/pipeline.ts "); console.log(""); console.log("Environment variables:"); console.log(" DATABASE_URL - PostgreSQL connection string"); @@ -29,7 +29,7 @@ async function main() { console.log(" OPENAI_API_KEY / GOOGLE_API_KEY / etc."); console.log(""); console.log("Example:"); - console.log(" bun scripts/ingest/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf"); + console.log(" bun scripts/ingest-pdf/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf"); process.exit(1); } @@ -92,7 +92,7 @@ async function main() { try { await $`${pythonPath} -c "import pymupdf4llm" 2>/dev/null`; console.log(`Using Python: ${pythonPath}`); - await $`${pythonPath} scripts/ingest/extract-pdf.py ${pdfPath} ${extractedDir}`; + await $`${pythonPath} scripts/ingest-pdf/extract.py ${pdfPath} ${extractedDir}`; extractSuccess = true; break; } catch { @@ -110,17 +110,17 @@ async function main() { // Step 2: Chunk console.log("\n[2/4] Chunking content..."); console.log("-".repeat(40)); - await $`bun scripts/ingest/chunk.ts ${extractedDir} ${chunksFile}`; + await $`bun scripts/ingest-pdf/chunk.ts ${extractedDir} ${chunksFile}`; // Step 3: Embed console.log("\n[3/4] Generating embeddings..."); console.log("-".repeat(40)); - await $`bun scripts/ingest/embed.ts ${chunksFile} ${embeddedFile}`; + await $`bun scripts/ingest-pdf/embed.ts ${chunksFile} ${embeddedFile}`; // Step 4: Upload console.log("\n[4/4] Uploading to database..."); console.log("-".repeat(40)); - await $`bun scripts/ingest/upload.ts ${partNumber} ${embeddedFile}`; + await $`bun scripts/ingest-pdf/upload.ts ${partNumber} ${embeddedFile}`; console.log(`\n${"=".repeat(60)}`); console.log("Pipeline complete!"); diff --git a/scripts/ingest/upload.ts b/scripts/ingest-pdf/upload.ts similarity index 88% rename from scripts/ingest/upload.ts rename to scripts/ingest-pdf/upload.ts index db19b41..c17cffc 100644 --- a/scripts/ingest/upload.ts +++ b/scripts/ingest-pdf/upload.ts @@ -4,13 +4,13 @@ * Uploads embedded chunks to the database. * * Usage: - * bun scripts/ingest/upload.ts + * bun scripts/ingest-pdf/upload.ts * * Environment variables: * DATABASE_URL - PostgreSQL connection string * * Example: - * bun scripts/ingest/upload.ts 1 ./embedded/part1-embedded.json + * bun scripts/ingest-pdf/upload.ts 1 ./embedded/part1-embedded.json */ import { createDbClient } from "../../packages/shared/src/db/index.ts"; @@ -30,13 +30,13 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest/upload.ts "); + console.log("Usage: bun scripts/ingest-pdf/upload.ts "); console.log(""); console.log("Environment variables:"); console.log(" DATABASE_URL - PostgreSQL connection string"); console.log(""); console.log("Example:"); - console.log(" bun scripts/ingest/upload.ts 1 ./embedded/part1-embedded.json"); + console.log(" bun scripts/ingest-pdf/upload.ts 1 ./embedded/part1-embedded.json"); process.exit(1); } diff --git a/scripts/ingest-xsd/README.md b/scripts/ingest-xsd/README.md new file mode 100644 index 0000000..464421b --- /dev/null +++ b/scripts/ingest-xsd/README.md @@ -0,0 +1,73 @@ +# XSD ingest (ECMA-376 schema graph) + +Builds the structural-query corpus that powers `ooxml_lookup_element`, +`ooxml_children`, `ooxml_attributes`, etc. The XSDs published by Ecma +International for ECMA-376 Transitional are parsed and persisted as a +profile-scoped relational graph. + +``` +ECMA Part 4 zip -> fetch + verify (sha256) -> parse (preserveOrder) + -> ingest (single transaction) -> 11 tables in Postgres +``` + +## Prerequisites + +- `DATABASE_URL` pointed at a Postgres with `db/schema.sql` applied +- A row in `reference_sources` named `ecma-376-transitional`. Run + `bun run sources:sync` after editing `data/sources.json`. + +## Fetch the schemas + +The Part 4 zip is published on the ECMA-376 publications page. It contains +`OfficeOpenXML-XMLSchema-Transitional.zip`, which contains the 26 +Transitional XSDs (`wml.xsd`, `dml-main.xsd`, `sml.xsd`, `pml.xsd`, +`shared-*.xsd`, ...). + +```bash +bun run xsd:fetch \ + --url 'https://ecma-international.org/wp-content/uploads/ECMA-376-4_5th_edition_december_2016.zip' \ + --expected-sha256 'bd25da1109f73762356596918bf5ff8b74a1331642dba5f1c1d1dfc6bed34ecd' +``` + +The script verifies the outer-zip sha256, extracts the inner zip, and lands +the XSDs in `data/xsd-cache/ecma-376-transitional/`. The cache is gitignored; +nothing binary lands in the repo. + +## Ingest + +```bash +bun run xsd:ingest +``` + +By default it walks `wml.xsd` plus its import closure (12 documents) and +populates: `xsd_profiles`, `xsd_namespaces`, `xsd_symbols`, +`xsd_symbol_profiles`, `xsd_inheritance_edges`, `xsd_compositors`, +`xsd_child_edges`, `xsd_group_edges`, `xsd_attr_edges`, `xsd_enums`. Wraps +the whole thing in a single transaction; idempotent across runs. + +To ingest a different working set: + +```bash +bun run xsd:ingest --entrypoint dml-main.xsd +bun run xsd:ingest --schema-dir --entrypoint \ + --profile --source +``` + +## Files + +- `fetch.ts` - download Part 4 zip, verify sha256, extract XSDs +- `parse-schema.ts` - load XSDs into an in-memory schema set with ordered + AST + namespace map + import graph + qname declaration index +- `vocabulary.ts` - canonical namespace URI -> vocabulary id map +- `qname.ts` - canonical-key + qname-attribute resolution +- `ast.ts` - helpers for walking fast-xml-parser preserveOrder output +- `types.ts` - shared types +- `ingest.ts` - parser output -> 11 DB tables, single transaction + +## Smoke-test the result + +```bash +bun run ooxml:call ooxml_children '{"qname":"w:tbl"}' +bun run ooxml:call ooxml_attributes '{"qname":"w:pBdr"}' +bun run ooxml:call ooxml_enum '{"qname":"w:ST_Jc"}' +``` diff --git a/scripts/fetch-xsd.ts b/scripts/ingest-xsd/fetch.ts similarity index 96% rename from scripts/fetch-xsd.ts rename to scripts/ingest-xsd/fetch.ts index 82a3c21..4f6bd43 100644 --- a/scripts/fetch-xsd.ts +++ b/scripts/ingest-xsd/fetch.ts @@ -12,11 +12,11 @@ * ecma-376-transitional/ (final XSDs land here) * * Usage: - * bun scripts/fetch-xsd.ts --url - * bun scripts/fetch-xsd.ts --url --expected-sha256 + * bun scripts/ingest-xsd/fetch.ts --url + * bun scripts/ingest-xsd/fetch.ts --url --expected-sha256 * * Or via env: - * XSD_PART4_URL= bun scripts/fetch-xsd.ts + * XSD_PART4_URL= bun scripts/ingest-xsd/fetch.ts * * After a successful fetch the script prints the outer-zip sha256; * paste it into data/sources.json under the ecma-376-transitional entry diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index d018093..49c495f 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -1,32 +1,29 @@ /** - * Phase 3c: ingest top-level symbols and inheritance edges. + * Ingest the OOXML schema graph from parseSchemaSet output. Runs in a single + * transaction and writes: * - * Walks parseSchemaSet output and writes: - * - xsd_profiles (bootstrap target profile, idempotent) - * - xsd_namespaces (one row per unique URI seen across documents) - * - xsd_symbols (canonical (vocabulary_id, local_name, kind), upsert by natural key) - * - xsd_symbol_profiles (membership for the target profile, with source_id) - * - xsd_inheritance_edges (extension/restriction from complexContent/simpleContent - * and simpleType/restriction) + * - xsd_profiles, xsd_namespaces, xsd_symbols, xsd_symbol_profiles + * (bootstrap + per-symbol membership; symbol/inheritance passes use + * ON CONFLICT for natural-key idempotency) + * - xsd_inheritance_edges (complexContent/simpleContent extension/restriction + * and simpleType restriction) + * - xsd_compositors, xsd_child_edges, xsd_group_edges (content models; + * content-model rows have no natural unique key, so this pass uses + * delete-and-rewrite per profile) + * - xsd_attr_edges, xsd_enums (attributes, attributeGroup refs, and + * simpleType enumeration values; same delete-and-rewrite pattern) * - * NOT touched here (Phases 3d/3e): - * - xsd_compositors, xsd_child_edges (content models) - * - xsd_attr_edges (attributes) - * - xsd_group_edges (group/attributeGroup refs) - * - xsd_enums (simpleType enumerations) + * Re-running against the same source is idempotent: identical row counts on + * every run. Stale-row cleanup (when symbols vanish in a future edition) is + * deferred until needed. * - * Idempotency: the entire ingest runs in a single transaction. Re-running - * against the same source produces no new rows (UNIQUE + ON CONFLICT DO NOTHING). - * Stale-row cleanup (when symbols vanish in a future edition) is deferred, - * see PLAN.md "Edition flip and behavior_notes" open item. + * Library usage: + * await ingestSchemaSet({ schemaDir, entrypoints, profileName, sourceName, db }) * - * Usage as a library: - * await ingestSchemaSet({ schemaDir, entrypoints, profileName, sourceName, sql }) - * - * Usage as a CLI: - * bun scripts/ingest-xsd/ingest.ts - * bun scripts/ingest-xsd/ingest.ts --schema-dir --entrypoint wml.xsd \ - * --profile transitional --source ecma-376-transitional + * CLI usage: + * bun run xsd:ingest + * bun run xsd:ingest --schema-dir --entrypoint wml.xsd \ + * --profile transitional --source ecma-376-transitional */ import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; @@ -486,12 +483,7 @@ async function handleElement( // Local elements are scoped per-owner: the same name in two different // complexTypes is not the same symbol (e.g. WML's tblGrid is // CT_TblGridBase inside CT_TblGridChange but CT_TblGrid inside CT_Tbl). - const key = symbolKey( - ctx.ownerDecl.vocabularyId, - a.name, - "element", - ctx.ownerSymbolId, - ); + const key = symbolKey(ctx.ownerDecl.vocabularyId, a.name, "element", ctx.ownerSymbolId); let id = ctx.symbolIds.get(key); if (id == null) { const res = await upsertSymbol( @@ -936,7 +928,12 @@ function stripPrefixLocal(tag: string | null): string | null { return colon < 0 ? tag : tag.slice(colon + 1); } -function symbolKey(vocab: string, local: string, kind: string, parentId: number | null = null): string { +function symbolKey( + vocab: string, + local: string, + kind: string, + parentId: number | null = null, +): string { return `${vocab}|${local}|${kind}|${parentId ?? ""}`; } diff --git a/scripts/ingest-xsd/smoke.ts b/scripts/ingest-xsd/smoke.ts deleted file mode 100644 index 316fb10..0000000 --- a/scripts/ingest-xsd/smoke.ts +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Phase 3b smoke: parse the real WML working set and print a summary. - * - * Verifies the parser end-to-end against the live cache before Phase 3c - * starts writing symbols/edges to the DB. - * - * Usage: - * bun scripts/ingest-xsd/smoke.ts - * bun scripts/ingest-xsd/smoke.ts --schema-dir ./some/dir --entrypoint wml.xsd - */ - -import { parseSchemaSet } from "./parse-schema.ts"; -import type { DeclarationKind } from "./types.ts"; - -interface Args { - schemaDir: string; - entrypoints: string[]; -} - -function parseArgs(): Args { - const argv = process.argv.slice(2); - let schemaDir = "./data/xsd-cache/ecma-376-transitional"; - const entrypoints: string[] = []; - for (let i = 0; i < argv.length; i++) { - const a = argv[i]; - if (a === "--schema-dir") schemaDir = argv[++i] ?? schemaDir; - else if (a === "--entrypoint") entrypoints.push(argv[++i] ?? ""); - } - if (entrypoints.length === 0) entrypoints.push("wml.xsd"); - return { schemaDir, entrypoints }; -} - -async function main() { - const args = parseArgs(); - const set = await parseSchemaSet({ - schemaDir: args.schemaDir, - entrypoints: args.entrypoints, - }); - - console.log(`schemaDir: ${args.schemaDir}`); - console.log(`entrypoints: ${args.entrypoints.join(", ")}`); - console.log(`documents loaded: ${set.documents.size}\n`); - - for (const ep of args.entrypoints) { - const doc = set.documents.get(ep); - if (!doc) continue; - console.log(`${ep}`); - console.log(` targetNamespace: ${doc.targetNamespace}`); - console.log(` vocabularyId: ${doc.vocabularyId}`); - const imports = set.importGraph.get(ep) ?? []; - console.log(` imports (${imports.length}):`); - for (const imp of imports) { - console.log(` ${imp.namespace} → ${imp.target ?? "(no schemaLocation)"}`); - } - console.log(); - } - - const counts: Record = { - element: 0, - complexType: 0, - simpleType: 0, - group: 0, - attributeGroup: 0, - attribute: 0, - }; - for (const arr of set.declarationsByQName.values()) { - for (const d of arr) counts[d.kind]++; - } - console.log("declaration counts (across all loaded documents):"); - for (const k of Object.keys(counts).sort() as DeclarationKind[]) { - console.log(` ${k.padEnd(16)} ${counts[k]}`); - } -} - -main().catch((err) => { - console.error("smoke failed:", err); - process.exit(1); -}); diff --git a/scripts/ooxml-call.ts b/scripts/ooxml-call.ts index e404089..2594b24 100644 --- a/scripts/ooxml-call.ts +++ b/scripts/ooxml-call.ts @@ -1,5 +1,5 @@ /** - * Local end-to-end harness for the Phase 4 OOXML tools. + * Local end-to-end harness for the OOXML structural tools. * * The deployed Worker uses @neondatabase/serverless (HTTP-only), which can't * talk to local Postgres. This CLI bypasses the Worker and dispatches through @@ -7,10 +7,10 @@ * code path that the Worker exercises runs end-to-end against the dev DB. * * Usage: - * bun scripts/ooxml-call.ts - * bun scripts/ooxml-call.ts ooxml_children '{"qname":"w:tbl"}' - * bun scripts/ooxml-call.ts ooxml_attributes '{"qname":"w:pBdr"}' - * bun scripts/ooxml-call.ts ooxml_enum '{"qname":"w:ST_Jc"}' + * bun run ooxml:call + * bun run ooxml:call ooxml_children '{"qname":"w:tbl"}' + * bun run ooxml:call ooxml_attributes '{"qname":"w:pBdr"}' + * bun run ooxml:call ooxml_enum '{"qname":"w:ST_Jc"}' * * Environment: * DATABASE_URL - postgres connection string (defaults to local docker) diff --git a/scripts/sync-sources.ts b/scripts/sources-sync.ts similarity index 98% rename from scripts/sync-sources.ts rename to scripts/sources-sync.ts index 74c4418..ecf48b5 100644 --- a/scripts/sync-sources.ts +++ b/scripts/sources-sync.ts @@ -6,7 +6,7 @@ * The backfill is a one-time concern; once all rows have source_id it is a no-op. * * Usage: - * bun scripts/sync-sources.ts + * bun scripts/sources-sync.ts * * Environment: * DATABASE_URL - PostgreSQL connection string diff --git a/tests/db/xsd-schema.test.ts b/tests/db/xsd-schema.test.ts index 926d902..9ef717a 100644 --- a/tests/db/xsd-schema.test.ts +++ b/tests/db/xsd-schema.test.ts @@ -1,9 +1,9 @@ /** - * Phase 2 acceptance tests: XSD schema integrity. + * XSD schema integrity tests. * * Each test starts with an empty xsd_* / behavior_notes set. spec_content and - * reference_sources are left alone. The XSD tables are empty by design in Phase 2; - * once Phase 3 ingests data, tests should move to a separate TEST_DATABASE_URL. + * reference_sources are left alone. Once a real ingest populates the dev DB, + * tests should move to a separate TEST_DATABASE_URL. * * Usage: * DATABASE_URL=postgresql://... bun test tests/db/xsd-schema.test.ts diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts index 3aaaea4..48e7800 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-xsd/ingest.test.ts @@ -1,5 +1,5 @@ /** - * Phase 3c: ingest pass tests. + * Ingest pass tests. * * Each test starts with empty xsd_* / behavior_notes tables (afterEach TRUNCATE) * and a known reference_sources row. Uses fixture XSDs. @@ -493,7 +493,7 @@ test.skipIf(!realCacheReady)( expect(stats.groupRefsInserted).toBeGreaterThan(20); expect(stats.childEdgesUnresolved).toBe(0); expect(stats.groupRefsUnresolved).toBe(0); - // Phase 3e additions: + // Attribute / attributeGroup / enum coverage: expect(stats.attrEdgesInserted).toBeGreaterThan(500); expect(stats.attrGroupRefsInserted).toBeGreaterThan(10); expect(stats.enumsInserted).toBeGreaterThan(200); diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-xsd/parse-schema.test.ts index 6c7aeb4..9feba7d 100644 --- a/tests/ingest-xsd/parse-schema.test.ts +++ b/tests/ingest-xsd/parse-schema.test.ts @@ -1,5 +1,5 @@ /** - * Phase 3b: parser scaffolding tests. + * Parser scaffolding tests. * * Primary tests use tiny fixture XSDs to keep the suite fast and independent * of the local cache. One optional smoke test runs against the real diff --git a/tests/mcp-server/ooxml-queries.test.ts b/tests/mcp-server/ooxml-queries.test.ts index 76e62bf..824c8e0 100644 --- a/tests/mcp-server/ooxml-queries.test.ts +++ b/tests/mcp-server/ooxml-queries.test.ts @@ -1,5 +1,5 @@ /** - * Phase 4 query layer tests. Ingests the same fixture XSDs the ingest tests use, + * Query layer tests. Ingests the same fixture XSDs the ingest tests use, * then exercises each MCP-tool query function against the populated DB. */ From 99ec4eb08df9c749941db4d58b16ce74f8783608 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 16:48:17 -0300 Subject: [PATCH 19/24] fix(mcp): address review correctness gaps + tighten test isolation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five real issues from review against the WML schema graph; one cheap DX win folded in. Issue 1 - lookupSymbol returned local-only symbols by qname. After per-owner scoping landed, lookupElement("w:tblGrid") could return either CT_TblGridBase or CT_TblGrid depending on which row postgres picked first. Fixed: lookupSymbol now filters parent_symbol_id IS NULL, so it only returns top-level symbols addressable by qname. Local elements are reachable through getChildren on their owning type. Issues 3 + 4 - getAttributes mishandled inheritance. complexContent/restriction inherits attribute uses per XSD §3.4.2.2; only `use="prohibited"` drops them. The previous code walked the base only for `extension`, so every WML *Change type built on a restriction reported zero base attrs (id, author, date silently missing). The walk order also had base attrs emitted first, making the older docstring claim about "derived wins" wrong in practice. Fixed: derived attrs (and their attributeGroup refs) emit first, then the base is walked for both extension AND restriction; seenAttrs dedup makes derived redeclarations win. Two new tests pin both behaviors with a CT_TrackedBase / CT_TrackedRestricted / CT_OverrideDerived fixture. Issue 5 - stale unscoped local symbols on dev DBs. The migration that introduced parent_symbol_id never purged the pre-migration parent-NULL collapsed rows, so re-ingest left them alongside the new per-owner symbols. Fixed: ingest now purges everything this source previously wrote at the start of the transaction, then rewrites. Non-cascading FKs (xsd_inheritance_edges.base_symbol_id and friends) are cleaned explicitly first. Idempotency test updated to reflect the new semantics: every stat equals first.X across re-runs and DB row counts stay stable. Issue 2 - tests TRUNCATE through DATABASE_URL. A developer with DATABASE_URL pointed at Neon could wipe their schema graph by running `bun test`. Fixed: tests now require TEST_DATABASE_URL (no fallback) and refuse to run unless the hostname is local. Shared guard at tests/test-db.ts; package.json test script defaults TEST_DATABASE_URL to local Postgres. DX - ooxml_children's group fallback inlined a 28-line copy of lookupSymbol. Replaced with a 4-line lookupSymbol("group", ...) call, and dropped a dead-code branch in getChildrenRecursive that re-set source="inherited" on entries the recursive call had already labeled. 44/0 across db / ingest / mcp-server. --- apps/mcp-server/src/ooxml-queries.ts | 56 ++++++++++++++++---------- apps/mcp-server/src/ooxml-tools.ts | 38 +++++------------ package.json | 2 +- scripts/ingest-xsd/ingest.ts | 34 ++++++++++++++++ tests/db/xsd-schema.test.ts | 7 ++-- tests/ingest-xsd/fixtures/main.xsd | 21 ++++++++++ tests/ingest-xsd/ingest.test.ts | 44 +++++++++++--------- tests/ingest-xsd/parse-schema.test.ts | 4 +- tests/mcp-server/ooxml-queries.test.ts | 54 ++++++++++++++++++++++--- tests/test-db.ts | 38 +++++++++++++++++ 10 files changed, 217 insertions(+), 81 deletions(-) create mode 100644 tests/test-db.ts diff --git a/apps/mcp-server/src/ooxml-queries.ts b/apps/mcp-server/src/ooxml-queries.ts index 5746bf6..7950974 100644 --- a/apps/mcp-server/src/ooxml-queries.ts +++ b/apps/mcp-server/src/ooxml-queries.ts @@ -87,7 +87,14 @@ export interface SymbolHit { sourceName: string | null; } -/** Look up a symbol by namespace + localName + kind in a given profile. */ +/** + * Look up a top-level symbol by namespace + localName + kind in a given profile. + * + * Local element symbols (parent_symbol_id IS NOT NULL) are intentionally excluded: + * an inline `` declared in two different + * complexTypes is two distinct symbols whose identity depends on context. Reach + * those through `getChildren(parentTypeSymbolId, profile)` instead. + */ export async function lookupSymbol( sql: Sql, namespace: string, @@ -105,6 +112,7 @@ export async function lookupSymbol( LEFT JOIN reference_sources src ON src.id = sp.source_id WHERE s.local_name = ${localName} AND s.kind = ${kind} + AND s.parent_symbol_id IS NULL AND ns.uri = ${namespace} AND p.name = ${profile} LIMIT 1 @@ -331,15 +339,12 @@ async function getChildrenRecursive( const out: ChildEdge[] = []; // Inheritance: extension prepends base content; restriction replaces it. + // Recursing with isRoot=false sets source="inherited" inside the base call, + // so we just push the entries through. const inherit = await getInheritanceEdge(sql, symbolId, profile); if (inherit && inherit.relation === "extension") { const base = await getChildrenRecursive(sql, inherit.baseId, profile, false); - // Already-emitted entries in `base` already carry their owning type name; - // flip their source to "inherited" relative to the root request. - for (const c of base) { - if (isRoot) c.source = "inherited"; - out.push(c); - } + out.push(...base); } // Walk this type's own top-level compositors. @@ -408,13 +413,17 @@ export interface AttrEntry { } /** - * Attributes on a type symbol. Walks inheritance per XSD semantics - * (extension prepends base attrs; restriction replaces them) and recurses - * through attributeGroup refs, including refs nested inside other - * attributeGroups. Cycles are guarded by a visited-set. + * Attributes on a type symbol, applying XSD §3.4.2.2 inheritance: + * - extension: derived's own attribute uses are unioned with the base's. + * - restriction: derived's attribute uses also union with the base's, with + * the derived narrowing or prohibiting individual entries. Restriction + * CANNOT silently drop a base attribute; only `use="prohibited"` does. * - * Names are de-duplicated: a derived type's redeclaration of an inherited - * attribute wins, so the first occurrence in walk order is what surfaces. + * Walk order emits the derived type's own attributes first, then attributeGroup + * refs the derived holds, then recurses into the base. Names are de-duplicated + * by first occurrence, so a derived redeclaration wins and base attrs only + * surface when the derived didn't override them. attributeGroup nesting is + * walked recursively with a visited-set against cycles. */ export async function getAttributes( sql: Sql, @@ -437,17 +446,11 @@ async function collectAttrsForType( seenAttrs: Set, visitedGroups: Set, ): Promise { - // Per XSD: extension prepends base; restriction replaces. We always emit base - // first when extending so derived declarations correctly override later. - const inherit = await getInheritanceEdge(sql, symbolId, profile); - if (inherit && inherit.relation === "extension") { - await collectAttrsForType(sql, inherit.baseId, profile, false, out, seenAttrs, visitedGroups); - } - const ownName = await getSymbolName(sql, symbolId); // Direct attribute declarations on this symbol (whether complexType or - // attributeGroup; both can carry xsd:attribute children). + // attributeGroup; both can carry xsd:attribute children). Emit first so + // derived redeclarations override base attrs found below. const directAttrs = await sql` SELECT a.local_name, a.attr_use, a.default_value, a.fixed_value, a.type_ref, a.order_index FROM xsd_attr_edges a @@ -470,7 +473,8 @@ async function collectAttrsForType( }); } - // attributeGroup refs hanging off this symbol; recurse into each. + // attributeGroup refs the derived itself holds; recurse into each before + // touching the base so a derived's group-bundled attr also wins. const agRefs = await sql` SELECT ge.group_symbol_id FROM xsd_group_edges ge @@ -490,6 +494,14 @@ async function collectAttrsForType( visitedGroups, ); } + + // Inherited base attrs. Both extension and restriction inherit attribute uses + // per XSD §3.4.2.2; restriction can override or prohibit but cannot drop + // silently. Dedup by seenAttrs so the derived's redeclarations win. + const inherit = await getInheritanceEdge(sql, symbolId, profile); + if (inherit) { + await collectAttrsForType(sql, inherit.baseId, profile, false, out, seenAttrs, visitedGroups); + } } async function collectAttrsFromAttributeGroup( diff --git a/apps/mcp-server/src/ooxml-tools.ts b/apps/mcp-server/src/ooxml-tools.ts index c4dc156..d183e3a 100644 --- a/apps/mcp-server/src/ooxml-tools.ts +++ b/apps/mcp-server/src/ooxml-tools.ts @@ -21,6 +21,7 @@ import { getEnums, getNamespaceInfo, lookupElement, + lookupSymbol, lookupSymbolByTypeRef, lookupType, type NamespaceInfo, @@ -212,34 +213,15 @@ export async function runOoxmlTool( if (elementSym?.typeRef) { typeSym = await lookupSymbolByTypeRef(sql, elementSym.typeRef, profile); } else if (!elementSym) { - // Fall back to looking for a group with this name (so EG_PContent works). - const grp = await sql` - SELECT s.id, s.local_name, s.kind, s.vocabulary_id, s.type_ref, - ns.uri AS namespace_uri, p.name AS profile_name, src.name AS source_name - FROM xsd_symbols s - JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id - JOIN xsd_namespaces ns ON ns.id = sp.namespace_id - JOIN xsd_profiles p ON p.id = sp.profile_id - LEFT JOIN reference_sources src ON src.id = sp.source_id - WHERE s.local_name = ${q.qname.localName} - AND s.kind = 'group' - AND ns.uri = ${q.qname.namespace} - AND p.name = ${profile} - LIMIT 1 - `; - const r = grp[0]; - if (r) { - typeSym = { - id: r.id as number, - vocabularyId: r.vocabulary_id as string, - localName: r.local_name as string, - kind: r.kind as string, - typeRef: r.type_ref as string | null, - namespaceUri: r.namespace_uri as string, - profileName: r.profile_name as string, - sourceName: r.source_name as string | null, - }; - } + // Fall back to looking for a named xsd:group with this qname (so + // EG_PContent and friends are reachable directly). + typeSym = await lookupSymbol( + sql, + q.qname.namespace, + q.qname.localName, + "group", + profile, + ); } } if (!typeSym) { diff --git a/package.json b/package.json index ae55196..0116521 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,7 @@ "xsd:fetch": "bun scripts/ingest-xsd/fetch.ts", "xsd:ingest": "bun scripts/ingest-xsd/ingest.ts", "ooxml:call": "bun scripts/ooxml-call.ts", - "test": "bun test tests/db/ && bun test tests/ingest-xsd/ && bun test tests/mcp-server/" + "test": "export TEST_DATABASE_URL=${TEST_DATABASE_URL:-postgresql://postgres:postgres@localhost:5432/ecma_spec} && bun test tests/db/ && bun test tests/ingest-xsd/ && bun test tests/mcp-server/" }, "devDependencies": { "@biomejs/biome": "^2.3.13", diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index 49c495f..71a6054 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -99,6 +99,40 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise(); const symbolIds = new Map(); // canonical (vocab|local|kind) -> id diff --git a/tests/db/xsd-schema.test.ts b/tests/db/xsd-schema.test.ts index 9ef717a..d44f93d 100644 --- a/tests/db/xsd-schema.test.ts +++ b/tests/db/xsd-schema.test.ts @@ -12,10 +12,9 @@ import { afterAll, beforeAll, beforeEach, expect, test } from "bun:test"; import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; -const databaseUrl = process.env.DATABASE_URL; -if (!databaseUrl) { - throw new Error("Missing DATABASE_URL for integration tests"); -} +import { getTestDatabaseUrl } from "../test-db.ts"; + +const databaseUrl = getTestDatabaseUrl(); let db: DbClient; diff --git a/tests/ingest-xsd/fixtures/main.xsd b/tests/ingest-xsd/fixtures/main.xsd index 4454d3f..56a3547 100644 --- a/tests/ingest-xsd/fixtures/main.xsd +++ b/tests/ingest-xsd/fixtures/main.xsd @@ -108,5 +108,26 @@ + + + + + + + + + + + + + + + + + + diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts index 48e7800..36b80f2 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-xsd/ingest.test.ts @@ -15,10 +15,9 @@ const FIXTURES_DIR = join(import.meta.dir, "fixtures"); const REAL_CACHE_DIR = "./data/xsd-cache/ecma-376-transitional"; const realCacheReady = existsSync(join(REAL_CACHE_DIR, "wml.xsd")); -const databaseUrl = process.env.DATABASE_URL; -if (!databaseUrl) { - throw new Error("Missing DATABASE_URL for integration tests"); -} +import { getTestDatabaseUrl } from "../test-db.ts"; + +const databaseUrl = getTestDatabaseUrl(); let db: DbClient; @@ -122,7 +121,9 @@ test("ingest writes inheritance edges for extension and restriction", async () = // ST_Jc restricts xsd:string (simpleType) // ST_OnOff restricts xsd:boolean // ST_String restricts xsd:string - expect(stats.inheritanceEdgesInserted).toBe(6); + // 6 from the original fixture + 2 new restrictions (CT_TrackedRestricted, + // CT_OverrideDerived). + expect(stats.inheritanceEdgesInserted).toBe(8); expect(stats.inheritanceUnresolved).toBe(0); // Verify the CT_Extended → CT_Empty extension edge. @@ -171,12 +172,14 @@ test("ingest is idempotent: re-running adds no new symbols/edges", async () => { db, }); - expect(second.symbolsInserted).toBe(0); - expect(second.symbolsExisting).toBeGreaterThan(0); - expect(second.profileMembershipsInserted).toBe(0); - expect(second.inheritanceEdgesInserted).toBe(0); - // Content-model + attribute passes use delete-and-rewrite, so insert counts - // equal the first run on every re-run; DB row counts stay stable. + // Re-ingest purges everything this source previously wrote and re-creates + // it, so every stat equals the first run and symbolsExisting stays at 0. + // What matters for idempotency is that the DB row counts are stable across + // runs (asserted below). + expect(second.symbolsInserted).toBe(first.symbolsInserted); + expect(second.symbolsExisting).toBe(0); + expect(second.profileMembershipsInserted).toBe(first.profileMembershipsInserted); + expect(second.inheritanceEdgesInserted).toBe(first.inheritanceEdgesInserted); expect(second.compositorsInserted).toBe(first.compositorsInserted); expect(second.childEdgesInserted).toBe(first.childEdgesInserted); expect(second.groupRefsInserted).toBe(first.groupRefsInserted); @@ -332,14 +335,17 @@ test("ingest writes attributes, attributeGroup refs, and enum values", async () }); // Fixture attributes: - // CT_Para/bold (optional, type s:ST_OnOff) - // CT_Extended/extra (optional, type xsd:string, under complexContent/extension) - // AG_TableProps/cols (optional, type xsd:int) - // CT_TableUser/caption (required, type xsd:string) - // CT_RefTest/space (required, ref="s:space"; type/default copied from decl) - // AG_Inner/innerAttr (optional, type xsd:string) - // AG_Outer/outerAttr (optional, type xsd:string) - expect(stats.attrEdgesInserted).toBe(7); + // CT_Para/bold (optional, type s:ST_OnOff) + // CT_Extended/extra (optional, type xsd:string, under complexContent/extension) + // AG_TableProps/cols (optional, type xsd:int) + // CT_TableUser/caption (required, type xsd:string) + // CT_RefTest/space (required, ref="s:space"; type/default copied from decl) + // AG_Inner/innerAttr (optional, type xsd:string) + // AG_Outer/outerAttr (optional, type xsd:string) + // CT_TrackedBase/id (required, type xsd:string) + // CT_TrackedBase/author (optional, type xsd:string) + // CT_OverrideDerived/id (optional override, type xsd:string) + expect(stats.attrEdgesInserted).toBe(10); expect(stats.attrEdgesUnresolved).toBe(0); // Fixture attributeGroup refs: diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-xsd/parse-schema.test.ts index 9feba7d..0e4384d 100644 --- a/tests/ingest-xsd/parse-schema.test.ts +++ b/tests/ingest-xsd/parse-schema.test.ts @@ -85,10 +85,10 @@ test("declarationsByQName indexes all top-level declarations across documents", const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] }); const counts = countByKind(set.declarationsByQName); - // main.xsd: 1 element, 13 complexType, 1 simpleType, 1 group, 3 attributeGroup + // main.xsd: 1 element, 16 complexType, 1 simpleType, 1 group, 3 attributeGroup // shared.xsd: 2 simpleType, 1 attribute expect(counts.element).toBe(1); - expect(counts.complexType).toBe(13); + expect(counts.complexType).toBe(16); expect(counts.simpleType).toBe(3); expect(counts.group).toBe(1); expect(counts.attributeGroup).toBe(3); diff --git a/tests/mcp-server/ooxml-queries.test.ts b/tests/mcp-server/ooxml-queries.test.ts index 824c8e0..7d5c5f2 100644 --- a/tests/mcp-server/ooxml-queries.test.ts +++ b/tests/mcp-server/ooxml-queries.test.ts @@ -19,8 +19,9 @@ import { } from "../../apps/mcp-server/src/ooxml-queries.ts"; const FIXTURES_DIR = join(import.meta.dir, "..", "ingest-xsd", "fixtures"); -const databaseUrl = process.env.DATABASE_URL; -if (!databaseUrl) throw new Error("Missing DATABASE_URL for integration tests"); +import { getTestDatabaseUrl } from "../test-db.ts"; + +const databaseUrl = getTestDatabaseUrl(); let db: DbClient; @@ -97,10 +98,19 @@ test("lookupElement: top-level element with type_ref", async () => { expect(hit?.namespaceUri).toBe(WML_NS); }); -test("lookupElement: local element (text inside CT_Para) is in the profile", async () => { +test("lookupElement returns null for local-only names (no qname-addressable identity)", async () => { + // 'text' is declared inline in CT_Para and is not a top-level . + // Per XSD it has no global qname; reach it via getChildren(CT_Para) instead. const hit = await lookupElement(db.sql, WML_NS, "text", "transitional"); - expect(hit).not.toBeNull(); - expect(hit?.typeRef).toBe("{http://www.w3.org/2001/XMLSchema}string"); + expect(hit).toBeNull(); +}); + +test("lookupElement returns null for ambiguous local names (the tblGrid case)", async () => { + // 'shared' is declared inline in CT_OuterA (type ST_Jc) and in CT_OuterB + // (type xsd:string). Returning either would be wrong; lookupElement scopes + // by parent_symbol_id IS NULL and refuses to pick one. + const hit = await lookupElement(db.sql, WML_NS, "shared", "transitional"); + expect(hit).toBeNull(); }); test("lookupType: complexType vs simpleType disambiguation", async () => { @@ -287,6 +297,40 @@ test("xsd-builtin symbols have profile membership (lookupSymbolByTypeRef can fol expect(t?.vocabularyId).toBe("xsd-builtin"); }); +test("getAttributes: complexContent/restriction inherits base attributes", async () => { + // CT_TrackedRestricted restricts CT_TrackedBase but redeclares nothing. + // Per XSD §3.4.2.2 the base's attribute uses are inherited; restriction can + // narrow or prohibit but cannot drop silently. + const ct = await lookupType(db.sql, WML_NS, "CT_TrackedRestricted", "transitional"); + if (!ct) throw new Error("CT_TrackedRestricted not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + const names = attrs.map((a) => a.localName).sort(); + expect(names).toEqual(["author", "id"]); + + const idAttr = attrs.find((a) => a.localName === "id"); + expect(idAttr?.attrUse).toBe("required"); + expect(idAttr?.source).toBe("inherited"); + expect(idAttr?.owningName).toBe("CT_TrackedBase"); +}); + +test("getAttributes: derived redeclaration wins over inherited base attribute", async () => { + // CT_OverrideDerived restricts CT_TrackedBase and overrides 'id' from + // required to optional. The derived's redeclaration must win; the base's + // 'author' should still be inherited unchanged. + const ct = await lookupType(db.sql, WML_NS, "CT_OverrideDerived", "transitional"); + if (!ct) throw new Error("CT_OverrideDerived not found"); + const attrs = await getAttributes(db.sql, ct.id, "transitional"); + + const idAttr = attrs.find((a) => a.localName === "id"); + expect(idAttr?.attrUse).toBe("optional"); + expect(idAttr?.source).toBe("self"); + expect(idAttr?.owningName).toBe("CT_OverrideDerived"); + + const authorAttr = attrs.find((a) => a.localName === "author"); + expect(authorAttr?.attrUse).toBe("optional"); + expect(authorAttr?.source).toBe("inherited"); +}); + test("getAttributes: nested attributeGroup chain unfolds (CT_NestedAttrUser -> innerAttr + outerAttr)", async () => { const ct = await lookupType(db.sql, WML_NS, "CT_NestedAttrUser", "transitional"); if (!ct) throw new Error("CT_NestedAttrUser not found"); diff --git a/tests/test-db.ts b/tests/test-db.ts new file mode 100644 index 0000000..b313640 --- /dev/null +++ b/tests/test-db.ts @@ -0,0 +1,38 @@ +/** + * Shared database guard for integration tests. + * + * The test suites TRUNCATE xsd_* tables aggressively (TRUNCATE ... CASCADE) and + * delete from spec_content's foreign-key sphere. They MUST NOT run against any + * non-local Postgres - in particular, never against a Neon production URL. + * + * Rules: + * 1. TEST_DATABASE_URL must be set explicitly. There is no fallback to + * DATABASE_URL: a developer who accidentally has DATABASE_URL pointed at + * Neon would otherwise wipe their schema graph data on `bun test`. + * 2. The hostname in TEST_DATABASE_URL must be local + * (localhost / 127.0.0.1 / host.docker.internal). + * + * If either rule fails, throw and refuse to run. + */ + +const LOCAL_HOSTS = new Set(["localhost", "127.0.0.1", "host.docker.internal"]); + +export function getTestDatabaseUrl(): string { + const url = process.env.TEST_DATABASE_URL; + if (!url) { + throw new Error( + "TEST_DATABASE_URL is not set. Integration tests TRUNCATE xsd_* tables and refuse to run without an explicit test database URL. Example: TEST_DATABASE_URL=postgresql://postgres:postgres@localhost:5432/ecma_spec", + ); + } + + // Extract hostname from a postgres connection string. Avoid `new URL()` on + // `postgresql://` because some Node URL parsers reject the scheme. + const hostMatch = url.match(/@([^/:?]+)/); + const host = (hostMatch?.[1] ?? "").toLowerCase(); + if (!LOCAL_HOSTS.has(host)) { + throw new Error( + `TEST_DATABASE_URL hostname '${host}' is not a local host. Refusing to TRUNCATE against a non-local database. Allowed hosts: ${[...LOCAL_HOSTS].join(", ")}.`, + ); + } + return url; +} From 076f096424f2f6d70898f58d892b1f85f0dd9485 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 16:54:46 -0300 Subject: [PATCH 20/24] chore(mcp): remove ENABLE_OOXML_TOOLS feature flag The flag existed so the new structural tools could land without affecting api.ooxml.dev/mcp's surface until prod was ready. The operational plan is to populate the prod schema graph before merging, at which point the flag is just friction. Drops: - ENABLE_OOXML_TOOLS env var on Env / OoxmlEnv - ooxmlToolsEnabled() and the gating in tools/list and tools/call - The defensive 'method-not-found while flag is off' branch Both tools/list and tools/call now expose the OOXML tools unconditionally. Worker bundle builds clean; 44 / 0 tests still pass. Note: prod populate must run before this merges. The current per-row INSERT pattern is slow against Neon over public internet (~10-20 minutes for the WML closure); batching is the next step operationally. --- CLAUDE.md | 6 +++--- apps/mcp-server/src/index.ts | 7 ------- apps/mcp-server/src/mcp.ts | 21 ++++++--------------- apps/mcp-server/src/ooxml-tools.ts | 18 ++---------------- 4 files changed, 11 insertions(+), 41 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 27a439f..36c9535 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -105,15 +105,15 @@ The XML you provide is wrapped in a minimal `w:document > w:body` structure auto ## MCP Server -Cloudflare Worker exposing two flavors of MCP tools backed by the same database: +Cloudflare Worker exposing two flavors of MCP tools backed by the same database. -Always-on (semantic search over the spec PDF): +Semantic search over the spec PDF (powered by `spec_content`): - `search_ecma_spec` - semantic vector search across 18,000+ spec chunks - `get_section` - fetch a specific section by ID (e.g., "17.3.1.24") - `list_parts` - browse the spec structure -Behind `ENABLE_OOXML_TOOLS` (structural queries over the XSD schema graph): +Structural queries over the XSD schema graph (powered by `xsd_*` tables): - `ooxml_lookup_element` / `ooxml_lookup_type` - canonical symbol info - `ooxml_children` - legal children of an element/type/group, in document order diff --git a/apps/mcp-server/src/index.ts b/apps/mcp-server/src/index.ts index 6c37d5c..f50d025 100644 --- a/apps/mcp-server/src/index.ts +++ b/apps/mcp-server/src/index.ts @@ -16,13 +16,6 @@ import { handleMcpRequest } from "./mcp"; export interface Env { DATABASE_URL: string; VOYAGE_API_KEY: string; - /** - * Feature flag for the OOXML structural tools. Set to "true" to expose - * ooxml_lookup_element / ooxml_lookup_type / ooxml_children / - * ooxml_attributes / ooxml_enum / ooxml_namespace_info via tools/list - * and tools/call. Default off. - */ - ENABLE_OOXML_TOOLS?: string; } // Part descriptions diff --git a/apps/mcp-server/src/mcp.ts b/apps/mcp-server/src/mcp.ts index 5059151..6af618f 100644 --- a/apps/mcp-server/src/mcp.ts +++ b/apps/mcp-server/src/mcp.ts @@ -7,7 +7,7 @@ import { createDb } from "./db"; import { embedQuery } from "./embeddings"; import type { Env } from "./index"; -import { callOoxmlTool, isOoxmlTool, OOXML_TOOL_DEFS, ooxmlToolsEnabled } from "./ooxml-tools"; +import { callOoxmlTool, isOoxmlTool, OOXML_TOOL_DEFS } from "./ooxml-tools"; // JSON-RPC types interface JsonRpcRequest { @@ -133,12 +133,11 @@ function handleInitialize(id: number | string | null): JsonRpcResponse { }; } -function handleToolsList(id: number | string | null, env: Env): JsonRpcResponse { - const tools = ooxmlToolsEnabled(env) ? [...TOOLS, ...OOXML_TOOL_DEFS] : TOOLS; +function handleToolsList(id: number | string | null): JsonRpcResponse { return { jsonrpc: "2.0", id, - result: { tools }, + result: { tools: [...TOOLS, ...OOXML_TOOL_DEFS] }, }; } @@ -162,17 +161,9 @@ async function handleToolsCall( try { let resultText: string; - // OOXML tools are feature-flagged; tools/list filters them out when the flag - // is off, so callers should not see these tool names. Defensive check here in - // case a caller hand-crafts a request. + // Structural OOXML tools share the dispatch with the existing semantic + // tools below. if (isOoxmlTool(name)) { - if (!ooxmlToolsEnabled(env)) { - return { - jsonrpc: "2.0", - id, - error: { code: METHOD_NOT_FOUND, message: `Unknown tool: ${name}` }, - }; - } resultText = await callOoxmlTool(name, args ?? {}, env); return { jsonrpc: "2.0", @@ -393,7 +384,7 @@ export async function handleMcpRequest(request: Request, env: Env): Promise Date: Mon, 27 Apr 2026 16:59:00 -0300 Subject: [PATCH 21/24] chore: drop ooxml-call dev harness; add data/README; surface structural tools in README - Remove scripts/ooxml-call.ts: 23 query-layer tests cover the same dispatch path. The harness was load-bearing only while we were verifying e2e before tests existed. - Add data/README.md describing what each subpath under data/ is for (sources.json committed manifest, xsd-cache gitignored cache, behavior-notes future curated content). - Update README.md to list the structural tools alongside semantic search; both flavors share one MCP endpoint after the prod populate. - Update CLAUDE.md and scripts/ingest-xsd/README.md to drop ooxml:call references; smoke testing now points at tests/mcp-server/. 44/0 still. --- CLAUDE.md | 1 - README.md | 14 +++++---- data/README.md | 27 +++++++++++++++++ package.json | 1 - scripts/ingest-xsd/README.md | 13 +++++++-- scripts/ooxml-call.ts | 56 ------------------------------------ 6 files changed, 46 insertions(+), 66 deletions(-) create mode 100644 data/README.md delete mode 100644 scripts/ooxml-call.ts diff --git a/CLAUDE.md b/CLAUDE.md index 36c9535..367a785 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -37,7 +37,6 @@ scripts/ ingest-xsd/ ECMA XSDs -> schema graph (structural query corpus) sources-sync.ts data/sources.json -> reference_sources db-migrate.ts Apply db/migrations/*.sql in order - ooxml-call.ts Local CLI harness for the structural MCP tools db/ schema.sql PostgreSQL + pgvector + XSD schema graph migrations/ Numbered, idempotent SQL migrations diff --git a/README.md b/README.md index 2f0da7e..768affc 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,10 @@ The OOXML spec, explained by people who actually implemented it. An interactive reference for ECMA-376 (Office Open XML) built by the [SuperDoc — DOCX editing and tooling](https://superdoc.dev) team. Every page combines XML structure, live rendered previews, and implementation notes that tell you what the spec doesn't. -- **Live previews** — Edit XML and see it render in real-time. Every example is a working document. -- **Implementation notes** — Where Word diverges from the spec, what will break your code, and what to do about it. -- **Semantic spec search** — 18,000+ spec chunks searchable by meaning via MCP server. +- **Live previews** - Edit XML and see it render in real-time. Every example is a working document. +- **Implementation notes** - Where Word diverges from the spec, what will break your code, and what to do about it. +- **Semantic spec search** - 18,000+ spec chunks searchable by meaning via MCP server. +- **Structural schema lookup** - Element children, attributes, types, enums, namespaces. Same MCP server, deterministic answers from the parsed XSDs. ## Why? @@ -22,13 +23,16 @@ We faced this at SuperDoc — building a document engine on native OOXML with no ## MCP Server -Search the ECMA-376 spec with AI. Ask questions in natural language, get answers grounded in the actual specification. +Ask questions in natural language and get answers grounded in the spec, or query the schema graph for precise structural answers. ```bash claude mcp add --transport http ecma-spec https://api.ooxml.dev/mcp ``` -Works with Claude Code, Cursor, and any MCP-compatible client. Three tools: `search_ecma_spec` (semantic search), `get_section` (by ID), and `list_parts` (browse structure). +Works with Claude Code, Cursor, and any MCP-compatible client. Two flavors of tools share one server: + +- **Semantic** (over the spec PDF): `search_ecma_spec`, `get_section`, `list_parts` +- **Structural** (over the parsed XSDs): `ooxml_lookup_element`, `ooxml_lookup_type`, `ooxml_children`, `ooxml_attributes`, `ooxml_enum`, `ooxml_namespace_info` ## Development diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..31f829d --- /dev/null +++ b/data/README.md @@ -0,0 +1,27 @@ +# data/ + +Repository data root. Three categories live here: + +- **`sources.json`** (committed): canonical source manifest. One entry per + artifact (ECMA-376 PDFs, ECMA Part 4 XSD zip, future MS-OI29500, etc.) with + url, edition, sha256, and a license note. `bun run sources:sync` upserts these + rows into the `reference_sources` table. Edit by hand; the sync script reads + it. + +- **`xsd-cache/`** (gitignored): local XSD download cache. Populated by + `bun run xsd:fetch`. Contents are not load-bearing for the schema graph + itself - the graph lives in Postgres - they're just the source artifacts + the ingest reads. Safe to delete; regenerated on the next fetch. + +- **`behavior-notes/`** (committed when populated): curated YAML files + documenting how Microsoft Office actually behaves vs. the spec. A future + ingest will load these into the `behavior_notes` table so structural tool + responses can carry "what Word actually does" alongside the schema-level + answer. Empty until that workflow lands. + +What does NOT live here: + +- Generated build output: `dist/`, `dev/data/extracted/`, `dev/data/chunks/`, + `dev/data/embedded/` (all under `dev/`, gitignored). +- Database state: lives in Postgres; reproducible from the manifest + + ingest scripts. diff --git a/package.json b/package.json index 0116521..1d4317b 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,6 @@ "pdf:setup": "pip install -r scripts/requirements.txt", "xsd:fetch": "bun scripts/ingest-xsd/fetch.ts", "xsd:ingest": "bun scripts/ingest-xsd/ingest.ts", - "ooxml:call": "bun scripts/ooxml-call.ts", "test": "export TEST_DATABASE_URL=${TEST_DATABASE_URL:-postgresql://postgres:postgres@localhost:5432/ecma_spec} && bun test tests/db/ && bun test tests/ingest-xsd/ && bun test tests/mcp-server/" }, "devDependencies": { diff --git a/scripts/ingest-xsd/README.md b/scripts/ingest-xsd/README.md index 464421b..72253ce 100644 --- a/scripts/ingest-xsd/README.md +++ b/scripts/ingest-xsd/README.md @@ -66,8 +66,15 @@ bun run xsd:ingest --schema-dir --entrypoint \ ## Smoke-test the result +The query layer is exercised by `tests/mcp-server/ooxml-queries.test.ts` +against the same fixtures the ingest tests use. Run with: + ```bash -bun run ooxml:call ooxml_children '{"qname":"w:tbl"}' -bun run ooxml:call ooxml_attributes '{"qname":"w:pBdr"}' -bun run ooxml:call ooxml_enum '{"qname":"w:ST_Jc"}' +bun test tests/mcp-server/ ``` + +To hit the live MCP, deploy the Worker and call the tools through any +MCP client. For local poking against the dev DB, write a small bun +script that imports `runOoxmlTool` from +`apps/mcp-server/src/ooxml-tools.ts` with a `postgres.js`-backed sql +function. diff --git a/scripts/ooxml-call.ts b/scripts/ooxml-call.ts deleted file mode 100644 index 2594b24..0000000 --- a/scripts/ooxml-call.ts +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Local end-to-end harness for the OOXML structural tools. - * - * The deployed Worker uses @neondatabase/serverless (HTTP-only), which can't - * talk to local Postgres. This CLI bypasses the Worker and dispatches through - * `runOoxmlTool` directly with a postgres.js-backed sql function, so the same - * code path that the Worker exercises runs end-to-end against the dev DB. - * - * Usage: - * bun run ooxml:call - * bun run ooxml:call ooxml_children '{"qname":"w:tbl"}' - * bun run ooxml:call ooxml_attributes '{"qname":"w:pBdr"}' - * bun run ooxml:call ooxml_enum '{"qname":"w:ST_Jc"}' - * - * Environment: - * DATABASE_URL - postgres connection string (defaults to local docker) - */ - -import { - isOoxmlTool, - type OoxmlToolName, - runOoxmlTool, -} from "../apps/mcp-server/src/ooxml-tools.ts"; -import { createDbClient } from "../packages/shared/src/db/index.ts"; - -async function main() { - const [, , toolArg, argsArg] = process.argv; - if (!toolArg) { - console.error("Usage: bun scripts/ooxml-call.ts [jsonArgs]"); - console.error("Tools: ooxml_lookup_element, ooxml_lookup_type, ooxml_children,"); - console.error(" ooxml_attributes, ooxml_enum, ooxml_namespace_info"); - process.exit(1); - } - if (!isOoxmlTool(toolArg)) { - console.error(`Unknown tool: ${toolArg}`); - process.exit(1); - } - - const args: Record = argsArg ? JSON.parse(argsArg) : {}; - - const databaseUrl = - process.env.DATABASE_URL ?? "postgresql://postgres:postgres@localhost:5432/ecma_spec"; - const db = createDbClient(databaseUrl); - - try { - const text = await runOoxmlTool(toolArg as OoxmlToolName, args, db.sql); - console.log(text); - } finally { - await db.close(); - } -} - -main().catch((err) => { - console.error(err); - process.exit(1); -}); From b313a9e802b0d270b0f4c0d69587fa8abc62c0a0 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 18:09:15 -0300 Subject: [PATCH 22/24] feat(xsd): default fetch URL + sha256 to data/sources.json `bun run xsd:fetch` no longer requires --url. The script reads the URL and expected sha256 from data/sources.json's ecma-376-transitional entry by default; CLI flags and XSD_PART4_URL still override for testing a new edition before pinning it. Common case becomes a single command: bun run xsd:fetch The manifest is the canonical pin (already used to upsert reference_sources via sources:sync), so making it the default for the fetch script keeps a single source of truth instead of asking contributors to remember a long URL or paste it into a .env. Docs (CLAUDE.md, scripts/ingest-xsd/README.md) updated to show the short form and explain how to override. --- CLAUDE.md | 2 +- scripts/ingest-xsd/README.md | 17 +++++++--- scripts/ingest-xsd/fetch.ts | 64 +++++++++++++++++++++++++++--------- 3 files changed, 62 insertions(+), 21 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 367a785..0ec3ed4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -145,7 +145,7 @@ ECMA Part 4 zip → fetch+verify (sha256) → parse → ingest (single transacti ``` ```bash -bun run xsd:fetch --url --expected-sha256 +bun run xsd:fetch # URL + sha256 from data/sources.json bun run xsd:ingest ``` diff --git a/scripts/ingest-xsd/README.md b/scripts/ingest-xsd/README.md index 72253ce..78cdba2 100644 --- a/scripts/ingest-xsd/README.md +++ b/scripts/ingest-xsd/README.md @@ -24,15 +24,22 @@ Transitional XSDs (`wml.xsd`, `dml-main.xsd`, `sml.xsd`, `pml.xsd`, `shared-*.xsd`, ...). ```bash -bun run xsd:fetch \ - --url 'https://ecma-international.org/wp-content/uploads/ECMA-376-4_5th_edition_december_2016.zip' \ - --expected-sha256 'bd25da1109f73762356596918bf5ff8b74a1331642dba5f1c1d1dfc6bed34ecd' +bun run xsd:fetch ``` -The script verifies the outer-zip sha256, extracts the inner zip, and lands -the XSDs in `data/xsd-cache/ecma-376-transitional/`. The cache is gitignored; +URL and sha256 are read from `data/sources.json`'s `ecma-376-transitional` +entry (currently pinned to ECMA-376 5th edition, December 2016). The script +verifies the outer-zip sha256, extracts the inner zip, and lands the XSDs +in `data/xsd-cache/ecma-376-transitional/`. The cache is gitignored; nothing binary lands in the repo. +To test a new edition before pinning it: + +```bash +bun run xsd:fetch -- --url # override URL +bun run xsd:fetch -- --expected-sha256 # override hash +``` + ## Ingest ```bash diff --git a/scripts/ingest-xsd/fetch.ts b/scripts/ingest-xsd/fetch.ts index 4f6bd43..1484601 100644 --- a/scripts/ingest-xsd/fetch.ts +++ b/scripts/ingest-xsd/fetch.ts @@ -6,21 +6,20 @@ * which in turn contains the 26 Transitional XSDs (wml.xsd, dml-main.xsd, * sml.xsd, pml.xsd, shared-*.xsd, and friends). * + * URL and sha256 are read from data/sources.json's ecma-376-transitional + * entry by default. CLI flags and env vars override; useful for testing a + * new edition before pinning it in the manifest. + * * Cache layout: * data/xsd-cache/ * _staging/ (outer + inner zip extraction scratch) * ecma-376-transitional/ (final XSDs land here) * * Usage: - * bun scripts/ingest-xsd/fetch.ts --url - * bun scripts/ingest-xsd/fetch.ts --url --expected-sha256 - * - * Or via env: - * XSD_PART4_URL= bun scripts/ingest-xsd/fetch.ts - * - * After a successful fetch the script prints the outer-zip sha256; - * paste it into data/sources.json under the ecma-376-transitional entry - * to pin reproducibility. + * bun run xsd:fetch (manifest default) + * bun run xsd:fetch -- --url (override URL) + * bun run xsd:fetch -- --expected-sha256 (override hash) + * XSD_PART4_URL= bun run xsd:fetch (override via env) */ import { createHash } from "node:crypto"; @@ -39,22 +38,57 @@ interface Args { innerZip: string; } -function parseArgs(): Args { +interface SourceManifestEntry { + name: string; + url?: string; + sha256?: string | null; +} + +interface SourceManifest { + sources: SourceManifestEntry[]; +} + +async function loadManifestDefault(): Promise<{ url: string | null; sha256: string | null }> { + try { + const raw = await Bun.file("./data/sources.json").text(); + const manifest = JSON.parse(raw) as SourceManifest; + const ecma = manifest.sources?.find((s) => s.name === "ecma-376-transitional"); + return { + url: ecma?.url ?? null, + sha256: ecma?.sha256 ?? null, + }; + } catch { + return { url: null, sha256: null }; + } +} + +async function parseArgs(): Promise { const argv = process.argv.slice(2); - let url = process.env.XSD_PART4_URL ?? ""; + let url: string | null = process.env.XSD_PART4_URL ?? null; let expectedSha256: string | null = null; let innerZip = DEFAULT_INNER_ZIP; for (let i = 0; i < argv.length; i++) { const a = argv[i]; - if (a === "--url") url = argv[++i] ?? ""; + if (a === "--url") url = argv[++i] ?? null; else if (a === "--expected-sha256") expectedSha256 = argv[++i] ?? null; else if (a === "--inner-zip") innerZip = argv[++i] ?? DEFAULT_INNER_ZIP; } + // Fall back to the manifest for any unset values. data/sources.json is + // the canonical pin; we treat it as the default config so the common case + // is just `bun run xsd:fetch`. + if (!url || !expectedSha256) { + const fromManifest = await loadManifestDefault(); + if (!url) url = fromManifest.url; + if (!expectedSha256) expectedSha256 = fromManifest.sha256; + } + if (!url) { - console.error("Missing --url (or XSD_PART4_URL env var)."); - console.error("Pass the canonical ECMA-376 5th edition Part 4 zip URL."); + console.error( + "No URL configured. Set 'url' on the ecma-376-transitional entry in data/sources.json,", + ); + console.error("or pass --url / XSD_PART4_URL."); process.exit(1); } return { url, expectedSha256, innerZip }; @@ -100,7 +134,7 @@ function findFile(dir: string, name: string): string | null { } async function main() { - const args = parseArgs(); + const args = await parseArgs(); await rm(STAGING_DIR, { recursive: true, force: true }); await rm(FINAL_DIR, { recursive: true, force: true }); From aee4d87b000c7c3bff6d3b61598c5078b526d3b8 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 18:13:52 -0300 Subject: [PATCH 23/24] feat(sources): pin all four ECMA-376 parts in the manifest Replace the single `ecma-376` placeholder with four part-specific entries (`ecma-376-part1` through `ecma-376-part4`), each pinned with URL, edition (5th), publication date, and sha256. Part 2 is the 2021 revision; Part 3 is from 2015; Parts 1 and 4 are 2016 - reflected in each entry's version field. The Part 4 zip URL is shared with the existing ecma-376-transitional entry (the XSD zip is extracted from inside Part 4); both rows pin the same outer-zip sha256. scripts/sources-sync.ts now backfills spec_content.source_id by part_number, mapping each row to the matching ecma-376-partN source. The previous backfill targeted a single `ecma-376` source which no longer exists. Migration 0005 cleans up the legacy `ecma-376` placeholder row from reference_sources, but only if no spec_content row references it (safe for a developer who had already backfilled to the placeholder id; idempotent). 44 / 0 tests still passing. --- data/sources.json | 37 +++++++++++++++++++--- db/migrations/0005_drop_legacy_ecma376.sql | 17 ++++++++++ scripts/sources-sync.ts | 23 +++++++++----- 3 files changed, 64 insertions(+), 13 deletions(-) create mode 100644 db/migrations/0005_drop_legacy_ecma376.sql diff --git a/data/sources.json b/data/sources.json index 7a77b27..4503849 100644 --- a/data/sources.json +++ b/data/sources.json @@ -2,13 +2,40 @@ "$comment": "Source manifest. Human-edited; scripts/sources-sync.ts upserts these rows into reference_sources.", "sources": [ { - "name": "ecma-376", + "name": "ecma-376-part1", "kind": "spec_pdf", - "edition": "unknown", - "version": null, - "url": "https://ecma-international.org/publications-and-standards/standards/ecma-376/", + "edition": "5th", + "version": "2016-12", + "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-1_5th_edition_december_2016.zip", + "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", + "sha256": "9d0bcad9cf06054785b03762fcfadbf6bab7e54a5f9d69434e34b7fd464d4129" + }, + { + "name": "ecma-376-part2", + "kind": "spec_pdf", + "edition": "5th", + "version": "2021-12", + "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-2_5th_edition_december_2021.zip", + "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", + "sha256": "1d489dc491168ea1f9e9a59063acc8dd5f02b4ad1d21aa7ec19ba9a58d020c70" + }, + { + "name": "ecma-376-part3", + "kind": "spec_pdf", + "edition": "5th", + "version": "2015-12", + "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-3_5th_edition_december_2015.zip", + "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", + "sha256": "42294159fbbbe9393ccadac95b859d7729cc68d908898bcbe31034dda059daa8" + }, + { + "name": "ecma-376-part4", + "kind": "spec_pdf", + "edition": "5th", + "version": "2016-12", + "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-4_5th_edition_december_2016.zip", "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", - "sha256": null + "sha256": "bd25da1109f73762356596918bf5ff8b74a1331642dba5f1c1d1dfc6bed34ecd" }, { "name": "ecma-376-transitional", diff --git a/db/migrations/0005_drop_legacy_ecma376.sql b/db/migrations/0005_drop_legacy_ecma376.sql new file mode 100644 index 0000000..89afb4a --- /dev/null +++ b/db/migrations/0005_drop_legacy_ecma376.sql @@ -0,0 +1,17 @@ +-- Drop the legacy `ecma-376` placeholder source row. +-- +-- An earlier version of data/sources.json had a single placeholder +-- (`ecma-376`, edition=unknown, sha256=null) that stood in for the whole +-- spec corpus before per-part entries existed. The manifest now pins the +-- four ECMA-376 parts individually (`ecma-376-partN`), so the placeholder +-- is obsolete. +-- +-- This migration only deletes the row when nothing in spec_content +-- references it, so a developer who already backfilled source_id to the +-- legacy id stays safe. Idempotent. + +DELETE FROM reference_sources +WHERE name = 'ecma-376' + AND NOT EXISTS ( + SELECT 1 FROM spec_content WHERE spec_content.source_id = reference_sources.id + ); diff --git a/scripts/sources-sync.ts b/scripts/sources-sync.ts index ecf48b5..b86e565 100644 --- a/scripts/sources-sync.ts +++ b/scripts/sources-sync.ts @@ -68,16 +68,23 @@ async function main() { ); } - const [ecma] = await sql<[{ id: number } | undefined]>` - SELECT id FROM reference_sources WHERE name = 'ecma-376' LIMIT 1 - `; - if (ecma) { + // Backfill spec_content.source_id by part_number to the matching + // ecma-376-partN row. Idempotent: only touches rows where source_id IS NULL. + for (let part = 1; part <= 4; part++) { + const sourceName = `ecma-376-part${part}`; + const [src] = await sql<[{ id: number } | undefined]>` + SELECT id FROM reference_sources WHERE name = ${sourceName} LIMIT 1 + `; + if (!src) continue; const result = await sql` - UPDATE spec_content SET source_id = ${ecma.id} WHERE source_id IS NULL + UPDATE spec_content SET source_id = ${src.id} + WHERE part_number = ${part} AND source_id IS NULL `; - console.log(`Backfilled ${result.count} spec_content row(s) -> source_id=${ecma.id}`); - } else { - console.warn("No ecma-376 source row found; skipped spec_content backfill."); + if (result.count > 0) { + console.log( + `Backfilled ${result.count} spec_content row(s) (part ${part}) -> source_id=${src.id}`, + ); + } } } finally { await db.close(); From 07f1086a98267e4756198ab12cb197a99b582e9a Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 27 Apr 2026 18:18:04 -0300 Subject: [PATCH 24/24] chore(xsd): fix stale db:sync-sources reference in ingest error message --- scripts/ingest-xsd/ingest.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index 71a6054..f8adc2d 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -636,7 +636,7 @@ async function lookupSourceId(sql: Sql, name: string): Promise { const [row] = await sql`SELECT id FROM reference_sources WHERE name = ${name} LIMIT 1`; if (!row) throw new Error( - `reference_sources row not found for name='${name}'. Run db:sync-sources first.`, + `reference_sources row not found for name='${name}'. Run \`bun run sources:sync\` first.`, ); return row.id; }