diff --git a/ingest/document_importer.py b/ingest/document_importer.py index eaad9a5..25590e8 100644 --- a/ingest/document_importer.py +++ b/ingest/document_importer.py @@ -15,6 +15,60 @@ from ingest.utils.chunking import chunk_markdown_lines from psycopg.sql import SQL, Identifier +# Strict allow-list pattern for CREATE INDEX statements returned by +# pg_get_indexdef(). PostgreSQL renders these in a canonical form like: +# CREATE [UNIQUE] INDEX ON . [USING ] (...) +# We re-execute these strings verbatim in finalize_database(), so we must +# refuse anything that does not match the expected shape and target table. +# This guards against a second-order SQL injection where a malicious or +# unexpected index definition stored in the catalog (e.g. by a user with +# DDL access) would otherwise be executed with the importer's privileges. +_INDEX_DEF_RE = re.compile( + r""" + ^\s*CREATE\s+(?:UNIQUE\s+)?INDEX\s+ # CREATE [UNIQUE] INDEX + (?:IF\s+NOT\s+EXISTS\s+)? + "?[A-Za-z_][A-Za-z0-9_]*"?\s+ # index name (optionally quoted) + ON\s+ + (?:"?docs"?\.)?"?(?P
[A-Za-z_][A-Za-z0-9_]*)"? # [docs.]
+ \s+ + """, + re.IGNORECASE | re.VERBOSE, +) + + +def _validate_index_def(index_def: str, expected_table: str) -> str: + """Validate that ``index_def`` is a CREATE INDEX targeting ``expected_table``. + + Returns the original string on success; raises :class:`ValueError` + otherwise. Rejects stacked statements (semicolons outside the trailing + terminator) so a crafted catalog entry cannot smuggle additional SQL. + """ + if not isinstance(index_def, str): + raise ValueError(f"Unexpected index definition type: {type(index_def)!r}") + + # Strip a single trailing semicolon if present, then ensure no further + # statement separators remain. + stripped = index_def.strip().rstrip(";").strip() + if ";" in stripped: + raise ValueError( + f"Refusing to execute multi-statement index definition: {index_def!r}" + ) + + match = _INDEX_DEF_RE.match(stripped) + if not match: + raise ValueError( + f"Index definition does not match expected CREATE INDEX shape: {index_def!r}" + ) + + target_table = match.group("table") + if target_table != expected_table: + raise ValueError( + "Index definition targets unexpected table " + f"{target_table!r} (expected {expected_table!r}): {index_def!r}" + ) + + return stripped + class DocumentImporter(ABC): def __init__(self, version: str | int, pages_table: str, chunks_table: str): @@ -131,7 +185,10 @@ def init_database(self, conn: psycopg.Connection) -> None: print("Initializing database tables...") # Capture existing index definitions so we can recreate them after swap. - self._index_defs_to_create = [ + # Each definition is validated against a strict allow-list before being + # stored, so finalize_database() can never execute an unexpected + # statement even if the catalog is tampered with between the two phases. + raw_index_defs = [ row[0] for row in conn.execute( """ @@ -144,6 +201,9 @@ def init_database(self, conn: psycopg.Connection) -> None: [self.chunks_table], ).fetchall() ] + self._index_defs_to_create = [ + _validate_index_def(d, self.chunks_table) for d in raw_index_defs + ] conn.execute(f"DROP TABLE IF EXISTS docs.{self.chunks_table}_tmp CASCADE") conn.execute(f"DROP TABLE IF EXISTS docs.{self.pages_table}_tmp CASCADE") @@ -203,7 +263,11 @@ def finalize_database(self, conn: psycopg.Connection) -> None: ) for index_def in self._index_defs_to_create: - cur.execute(index_def) + # Re-validate just before execution as defence in depth: the + # list is only ever populated through _validate_index_def in + # init_database(), but we don't want to depend on that + # invariant if the class is subclassed or refactored. + cur.execute(_validate_index_def(index_def, self.chunks_table)) for table in [self.pages_table, self.chunks_table]: cur.execute( diff --git a/ingest/tests/__init__.py b/ingest/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ingest/tests/test_index_def_validation.py b/ingest/tests/test_index_def_validation.py new file mode 100644 index 0000000..534a28b --- /dev/null +++ b/ingest/tests/test_index_def_validation.py @@ -0,0 +1,57 @@ +"""Tests for CREATE INDEX statement validation in document_importer. + +These guard against second-order SQL injection: ``finalize_database`` re-runs +the ``indexdef`` strings captured from ``pg_indexes`` during ``init_database``. +A malicious or unexpected catalog entry must not be able to smuggle additional +SQL or target a table other than the importer's own chunks table. +""" + +from __future__ import annotations + +import pytest +from ingest.document_importer import _validate_index_def + + +@pytest.mark.parametrize( + "index_def", + [ + "CREATE UNIQUE INDEX chunks_pkey ON docs.chunks USING btree (id)", + "CREATE INDEX idx_chunks_page_id ON docs.chunks USING btree (page_id)", + ( + "CREATE INDEX chunks_bm25_idx ON docs.chunks " + "USING bm25 (id, content) WITH (key_field=id)" + ), + # Trailing semicolon is acceptable (some catalog renderings include it). + "CREATE INDEX idx_chunks_page_id ON docs.chunks USING btree (page_id);", + ], +) +def test_accepts_canonical_create_index(index_def: str) -> None: + result = _validate_index_def(index_def, "chunks") + assert result.lower().startswith("create") + assert "docs.chunks" in result.lower() or "chunks" in result.lower() + + +@pytest.mark.parametrize( + "index_def", + [ + # Stacked-statement injection. + "CREATE INDEX x ON docs.chunks (id); DROP TABLE docs.pages; --", + "CREATE INDEX x ON docs.chunks (id); CREATE INDEX y ON docs.chunks(id)", + # Wrong target table — refuses to touch anything other than chunks. + "CREATE INDEX x ON docs.other_table (id)", + "CREATE INDEX x ON docs.pg_class (id)", + # Outright non-CREATE-INDEX DDL. + "DROP TABLE docs.chunks", + "ALTER TABLE docs.chunks ADD COLUMN evil text", + "SELECT pg_sleep(10)", + "", + ], +) +def test_rejects_malicious_or_unexpected_definitions(index_def: str) -> None: + with pytest.raises(ValueError): + _validate_index_def(index_def, "chunks") + + +def test_rejects_non_string() -> None: + with pytest.raises(ValueError): + _validate_index_def(None, "chunks") # type: ignore[arg-type]