Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 66 additions & 2 deletions ingest/document_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,60 @@
from ingest.utils.chunking import chunk_markdown_lines
from psycopg.sql import SQL, Identifier

# Strict allow-list pattern for CREATE INDEX statements returned by
# pg_get_indexdef(). PostgreSQL renders these in a canonical form like:
# CREATE [UNIQUE] INDEX <name> ON <schema>.<table> [USING <method>] (...)
# We re-execute these strings verbatim in finalize_database(), so we must
# refuse anything that does not match the expected shape and target table.
# This guards against a second-order SQL injection where a malicious or
# unexpected index definition stored in the catalog (e.g. by a user with
# DDL access) would otherwise be executed with the importer's privileges.
_INDEX_DEF_RE = re.compile(
r"""
^\s*CREATE\s+(?:UNIQUE\s+)?INDEX\s+ # CREATE [UNIQUE] INDEX
(?:IF\s+NOT\s+EXISTS\s+)?
"?[A-Za-z_][A-Za-z0-9_]*"?\s+ # index name (optionally quoted)
ON\s+
(?:"?docs"?\.)?"?(?P<table>[A-Za-z_][A-Za-z0-9_]*)"? # [docs.]<table>
\s+
""",
re.IGNORECASE | re.VERBOSE,
)


def _validate_index_def(index_def: str, expected_table: str) -> str:
"""Validate that ``index_def`` is a CREATE INDEX targeting ``expected_table``.

Returns the original string on success; raises :class:`ValueError`
otherwise. Rejects stacked statements (semicolons outside the trailing
terminator) so a crafted catalog entry cannot smuggle additional SQL.
"""
if not isinstance(index_def, str):
raise ValueError(f"Unexpected index definition type: {type(index_def)!r}")

# Strip a single trailing semicolon if present, then ensure no further
# statement separators remain.
stripped = index_def.strip().rstrip(";").strip()
if ";" in stripped:
raise ValueError(
f"Refusing to execute multi-statement index definition: {index_def!r}"
)

match = _INDEX_DEF_RE.match(stripped)
if not match:
raise ValueError(
f"Index definition does not match expected CREATE INDEX shape: {index_def!r}"
)

target_table = match.group("table")
if target_table != expected_table:
raise ValueError(
"Index definition targets unexpected table "
f"{target_table!r} (expected {expected_table!r}): {index_def!r}"
)

return stripped


class DocumentImporter(ABC):
def __init__(self, version: str | int, pages_table: str, chunks_table: str):
Expand Down Expand Up @@ -131,7 +185,10 @@ def init_database(self, conn: psycopg.Connection) -> None:
print("Initializing database tables...")

# Capture existing index definitions so we can recreate them after swap.
self._index_defs_to_create = [
# Each definition is validated against a strict allow-list before being
# stored, so finalize_database() can never execute an unexpected
# statement even if the catalog is tampered with between the two phases.
raw_index_defs = [
row[0]
for row in conn.execute(
"""
Expand All @@ -144,6 +201,9 @@ def init_database(self, conn: psycopg.Connection) -> None:
[self.chunks_table],
).fetchall()
]
self._index_defs_to_create = [
_validate_index_def(d, self.chunks_table) for d in raw_index_defs
]

conn.execute(f"DROP TABLE IF EXISTS docs.{self.chunks_table}_tmp CASCADE")
conn.execute(f"DROP TABLE IF EXISTS docs.{self.pages_table}_tmp CASCADE")
Expand Down Expand Up @@ -203,7 +263,11 @@ def finalize_database(self, conn: psycopg.Connection) -> None:
)

for index_def in self._index_defs_to_create:
cur.execute(index_def)
# Re-validate just before execution as defence in depth: the
# list is only ever populated through _validate_index_def in
# init_database(), but we don't want to depend on that
# invariant if the class is subclassed or refactored.
cur.execute(_validate_index_def(index_def, self.chunks_table))

for table in [self.pages_table, self.chunks_table]:
cur.execute(
Expand Down
Empty file added ingest/tests/__init__.py
Empty file.
57 changes: 57 additions & 0 deletions ingest/tests/test_index_def_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Tests for CREATE INDEX statement validation in document_importer.

These guard against second-order SQL injection: ``finalize_database`` re-runs
the ``indexdef`` strings captured from ``pg_indexes`` during ``init_database``.
A malicious or unexpected catalog entry must not be able to smuggle additional
SQL or target a table other than the importer's own chunks table.
"""

from __future__ import annotations

import pytest
from ingest.document_importer import _validate_index_def


@pytest.mark.parametrize(
"index_def",
[
"CREATE UNIQUE INDEX chunks_pkey ON docs.chunks USING btree (id)",
"CREATE INDEX idx_chunks_page_id ON docs.chunks USING btree (page_id)",
(
"CREATE INDEX chunks_bm25_idx ON docs.chunks "
"USING bm25 (id, content) WITH (key_field=id)"
),
# Trailing semicolon is acceptable (some catalog renderings include it).
"CREATE INDEX idx_chunks_page_id ON docs.chunks USING btree (page_id);",
],
)
def test_accepts_canonical_create_index(index_def: str) -> None:
result = _validate_index_def(index_def, "chunks")
assert result.lower().startswith("create")
assert "docs.chunks" in result.lower() or "chunks" in result.lower()


@pytest.mark.parametrize(
"index_def",
[
# Stacked-statement injection.
"CREATE INDEX x ON docs.chunks (id); DROP TABLE docs.pages; --",
"CREATE INDEX x ON docs.chunks (id); CREATE INDEX y ON docs.chunks(id)",
# Wrong target table — refuses to touch anything other than chunks.
"CREATE INDEX x ON docs.other_table (id)",
"CREATE INDEX x ON docs.pg_class (id)",
# Outright non-CREATE-INDEX DDL.
"DROP TABLE docs.chunks",
"ALTER TABLE docs.chunks ADD COLUMN evil text",
"SELECT pg_sleep(10)",
"",
],
)
def test_rejects_malicious_or_unexpected_definitions(index_def: str) -> None:
with pytest.raises(ValueError):
_validate_index_def(index_def, "chunks")


def test_rejects_non_string() -> None:
with pytest.raises(ValueError):
_validate_index_def(None, "chunks") # type: ignore[arg-type]