Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,14 @@
)
)

# Maximum uncompressed size (in bytes) for a single document member that the V2
# corpus-export reingest path will read into memory. Larger members fall back to
# the baked import path instead of risking Celery worker exhaustion. Set to 0 to
# disable the per-member guard.
MAX_CORPUS_REINGEST_SOURCE_BYTES = int(
env("MAX_CORPUS_REINGEST_SOURCE_BYTES", default=str(256 * 1024 * 1024))
)

# Chunked (resumable) upload limits
# ------------------------------------------------------------------------------
# Back the /api/imports/chunked/* endpoints, which slice a large file into
Expand Down
42 changes: 39 additions & 3 deletions opencontractserver/tasks/import_tasks_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import zipfile
from typing import IO, TYPE_CHECKING, Any, cast

from django.conf import settings
from django.contrib.auth import get_user_model
from django.db import IntegrityError, transaction
from django.utils import timezone
Expand Down Expand Up @@ -84,6 +85,42 @@
_NUL_SOURCE_PLACEHOLDER = b"\x00"


def _read_reingest_source_bytes(
import_zip: zipfile.ZipFile, doc_filename: str
) -> bytes | None:
"""Read a reingest source member only when its expanded size is safe.

User-facing corpus-export imports default to reingest mode, so a crafted ZIP
can otherwise force Celery workers to allocate memory proportional to an
uncompressed member via ``ZipExtFile.read()``. Return ``None`` to make the
caller use the baked-import fallback instead.
"""
max_source_bytes = getattr(settings, "MAX_CORPUS_REINGEST_SOURCE_BYTES", 0)
zip_info = import_zip.getinfo(doc_filename)
if max_source_bytes and zip_info.file_size > max_source_bytes:
logger.warning(
"Skipping reingest for %s: uncompressed ZIP member size %s exceeds "
"MAX_CORPUS_REINGEST_SOURCE_BYTES=%s; using baked import fallback.",
doc_filename,
zip_info.file_size,
max_source_bytes,
)
return None

with import_zip.open(zip_info) as fh:
source_bytes = fh.read(max_source_bytes + 1 if max_source_bytes else -1)
if max_source_bytes and len(source_bytes) > max_source_bytes:
logger.warning(
"Skipping reingest for %s: ZIP member expanded beyond "
"MAX_CORPUS_REINGEST_SOURCE_BYTES=%s while reading; using baked "
"import fallback.",
doc_filename,
max_source_bytes,
)
return None
return source_bytes


def import_corpus_v2_from_bytes(
zip_source: IO[bytes],
user_id: int,
Expand Down Expand Up @@ -311,9 +348,8 @@ def _import_document_with_annotations(
# relationship fan-in can resolve this doc's annotation ids.
reingest_fallback = False
if reingest_and_remap:
with import_zip.open(doc_filename) as fh:
source_bytes = fh.read()
if _source_is_reingestable(source_bytes):
source_bytes = _read_reingest_source_bytes(import_zip, doc_filename)
if source_bytes is not None and _source_is_reingestable(source_bytes):
return _reingest_document_with_deferred_remap(
doc_filename,
doc_data,
Expand Down
74 changes: 73 additions & 1 deletion opencontractserver/tests/test_import_v2_reingest_remap.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from django.contrib.auth import get_user_model
from django.core.files.base import ContentFile
from django.db import transaction
from django.test import TestCase, TransactionTestCase
from django.test import TestCase, TransactionTestCase, override_settings
from django.utils import timezone

from opencontractserver.annotations.models import (
Expand Down Expand Up @@ -136,6 +136,78 @@ def test_real_bytes_are_reingestable(self):
self.assertTrue(_source_is_reingestable(b"%PDF-1.4 ..."))
self.assertTrue(_source_is_reingestable(b"plain text body"))

@override_settings(MAX_CORPUS_REINGEST_SOURCE_BYTES=4)
def test_read_reingest_source_bytes_rejects_oversized_zip_member(self):
import io
import zipfile

from opencontractserver.tasks.import_tasks_v2 import (
_read_reingest_source_bytes,
)

buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
zf.writestr("documents/large.pdf", b"%PDF-1.4 large")
buf.seek(0)

with zipfile.ZipFile(buf) as zf:
self.assertIsNone(_read_reingest_source_bytes(zf, "documents/large.pdf"))

@override_settings(MAX_CORPUS_REINGEST_SOURCE_BYTES=32)
def test_read_reingest_source_bytes_allows_member_under_limit(self):
import io
import zipfile

from opencontractserver.tasks.import_tasks_v2 import (
_read_reingest_source_bytes,
)

buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
zf.writestr("documents/small.pdf", b"%PDF-1.4 small")
buf.seek(0)

with zipfile.ZipFile(buf) as zf:
self.assertEqual(
_read_reingest_source_bytes(zf, "documents/small.pdf"),
b"%PDF-1.4 small",
)

@override_settings(MAX_CORPUS_REINGEST_SOURCE_BYTES=4)
def test_read_reingest_source_bytes_rejects_metadata_lie_on_read(self):
"""Second guard fires when ZIP metadata under-reports file_size but the
actual read returns more than MAX_CORPUS_REINGEST_SOURCE_BYTES bytes.

A crafted ZIP can set a small file_size in the central directory while
storing larger data, bypassing the first (metadata) guard. The second
guard catches this by checking len(source_bytes) after the bounded read.
"""
import io
import zipfile

from opencontractserver.tasks.import_tasks_v2 import (
_read_reingest_source_bytes,
)

# 10-byte stored entry: compress_size == 10.
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_STORED) as zf:
zf.writestr("documents/liar.pdf", b"%PDF-1.4 x")
buf.seek(0)

with zipfile.ZipFile(buf) as real_zf:
# ZipFile.getinfo() returns the live ZipInfo from NameToInfo, so
# modifying it in-place is seen by the subsequent getinfo() call
# inside _read_reingest_source_bytes.
real_info = real_zf.getinfo("documents/liar.pdf")
# Lie: shrink reported file_size to 3 so the first guard (3 > 4)
# is skipped. compress_size stays at 10, so fh.read(5) still
# yields 5 bytes and the second guard (5 > 4) fires → None.
real_info.file_size = 3
result = _read_reingest_source_bytes(real_zf, "documents/liar.pdf")

self.assertIsNone(result)


class TestCorpusImportFanIn(TestCase):
"""Coordination-layer unit tests for the relationship fan-in."""
Expand Down
Loading