Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions opencontractserver/enrichment/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@
CRAWL_DEFAULT_TOKEN_BUDGET = (
2_000_000 # cumulative est. tokens (text len / 4) before stop
)
# Public analyzer safety caps. The crawl analyzer accepts user-provided bounds
# from GraphQL, so keep those values at or below the defaults unless operators
# intentionally raise these constants.
CRAWL_MAX_DEPTH = CRAWL_DEFAULT_MAX_DEPTH
CRAWL_MAX_MIN_DEMAND = CRAWL_DEFAULT_MIN_DEMAND
CRAWL_MAX_AUTHORITIES = CRAWL_DEFAULT_MAX_AUTHORITIES
CRAWL_MAX_PER_JURISDICTION_CAP = CRAWL_DEFAULT_PER_JURISDICTION_CAP
CRAWL_MAX_TOKEN_BUDGET = CRAWL_DEFAULT_TOKEN_BUDGET
# Punctuation stripped from the tail of a captured defined term
# (e.g. (the "Notes," ...) -> "Notes").
TRAILING_PUNCT = ",.;:"
Expand Down
68 changes: 53 additions & 15 deletions opencontractserver/tasks/corpus_analysis_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,34 +79,54 @@ def corpus_reference_enrichment(
)


def _resolve_crawl_bound(
*,
name: str,
value: int | None,
default: int,
minimum: int,
maximum: int,
) -> int:
"""Resolve and clamp user-supplied public analyzer crawl bounds."""
if value is None:
return default
if isinstance(value, bool) or not isinstance(value, int):
raise ValueError(f"{name} must be an integer")
return min(max(value, minimum), maximum)


@corpus_analyzer_task(
input_schema={
"type": "object",
"properties": {
"max_depth": {
"type": "integer",
"minimum": 0,
"maximum": 5,
"maximum": C.CRAWL_MAX_DEPTH,
"default": C.CRAWL_DEFAULT_MAX_DEPTH,
},
"min_demand": {
"type": "integer",
"minimum": 0,
"maximum": C.CRAWL_MAX_MIN_DEMAND,
"default": C.CRAWL_DEFAULT_MIN_DEMAND,
},
"max_authorities": {
"type": "integer",
"minimum": 1,
"maximum": C.CRAWL_MAX_AUTHORITIES,
"default": C.CRAWL_DEFAULT_MAX_AUTHORITIES,
},
"per_jurisdiction_cap": {
"type": "integer",
"minimum": 1,
"maximum": C.CRAWL_MAX_PER_JURISDICTION_CAP,
"default": C.CRAWL_DEFAULT_PER_JURISDICTION_CAP,
},
"token_budget": {
"type": "integer",
"minimum": 0,
"minimum": 1,
"maximum": C.CRAWL_MAX_TOKEN_BUDGET,
"default": C.CRAWL_DEFAULT_TOKEN_BUDGET,
},
"make_public": {"type": "boolean", "default": True},
Expand Down Expand Up @@ -156,22 +176,40 @@ def crawl_authorities(
return CrawlAuthoritiesService.crawl(
creator_id=analysis.creator_id,
corpus_id=corpus_id,
max_depth=max_depth if max_depth is not None else C.CRAWL_DEFAULT_MAX_DEPTH,
min_demand=(
min_demand if min_demand is not None else C.CRAWL_DEFAULT_MIN_DEMAND
max_depth=_resolve_crawl_bound(
name="max_depth",
value=max_depth,
default=C.CRAWL_DEFAULT_MAX_DEPTH,
minimum=0,
maximum=C.CRAWL_MAX_DEPTH,
),
min_demand=_resolve_crawl_bound(
name="min_demand",
value=min_demand,
default=C.CRAWL_DEFAULT_MIN_DEMAND,
minimum=0,
maximum=C.CRAWL_MAX_MIN_DEMAND,
),
max_authorities=(
max_authorities
if max_authorities is not None
else C.CRAWL_DEFAULT_MAX_AUTHORITIES
max_authorities=_resolve_crawl_bound(
name="max_authorities",
value=max_authorities,
default=C.CRAWL_DEFAULT_MAX_AUTHORITIES,
minimum=1,
maximum=C.CRAWL_MAX_AUTHORITIES,
),
per_jurisdiction_cap=(
per_jurisdiction_cap
if per_jurisdiction_cap is not None
else C.CRAWL_DEFAULT_PER_JURISDICTION_CAP
per_jurisdiction_cap=_resolve_crawl_bound(
name="per_jurisdiction_cap",
value=per_jurisdiction_cap,
default=C.CRAWL_DEFAULT_PER_JURISDICTION_CAP,
minimum=1,
maximum=C.CRAWL_MAX_PER_JURISDICTION_CAP,
),
token_budget=(
token_budget if token_budget is not None else C.CRAWL_DEFAULT_TOKEN_BUDGET
token_budget=_resolve_crawl_bound(
name="token_budget",
value=token_budget,
default=C.CRAWL_DEFAULT_TOKEN_BUDGET,
minimum=1,
maximum=C.CRAWL_MAX_TOKEN_BUDGET,
),
make_public=make_public,
log=task_log,
Expand Down
66 changes: 66 additions & 0 deletions opencontractserver/tests/test_crawl_authorities.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,72 @@ def test_crawl_authorities_task_importable(self):
"crawl_authorities must be decorated with @corpus_analyzer_task",
)

def test_crawl_authorities_public_schema_caps_expensive_bounds(self):
from opencontractserver.enrichment import constants as C
from opencontractserver.tasks.corpus_analysis_tasks import crawl_authorities

properties = crawl_authorities._oc_corpus_analyzer_input_schema["properties"] # type: ignore[attr-defined]
self.assertEqual(properties["max_depth"]["maximum"], C.CRAWL_MAX_DEPTH)
self.assertEqual(properties["min_demand"]["maximum"], C.CRAWL_MAX_MIN_DEMAND)
self.assertEqual(
properties["max_authorities"]["maximum"], C.CRAWL_MAX_AUTHORITIES
)
self.assertEqual(
properties["per_jurisdiction_cap"]["maximum"],
C.CRAWL_MAX_PER_JURISDICTION_CAP,
)
self.assertEqual(
properties["token_budget"]["maximum"], C.CRAWL_MAX_TOKEN_BUDGET
)
self.assertEqual(properties["token_budget"]["minimum"], 1)

def test_crawl_authorities_task_clamps_user_supplied_bounds(self):
from opencontractserver.enrichment import constants as C
from opencontractserver.tasks.corpus_analysis_tasks import _resolve_crawl_bound

self.assertEqual(
_resolve_crawl_bound(
name="max_authorities",
value=10**9,
default=C.CRAWL_DEFAULT_MAX_AUTHORITIES,
minimum=1,
maximum=C.CRAWL_MAX_AUTHORITIES,
),
C.CRAWL_MAX_AUTHORITIES,
)
self.assertEqual(
_resolve_crawl_bound(
name="token_budget",
value=0,
default=C.CRAWL_DEFAULT_TOKEN_BUDGET,
minimum=1,
maximum=C.CRAWL_MAX_TOKEN_BUDGET,
),
1,
)
# ``None`` (the schema default is absent) falls back to the default.
self.assertEqual(
_resolve_crawl_bound(
name="max_depth",
value=None,
default=C.CRAWL_DEFAULT_MAX_DEPTH,
minimum=0,
maximum=C.CRAWL_MAX_DEPTH,
),
C.CRAWL_DEFAULT_MAX_DEPTH,
)
# Non-integers are rejected — including ``bool`` (an ``int`` subclass)
# so a stray ``True`` cannot slip through as ``1``.
for bad in ("5", 1.5, True):
with self.assertRaises(ValueError):
_resolve_crawl_bound(
name="max_depth",
value=bad, # type: ignore[arg-type] # intentional bad type
default=C.CRAWL_DEFAULT_MAX_DEPTH,
minimum=0,
maximum=C.CRAWL_MAX_DEPTH,
)


class IdempotencyTests(TransactionTestCase):
"""Crawling the same authority twice must not create duplicate rows.
Expand Down
Loading