From 1cd4c1fede9ad776a6412761d77d3d37f8385256 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Mon, 20 Apr 2026 15:09:19 -0700 Subject: [PATCH 01/20] feat(detect): accept validator pool + chunked-validation config knobs Allows users to declare `entity_validator` as either a single alias (current behavior) or a list of aliases that form a validator pool. Scalars normalize to single-element lists so every consumer sees `list[str]`. Adds `validation_max_entities_per_call` (default 100) and `validation_excerpt_window_chars` (default 500) to Detect, which commit 2 will consume when it replaces the single validation LLM call with a chunked custom column that rotates across the pool. Behavior is unchanged in this commit: the workflow still issues one validation call per row using pool[0]. Adds `resolve_model_aliases()` helper for list-valued roles and updates `validate_model_alias_references` to flag unknown aliases in a pool by index. Made-with: Cursor --- src/anonymizer/config/anonymizer_config.py | 18 ++++ src/anonymizer/config/models.py | 36 ++++++- .../engine/detection/detection_workflow.py | 11 ++- src/anonymizer/engine/ndd/model_loader.py | 56 ++++++++--- src/anonymizer/interface/anonymizer.py | 2 +- tests/config/test_anonymizer_config.py | 28 ++++++ tests/engine/test_detection_workflow.py | 19 +++- tests/engine/test_model_loader.py | 94 ++++++++++++++++++- 8 files changed, 239 insertions(+), 25 deletions(-) diff --git a/src/anonymizer/config/anonymizer_config.py b/src/anonymizer/config/anonymizer_config.py index 2ecd072a..2c8c4d13 100644 --- a/src/anonymizer/config/anonymizer_config.py +++ b/src/anonymizer/config/anonymizer_config.py @@ -82,6 +82,24 @@ class Detect(BaseModel): gliner_threshold: float = Field( default=0.3, ge=0.0, le=1.0, description="GLiNER detection confidence threshold (0.0-1.0)." ) + validation_max_entities_per_call: int = Field( + default=100, + gt=0, + description=( + "Maximum number of candidate entities included in a single validator LLM call. " + "When a row has more candidates than this, validation is split into chunks that " + "are dispatched (round-robin) across the validator pool." + ), + ) + validation_excerpt_window_chars: int = Field( + default=500, + gt=0, + description=( + "Number of characters to include before and after a chunk's entity span when " + "building the text excerpt sent to the validator. Bounds the prompt context the " + "validator sees per chunk; it is NOT the LLM's context window limit." + ), + ) @field_validator("entity_labels") @classmethod diff --git a/src/anonymizer/config/models.py b/src/anonymizer/config/models.py index f01983f1..108316f8 100644 --- a/src/anonymizer/config/models.py +++ b/src/anonymizer/config/models.py @@ -3,17 +3,47 @@ from __future__ import annotations -from pydantic import BaseModel +from typing import Any + +from pydantic import BaseModel, field_validator class DetectionModelSelection(BaseModel): - """Model aliases for the entity detection pipeline.""" + """Model aliases for the entity detection pipeline. + + ``entity_validator`` accepts either a single alias or a list of aliases. + A list forms a validator *pool*: chunked validation rotates calls + across the pool in round-robin order, which is useful for bypassing + per-alias TPM/RPM limits. A single scalar is normalized to a + one-element list. + """ entity_detector: str - entity_validator: str + entity_validator: list[str] entity_augmenter: str latent_detector: str + @field_validator("entity_validator", mode="before") + @classmethod + def normalize_entity_validator(cls, value: Any) -> list[str]: + """Accept either a scalar alias or a list, return a non-empty list. + + Normalizing at parse time keeps every downstream consumer on the + same shape (``list[str]``) regardless of whether the user wrote + ``entity_validator: some-alias`` or + ``entity_validator: [alias-a, alias-b]``. + """ + if isinstance(value, str): + aliases: list[str] = [value] + elif isinstance(value, (list, tuple)): + aliases = [str(item) for item in value] + else: + raise TypeError(f"entity_validator must be a string or list of strings, got {type(value).__name__}") + cleaned = [alias.strip() for alias in aliases if alias.strip()] + if not cleaned: + raise ValueError("entity_validator must name at least one model alias.") + return cleaned + class ReplaceModelSelection(BaseModel): """Model aliases for the replacement pipeline.""" diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index d633041c..22134a9b 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -47,7 +47,7 @@ ) from anonymizer.engine.detection.postprocess import EntitySpan, group_entities_by_value from anonymizer.engine.ndd.adapter import FailedRecord, NddAdapter -from anonymizer.engine.ndd.model_loader import resolve_model_alias +from anonymizer.engine.ndd.model_loader import resolve_model_alias, resolve_model_aliases from anonymizer.engine.prompt_utils import substitute_placeholders from anonymizer.engine.schemas import ( AugmentedEntitiesSchema, @@ -99,12 +99,17 @@ def detect_and_validate_entities( ) detection_alias = resolve_model_alias("entity_detector", selected_models) - validator_alias = resolve_model_alias("entity_validator", selected_models) + validator_aliases = resolve_model_aliases("entity_validator", selected_models) augmenter_alias = resolve_model_alias("entity_augmenter", selected_models) + # Commit 2 will replace the LLMStructuredColumnConfig below with a chunked + # CustomColumnConfig that rotates across the full validator pool. Until then + # the first alias preserves current single-call semantics. + validator_alias = validator_aliases[0] logger.debug( - "detection aliases: detector=%s, validator=%s, augmenter=%s", + "detection aliases: detector=%s, validator=%s (pool=%s), augmenter=%s", detection_alias, validator_alias, + validator_aliases, augmenter_alias, ) diff --git a/src/anonymizer/engine/ndd/model_loader.py b/src/anonymizer/engine/ndd/model_loader.py index b5670025..ee556926 100644 --- a/src/anonymizer/engine/ndd/model_loader.py +++ b/src/anonymizer/engine/ndd/model_loader.py @@ -126,12 +126,34 @@ def resolve_model_alias( role: str, selection_model: DetectionModelSelection | ReplaceModelSelection | RewriteModelSelection, ) -> str: - """Read model alias directly from the selection model. + """Read a scalar model alias directly from the selection model. The selection model is already populated with defaults from YAML (via ``load_default_model_selection``) or user overrides. + + For list-valued roles (e.g. ``entity_validator``), use + ``resolve_model_aliases`` instead. + """ + value = getattr(selection_model, role) + if isinstance(value, list): + raise TypeError(f"Role {role!r} is list-valued; use resolve_model_aliases() to read it.") + return value + + +def resolve_model_aliases( + role: str, + selection_model: DetectionModelSelection | ReplaceModelSelection | RewriteModelSelection, +) -> list[str]: + """Read model aliases from the selection model as a list. + + Returns the stored list for list-valued roles (e.g. ``entity_validator``) + or a one-element list wrapping the scalar for scalar roles. Callers + that need to iterate a possible model pool should prefer this helper. """ - return getattr(selection_model, role) + value = getattr(selection_model, role) + if isinstance(value, list): + return list(value) + return [value] def _merge_selections(user_selections: dict[str, dict[str, str]] | None) -> ModelSelection: @@ -164,21 +186,16 @@ def validate_model_alias_references( known_aliases = {model_config.alias for model_config in model_configs} detection_roles = selected_models.detection.model_dump() - roles_to_check: dict[str, str] = { - f"detection.{role}": detection_roles[role] - for role in ("entity_detector", "entity_validator", "entity_augmenter") - } + roles_to_check: dict[str, str] = {} + for role in ("entity_detector", "entity_validator", "entity_augmenter"): + _collect_role(roles_to_check, f"detection.{role}", detection_roles[role]) if check_rewrite: - roles_to_check.update( - { - "detection.latent_detector": detection_roles["latent_detector"], - **{f"rewrite.{role}": alias for role, alias in selected_models.rewrite.model_dump().items()}, - } - ) + _collect_role(roles_to_check, "detection.latent_detector", detection_roles["latent_detector"]) + for role, alias in selected_models.rewrite.model_dump().items(): + _collect_role(roles_to_check, f"rewrite.{role}", alias) if check_substitute: - roles_to_check.update( - {f"replace.{role}": alias for role, alias in selected_models.replace.model_dump().items()} - ) + for role, alias in selected_models.replace.model_dump().items(): + _collect_role(roles_to_check, f"replace.{role}", alias) unknown = {path: alias for path, alias in roles_to_check.items() if alias not in known_aliases} if unknown: @@ -188,6 +205,15 @@ def validate_model_alias_references( ) +def _collect_role(bucket: dict[str, str], path: str, value: object) -> None: + """Flatten a role entry into ``bucket`` so list-valued roles produce one entry per alias.""" + if isinstance(value, list): + for idx, alias in enumerate(value): + bucket[f"{path}[{idx}]"] = str(alias) + else: + bucket[path] = str(value) + + def _validate_alias_references( models_config: dict[str, Any], selections: dict[str, str], diff --git a/src/anonymizer/interface/anonymizer.py b/src/anonymizer/interface/anonymizer.py index cca83f7e..7ece6cbc 100644 --- a/src/anonymizer/interface/anonymizer.py +++ b/src/anonymizer/interface/anonymizer.py @@ -95,7 +95,7 @@ def __init__( logger.info("🔧 Anonymizer initialized with %d model configs", len(self._model_configs)) det = self._selected_models.detection logger.info(LOG_INDENT + "🔎 detector: %s", det.entity_detector) - logger.info(LOG_INDENT + "✅ validator: %s", det.entity_validator) + logger.info(LOG_INDENT + "✅ validator: %s", ", ".join(det.entity_validator)) logger.info(LOG_INDENT + "🧩 augmenter: %s", det.entity_augmenter) if data_designer is not None: diff --git a/tests/config/test_anonymizer_config.py b/tests/config/test_anonymizer_config.py index 37d2b6e6..ee8c8594 100644 --- a/tests/config/test_anonymizer_config.py +++ b/tests/config/test_anonymizer_config.py @@ -117,3 +117,31 @@ def test_both_modes_set_exits() -> None: """Setting both replace and rewrite on AnonymizerConfig violates the model_validator.""" with pytest.raises(ValidationError): AnonymizerConfig(replace=Redact(), rewrite=Rewrite()) + + +def test_detect_chunked_validation_defaults() -> None: + config = AnonymizerConfig(replace=Redact()) + assert config.detect.validation_max_entities_per_call == 100 + assert config.detect.validation_excerpt_window_chars == 500 + + +def test_detect_chunked_validation_accepts_overrides() -> None: + config = AnonymizerConfig( + detect={ + "validation_max_entities_per_call": 25, + "validation_excerpt_window_chars": 1000, + }, + replace=Redact(), + ) + assert config.detect.validation_max_entities_per_call == 25 + assert config.detect.validation_excerpt_window_chars == 1000 + + +def test_detect_validation_max_entities_per_call_must_be_positive() -> None: + with pytest.raises(ValidationError): + AnonymizerConfig(detect={"validation_max_entities_per_call": 0}, replace=Redact()) + + +def test_detect_validation_excerpt_window_chars_must_be_positive() -> None: + with pytest.raises(ValidationError): + AnonymizerConfig(detect={"validation_excerpt_window_chars": 0}, replace=Redact()) diff --git a/tests/engine/test_detection_workflow.py b/tests/engine/test_detection_workflow.py index a6cb0c47..9b7b8d44 100644 --- a/tests/engine/test_detection_workflow.py +++ b/tests/engine/test_detection_workflow.py @@ -30,7 +30,11 @@ _resolve_detection_labels, ) from anonymizer.engine.ndd.adapter import FailedRecord, WorkflowRunResult -from anonymizer.engine.ndd.model_loader import load_default_model_selection, resolve_model_alias +from anonymizer.engine.ndd.model_loader import ( + load_default_model_selection, + resolve_model_alias, + resolve_model_aliases, +) from anonymizer.engine.schemas import EntitiesSchema @@ -326,7 +330,18 @@ def test_resolve_model_alias_reads_from_selection_model() -> None: defaults = load_default_model_selection().detection selection = defaults.model_copy(update={"entity_detector": "custom-model"}) assert resolve_model_alias("entity_detector", selection) == "custom-model" - assert resolve_model_alias("entity_validator", selection) == defaults.entity_validator + assert resolve_model_aliases("entity_validator", selection) == defaults.entity_validator + + +def test_resolve_model_alias_raises_for_list_valued_role() -> None: + selection = load_default_model_selection().detection + with pytest.raises(TypeError, match="list-valued"): + resolve_model_alias("entity_validator", selection) + + +def test_resolve_model_aliases_wraps_scalar_roles() -> None: + selection = load_default_model_selection().detection + assert resolve_model_aliases("entity_detector", selection) == [selection.entity_detector] def test_resolve_detection_labels_none_uses_defaults() -> None: diff --git a/tests/engine/test_model_loader.py b/tests/engine/test_model_loader.py index 29ff2998..82406384 100644 --- a/tests/engine/test_model_loader.py +++ b/tests/engine/test_model_loader.py @@ -54,7 +54,9 @@ def test_load_default_model_selection_populates_all_workflows() -> None: selection = load_default_model_selection() # Detection assert selection.detection.entity_detector - assert selection.detection.entity_validator + assert selection.detection.entity_validator # list[str] + assert isinstance(selection.detection.entity_validator, list) + assert all(isinstance(alias, str) and alias for alias in selection.detection.entity_validator) assert selection.detection.entity_augmenter assert selection.detection.latent_detector # Replace @@ -228,3 +230,93 @@ def test_validate_model_alias_references_skips_rewrite_alias_when_not_enabled( stub_known_model_configs, selected_models, ) + + +class TestEntityValidatorNormalization: + """``DetectionModelSelection.entity_validator`` accepts scalar or list input. + + Scalars normalize to single-item lists so every downstream consumer + sees ``list[str]``. + """ + + def test_scalar_normalizes_to_single_item_list(self) -> None: + selection = DetectionModelSelection( + entity_detector="d", + entity_validator="v", + entity_augmenter="a", + latent_detector="l", + ) + assert selection.entity_validator == ["v"] + + def test_list_preserved(self) -> None: + selection = DetectionModelSelection( + entity_detector="d", + entity_validator=["v1", "v2", "v3"], + entity_augmenter="a", + latent_detector="l", + ) + assert selection.entity_validator == ["v1", "v2", "v3"] + + def test_empty_list_rejected(self) -> None: + with pytest.raises(ValueError, match="at least one model alias"): + DetectionModelSelection( + entity_detector="d", + entity_validator=[], + entity_augmenter="a", + latent_detector="l", + ) + + def test_whitespace_only_rejected(self) -> None: + with pytest.raises(ValueError, match="at least one model alias"): + DetectionModelSelection( + entity_detector="d", + entity_validator=[" ", ""], + entity_augmenter="a", + latent_detector="l", + ) + + def test_non_string_non_list_rejected(self) -> None: + with pytest.raises((ValueError, TypeError)): + DetectionModelSelection( + entity_detector="d", + entity_validator=42, # type: ignore[arg-type] + entity_augmenter="a", + latent_detector="l", + ) + + +class TestValidateAliasReferencesHandlesValidatorPool: + """``validate_model_alias_references`` must expand list-valued roles to one check per alias.""" + + def test_accepts_all_pool_aliases_present( + self, + stub_slim_model_selection: ModelSelection, + ) -> None: + """Pool of aliases all present in the model pool — passes.""" + configs = [ + ModelConfig(alias="v1", model="test/v1"), + ModelConfig(alias="v2", model="test/v2"), + ModelConfig(alias="known", model="some/model"), + ] + selected_models = stub_slim_model_selection.model_copy( + update={ + "detection": stub_slim_model_selection.detection.model_copy(update={"entity_validator": ["v1", "v2"]}) + } + ) + validate_model_alias_references(configs, selected_models) + + def test_raises_on_any_pool_alias_missing( + self, + stub_known_model_configs: list[ModelConfig], + stub_slim_model_selection: ModelSelection, + ) -> None: + """If any alias in the validator pool is unknown, error names that alias by index.""" + selected_models = stub_slim_model_selection.model_copy( + update={ + "detection": stub_slim_model_selection.detection.model_copy( + update={"entity_validator": ["known", "missing-one"]} + ) + } + ) + with pytest.raises(ValueError, match=r"entity_validator\[1\].*missing-one"): + validate_model_alias_references(stub_known_model_configs, selected_models) From 2e7c97b77ccff98a30bb9a76ae186da90e92808a Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Mon, 20 Apr 2026 16:26:43 -0700 Subject: [PATCH 02/20] feat(detect): chunked validation across a validator pool Replaces the single per-row validation LLM call with a chunked custom column that partitions each row's validation candidates, dispatches each chunk to one alias from the configured `entity_validator` pool (round-robin by chunk index), and merges the decisions back into the shape `apply_validation_decisions` already expects. Single-alias pools behave identically to the prior path; multi-alias pools distribute calls to bypass per-alias TPM/RPM caps. Chunk size and the per-chunk excerpt window are driven by the Detect config knobs added in commit 1 (`validation_max_entities_per_call`, `validation_excerpt_window_chars`) and threaded through `Anonymizer -> EntityDetectionWorkflow.run`. Per-row tag notation is resolved once against the full text and forced onto every chunk so excerpts stay internally consistent; `build_tagged_text` gains an optional `notation=` override for that purpose. Implementation uses data_designer's `CustomColumnConfig` with `model_aliases=pool`, letting DD inject a `{alias: ModelFacade}` dict and giving us its retry/throttle behavior for free. A terminal LLM error in any chunk fails the whole row (surfaced as a `FailedRecord`); decisions for ids outside the candidate set are dropped to mirror the single-call path's `enrich_validation_decisions` filter. Emits a warning when the pool has more than one alias noting that `max_parallel_requests` is enforced per alias, so the pool multiplies total in-flight validator calls. Made-with: Cursor --- .../engine/detection/chunked_validation.py | 412 +++++++++++++ .../engine/detection/detection_workflow.py | 67 ++- .../engine/detection/postprocess.py | 27 +- src/anonymizer/interface/anonymizer.py | 2 + tests/engine/test_chunked_validation.py | 543 ++++++++++++++++++ tests/engine/test_detection_workflow.py | 175 +++++- 6 files changed, 1204 insertions(+), 22 deletions(-) create mode 100644 src/anonymizer/engine/detection/chunked_validation.py create mode 100644 tests/engine/test_chunked_validation.py diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py new file mode 100644 index 00000000..02f28ac1 --- /dev/null +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -0,0 +1,412 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Chunked LLM validation for the entity detection pipeline. + +Validation is the step that asks an LLM to keep/reclass/drop each candidate +entity produced by the detector. Historically we ran one LLM call per row over +the full tagged text. For long documents with many candidates this hits +context-window and TPM/RPM limits. This module replaces that single call with +a fan-out: partition the row's candidates into chunks, build a small tagged +excerpt of text around each chunk, render the existing validation prompt per +chunk, and dispatch each chunk to an alias selected round-robin from a +configured validator pool. + +The per-chunk decisions are merged into a ``ValidationDecisionsSchema``-shaped +payload so the downstream ``enrich_validation_decisions`` column keeps +working unchanged. + +The public entry point for DataDesigner wiring is +:func:`make_chunked_validation_generator`, which produces a generator function +decorated with ``@custom_column_generator`` and bound to a concrete pool. +The pure helpers below are exposed for unit testing. +""" + +from __future__ import annotations + +import asyncio +import logging +from typing import Any + +from data_designer.config import custom_column_generator +from data_designer.engine.models.recipes.response_recipes import PydanticResponseRecipe +from jinja2 import BaseLoader, Environment, StrictUndefined +from pydantic import BaseModel, Field + +from anonymizer.engine.constants import ( + COL_MERGED_TAGGED_TEXT, + COL_SEED_ENTITIES, + COL_TAG_NOTATION, + COL_TEXT, + COL_VALIDATION_CANDIDATES, + COL_VALIDATION_DECISIONS, + COL_VALIDATION_SKELETON, +) +from anonymizer.engine.detection.postprocess import ( + EntitySpan, + TagNotation, + build_tagged_text, +) +from anonymizer.engine.schemas import ( + EntitiesSchema, + RawValidationDecisionsSchema, + ValidationCandidatesSchema, + ValidationDecisionsSchema, + ValidationSkeletonDecisionSchema, + ValidationSkeletonSchema, +) + +logger = logging.getLogger("anonymizer.detection.chunked_validation") + +# Jinja2 environment used to render the per-chunk validation prompt. +# The template mirrors the production prompt exactly: we substitute the same +# placeholders (``_merged_tagged_text``, ``_validation_skeleton``, +# ``_tag_notation``) but with per-chunk values. +_PROMPT_ENV = Environment( + loader=BaseLoader(), + autoescape=False, + undefined=StrictUndefined, + keep_trailing_newline=True, +) + + +class ChunkedValidationParams(BaseModel): + """Parameters supplied to :func:`chunked_validate_row` via DD's ``generator_params``. + + Attributes: + pool: Ordered list of validator model aliases. Chunk ``i`` is dispatched + to ``pool[i % len(pool)]``. Must be non-empty and every alias must + also be present in the decorator's ``model_aliases`` so DataDesigner + materialises the facade. + max_entities_per_call: Upper bound on candidates per chunk. + excerpt_window_chars: Chars of surrounding raw text included in each + chunk's excerpt on either side of the chunk span. + prompt_template: Jinja2 source for the validation prompt (with + ``_merged_tagged_text``, ``_validation_skeleton``, ``_tag_notation`` + placeholders). Typically produced by ``_get_validation_prompt``. + system_prompt: Optional system prompt forwarded to each chunk call. + """ + + pool: list[str] = Field(min_length=1) + max_entities_per_call: int = Field(gt=0) + excerpt_window_chars: int = Field(gt=0) + prompt_template: str + system_prompt: str | None = None + + +# --------------------------------------------------------------------------- +# Pure helpers (no DataDesigner, no LLM). Tested directly. +# --------------------------------------------------------------------------- + + +def order_candidates_by_position( + candidates: ValidationCandidatesSchema, + seed_entities: list[EntitySpan], +) -> list[tuple[Any, EntitySpan]]: + """Pair each candidate with its matching seed entity and sort by text position. + + Every candidate id must resolve to a seed entity. A missing id indicates an + upstream bug in ``merge_and_build_candidates`` or ``prepare_validation_inputs`` + (both produce candidates whose ids come from ``EntitySpan.entity_id``). We + raise early with the offending id so the failure is easy to triage. + """ + seed_by_id = {span.entity_id: span for span in seed_entities} + paired: list[tuple[Any, EntitySpan]] = [] + for candidate in candidates.candidates: + seed = seed_by_id.get(candidate.id) + if seed is None: + raise ValueError( + f"Validation candidate id {candidate.id!r} has no matching seed entity. " + "Every candidate produced by merge_and_build_candidates or " + "prepare_validation_inputs must correspond to a seed entity with " + "start_position and end_position populated; this inconsistency " + "indicates a bug in one of those upstream generators." + ) + paired.append((candidate, seed)) + paired.sort(key=lambda pair: (pair[1].start_position, pair[1].end_position, pair[1].entity_id)) + return paired + + +def chunk_candidates( + ordered: list[tuple[Any, EntitySpan]], + max_entities_per_call: int, +) -> list[list[tuple[Any, EntitySpan]]]: + """Partition the ordered (candidate, seed) pairs into chunks of at most ``max_entities_per_call``.""" + if max_entities_per_call <= 0: + raise ValueError(f"max_entities_per_call must be > 0, got {max_entities_per_call}.") + return [ordered[i : i + max_entities_per_call] for i in range(0, len(ordered), max_entities_per_call)] + + +def build_chunk_excerpt( + *, + text: str, + chunk_spans: list[EntitySpan], + all_spans: list[EntitySpan], + window_chars: int, + notation: TagNotation, +) -> str: + """Build a tagged text excerpt wide enough to give the LLM context around ``chunk_spans``. + + The excerpt spans ``[min(chunk.start) - window, max(chunk.end) + window]`` + clamped to the text bounds. Any entity from ``all_spans`` fully contained + in that window is re-tagged inside the excerpt so the surrounding context + matches the full-document view. The forced ``notation`` keeps tags stable + across chunks of the same row even when a local slice would otherwise + pick a different heuristic. + """ + if not chunk_spans: + return "" + chunk_start = min(span.start_position for span in chunk_spans) + chunk_end = max(span.end_position for span in chunk_spans) + excerpt_start = max(0, chunk_start - window_chars) + excerpt_end = min(len(text), chunk_end + window_chars) + excerpt_raw = text[excerpt_start:excerpt_end] + in_window = [ + EntitySpan( + entity_id=span.entity_id, + value=span.value, + label=span.label, + start_position=span.start_position - excerpt_start, + end_position=span.end_position - excerpt_start, + score=span.score, + source=span.source, + ) + for span in all_spans + if span.start_position >= excerpt_start and span.end_position <= excerpt_end + ] + return build_tagged_text(excerpt_raw, in_window, notation=notation) + + +def build_chunk_skeleton(chunk_candidates_: list[Any]) -> dict[str, Any]: + """Build the validation skeleton (``ValidationSkeletonSchema``) for a chunk.""" + skeleton = ValidationSkeletonSchema( + decisions=[ + ValidationSkeletonDecisionSchema(id=c.id, value=c.value, label=c.label) for c in chunk_candidates_ + ] + ) + return skeleton.model_dump(mode="json") + + +def render_chunk_prompt( + *, + template: str, + excerpt: str, + skeleton: dict[str, Any], + notation: TagNotation, +) -> str: + """Render the validation prompt for a single chunk via Jinja2. + + The template and context match the production ``LLMStructuredColumnConfig`` + call: dicts are rendered with Python ``str()`` (Jinja2 default), which is + how the existing prompt has always served ``{{ _validation_skeleton }}``. + """ + compiled = _PROMPT_ENV.from_string(template) + return compiled.render( + **{ + COL_MERGED_TAGGED_TEXT: excerpt, + COL_VALIDATION_SKELETON: skeleton, + COL_TAG_NOTATION: notation.value, + } + ) + + +def merge_chunk_decisions( + chunk_results: list[RawValidationDecisionsSchema], + candidates: ValidationCandidatesSchema, +) -> dict[str, Any]: + """Flatten chunk decisions into a single ``ValidationDecisionsSchema`` payload. + + Mirrors the single-call contract: + - Only decisions whose ids match a known candidate are retained. This is + consistent with ``enrich_validation_decisions``, which also filters to + valid ids; doing it here too keeps COL_VALIDATION_DECISIONS minimal. + - Duplicate ids across chunks keep the first occurrence. Duplicates + shouldn't happen because candidates partition cleanly, but deduping is + cheap defence-in-depth. + - Candidates with no decision at all flow through as absent; downstream + ``apply_validation_decisions`` treats them as ``keep`` with the original + label, which matches prior behaviour for a partially-answered response. + """ + candidate_lookup = {c.id: c for c in candidates.candidates} + valid_ids = set(candidate_lookup) + seen: set[str] = set() + merged_decisions: list[dict[str, Any]] = [] + for result in chunk_results: + for decision in result.decisions: + if decision.id not in valid_ids or decision.id in seen: + continue + cand = candidate_lookup[decision.id] + merged_decisions.append( + { + "id": decision.id, + "value": cand.value, + "label": cand.label, + "decision": decision.decision.value if decision.decision is not None else None, + "proposed_label": decision.proposed_label or "", + "reason": decision.reason, + } + ) + seen.add(decision.id) + # Validate the final shape so malformed output fails loud here rather than in a + # downstream parser. ``ValidationDecisionsSchema.decision`` is non-optional, so + # we drop rows where the LLM didn't supply a decision; the downstream + # "no decision -> keep" semantics rely on candidate-not-in-output, not on a + # null-decision entry. + filtered = [d for d in merged_decisions if d["decision"] is not None] + return ValidationDecisionsSchema.model_validate({"decisions": filtered}).model_dump(mode="json") + + +# --------------------------------------------------------------------------- +# Async dispatch. Testable by passing fake ``models``. +# --------------------------------------------------------------------------- + + +async def _dispatch_chunk( + *, + facade: Any, + prompt: str, + system_prompt: str | None, + chunk_index: int, +) -> RawValidationDecisionsSchema: + """Dispatch a single chunk via the configured ModelFacade. + + We use ``PydanticResponseRecipe`` so the facade appends JSON task + instructions and parses the response into ``RawValidationDecisionsSchema``. + Parser failures and transient LLM errors are handled by + ``ModelFacade.agenerate`` (throttle, retries); any error that escapes is + terminal for the chunk. We let it propagate so the row fails as a whole + and DataDesigner records it as a ``FailedRecord``. + """ + recipe = PydanticResponseRecipe(data_type=RawValidationDecisionsSchema) + final_prompt = recipe.apply_recipe_to_user_prompt(prompt) + final_system = recipe.apply_recipe_to_system_prompt(system_prompt) + output, _messages = await facade.agenerate( + prompt=final_prompt, + parser=recipe.parse, + system_prompt=final_system, + purpose=f"entity-validation-chunk-{chunk_index}", + ) + return output + + +async def chunked_validate_row( + row: dict[str, Any], + params: ChunkedValidationParams, + models: dict[str, Any], +) -> dict[str, Any]: + """Run chunked validation for a single row and write ``COL_VALIDATION_DECISIONS``. + + This is the async workhorse. Call it directly in tests with fake ``models``; + the DataDesigner-decorated wrapper produced by + :func:`make_chunked_validation_generator` just forwards to it. + """ + missing_aliases = [alias for alias in params.pool if alias not in models] + if missing_aliases: + raise KeyError( + f"Validator pool aliases {missing_aliases} not present in models dict. " + f"Ensure make_chunked_validation_generator was invoked with the same pool " + f"passed in ChunkedValidationParams.pool." + ) + + text = str(row.get(COL_TEXT, "")) + candidates = ValidationCandidatesSchema.from_raw(row.get(COL_VALIDATION_CANDIDATES, {})) + seed_entities_schema = EntitiesSchema.from_raw(row.get(COL_SEED_ENTITIES, {})) + notation_raw = row.get(COL_TAG_NOTATION) or TagNotation.sentinel.value + notation = TagNotation(str(notation_raw)) + + # Short-circuit: a row with no candidates has no decisions to make. + if not candidates.candidates: + row[COL_VALIDATION_DECISIONS] = ValidationDecisionsSchema().model_dump(mode="json") + return row + + all_spans = [ + EntitySpan( + entity_id=entity.id, + value=entity.value, + label=entity.label, + start_position=entity.start_position, + end_position=entity.end_position, + score=entity.score, + source=entity.source, + ) + for entity in seed_entities_schema.entities + ] + + ordered = order_candidates_by_position(candidates, all_spans) + chunks = chunk_candidates(ordered, params.max_entities_per_call) + + tasks: list[Any] = [] + for chunk_index, chunk in enumerate(chunks): + chunk_candidates_ = [pair[0] for pair in chunk] + chunk_spans = [pair[1] for pair in chunk] + excerpt = build_chunk_excerpt( + text=text, + chunk_spans=chunk_spans, + all_spans=all_spans, + window_chars=params.excerpt_window_chars, + notation=notation, + ) + skeleton = build_chunk_skeleton(chunk_candidates_) + prompt = render_chunk_prompt( + template=params.prompt_template, + excerpt=excerpt, + skeleton=skeleton, + notation=notation, + ) + # Round-robin across the validator pool. ``ChunkedValidationParams`` + # guarantees ``pool`` is non-empty; ``chunk_index`` comes from + # ``enumerate`` so it's non-negative by construction. + alias = params.pool[chunk_index % len(params.pool)] + facade = models[alias] + tasks.append( + _dispatch_chunk( + facade=facade, + prompt=prompt, + system_prompt=params.system_prompt, + chunk_index=chunk_index, + ) + ) + + # gather() propagates the first exception, cancelling siblings. That's the + # all-or-nothing row contract: a single terminal chunk failure fails the row. + chunk_results = await asyncio.gather(*tasks) + + row[COL_VALIDATION_DECISIONS] = merge_chunk_decisions(chunk_results, candidates) + return row + + +# --------------------------------------------------------------------------- +# DataDesigner wiring factory. +# --------------------------------------------------------------------------- + + +def make_chunked_validation_generator(pool: list[str]) -> Any: + """Build a ``@custom_column_generator``-decorated async function bound to ``pool``. + + ``model_aliases`` must be declared statically on the decorator so + DataDesigner knows which facades to materialise for the generator. Since + the pool is config-driven (per-run), we generate the function dynamically. + The required_columns are exhaustive for DataDesigner's DAG ordering: the + generator reads the raw text, seed entities (for positions), the candidate + list (what to decide), and the tag notation (for excerpt tagging). + """ + if not pool: + raise ValueError("Cannot build chunked validation generator: pool is empty.") + + @custom_column_generator( + required_columns=[ + COL_TEXT, + COL_SEED_ENTITIES, + COL_VALIDATION_CANDIDATES, + COL_TAG_NOTATION, + ], + model_aliases=list(pool), + ) + async def chunked_validate( + row: dict[str, Any], + generator_params: ChunkedValidationParams, + models: dict[str, Any], + ) -> dict[str, Any]: + return await chunked_validate_row(row, generator_params, models) + + return chunked_validate diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index 22134a9b..24e4ef90 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -11,6 +11,7 @@ from data_designer.config.column_configs import CustomColumnConfig, LLMStructuredColumnConfig, LLMTextColumnConfig from data_designer.config.models import ModelConfig +from anonymizer.config.anonymizer_config import Detect as AnonymizerDetectConfig from anonymizer.config.models import DetectionModelSelection from anonymizer.config.rewrite import PrivacyGoal from anonymizer.engine.constants import ( @@ -36,10 +37,13 @@ ENTITY_LABEL_EXAMPLES, _jinja, ) +from anonymizer.engine.detection.chunked_validation import ( + ChunkedValidationParams, + make_chunked_validation_generator, +) from anonymizer.engine.detection.custom_columns import ( apply_validation_and_finalize, apply_validation_to_seed_entities, - build_validation_skeleton, enrich_validation_decisions, merge_and_build_candidates, parse_detected_entities, @@ -54,11 +58,20 @@ EntitiesByValueSchema, EntitiesSchema, LatentEntitiesSchema, - ValidationDecisionsSchema, ) logger = logging.getLogger("anonymizer.detection") +# Defaults for the two chunked-validation knobs. Sourced from the Detect config +# so there is a single source of truth; the workflow method defaults exist so +# internal tests and ad-hoc callers do not have to wire plumbing by hand. +_DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL: int = AnonymizerDetectConfig.model_fields[ + "validation_max_entities_per_call" +].default +_DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS: int = AnonymizerDetectConfig.model_fields[ + "validation_excerpt_window_chars" +].default + @dataclass(frozen=True) class EntityDetectionResult: @@ -79,6 +92,8 @@ def detect_and_validate_entities( model_configs: list[ModelConfig], selected_models: DetectionModelSelection, gliner_detection_threshold: float, + validation_max_entities_per_call: int = _DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, + validation_excerpt_window_chars: int = _DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, entity_labels: list[str] | None = None, data_summary: str | None = None, preview_num_records: int | None = None, @@ -86,9 +101,10 @@ def detect_and_validate_entities( """Run the core detection pipeline: GLiNER NER, LLM validation, LLM augmentation, and finalization. This is the primary detection workflow. It detects entities via GLiNER, - validates/reclassifies them with an LLM, augments with additional - entities the detector may have missed, and produces final standoff - entity spans with overlap resolution. + validates/reclassifies them with an LLM (chunked across a pool of + validator aliases), augments with additional entities the detector may + have missed, and produces final standoff entity spans with overlap + resolution. """ labels = _resolve_detection_labels(entity_labels) workflow_model_configs = self._inject_detector_params( @@ -101,17 +117,34 @@ def detect_and_validate_entities( detection_alias = resolve_model_alias("entity_detector", selected_models) validator_aliases = resolve_model_aliases("entity_validator", selected_models) augmenter_alias = resolve_model_alias("entity_augmenter", selected_models) - # Commit 2 will replace the LLMStructuredColumnConfig below with a chunked - # CustomColumnConfig that rotates across the full validator pool. Until then - # the first alias preserves current single-call semantics. - validator_alias = validator_aliases[0] logger.debug( - "detection aliases: detector=%s, validator=%s (pool=%s), augmenter=%s", + "detection aliases: detector=%s, validator_pool=%s, augmenter=%s", detection_alias, - validator_alias, validator_aliases, augmenter_alias, ) + # ModelConfig.max_parallel_requests caps concurrency *per alias*. When + # the pool has multiple validators each gets its own cap, so total + # in-flight validator calls can reach sum(per-alias caps). Operators + # provisioning rate budgets for downstream providers should size each + # alias's cap accordingly. + if len(validator_aliases) > 1: + logger.warning( + "entity validation runs across a pool of %d aliases (%s). " + "max_parallel_requests is enforced per alias, so the pool " + "multiplies total in-flight validator calls; budget provider " + "TPM/RPM accordingly.", + len(validator_aliases), + validator_aliases, + ) + + validator_generator = make_chunked_validation_generator(validator_aliases) + validator_params = ChunkedValidationParams( + pool=list(validator_aliases), + max_entities_per_call=validation_max_entities_per_call, + excerpt_window_chars=validation_excerpt_window_chars, + prompt_template=_get_validation_prompt(data_summary=data_summary, labels=labels), + ) detection_result = self._adapter.run_workflow( dataframe, @@ -131,13 +164,9 @@ def detect_and_validate_entities( generator_function=prepare_validation_inputs, ), CustomColumnConfig( - name=COL_VALIDATION_SKELETON, generator_function=build_validation_skeleton, drop=True - ), - LLMStructuredColumnConfig( name=COL_VALIDATION_DECISIONS, - prompt=_get_validation_prompt(data_summary=data_summary, labels=labels), - model_alias=validator_alias, - output_format=ValidationDecisionsSchema, + generator_function=validator_generator, + generator_params=validator_params, drop=True, ), CustomColumnConfig( @@ -222,6 +251,8 @@ def run( model_configs: list[ModelConfig], selected_models: DetectionModelSelection, gliner_detection_threshold: float, + validation_max_entities_per_call: int = _DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, + validation_excerpt_window_chars: int = _DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, entity_labels: list[str] | None = None, privacy_goal: PrivacyGoal | None = None, data_summary: str | None = None, @@ -244,6 +275,8 @@ def run( model_configs=model_configs, selected_models=selected_models, gliner_detection_threshold=gliner_detection_threshold, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, entity_labels=entity_labels, data_summary=data_summary, preview_num_records=preview_num_records, diff --git a/src/anonymizer/engine/detection/postprocess.py b/src/anonymizer/engine/detection/postprocess.py index 14338083..9e3557e3 100644 --- a/src/anonymizer/engine/detection/postprocess.py +++ b/src/anonymizer/engine/detection/postprocess.py @@ -255,11 +255,30 @@ def resolve_overlaps(entities: list[EntitySpan]) -> list[EntitySpan]: return sorted(accepted, key=lambda item: (item.start_position, item.end_position, item.label)) -def build_tagged_text(text: str, entities: list[EntitySpan]) -> str: - """Render human-readable tagged text for downstream LLM prompts.""" +def build_tagged_text( + text: str, + entities: list[EntitySpan], + *, + notation: TagNotation | str | None = None, +) -> str: + """Render human-readable tagged text for downstream LLM prompts. + + Args: + text: Source text to annotate. + entities: Entities to tag within ``text``; positions are relative to ``text``. + notation: Optional override of the tag notation. When ``None`` the + notation is chosen heuristically from ``text`` (default behaviour). + Callers that tag a substring of a larger document should pass the + parent document's notation so tags remain stable across excerpts. + """ if not entities: return text - notation = _choose_tag_notation(text) + if notation is None: + resolved_notation = _choose_tag_notation(text) + elif isinstance(notation, TagNotation): + resolved_notation = notation + else: + resolved_notation = TagNotation(notation) cursor = 0 parts: list[str] = [] for entity in sorted(entities, key=lambda item: (item.start_position, item.end_position)): @@ -270,7 +289,7 @@ def build_tagged_text(text: str, entities: list[EntitySpan]) -> str: _format_entity_tag( value=text[entity.start_position : entity.end_position], label=entity.label, - notation=notation, + notation=resolved_notation, ) ) cursor = entity.end_position diff --git a/src/anonymizer/interface/anonymizer.py b/src/anonymizer/interface/anonymizer.py index 7ece6cbc..8cdbb04f 100644 --- a/src/anonymizer/interface/anonymizer.py +++ b/src/anonymizer/interface/anonymizer.py @@ -210,6 +210,8 @@ def _run_internal( model_configs=self._model_configs, selected_models=self._selected_models.detection, gliner_detection_threshold=config.detect.gliner_threshold, + validation_max_entities_per_call=config.detect.validation_max_entities_per_call, + validation_excerpt_window_chars=config.detect.validation_excerpt_window_chars, entity_labels=config.detect.entity_labels, privacy_goal=config.rewrite.privacy_goal if config.rewrite else None, data_summary=data.data_summary, diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py new file mode 100644 index 00000000..70fa4dad --- /dev/null +++ b/tests/engine/test_chunked_validation.py @@ -0,0 +1,543 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for chunked LLM validation. + +The module is layered: pure helpers (ordering, chunking, excerpts, prompt +rendering, merging) are tested directly; the async dispatch is tested via a +fake ``ModelFacade`` that records calls and returns preconfigured responses. +""" + +from __future__ import annotations + +import asyncio +import json +from typing import Any, Callable + +import pytest + +from anonymizer.engine.constants import ( + COL_MERGED_TAGGED_TEXT, + COL_SEED_ENTITIES, + COL_TAG_NOTATION, + COL_TEXT, + COL_VALIDATION_CANDIDATES, + COL_VALIDATION_DECISIONS, + COL_VALIDATION_SKELETON, +) +from anonymizer.engine.detection.chunked_validation import ( + ChunkedValidationParams, + build_chunk_excerpt, + build_chunk_skeleton, + chunk_candidates, + chunked_validate_row, + make_chunked_validation_generator, + merge_chunk_decisions, + order_candidates_by_position, + render_chunk_prompt, +) +from anonymizer.engine.detection.postprocess import EntitySpan, TagNotation +from anonymizer.engine.schemas import ( + EntitiesSchema, + RawValidationDecisionsSchema, + ValidationCandidateSchema, + ValidationCandidatesSchema, +) + + +# --------------------------------------------------------------------------- +# Test fixtures +# --------------------------------------------------------------------------- + + +class FakeFacade: + """Test double for ``ModelFacade`` recording invocations and replaying canned responses. + + A canned response can be a ``dict`` (auto-wrapped in a ```json fence), a + raw string, or a callable that receives the prompt and returns either. + Set ``raise_on_call=True`` to simulate a terminal LLM failure. + """ + + def __init__( + self, + alias: str, + response: dict | str | Callable[[str], dict | str] | None = None, + *, + raise_on_call: bool = False, + ) -> None: + self.alias = alias + self._response = response + self._raise = raise_on_call + self.calls: list[dict[str, Any]] = [] + + async def agenerate(self, *, prompt, parser, system_prompt=None, purpose=None, **kwargs): + self.calls.append( + { + "prompt": prompt, + "system_prompt": system_prompt, + "purpose": purpose, + "kwargs": kwargs, + } + ) + if self._raise: + raise RuntimeError(f"forced failure from {self.alias}") + response = self._response + if callable(response): + response = response(prompt) + raw = response if isinstance(response, str) else f"```json\n{json.dumps(response)}\n```" + return parser(raw), [] + + +def _entity_span(entity_id: str, value: str, label: str, start: int, end: int) -> EntitySpan: + return EntitySpan( + entity_id=entity_id, + value=value, + label=label, + start_position=start, + end_position=end, + score=1.0, + source="detector", + ) + + +def _candidates_schema(*candidates: tuple[str, str, str]) -> ValidationCandidatesSchema: + return ValidationCandidatesSchema( + candidates=[ValidationCandidateSchema(id=cid, value=val, label=lab) for cid, val, lab in candidates] + ) + + +# --------------------------------------------------------------------------- +# Pure helpers: ordering / chunking / excerpt / skeleton / prompt / merge +# --------------------------------------------------------------------------- + + +class TestOrderCandidatesByPosition: + def test_orders_by_start_then_end_then_id(self) -> None: + candidates = _candidates_schema( + ("a_10_13", "foo", "first_name"), + ("b_0_5", "bar", "email"), + ("c_10_12", "baz", "city"), + ) + spans = [ + _entity_span("a_10_13", "foo", "first_name", 10, 13), + _entity_span("b_0_5", "bar", "email", 0, 5), + _entity_span("c_10_12", "baz", "city", 10, 12), + ] + ordered = order_candidates_by_position(candidates, spans) + assert [pair[0].id for pair in ordered] == ["b_0_5", "c_10_12", "a_10_13"] + + def test_missing_seed_raises_with_triage_hint(self) -> None: + candidates = _candidates_schema(("missing", "x", "y")) + with pytest.raises(ValueError, match="merge_and_build_candidates or prepare_validation_inputs"): + order_candidates_by_position(candidates, []) + + +class TestChunkCandidates: + def test_splits_into_chunks_of_at_most_n(self) -> None: + ordered = [(i, None) for i in range(5)] # type: ignore[misc] + chunks = chunk_candidates(ordered, max_entities_per_call=2) + assert [len(c) for c in chunks] == [2, 2, 1] + + def test_empty_input_yields_no_chunks(self) -> None: + assert chunk_candidates([], max_entities_per_call=10) == [] + + def test_non_positive_limit_raises(self) -> None: + with pytest.raises(ValueError): + chunk_candidates([(1, None)], max_entities_per_call=0) + + +class TestBuildChunkExcerpt: + def test_includes_fully_contained_neighbors_and_drops_outside_spans(self) -> None: + text = "Alice met Bob at Acme HQ in Seattle yesterday." + spans = [ + _entity_span("alice_0_5", "Alice", "first_name", 0, 5), + _entity_span("bob_10_13", "Bob", "first_name", 10, 13), + _entity_span("acme_17_21", "Acme", "organization", 17, 21), + _entity_span("seattle_28_35", "Seattle", "city", 28, 35), + ] + # Window 8 around Bob (10..13) => excerpt [2, 21]; contains Acme fully (17..21) + # but truncates 'Alice' (0..5) and excludes 'Seattle' (28..35). + excerpt = build_chunk_excerpt( + text=text, + chunk_spans=[spans[1]], + all_spans=spans, + window_chars=8, + notation=TagNotation.xml, + ) + assert "Bob" in excerpt + assert "Acme" in excerpt + # Alice is partially in the window (end=5 inside, but start=0 before) → excluded + assert "first_name>Alice" not in excerpt + assert "Seattle" not in excerpt + + def test_partially_contained_neighbor_is_excluded(self) -> None: + """A neighbor that only partially overlaps the excerpt window must not be re-tagged. + + Tagging a truncated span would emit text that doesn't match the + entity's actual value, which is worse than omitting the tag entirely. + """ + text = "PREFIX Alice SUFFIX" # Alice at 7..12 + spans = [_entity_span("a", "Alice", "first_name", 7, 12)] + # Excerpt window [8, 12] cuts off 'A' from Alice → partial, must be excluded. + excerpt = build_chunk_excerpt( + text=text, + chunk_spans=spans, + all_spans=spans, + window_chars=0, + notation=TagNotation.xml, + ) + # chunk_spans IS Alice (7..12), so its own span is within. This test + # instead builds the partial case via a distinct chunk entity: + bob = _entity_span("b", "lice", "first_name", 8, 12) + chunk = [bob] + excerpt2 = build_chunk_excerpt( + text=text, + chunk_spans=chunk, + all_spans=[spans[0], bob], + window_chars=0, + notation=TagNotation.xml, + ) + # 'Alice' at 7..12 starts before excerpt_start=8 → excluded. + # 'bob' (the chunk entity) has positions 8..12 which are within the window → included. + assert "lice" in excerpt2 + assert "Alice" not in excerpt2 + _ = excerpt # suppress "unused" lint + + def test_forces_requested_notation_over_heuristic(self) -> None: + text = "Alice met Bob at HQ" + spans = [_entity_span("alice_0_5", "Alice", "first_name", 0, 5)] + excerpt = build_chunk_excerpt( + text=text, + chunk_spans=spans, + all_spans=spans, + window_chars=100, + notation=TagNotation.paren, + ) + assert "((SENSITIVE:first_name|Alice))" in excerpt + assert "" not in excerpt + + def test_empty_chunk_returns_empty_string(self) -> None: + assert build_chunk_excerpt( + text="x", chunk_spans=[], all_spans=[], window_chars=5, notation=TagNotation.xml + ) == "" + + +class TestBuildChunkSkeleton: + def test_skeleton_matches_chunk_only(self) -> None: + candidates = _candidates_schema(("a", "Alice", "first_name"), ("b", "Bob", "first_name")) + skeleton = build_chunk_skeleton([candidates.candidates[0]]) + assert skeleton == { + "decisions": [ + {"id": "a", "value": "Alice", "label": "first_name", "decision": None, "proposed_label": None, "reason": None} + ] + } + + +class TestRenderChunkPrompt: + def test_substitutes_excerpt_skeleton_and_notation(self) -> None: + template = ( + "Input: {{ _merged_tagged_text }}\n" + "Skeleton: {{ _validation_skeleton }}\n" + '{%- if _tag_notation == "xml" -%}notation-is-xml{%- endif -%}' + ) + rendered = render_chunk_prompt( + template=template, + excerpt="hello Alice", + skeleton={"decisions": [{"id": "a"}]}, + notation=TagNotation.xml, + ) + assert "Input: hello Alice" in rendered + assert "notation-is-xml" in rendered + # Dict rendered via Python str(); this matches the existing production prompt path. + assert "'id': 'a'" in rendered + + +class TestMergeChunkDecisions: + def test_filters_unknown_ids_and_deduplicates(self) -> None: + candidates = _candidates_schema(("a", "Alice", "first_name"), ("b", "Bob", "first_name")) + chunk_one = RawValidationDecisionsSchema.model_validate( + {"decisions": [{"id": "a", "decision": "keep"}, {"id": "ghost", "decision": "drop"}]} + ) + chunk_two = RawValidationDecisionsSchema.model_validate( + {"decisions": [{"id": "b", "decision": "drop"}, {"id": "a", "decision": "reclass"}]} + ) + merged = merge_chunk_decisions([chunk_one, chunk_two], candidates) + ids = [d["id"] for d in merged["decisions"]] + assert ids == ["a", "b"] # 'ghost' dropped; duplicate 'a' from chunk_two ignored + by_id = {d["id"]: d for d in merged["decisions"]} + assert by_id["a"]["decision"] == "keep" + assert by_id["b"]["decision"] == "drop" + assert by_id["a"]["value"] == "Alice" # enriched from candidate + assert by_id["a"]["label"] == "first_name" + + def test_drops_decisions_without_verdict(self) -> None: + """A decision with ``decision=None`` is equivalent to 'no answer' and must not leak through. + + Downstream ``apply_validation_decisions`` interprets missing-id as + 'keep unchanged'. Emitting a null-decision entry would break that. + """ + candidates = _candidates_schema(("a", "Alice", "first_name")) + chunk = RawValidationDecisionsSchema.model_validate({"decisions": [{"id": "a", "decision": None}]}) + merged = merge_chunk_decisions([chunk], candidates) + assert merged == {"decisions": []} + + +# --------------------------------------------------------------------------- +# Async dispatch: chunked_validate_row end-to-end with fake facades +# --------------------------------------------------------------------------- + + +def _build_row( + *, + text: str, + seed_entities: list[EntitySpan], + candidates: ValidationCandidatesSchema, + notation: TagNotation = TagNotation.xml, +) -> dict[str, Any]: + return { + COL_TEXT: text, + COL_SEED_ENTITIES: EntitiesSchema( + entities=[ + { + "id": span.entity_id, + "value": span.value, + "label": span.label, + "start_position": span.start_position, + "end_position": span.end_position, + "score": span.score, + "source": span.source, + } + for span in seed_entities + ] + ).model_dump(mode="json"), + COL_VALIDATION_CANDIDATES: candidates.model_dump(mode="json"), + COL_TAG_NOTATION: notation.value, + } + + +_MINIMAL_TEMPLATE = ( + "TAGGED:{{ _merged_tagged_text }}|" + "SKELETON:{{ _validation_skeleton }}|" + "NOTATION:{{ _tag_notation }}" +) + + +class TestChunkedValidateRowPoolOfOne: + def test_single_chunk_single_alias_dispatches_once_and_merges(self) -> None: + text = "Alice and Bob met." + spans = [ + _entity_span("a", "Alice", "first_name", 0, 5), + _entity_span("b", "Bob", "first_name", 10, 13), + ] + candidates = _candidates_schema(("a", "Alice", "first_name"), ("b", "Bob", "first_name")) + row = _build_row(text=text, seed_entities=spans, candidates=candidates) + + facade = FakeFacade( + "v0", + response={ + "decisions": [ + {"id": "a", "decision": "keep", "proposed_label": "", "reason": "real"}, + {"id": "b", "decision": "drop", "proposed_label": "", "reason": "placeholder"}, + ] + }, + ) + params = ChunkedValidationParams( + pool=["v0"], + max_entities_per_call=10, + excerpt_window_chars=100, + prompt_template=_MINIMAL_TEMPLATE, + ) + + out = asyncio.run(chunked_validate_row(row, params, {"v0": facade})) + + assert len(facade.calls) == 1 + decisions = out[COL_VALIDATION_DECISIONS]["decisions"] + assert {d["id"]: d["decision"] for d in decisions} == {"a": "keep", "b": "drop"} + + def test_empty_candidates_short_circuits_without_calls(self) -> None: + row = _build_row(text="hello", seed_entities=[], candidates=_candidates_schema()) + facade = FakeFacade("v0", response={"decisions": []}) + params = ChunkedValidationParams( + pool=["v0"], max_entities_per_call=10, excerpt_window_chars=50, prompt_template=_MINIMAL_TEMPLATE + ) + + out = asyncio.run(chunked_validate_row(row, params, {"v0": facade})) + + assert facade.calls == [] + assert out[COL_VALIDATION_DECISIONS] == {"decisions": []} + + +class TestChunkedValidateRowPoolOfTwoRoundRobin: + def test_chunks_assigned_round_robin_across_pool(self) -> None: + text = "A B C D E F" + " " * 50 # pad so excerpts don't overlap + spans = [_entity_span(f"e{i}", chr(ord("A") + i), "first_name", i * 2, i * 2 + 1) for i in range(6)] + candidates = _candidates_schema(*[(f"e{i}", chr(ord("A") + i), "first_name") for i in range(6)]) + row = _build_row(text=text, seed_entities=spans, candidates=candidates) + + def make_response(chunk_size: int) -> Callable[[str], dict]: + def _respond(prompt: str) -> dict: + # Parse out which entity ids are in this chunk from the skeleton. + # We can't easily, so just return empty decisions — the dispatch + # order assertion is about which alias was called, not contents. + return {"decisions": []} + + return _respond + + v0 = FakeFacade("v0", response=make_response(2)) + v1 = FakeFacade("v1", response=make_response(2)) + params = ChunkedValidationParams( + pool=["v0", "v1"], + max_entities_per_call=2, + excerpt_window_chars=50, + prompt_template=_MINIMAL_TEMPLATE, + ) + asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) + # 6 candidates / 2 per chunk = 3 chunks; round-robin → v0,v1,v0 + assert len(v0.calls) == 2 + assert len(v1.calls) == 1 + + +class TestChunkedValidateRowMultiChunkReassembly: + def test_decisions_merged_across_chunks(self) -> None: + text = "one two three four five" + spans = [ + _entity_span("c1", "one", "first_name", 0, 3), + _entity_span("c2", "two", "first_name", 4, 7), + _entity_span("c3", "three", "first_name", 8, 13), + _entity_span("c4", "four", "first_name", 14, 18), + ] + candidates = _candidates_schema( + ("c1", "one", "first_name"), + ("c2", "two", "first_name"), + ("c3", "three", "first_name"), + ("c4", "four", "first_name"), + ) + row = _build_row(text=text, seed_entities=spans, candidates=candidates) + + def responder_for(alias: str) -> Callable[[str], dict]: + def respond(prompt: str) -> dict: + # Return a decision for every id mentioned in the skeleton portion of this prompt. + # Use alias-encoded decisions so we can verify which chunk decided which id. + ids_here = [cid for cid in ("c1", "c2", "c3", "c4") if f"'id': '{cid}'" in prompt] + return { + "decisions": [ + {"id": cid, "decision": "keep", "proposed_label": "", "reason": alias} for cid in ids_here + ] + } + + return respond + + v0 = FakeFacade("v0", response=responder_for("v0")) + v1 = FakeFacade("v1", response=responder_for("v1")) + params = ChunkedValidationParams( + pool=["v0", "v1"], + max_entities_per_call=2, + excerpt_window_chars=50, + prompt_template=_MINIMAL_TEMPLATE, + ) + out = asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) + decisions = {d["id"]: d for d in out[COL_VALIDATION_DECISIONS]["decisions"]} + assert set(decisions) == {"c1", "c2", "c3", "c4"} + # Chunk 0 (c1,c2) → v0; chunk 1 (c3,c4) → v1. + assert decisions["c1"]["reason"] == "v0" + assert decisions["c2"]["reason"] == "v0" + assert decisions["c3"]["reason"] == "v1" + assert decisions["c4"]["reason"] == "v1" + + +class TestChunkedValidateRowFailurePropagation: + def test_one_chunk_terminal_error_fails_whole_row(self) -> None: + """A terminal LLM error in any chunk bubbles up; downstream DD reporting turns that into a FailedRecord.""" + spans = [ + _entity_span("a", "Alice", "first_name", 0, 5), + _entity_span("b", "Bob", "first_name", 10, 13), + ] + candidates = _candidates_schema(("a", "Alice", "first_name"), ("b", "Bob", "first_name")) + row = _build_row(text="Alice and Bob", seed_entities=spans, candidates=candidates) + + v0 = FakeFacade("v0", response={"decisions": [{"id": "a", "decision": "keep"}]}) + v1 = FakeFacade("v1", raise_on_call=True) + params = ChunkedValidationParams( + pool=["v0", "v1"], + max_entities_per_call=1, + excerpt_window_chars=50, + prompt_template=_MINIMAL_TEMPLATE, + ) + with pytest.raises(RuntimeError, match="forced failure from v1"): + asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) + + +class TestChunkedValidateRowMissingIdMirrorsSingleCall: + def test_decision_for_non_candidate_id_is_dropped(self) -> None: + """Matches single-call contract: ``enrich_validation_decisions`` filters to candidate ids.""" + spans = [_entity_span("a", "Alice", "first_name", 0, 5)] + candidates = _candidates_schema(("a", "Alice", "first_name")) + row = _build_row(text="Alice", seed_entities=spans, candidates=candidates) + facade = FakeFacade( + "v0", + response={ + "decisions": [ + {"id": "a", "decision": "keep"}, + {"id": "unknown", "decision": "drop"}, + ] + }, + ) + params = ChunkedValidationParams( + pool=["v0"], max_entities_per_call=5, excerpt_window_chars=20, prompt_template=_MINIMAL_TEMPLATE + ) + out = asyncio.run(chunked_validate_row(row, params, {"v0": facade})) + ids = [d["id"] for d in out[COL_VALIDATION_DECISIONS]["decisions"]] + assert ids == ["a"] + + +class TestChunkedValidateRowGuardsBadConfig: + def test_pool_alias_missing_from_models_raises_helpful_error(self) -> None: + spans = [_entity_span("a", "Alice", "first_name", 0, 5)] + candidates = _candidates_schema(("a", "Alice", "first_name")) + row = _build_row(text="Alice", seed_entities=spans, candidates=candidates) + params = ChunkedValidationParams( + pool=["missing_alias"], + max_entities_per_call=5, + excerpt_window_chars=20, + prompt_template=_MINIMAL_TEMPLATE, + ) + with pytest.raises(KeyError, match="missing_alias"): + asyncio.run(chunked_validate_row(row, params, {"v0": FakeFacade("v0", response={"decisions": []})})) + + +# --------------------------------------------------------------------------- +# Factory: make_chunked_validation_generator +# --------------------------------------------------------------------------- + + +class TestMakeChunkedValidationGenerator: + def test_decorator_metadata_encodes_pool_and_required_columns(self) -> None: + fn = make_chunked_validation_generator(["v0", "v1"]) + meta = fn.custom_column_metadata + assert meta["model_aliases"] == ["v0", "v1"] + assert set(meta["required_columns"]) == { + COL_TEXT, + COL_SEED_ENTITIES, + COL_VALIDATION_CANDIDATES, + COL_TAG_NOTATION, + } + # Must not declare columns we deliberately don't read; an over-broad + # required_columns would distort DAG ordering elsewhere. + assert COL_VALIDATION_SKELETON not in meta["required_columns"] + assert COL_MERGED_TAGGED_TEXT not in meta["required_columns"] + + def test_factory_rejects_empty_pool(self) -> None: + with pytest.raises(ValueError, match="pool is empty"): + make_chunked_validation_generator([]) + + def test_generator_forwards_to_chunked_validate_row(self) -> None: + spans = [_entity_span("a", "Alice", "first_name", 0, 5)] + candidates = _candidates_schema(("a", "Alice", "first_name")) + row = _build_row(text="Alice", seed_entities=spans, candidates=candidates) + fn = make_chunked_validation_generator(["v0"]) + facade = FakeFacade("v0", response={"decisions": [{"id": "a", "decision": "keep"}]}) + params = ChunkedValidationParams( + pool=["v0"], max_entities_per_call=5, excerpt_window_chars=20, prompt_template=_MINIMAL_TEMPLATE + ) + out = asyncio.run(fn(row, params, {"v0": facade})) + assert out[COL_VALIDATION_DECISIONS]["decisions"][0]["id"] == "a" diff --git a/tests/engine/test_detection_workflow.py b/tests/engine/test_detection_workflow.py index 9b7b8d44..4b7a4174 100644 --- a/tests/engine/test_detection_workflow.py +++ b/tests/engine/test_detection_workflow.py @@ -7,7 +7,7 @@ import pandas as pd import pytest -from data_designer.config.column_configs import LLMStructuredColumnConfig +from data_designer.config.column_configs import CustomColumnConfig, LLMStructuredColumnConfig from data_designer.config.models import ModelConfig from anonymizer.config.models import DetectionModelSelection @@ -17,10 +17,15 @@ COL_ENTITIES_BY_VALUE, COL_FINAL_ENTITIES, COL_LATENT_ENTITIES, + COL_SEED_ENTITIES, + COL_TAG_NOTATION, COL_TAGGED_TEXT, COL_TEXT, + COL_VALIDATION_CANDIDATES, + COL_VALIDATION_DECISIONS, DEFAULT_ENTITY_LABELS, ) +from anonymizer.engine.detection.chunked_validation import ChunkedValidationParams from anonymizer.engine.detection.detection_workflow import ( EntityDetectionWorkflow, _format_label_examples, @@ -473,3 +478,171 @@ def test_default_entity_labels_preserves_novel_augmented_entities( assert "server_name" in final_labels assert "hostname" in final_labels assert "ipv4" in final_labels + + +# --------------------------------------------------------------------------- +# Chunked validation wiring (Commit 2) +# --------------------------------------------------------------------------- + + +def _find_column(columns: list, name: str): + for col in columns: + if getattr(col, "name", None) == name: + return col + raise AssertionError(f"Column {name!r} not found in workflow columns: {[getattr(c, 'name', c) for c in columns]}") + + +def test_validation_column_is_custom_chunked_generator( + stub_detector_model_configs: list[ModelConfig], + stub_detection_model_selection: DetectionModelSelection, +) -> None: + """COL_VALIDATION_DECISIONS is now a CustomColumnConfig bound to the chunked generator, + not an LLMStructuredColumnConfig.""" + adapter = Mock() + adapter.run_workflow.return_value = WorkflowRunResult( + dataframe=pd.DataFrame( + { + COL_TEXT: ["Alice"], + COL_DETECTED_ENTITIES: [{"entities": [{"value": "Alice", "label": "first_name"}]}], + } + ), + failed_records=[], + ) + workflow = EntityDetectionWorkflow(adapter=adapter) + workflow.run( + pd.DataFrame({COL_TEXT: ["Alice"]}), + model_configs=stub_detector_model_configs, + selected_models=stub_detection_model_selection, + gliner_detection_threshold=0.5, + tag_latent_entities=False, + ) + columns = adapter.run_workflow.call_args.kwargs["columns"] + validation_col = _find_column(columns, COL_VALIDATION_DECISIONS) + assert isinstance(validation_col, CustomColumnConfig) + # Must NOT be the old structured-output LLM column. + assert not isinstance(validation_col, LLMStructuredColumnConfig) + assert validation_col.drop is True + # generator_params must match the Detect config defaults that flow through. + assert isinstance(validation_col.generator_params, ChunkedValidationParams) + assert validation_col.generator_params.pool == stub_detection_model_selection.entity_validator + assert validation_col.generator_params.max_entities_per_call > 0 + assert validation_col.generator_params.excerpt_window_chars > 0 + # The decorated generator's metadata must expose the pool and the exact + # set of columns it reads, so DataDesigner resolves facades and DAG ordering. + metadata = validation_col.generator_function.custom_column_metadata + assert metadata["model_aliases"] == list(stub_detection_model_selection.entity_validator) + assert set(metadata["required_columns"]) == { + COL_TEXT, + COL_SEED_ENTITIES, + COL_VALIDATION_CANDIDATES, + COL_TAG_NOTATION, + } + + +def test_validator_pool_kwargs_thread_through_to_generator_params( + stub_detector_model_configs: list[ModelConfig], + stub_detection_model_selection: DetectionModelSelection, +) -> None: + """Explicit ``validation_max_entities_per_call`` and ``validation_excerpt_window_chars`` + propagate from ``run()`` all the way to ``ChunkedValidationParams``.""" + adapter = Mock() + adapter.run_workflow.return_value = WorkflowRunResult( + dataframe=pd.DataFrame( + { + COL_TEXT: ["Alice"], + COL_DETECTED_ENTITIES: [{"entities": [{"value": "Alice", "label": "first_name"}]}], + } + ), + failed_records=[], + ) + workflow = EntityDetectionWorkflow(adapter=adapter) + workflow.run( + pd.DataFrame({COL_TEXT: ["Alice"]}), + model_configs=stub_detector_model_configs, + selected_models=stub_detection_model_selection, + gliner_detection_threshold=0.5, + validation_max_entities_per_call=17, + validation_excerpt_window_chars=42, + tag_latent_entities=False, + ) + columns = adapter.run_workflow.call_args.kwargs["columns"] + params = _find_column(columns, COL_VALIDATION_DECISIONS).generator_params + assert params.max_entities_per_call == 17 + assert params.excerpt_window_chars == 42 + + +def test_pool_size_greater_than_one_emits_warning( + stub_detector_model_configs: list[ModelConfig], + stub_detection_model_selection: DetectionModelSelection, + caplog: pytest.LogCaptureFixture, +) -> None: + """Operators with multiple validator aliases must be alerted that + ``max_parallel_requests`` is enforced per alias (pool multiplies in-flight).""" + selection = stub_detection_model_selection.model_copy( + update={"entity_validator": [*stub_detection_model_selection.entity_validator, "extra-validator"]} + ) + adapter = Mock() + adapter.run_workflow.return_value = WorkflowRunResult( + dataframe=pd.DataFrame( + { + COL_TEXT: ["Alice"], + COL_DETECTED_ENTITIES: [{"entities": [{"value": "Alice", "label": "first_name"}]}], + } + ), + failed_records=[], + ) + workflow = EntityDetectionWorkflow(adapter=adapter) + with caplog.at_level("WARNING"): + workflow.run( + pd.DataFrame({COL_TEXT: ["Alice"]}), + model_configs=stub_detector_model_configs, + selected_models=selection, + gliner_detection_threshold=0.5, + tag_latent_entities=False, + ) + # caplog can attach handlers at both the target logger and root, so the + # same record may appear twice in ``records``; dedupe by identity. + pool_warnings = { + id(r): r + for r in caplog.records + if r.name == "anonymizer.detection" and "pool of" in r.getMessage() and "aliases" in r.getMessage() + } + assert len(pool_warnings) == 1 + (only,) = pool_warnings.values() + assert "multiplies total in-flight" in only.getMessage() + + +def test_pool_size_one_does_not_emit_warning( + stub_detector_model_configs: list[ModelConfig], + stub_detection_model_selection: DetectionModelSelection, + caplog: pytest.LogCaptureFixture, +) -> None: + """Default single-alias configurations must not spam the warning: it's a pool caveat, not advice for everyone.""" + adapter = Mock() + adapter.run_workflow.return_value = WorkflowRunResult( + dataframe=pd.DataFrame( + { + COL_TEXT: ["Alice"], + COL_DETECTED_ENTITIES: [{"entities": [{"value": "Alice", "label": "first_name"}]}], + } + ), + failed_records=[], + ) + assert len(stub_detection_model_selection.entity_validator) == 1, ( + "baseline default must be a single validator for this test to be meaningful" + ) + workflow = EntityDetectionWorkflow(adapter=adapter) + with caplog.at_level("WARNING"): + workflow.run( + pd.DataFrame({COL_TEXT: ["Alice"]}), + model_configs=stub_detector_model_configs, + selected_models=stub_detection_model_selection, + gliner_detection_threshold=0.5, + tag_latent_entities=False, + ) + pool_warnings = [ + r + for r in caplog.records + if r.name == "anonymizer.detection" and "pool of" in r.getMessage() and "aliases" in r.getMessage() + ] + assert pool_warnings == [] From f767e4c88b8e5516287f54fcfb07300453c08a2b Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Mon, 20 Apr 2026 18:56:55 -0700 Subject: [PATCH 03/20] docs(detect): document chunked validation + validator pools Adds a "Chunked validation" section to the detection concept doc covering the two new Detect knobs (`validation_max_entities_per_call`, `validation_excerpt_window_chars`), when chunking kicks in, and how to tune in either direction. Adds a "Validator pools" subsection to the models doc demonstrating list-form `entity_validator:` YAML, with a warning admonition calling out that `max_parallel_requests` is enforced per alias (so a pool multiplies total in-flight calls) and that pool aliases should target equivalent models. Cross-links the two sections. Rate-limit jargon (TPM/RPM) is expanded inline on first use in each page since a reader can enter from either; the operator-facing log warning in detection_workflow still uses the acronyms. Also adds a behavioral regression test pinning single-chunk vs multi-chunk parity: same deterministic per-id decisions, pool-of-one, three chunks vs one chunk must produce byte-identical merged `COL_VALIDATION_DECISIONS` and byte-identical post-`apply_validation_decisions` entities, with the expected keep/reclass/drop/untouched tallies and the final entity list pinned. Made-with: Cursor --- docs/concepts/detection.md | 20 +++ docs/concepts/models.md | 24 ++++ tests/engine/test_chunked_validation.py | 168 +++++++++++++++++++++++- 3 files changed, 211 insertions(+), 1 deletion(-) diff --git a/docs/concepts/detection.md b/docs/concepts/detection.md index 9fdb7a56..158ba1ad 100644 --- a/docs/concepts/detection.md +++ b/docs/concepts/detection.md @@ -45,7 +45,27 @@ config = AnonymizerConfig( |-------|---------|-------------| | `entity_labels` | `None` (all defaults) | List of labels to detect. Leave unset (or pass `None`) to use the full default set. | | `gliner_threshold` | `0.3` | GLiNER confidence threshold (0.0--1.0). Lower values detect more entities but may increase false positives. | +| `validation_max_entities_per_call` | `100` | Maximum candidate entities per validator LLM call. Rows with more candidates are split into chunks. See [Chunked validation](#chunked-validation). | +| `validation_excerpt_window_chars` | `500` | Characters of context included before and after a chunk's entity spans in the validator prompt. Bounds per-chunk prompt size; not the model's context-window limit. | +--- + +## Chunked validation + +When a row yields many entity candidates, validating them in a single LLM call can exceed the model's context window or the provider's rate limits (the tokens-per-minute or requests-per-minute quota most hosted models enforce). Anonymizer automatically splits validation for such rows: candidates are grouped in position order into chunks of at most `validation_max_entities_per_call`, and each chunk is validated independently with its own bounded text excerpt (`validation_excerpt_window_chars` before and after the chunk's span). Decisions are merged back into a single per-row set. + +The chunked path is always on; if a row has fewer candidates than the limit, it runs as a single call and is exactly equivalent to the unchunked behavior. Tuning guidance: + +- **Raise `validation_max_entities_per_call`** if your validator has a large context window and you want fewer, larger calls. +- **Lower it** if you hit provider rate limits or want more uniform per-call latency. +- **Raise `validation_excerpt_window_chars`** when short windows hide the context needed to disambiguate entities (e.g., `"John"` as first name vs. last name depends on surrounding text). +- **Lower it** to reduce per-chunk prompt tokens, at the risk of lower validation quality on context-sensitive labels. + +### Validator pools + +`entity_validator` can be a single alias (the default) or a list of aliases — a **pool**. When multiple aliases are configured, each chunk in a row is dispatched to the next alias in round-robin order, which lets you work around per-alias rate limits by spreading requests across equivalent endpoints. + +See [Validator pools](models.md#validator-pools) for the YAML syntax and caveats. ## Entity labels diff --git a/docs/concepts/models.md b/docs/concepts/models.md index 77a9439a..8cdaa8fb 100644 --- a/docs/concepts/models.md +++ b/docs/concepts/models.md @@ -109,6 +109,30 @@ Roles you don't override keep their default alias selections, but those aliases Use [`anonymizer.validate_config(config)`](../reference/anonymizer/interface/anonymizer.md) (or [`anonymizer validate`](../reference/anonymizer/interface/cli/main.md) from the CLI) after changing model configs to catch alias mismatches before processing data. +### Validator pools + +`entity_validator` accepts either a single alias (shown above) or a list of aliases. A list forms a **validator pool**: [chunked validation](detection.md#chunked-validation) dispatches each chunk to the next alias in round-robin order, which is useful for aggregating quota across equivalent endpoints when a single alias would hit the provider's rate limits (tokens-per-minute or requests-per-minute quotas). + +```yaml +selected_models: + detection: + entity_detector: gliner-pii-detector + entity_validator: + - gpt5-primary + - gpt5-secondary + entity_augmenter: gpt5-primary + latent_detector: claude-sonnet +``` + +Every alias in the pool must also appear in `model_configs`; `anonymizer validate` flags unknown aliases by index. A scalar value remains valid and is equivalent to a one-element list. + +!!! warning "`max_parallel_requests` is enforced per alias" + + A pool with N aliases effectively allows up to `sum(max_parallel_requests for alias in pool)` concurrent validator calls per row when chunks exist. Budget your provider rate limits accordingly — the whole point of pooling is to multiply in-flight requests, but the multiplication is real. + + Pool aliases should target **equivalent models** (same model family, similar quality). Mixing heterogeneous models produces inconsistent validation across chunks in the same row and is almost always a misconfiguration. + + ### Choosing custom models For Anonymizer, the best overall leaderboard model is not always the best default for every role. diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index 70fa4dad..3c41f035 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -12,6 +12,7 @@ import asyncio import json +import re from typing import Any, Callable import pytest @@ -36,7 +37,7 @@ order_candidates_by_position, render_chunk_prompt, ) -from anonymizer.engine.detection.postprocess import EntitySpan, TagNotation +from anonymizer.engine.detection.postprocess import EntitySpan, TagNotation, apply_validation_decisions from anonymizer.engine.schemas import ( EntitiesSchema, RawValidationDecisionsSchema, @@ -541,3 +542,168 @@ def test_generator_forwards_to_chunked_validate_row(self) -> None: ) out = asyncio.run(fn(row, params, {"v0": facade})) assert out[COL_VALIDATION_DECISIONS]["decisions"][0]["id"] == "a" + + +# --------------------------------------------------------------------------- +# Behavioral regression: single-chunk vs multi-chunk parity (pool-of-one). +# --------------------------------------------------------------------------- + + +def _selective_facade(alias: str, decisions_by_id: dict[str, dict[str, Any]]) -> FakeFacade: + """Return a facade whose response depends on the ids embedded in each chunk's prompt. + + The prompt renders ``_validation_skeleton`` as Python ``str(dict)``; we + parse ``'id': 'X'`` tokens back out to select which decisions to return. + This makes the LLM behaviour a pure function of the *candidate ids in + the chunk*, not of chunk shape or sequencing -- which is exactly the + assumption a real deterministic validator would satisfy when re-run. + """ + + def respond(prompt: str) -> dict[str, Any]: + ids = re.findall(r"'id':\s*'([^']+)'", prompt) + return {"decisions": [decisions_by_id[i] for i in ids if i in decisions_by_id]} + + return FakeFacade(alias, response=respond) + + +def _summarize(validated: list[EntitySpan]) -> list[tuple[str, str]]: + return [(e.entity_id, e.label) for e in validated] + + +def _tally(before: list[EntitySpan], after: list[EntitySpan], decisions: dict) -> dict[str, int]: + """Count keep/reclass/drop/untouched relative to the decisions the LLM returned.""" + before_labels = {e.entity_id: e.label for e in before} + after_ids = {e.entity_id for e in after} + decided_ids: dict[str, dict[str, str]] = {d["id"]: d for d in decisions["decisions"]} + counts = {"keep": 0, "reclass": 0, "drop": 0, "untouched": 0} + for entity_id, original_label in before_labels.items(): + decision = decided_ids.get(entity_id) + if decision is None: + counts["untouched"] += 1 + continue + verdict = decision.get("decision") + if verdict == "drop": + assert entity_id not in after_ids, f"drop decision for {entity_id} did not remove it" + counts["drop"] += 1 + elif verdict == "reclass": + counts["reclass"] += 1 + elif verdict == "keep": + counts["keep"] += 1 + return counts + + +def _normalize_decisions(doc: dict) -> list[tuple[str, str, str]]: + return sorted( + (d["id"], d.get("decision") or "", d.get("proposed_label") or "") for d in doc["decisions"] + ) + + +class TestChunkedValidationRegression: + """Partitioning must not change outcomes when the LLM is deterministic per candidate. + + Guards the most important property we promised when we switched from a + single ``LLMStructuredColumnConfig`` to chunked ``CustomColumnConfig`` + dispatch: given the same set of per-id decisions, chunk sizing is an + implementation detail of *how* we talk to the validator, not *what* + entities survive validation. + """ + + SCENARIO_TEXT = ( + # Positions referenced below are into this exact string. + "Alice met Bob in Chicago at Acme HQ; Doe introduced Eve to the team later." + # 0 5 10 15 20 28 34 43 53 56 + ) + + @pytest.fixture + def scenario(self) -> tuple[list[EntitySpan], ValidationCandidatesSchema, dict[str, dict[str, Any]]]: + spans = [ + _entity_span("a", "Alice", "first_name", 0, 5), + _entity_span("b", "Bob", "first_name", 10, 13), + _entity_span("c", "Chicago", "city", 17, 24), + _entity_span("d", "Acme", "organization", 28, 32), + _entity_span("e", "Doe", "last_name", 37, 40), + _entity_span("f", "Eve", "first_name", 54, 57), + ] + candidates = _candidates_schema( + ("a", "Alice", "first_name"), + ("b", "Bob", "first_name"), + ("c", "Chicago", "city"), + ("d", "Acme", "organization"), + ("e", "Doe", "last_name"), + ("f", "Eve", "first_name"), + ) + # Deterministic per-id decisions covering all branches. + # ``f`` intentionally has no decision: downstream must keep it as-is, + # regardless of whether it lands in its own chunk or shares one. + decisions_by_id: dict[str, dict[str, Any]] = { + "a": {"id": "a", "decision": "keep"}, + "b": {"id": "b", "decision": "drop"}, + "c": {"id": "c", "decision": "reclass", "proposed_label": "location"}, + "d": {"id": "d", "decision": "keep"}, + "e": {"id": "e", "decision": "reclass", "proposed_label": "surname"}, + } + return spans, candidates, decisions_by_id + + def _run( + self, + *, + spans: list[EntitySpan], + candidates: ValidationCandidatesSchema, + decisions_by_id: dict[str, dict[str, Any]], + max_per_call: int, + ) -> tuple[dict, list[EntitySpan], int]: + row = _build_row(text=self.SCENARIO_TEXT, seed_entities=spans, candidates=candidates) + facade = _selective_facade("solo", decisions_by_id) + params = ChunkedValidationParams( + pool=["solo"], + max_entities_per_call=max_per_call, + excerpt_window_chars=200, + prompt_template=_MINIMAL_TEMPLATE, + ) + out = asyncio.run(chunked_validate_row(row, params, {"solo": facade})) + decisions_doc = out[COL_VALIDATION_DECISIONS] + validated = apply_validation_decisions(spans, decisions_doc) + return decisions_doc, validated, len(facade.calls) + + def test_multi_chunk_matches_single_chunk(self, scenario) -> None: + spans, candidates, decisions_by_id = scenario + + # 1 chunk -- stands in for the "legacy single-call" path: all + # candidates fit into one validator call, no partitioning. + single_doc, single_validated, single_calls = self._run( + spans=spans, candidates=candidates, decisions_by_id=decisions_by_id, max_per_call=10 + ) + # 3 chunks of size 2 -- pool-of-one with real partitioning. + multi_doc, multi_validated, multi_calls = self._run( + spans=spans, candidates=candidates, decisions_by_id=decisions_by_id, max_per_call=2 + ) + + # Sanity: the two configurations actually differ in *how* they call + # the validator. Without this the test is trivially satisfiable. + assert single_calls == 1 + assert multi_calls == 3 + + # Decisions merged back together are identical (order-insensitive). + assert _normalize_decisions(single_doc) == _normalize_decisions(multi_doc) + + # Final per-entity outcomes (surviving ids + their post-validation + # labels) are identical. This is the ``COL_DETECTED_ENTITIES`` parity + # claim: downstream stages cannot tell the two runs apart. + assert _summarize(single_validated) == _summarize(multi_validated) + + # Keep/reclass/drop/untouched tallies match the fixed decision set. + expected_tally = {"keep": 2, "reclass": 2, "drop": 1, "untouched": 1} + assert _tally(spans, single_validated, single_doc) == expected_tally + assert _tally(spans, multi_validated, multi_doc) == expected_tally + + # Concrete post-validation outcome, pinned so a regression in + # ``apply_validation_decisions`` or chunk merging is caught + # precisely, not just "something changed". + assert _summarize(multi_validated) == [ + ("a", "first_name"), # keep + ("c", "location"), # reclass + ("d", "organization"), # keep + ("e", "surname"), # reclass + ("f", "first_name"), # untouched (no decision) + # "b" dropped + ] From ab42bd546b717bb1b6939a8c7897550b038f17c6 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Mon, 20 Apr 2026 19:07:04 -0700 Subject: [PATCH 04/20] fix: format --- .../engine/detection/chunked_validation.py | 4 +-- tests/engine/test_chunked_validation.py | 26 +++++++++---------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 02f28ac1..3f22101c 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -180,9 +180,7 @@ def build_chunk_excerpt( def build_chunk_skeleton(chunk_candidates_: list[Any]) -> dict[str, Any]: """Build the validation skeleton (``ValidationSkeletonSchema``) for a chunk.""" skeleton = ValidationSkeletonSchema( - decisions=[ - ValidationSkeletonDecisionSchema(id=c.id, value=c.value, label=c.label) for c in chunk_candidates_ - ] + decisions=[ValidationSkeletonDecisionSchema(id=c.id, value=c.value, label=c.label) for c in chunk_candidates_] ) return skeleton.model_dump(mode="json") diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index 3c41f035..04cc3efc 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -45,7 +45,6 @@ ValidationCandidatesSchema, ) - # --------------------------------------------------------------------------- # Test fixtures # --------------------------------------------------------------------------- @@ -218,9 +217,9 @@ def test_forces_requested_notation_over_heuristic(self) -> None: assert "" not in excerpt def test_empty_chunk_returns_empty_string(self) -> None: - assert build_chunk_excerpt( - text="x", chunk_spans=[], all_spans=[], window_chars=5, notation=TagNotation.xml - ) == "" + assert ( + build_chunk_excerpt(text="x", chunk_spans=[], all_spans=[], window_chars=5, notation=TagNotation.xml) == "" + ) class TestBuildChunkSkeleton: @@ -229,7 +228,14 @@ def test_skeleton_matches_chunk_only(self) -> None: skeleton = build_chunk_skeleton([candidates.candidates[0]]) assert skeleton == { "decisions": [ - {"id": "a", "value": "Alice", "label": "first_name", "decision": None, "proposed_label": None, "reason": None} + { + "id": "a", + "value": "Alice", + "label": "first_name", + "decision": None, + "proposed_label": None, + "reason": None, + } ] } @@ -316,11 +322,7 @@ def _build_row( } -_MINIMAL_TEMPLATE = ( - "TAGGED:{{ _merged_tagged_text }}|" - "SKELETON:{{ _validation_skeleton }}|" - "NOTATION:{{ _tag_notation }}" -) +_MINIMAL_TEMPLATE = "TAGGED:{{ _merged_tagged_text }}|SKELETON:{{ _validation_skeleton }}|NOTATION:{{ _tag_notation }}" class TestChunkedValidateRowPoolOfOne: @@ -593,9 +595,7 @@ def _tally(before: list[EntitySpan], after: list[EntitySpan], decisions: dict) - def _normalize_decisions(doc: dict) -> list[tuple[str, str, str]]: - return sorted( - (d["id"], d.get("decision") or "", d.get("proposed_label") or "") for d in doc["decisions"] - ) + return sorted((d["id"], d.get("decision") or "", d.get("proposed_label") or "") for d in doc["decisions"]) class TestChunkedValidationRegression: From c4196f2d9f25319c3a89d025aea3a72504f6d038 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Tue, 21 Apr 2026 10:33:24 -0700 Subject: [PATCH 05/20] feat(detect): cross-alias failover + DD engine compatibility --- docs/concepts/detection.md | 8 +- docs/concepts/models.md | 5 +- .../engine/detection/chunked_validation.py | 160 +++++++++++++++--- tests/engine/test_chunked_validation.py | 134 ++++++++++++++- 4 files changed, 272 insertions(+), 35 deletions(-) diff --git a/docs/concepts/detection.md b/docs/concepts/detection.md index 158ba1ad..163371b6 100644 --- a/docs/concepts/detection.md +++ b/docs/concepts/detection.md @@ -52,7 +52,7 @@ config = AnonymizerConfig( ## Chunked validation -When a row yields many entity candidates, validating them in a single LLM call can exceed the model's context window or the provider's rate limits (the tokens-per-minute or requests-per-minute quota most hosted models enforce). Anonymizer automatically splits validation for such rows: candidates are grouped in position order into chunks of at most `validation_max_entities_per_call`, and each chunk is validated independently with its own bounded text excerpt (`validation_excerpt_window_chars` before and after the chunk's span). Decisions are merged back into a single per-row set. +When a row yields many entity candidates, validating them in a single LLM call can often exceed the model's context window or the provider's rate limits (tokens-per-minute or requests-per-minute quotas that many hosted models enforce). Anonymizer automatically splits validation for such rows: candidates are grouped in position order into chunks of at most `validation_max_entities_per_call`, and each chunk is validated independently with its own bounded text excerpt (`validation_excerpt_window_chars` before and after the chunk's span). Decisions are merged back into a single per-row set. The chunked path is always on; if a row has fewer candidates than the limit, it runs as a single call and is exactly equivalent to the unchunked behavior. Tuning guidance: @@ -65,6 +65,12 @@ The chunked path is always on; if a row has fewer candidates than the limit, it `entity_validator` can be a single alias (the default) or a list of aliases — a **pool**. When multiple aliases are configured, each chunk in a row is dispatched to the next alias in round-robin order, which lets you work around per-alias rate limits by spreading requests across equivalent endpoints. +Pools also act as **cross-alias failover**. If a chunk's primary alias raises a terminal exception (5xx after transport retries, connection error, or parser error), the chunk is re-dispatched against the remaining pool members in pool order before giving up. A chunk only fails when every pool member has raised for it. This makes pools a cheap way to harden validation against single-endpoint degradation, in addition to their load-spreading role. + +#### Failure contract + +When chunked validation cannot get an answer for a row — every pool member raised for at least one of its chunks — the row is **dropped** from the output DataFrame, not silently passed through with unscrubbed text. The dropped row is reported on `result.failed_records` with `step="detection"`, so a downstream reprocessing pass can identify misses by diffing input IDs against output IDs. + See [Validator pools](models.md#validator-pools) for the YAML syntax and caveats. diff --git a/docs/concepts/models.md b/docs/concepts/models.md index 8cdaa8fb..c7adc8d2 100644 --- a/docs/concepts/models.md +++ b/docs/concepts/models.md @@ -111,7 +111,10 @@ Roles you don't override keep their default alias selections, but those aliases ### Validator pools -`entity_validator` accepts either a single alias (shown above) or a list of aliases. A list forms a **validator pool**: [chunked validation](detection.md#chunked-validation) dispatches each chunk to the next alias in round-robin order, which is useful for aggregating quota across equivalent endpoints when a single alias would hit the provider's rate limits (tokens-per-minute or requests-per-minute quotas). +`entity_validator` accepts either a single alias (shown above) or a list of aliases. A list forms a **validator pool** with two jobs: + +1. **Load spreading.** [Chunked validation](detection.md#chunked-validation) dispatches each chunk to the next alias in round-robin order, aggregating quota across equivalent endpoints when a single alias would hit the provider's rate limits (tokens-per-minute or requests-per-minute quotas). +2. **Failover.** If a chunk's assigned alias can't complete the call (an unrecoverable rate limit, a 5xx that didn't clear on retry, a malformed response), the same chunk is automatically retried against the other aliases in your pool before the row is given up on. A row is only dropped when *every* alias in the pool has failed for the same chunk. Single-alias pools have nothing to fall back to, so they behave the same as not using a pool. ```yaml selected_models: diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 3f22101c..860e7fa6 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -6,11 +6,22 @@ Validation is the step that asks an LLM to keep/reclass/drop each candidate entity produced by the detector. Historically we ran one LLM call per row over the full tagged text. For long documents with many candidates this hits -context-window and TPM/RPM limits. This module replaces that single call with -a fan-out: partition the row's candidates into chunks, build a small tagged -excerpt of text around each chunk, render the existing validation prompt per -chunk, and dispatch each chunk to an alias selected round-robin from a -configured validator pool. +context-window and provider rate limits. This module replaces that single +call with a fan-out: partition the row's candidates into chunks, build a +small tagged excerpt of text around each chunk, render the existing +validation prompt per chunk, and dispatch each chunk to an alias selected +round-robin from a configured validator pool. + +Failure contract. Each chunk attempts its round-robin-assigned alias first +and, on any terminal exception from that alias (5xx after transport retries, +connection errors, parser errors), fails over sequentially to the rest of +the pool. A chunk only fails when every pool member has raised. A failed +chunk cancels sibling chunks (``asyncio.gather`` semantics) and raises out +of the custom column generator, at which point DataDesigner drops the row +from the output DataFrame and ``NddAdapter._detect_missing_records`` +surfaces it as a ``FailedRecord``. This is the "best effort on first pass, +identify and reprocess misses" contract: raw text never silently leaks +through as unscrubbed output. The per-chunk decisions are merged into a ``ValidationDecisionsSchema``-shaped payload so the downstream ``enrich_validation_decisions`` column keeps @@ -20,6 +31,19 @@ :func:`make_chunked_validation_generator`, which produces a generator function decorated with ``@custom_column_generator`` and bound to a concrete pool. The pure helpers below are exposed for unit testing. + +Sync/async boundary with DataDesigner. The DD-exposed wrapper is sync +(DD's default thread-pool engine calls ``generator.generate(record)`` +directly; an async wrapper would return a coroutine and DD would reject it +with ``"must return a dict, got coroutine"``). The wrapper runs the +``chunked_validate_row`` coroutine on a fresh event loop per row via +``asyncio.run``; inside, each chunk dispatch calls the *sync* +``facade.generate()`` through ``asyncio.to_thread`` because the facades DD +hands us under the sync engine are sync-mode ``HttpModelClient``s whose +``agenerate()`` raises. DD#545 (shipped in ``data-designer>=0.5.7``) +bridges sync ``.generate()`` calls to ``agenerate`` transparently under +the async engine, so this same code path works both before and after +Anonymizer#119 turns the async engine on for detection. """ from __future__ import annotations @@ -75,9 +99,11 @@ class ChunkedValidationParams(BaseModel): Attributes: pool: Ordered list of validator model aliases. Chunk ``i`` is dispatched - to ``pool[i % len(pool)]``. Must be non-empty and every alias must - also be present in the decorator's ``model_aliases`` so DataDesigner - materialises the facade. + to ``pool[i % len(pool)]`` as its primary; on any terminal exception + from that alias the chunk fails over through the rest of the pool + (starting from the next position, wrapping around). Must be + non-empty and every alias must also be present in the decorator's + ``model_aliases`` so DataDesigner materialises the facade. max_entities_per_call: Upper bound on candidates per chunk. excerpt_window_chars: Chars of surrounding raw text included in each chunk's excerpt on either side of the chunk span. @@ -261,30 +287,92 @@ def merge_chunk_decisions( async def _dispatch_chunk( *, - facade: Any, + facades: list[tuple[str, Any]], prompt: str, system_prompt: str | None, chunk_index: int, ) -> RawValidationDecisionsSchema: - """Dispatch a single chunk via the configured ModelFacade. + """Dispatch a single chunk with cross-alias failover across the pool. + + ``facades`` is an ordered list of ``(alias, facade)`` pairs. The first + entry is the chunk's round-robin-assigned primary; subsequent entries + are the rest of the pool, tried in order on any terminal exception from + the primary. Each facade carries its own transport-level retry policy + (``RetryConfig.max_retries`` + exponential backoff on 5xx and connection + errors) and its own AIMD throttling on 429, so by the time an exception + escapes the facade call we consider that alias exhausted for this chunk. + + We call the *sync* ``facade.generate()`` inside ``asyncio.to_thread`` on + purpose. Under DataDesigner's default thread-pool engine the facades + DD hands us are sync-mode ``HttpModelClient``s, and calling + ``facade.agenerate()`` on them raises + ``"Async methods are not available on a sync-mode HttpModelClient"``. + The sync call works under both engines: DD#545 (shipped in + ``data-designer>=0.5.7``) bridges sync ``.generate()`` calls from + custom column generators to ``agenerate`` under the async engine, so + wrapping in ``asyncio.to_thread`` gives us per-row chunk concurrency + today and forward-compatibility after Anonymizer#119 lands. We use ``PydanticResponseRecipe`` so the facade appends JSON task instructions and parses the response into ``RawValidationDecisionsSchema``. - Parser failures and transient LLM errors are handled by - ``ModelFacade.agenerate`` (throttle, retries); any error that escapes is - terminal for the chunk. We let it propagate so the row fails as a whole - and DataDesigner records it as a ``FailedRecord``. + + Single-alias pools get no additional attempts — the loop runs exactly + once and re-raises the original exception, matching the pre-failover + contract. Multi-alias pools get ``len(pool)`` total attempts. If every + pool member raises, the *last* exception propagates so DataDesigner + records the row as a ``FailedRecord`` via ``NddAdapter._detect_missing_records``. + + Each failover attempt is logged at WARNING so operators can correlate + degraded pool members with run-level failure-rate spikes. """ recipe = PydanticResponseRecipe(data_type=RawValidationDecisionsSchema) final_prompt = recipe.apply_recipe_to_user_prompt(prompt) final_system = recipe.apply_recipe_to_system_prompt(system_prompt) - output, _messages = await facade.agenerate( - prompt=final_prompt, - parser=recipe.parse, - system_prompt=final_system, - purpose=f"entity-validation-chunk-{chunk_index}", - ) - return output + + last_exc: BaseException | None = None + for attempt_index, (alias, facade) in enumerate(facades): + try: + output, _messages = await asyncio.to_thread( + facade.generate, + prompt=final_prompt, + parser=recipe.parse, + system_prompt=final_system, + purpose=f"entity-validation-chunk-{chunk_index}-attempt-{attempt_index}", + ) + if attempt_index > 0: + logger.info( + "validator chunk %d: recovered on failover alias=%s (attempt %d of %d)", + chunk_index, + alias, + attempt_index + 1, + len(facades), + ) + return output + except Exception as exc: # noqa: BLE001 — we classify by failover position, not type + last_exc = exc + remaining = len(facades) - attempt_index - 1 + if remaining > 0: + logger.warning( + "validator chunk %d: alias=%s raised %s (%s); failing over to next pool member (%d remaining)", + chunk_index, + alias, + type(exc).__name__, + exc, + remaining, + ) + else: + logger.error( + "validator chunk %d: alias=%s raised %s (%s); pool exhausted — row will be dropped", + chunk_index, + alias, + type(exc).__name__, + exc, + ) + + # Defensive: facades is non-empty (caller guarantees), so the loop either + # returned a successful output or recorded last_exc and re-raises here. + assert last_exc is not None + raise last_exc async def chunked_validate_row( @@ -353,12 +441,15 @@ async def chunked_validate_row( ) # Round-robin across the validator pool. ``ChunkedValidationParams`` # guarantees ``pool`` is non-empty; ``chunk_index`` comes from - # ``enumerate`` so it's non-negative by construction. - alias = params.pool[chunk_index % len(params.pool)] - facade = models[alias] + # ``enumerate`` so it's non-negative by construction. The rotated + # order (primary first, then the rest of the pool) is what + # ``_dispatch_chunk`` walks on cross-alias failover. + start = chunk_index % len(params.pool) + rotated_aliases = [params.pool[(start + offset) % len(params.pool)] for offset in range(len(params.pool))] + chunk_facades = [(alias, models[alias]) for alias in rotated_aliases] tasks.append( _dispatch_chunk( - facade=facade, + facades=chunk_facades, prompt=prompt, system_prompt=params.system_prompt, chunk_index=chunk_index, @@ -379,7 +470,7 @@ async def chunked_validate_row( def make_chunked_validation_generator(pool: list[str]) -> Any: - """Build a ``@custom_column_generator``-decorated async function bound to ``pool``. + """Build a ``@custom_column_generator``-decorated sync function bound to ``pool``. ``model_aliases`` must be declared statically on the decorator so DataDesigner knows which facades to materialise for the generator. Since @@ -387,6 +478,19 @@ def make_chunked_validation_generator(pool: list[str]) -> Any: The required_columns are exhaustive for DataDesigner's DAG ordering: the generator reads the raw text, seed entities (for positions), the candidate list (what to decide), and the tag notation (for excerpt tagging). + + Why the outer wrapper is sync. DataDesigner routes cell-by-cell custom + generators through a ThreadPoolExecutor by default (the async engine is + opt-in via ``DATA_DESIGNER_ASYNC_ENGINE=1``). The thread-pool path calls + ``generator.generate(record)`` which synchronously invokes our function + and passes its return value straight into ``_postprocess_result``. If the + outer function were ``async``, the sync caller would receive a coroutine + object and DD would reject it with ``"must return a dict, got coroutine"``. + The sync wrapper here runs the async row logic on a fresh event loop + (safe because each DD worker thread has no ambient loop) and returns the + resolved dict, so this works under both the default thread engine and + the opt-in async engine (which wraps sync generators in + ``asyncio.to_thread``). """ if not pool: raise ValueError("Cannot build chunked validation generator: pool is empty.") @@ -400,11 +504,11 @@ def make_chunked_validation_generator(pool: list[str]) -> Any: ], model_aliases=list(pool), ) - async def chunked_validate( + def chunked_validate( row: dict[str, Any], generator_params: ChunkedValidationParams, models: dict[str, Any], ) -> dict[str, Any]: - return await chunked_validate_row(row, generator_params, models) + return asyncio.run(chunked_validate_row(row, generator_params, models)) return chunked_validate diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index 04cc3efc..57f27187 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -53,6 +53,13 @@ class FakeFacade: """Test double for ``ModelFacade`` recording invocations and replaying canned responses. + Exposes a *sync* ``generate()`` method because ``_dispatch_chunk`` now + calls ``asyncio.to_thread(facade.generate, ...)`` (see the module + docstring for why the sync primitive is correct under both DD engines). + Tests still drive the pipeline through ``asyncio.run(chunked_validate_row(...))``, + which awaits the ``to_thread`` task that invokes this sync method in a + worker thread. + A canned response can be a ``dict`` (auto-wrapped in a ```json fence), a raw string, or a callable that receives the prompt and returns either. Set ``raise_on_call=True`` to simulate a terminal LLM failure. @@ -70,7 +77,7 @@ def __init__( self._raise = raise_on_call self.calls: list[dict[str, Any]] = [] - async def agenerate(self, *, prompt, parser, system_prompt=None, purpose=None, **kwargs): + def generate(self, *, prompt, parser, system_prompt=None, purpose=None, **kwargs): self.calls.append( { "prompt": prompt, @@ -449,8 +456,12 @@ def respond(prompt: str) -> dict: class TestChunkedValidateRowFailurePropagation: - def test_one_chunk_terminal_error_fails_whole_row(self) -> None: - """A terminal LLM error in any chunk bubbles up; downstream DD reporting turns that into a FailedRecord.""" + def test_row_fails_only_when_every_pool_member_raises_for_a_chunk(self) -> None: + """With failover enabled, a chunk only fails when *every* pool member raises. + + Downstream DD reporting then turns that row into a FailedRecord via + ``NddAdapter._detect_missing_records`` — no unscrubbed passthrough. + """ spans = [ _entity_span("a", "Alice", "first_name", 0, 5), _entity_span("b", "Bob", "first_name", 10, 13), @@ -458,7 +469,7 @@ def test_one_chunk_terminal_error_fails_whole_row(self) -> None: candidates = _candidates_schema(("a", "Alice", "first_name"), ("b", "Bob", "first_name")) row = _build_row(text="Alice and Bob", seed_entities=spans, candidates=candidates) - v0 = FakeFacade("v0", response={"decisions": [{"id": "a", "decision": "keep"}]}) + v0 = FakeFacade("v0", raise_on_call=True) v1 = FakeFacade("v1", raise_on_call=True) params = ChunkedValidationParams( pool=["v0", "v1"], @@ -466,10 +477,103 @@ def test_one_chunk_terminal_error_fails_whole_row(self) -> None: excerpt_window_chars=50, prompt_template=_MINIMAL_TEMPLATE, ) + # The last alias tried propagates; order is round-robin + wrap, so + # chunk 0 starts at v0 → fails → v1 (fails, last) → raises "from v1". with pytest.raises(RuntimeError, match="forced failure from v1"): asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) +class TestChunkedValidateRowCrossAliasFailover: + def test_primary_alias_failure_falls_over_to_secondary(self) -> None: + """Primary raises, secondary in pool returns a valid response → chunk succeeds, row does not fail.""" + spans = [_entity_span("a", "Alice", "first_name", 0, 5)] + candidates = _candidates_schema(("a", "Alice", "first_name")) + row = _build_row(text="Alice", seed_entities=spans, candidates=candidates) + + v0 = FakeFacade("v0", raise_on_call=True) + v1 = FakeFacade("v1", response={"decisions": [{"id": "a", "decision": "keep"}]}) + params = ChunkedValidationParams( + pool=["v0", "v1"], + max_entities_per_call=5, + excerpt_window_chars=50, + prompt_template=_MINIMAL_TEMPLATE, + ) + out = asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) + decisions = out[COL_VALIDATION_DECISIONS]["decisions"] + assert len(decisions) == 1 + assert decisions[0]["id"] == "a" + assert decisions[0]["decision"] == "keep" + # Each alias was tried exactly once for this single-chunk row. + assert len(v0.calls) == 1 + assert len(v1.calls) == 1 + + def test_single_alias_pool_does_not_failover(self) -> None: + """A one-alias pool has no fallback — one attempt, propagate immediately. + + This keeps the behavioural guarantee that pools of size 1 behave + exactly as the pre-failover dispatch did: no hidden extra attempts. + """ + spans = [_entity_span("a", "Alice", "first_name", 0, 5)] + candidates = _candidates_schema(("a", "Alice", "first_name")) + row = _build_row(text="Alice", seed_entities=spans, candidates=candidates) + + v0 = FakeFacade("v0", raise_on_call=True) + params = ChunkedValidationParams( + pool=["v0"], + max_entities_per_call=5, + excerpt_window_chars=50, + prompt_template=_MINIMAL_TEMPLATE, + ) + with pytest.raises(RuntimeError, match="forced failure from v0"): + asyncio.run(chunked_validate_row(row, params, {"v0": v0})) + assert len(v0.calls) == 1 + + def test_failover_wraps_starting_from_round_robin_primary(self) -> None: + """With pool [v0,v1] and 3 chunks, chunk 1's primary is v1. + + If v1 fails, failover wraps to v0 (next position mod len(pool)). + We verify by ensuring chunk 1 succeeds on v0's response, while + chunk 0 and chunk 2 succeed on v0 (their primary) directly. + """ + spans = [ + _entity_span("a", "Alice", "first_name", 0, 5), + _entity_span("b", "Bob", "first_name", 10, 13), + _entity_span("c", "Carol", "first_name", 20, 25), + ] + candidates = _candidates_schema( + ("a", "Alice", "first_name"), + ("b", "Bob", "first_name"), + ("c", "Carol", "first_name"), + ) + row = _build_row(text="Alice and Bob and Carol", seed_entities=spans, candidates=candidates) + + def v0_response(prompt: str) -> dict: + # The chunk's skeleton is serialized into the prompt; pick the id + # from there. We can't use the raw text excerpt to distinguish + # chunks because the text is short enough that every chunk's + # excerpt window covers the whole string. + for candidate_id in ("a", "b", "c"): + if f"'id': '{candidate_id}'" in prompt: + return {"decisions": [{"id": candidate_id, "decision": "keep"}]} + raise AssertionError(f"no known candidate id found in prompt: {prompt!r}") + + v0 = FakeFacade("v0", response=v0_response) + v1 = FakeFacade("v1", raise_on_call=True) + params = ChunkedValidationParams( + pool=["v0", "v1"], + max_entities_per_call=1, + excerpt_window_chars=50, + prompt_template=_MINIMAL_TEMPLATE, + ) + out = asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) + decisions = {d["id"]: d["decision"] for d in out[COL_VALIDATION_DECISIONS]["decisions"]} + assert decisions == {"a": "keep", "b": "keep", "c": "keep"} + # v0 serviced all three chunks: chunk 0 + chunk 2 directly, chunk 1 via failover. + assert len(v0.calls) == 3 + # v1 saw exactly one call — chunk 1's primary attempt that raised. + assert len(v1.calls) == 1 + + class TestChunkedValidateRowMissingIdMirrorsSingleCall: def test_decision_for_non_candidate_id_is_dropped(self) -> None: """Matches single-call contract: ``enrich_validation_decisions`` filters to candidate ids.""" @@ -534,6 +638,10 @@ def test_factory_rejects_empty_pool(self) -> None: make_chunked_validation_generator([]) def test_generator_forwards_to_chunked_validate_row(self) -> None: + """The DD-exposed wrapper is *sync* (bridges DD's thread-pool fan-out + to our async row logic via asyncio.run), so this test calls it + directly and asserts it returns a dict, not a coroutine. + """ spans = [_entity_span("a", "Alice", "first_name", 0, 5)] candidates = _candidates_schema(("a", "Alice", "first_name")) row = _build_row(text="Alice", seed_entities=spans, candidates=candidates) @@ -542,9 +650,25 @@ def test_generator_forwards_to_chunked_validate_row(self) -> None: params = ChunkedValidationParams( pool=["v0"], max_entities_per_call=5, excerpt_window_chars=20, prompt_template=_MINIMAL_TEMPLATE ) - out = asyncio.run(fn(row, params, {"v0": facade})) + out = fn(row, params, {"v0": facade}) + assert isinstance(out, dict) assert out[COL_VALIDATION_DECISIONS]["decisions"][0]["id"] == "a" + def test_generator_is_sync_callable_returning_dict(self) -> None: + """Regression: DD's default thread-pool engine calls the wrapper + synchronously and rejects coroutine returns with "must return a dict, + got coroutine". Guard against accidentally re-introducing an async + outer wrapper.""" + import asyncio as _asyncio + import inspect as _inspect + + fn = make_chunked_validation_generator(["v0"]) + # The decorator wraps the function; unwrap and check the inner target. + inner = _inspect.unwrap(fn) + assert not _asyncio.iscoroutinefunction(inner), ( + "DD-exposed generator must be sync; an async outer wrapper breaks the default thread-pool engine." + ) + # --------------------------------------------------------------------------- # Behavioral regression: single-chunk vs multi-chunk parity (pool-of-one). From fd4fe4d112ad31632f0793cf22302d80c1eb16f9 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Tue, 21 Apr 2026 10:44:36 -0700 Subject: [PATCH 06/20] docs: update wording --- docs/concepts/detection.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/concepts/detection.md b/docs/concepts/detection.md index 163371b6..f9f7e4e8 100644 --- a/docs/concepts/detection.md +++ b/docs/concepts/detection.md @@ -65,11 +65,13 @@ The chunked path is always on; if a row has fewer candidates than the limit, it `entity_validator` can be a single alias (the default) or a list of aliases — a **pool**. When multiple aliases are configured, each chunk in a row is dispatched to the next alias in round-robin order, which lets you work around per-alias rate limits by spreading requests across equivalent endpoints. -Pools also act as **cross-alias failover**. If a chunk's primary alias raises a terminal exception (5xx after transport retries, connection error, or parser error), the chunk is re-dispatched against the remaining pool members in pool order before giving up. A chunk only fails when every pool member has raised for it. This makes pools a cheap way to harden validation against single-endpoint degradation, in addition to their load-spreading role. +Pools also act as **failover**. If a chunk's assigned alias can't complete the call (an unrecoverable rate limit, a 5xx that didn't clear on retry, a malformed response), the same chunk is automatically retried against the other aliases in your pool before the row is given up on. A chunk only fails once every alias in the pool has failed for it. This is a cheap way to harden validation against any one endpoint having a bad day, on top of the load-spreading role. -#### Failure contract +#### What happens when a row can't be validated -When chunked validation cannot get an answer for a row — every pool member raised for at least one of its chunks — the row is **dropped** from the output DataFrame, not silently passed through with unscrubbed text. The dropped row is reported on `result.failed_records` with `step="detection"`, so a downstream reprocessing pass can identify misses by diffing input IDs against output IDs. +If validation can't get a complete answer for a row — every alias in the pool has failed on at least one of that row's chunks — the row is **dropped from the output** rather than passed through with some entities unvalidated. This is deliberate: the alternative would be writing the original text back out with those entities still un-scrubbed, which is exactly the outcome you're trying to avoid. + +Dropped rows show up on `result.failed_records` with `step="detection"`, so you can tell which inputs didn't make it through by comparing input IDs against output IDs and reprocess those on a follow-up pass. See [Validator pools](models.md#validator-pools) for the YAML syntax and caveats. From e3c1ec6872c47f885759f17ae812a86099d25e56 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Tue, 21 Apr 2026 11:22:54 -0700 Subject: [PATCH 07/20] refactor(detect): harden chunked-validation guards per review feedback Follow-ups from code review on #126. No user-facing behaviour change; the happy path is unchanged and all tests in the affected modules pass. Source hardening (chunked_validation.py): - Replace `assert last_exc is not None` in `_dispatch_chunk` with an explicit `if`+`RuntimeError` guard. `assert` is stripped under `python -O`, which would turn a caller-contract violation (empty `facades`) into a confusing `TypeError: exceptions must derive from BaseException` from `raise None`. The new guard survives `-O` and names the precondition being violated. - Drop the redundant `max_entities_per_call <= 0` check in `chunk_candidates`. Positivity is already enforced at two upstream layers (`AnonymizerDetectConfig.validation_max_entities_per_call` and `ChunkedValidationParams.max_entities_per_call`, both `Field(gt=0)`); the inner check was defence-in-depth at a layer below the validated boundary. - Widen `chunk_candidates` input from `list[...]` to `Sequence[...]` and normalize the return to `list(...)`, so the declared `list[list[tuple[...]]]` return type is honest regardless of whether a list, tuple, or other sequence is passed. Previously tuple input silently produced `list[tuple[tuple[...]]]`. Documentation (models.py): - Expand `normalize_entity_validator`'s docstring to explicitly document tuple acceptance (previously undocumented behaviour that matched Pydantic v2's default `list[str]` coercion but wasn't called out). Test coverage: - `test_tuple_input_returns_list_of_lists` replaces the now-obsolete `test_non_positive_limit_raises`; pins the sequence-input / list-output contract. - `test_tuple_coerced_to_list` on `DetectionModelSelection` pins the `entity_validator=(...)` path. - `test_system_prompt_is_forwarded_to_facade` and `test_system_prompt_default_none_is_forwarded_untouched` close the untested-forwarding gap on `ChunkedValidationParams.system_prompt`. The field is plumbed but not populated by any production caller today; tests pin the contract for when it's wired up (tracked in #127). Made-with: Cursor --- src/anonymizer/config/models.py | 10 ++- .../engine/detection/chunked_validation.py | 32 ++++++--- tests/engine/test_chunked_validation.py | 69 ++++++++++++++++++- tests/engine/test_model_loader.py | 15 ++++ 4 files changed, 113 insertions(+), 13 deletions(-) diff --git a/src/anonymizer/config/models.py b/src/anonymizer/config/models.py index 108316f8..88bc7ba0 100644 --- a/src/anonymizer/config/models.py +++ b/src/anonymizer/config/models.py @@ -26,12 +26,18 @@ class DetectionModelSelection(BaseModel): @field_validator("entity_validator", mode="before") @classmethod def normalize_entity_validator(cls, value: Any) -> list[str]: - """Accept either a scalar alias or a list, return a non-empty list. + """Accept a scalar alias, a list of aliases, or a tuple of aliases; return a non-empty list. Normalizing at parse time keeps every downstream consumer on the same shape (``list[str]``) regardless of whether the user wrote ``entity_validator: some-alias`` or - ``entity_validator: [alias-a, alias-b]``. + ``entity_validator: [alias-a, alias-b]``. Tuples are accepted for + parity with Pydantic v2's default coercion for ``list[str]`` fields, + which lets programmatic callers pass either + ``DetectionModelSelection(entity_validator=["a", "b"])`` or + ``DetectionModelSelection(entity_validator=("a", "b"))`` without + caring about the concrete sequence type. Any other input type + raises ``TypeError``. """ if isinstance(value, str): aliases: list[str] = [value] diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 860e7fa6..1d0f6c67 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -50,6 +50,7 @@ import asyncio import logging +from collections.abc import Sequence from typing import Any from data_designer.config import custom_column_generator @@ -154,13 +155,17 @@ def order_candidates_by_position( def chunk_candidates( - ordered: list[tuple[Any, EntitySpan]], + ordered: Sequence[tuple[Any, EntitySpan]], max_entities_per_call: int, ) -> list[list[tuple[Any, EntitySpan]]]: - """Partition the ordered (candidate, seed) pairs into chunks of at most ``max_entities_per_call``.""" - if max_entities_per_call <= 0: - raise ValueError(f"max_entities_per_call must be > 0, got {max_entities_per_call}.") - return [ordered[i : i + max_entities_per_call] for i in range(0, len(ordered), max_entities_per_call)] + """Partition the ordered (candidate, seed) pairs into chunks of at most ``max_entities_per_call``. + + ``max_entities_per_call`` is assumed positive. It is validated upstream by + ``ChunkedValidationParams`` (``Field(gt=0)``) and by + ``AnonymizerDetectConfig.validation_max_entities_per_call`` (``gt=0``), so + we do not re-check it here. + """ + return [list(ordered[i : i + max_entities_per_call]) for i in range(0, len(ordered), max_entities_per_call)] def build_chunk_excerpt( @@ -369,9 +374,20 @@ async def _dispatch_chunk( exc, ) - # Defensive: facades is non-empty (caller guarantees), so the loop either - # returned a successful output or recorded last_exc and re-raises here. - assert last_exc is not None + # Defensive: ``facades`` is non-empty by caller contract + # (``chunked_validate_row`` builds it from the configured pool, which the + # config validator requires to be non-empty). After a non-empty pool fully + # exhausts, ``last_exc`` is guaranteed set and we re-raise it. If we ever + # reach this point with ``last_exc is None``, the precondition was + # violated — raise a loud ``RuntimeError`` rather than ``raise None`` + # (which would produce ``TypeError: exceptions must derive from + # BaseException``). Using a real check instead of ``assert`` so the guard + # survives ``python -O``. + if last_exc is None: + raise RuntimeError( + "_dispatch_chunk was called with an empty facades list; " + "this violates the caller contract (a non-empty validator pool)." + ) raise last_exc diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index 57f27187..fe3cfe3a 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -148,9 +148,14 @@ def test_splits_into_chunks_of_at_most_n(self) -> None: def test_empty_input_yields_no_chunks(self) -> None: assert chunk_candidates([], max_entities_per_call=10) == [] - def test_non_positive_limit_raises(self) -> None: - with pytest.raises(ValueError): - chunk_candidates([(1, None)], max_entities_per_call=0) + def test_tuple_input_returns_list_of_lists(self) -> None: + # Input-type tolerance: a sequence (not only a list) should work, and + # the declared ``list[list[...]]`` return contract must hold even + # when the caller hands in a tuple. + ordered: tuple[tuple[int, None], ...] = tuple((i, None) for i in range(3)) # type: ignore[misc] + chunks = chunk_candidates(ordered, max_entities_per_call=2) + assert chunks == [[(0, None), (1, None)], [(2, None)]] + assert all(isinstance(c, list) for c in chunks) class TestBuildChunkExcerpt: @@ -376,6 +381,64 @@ def test_empty_candidates_short_circuits_without_calls(self) -> None: assert facade.calls == [] assert out[COL_VALIDATION_DECISIONS] == {"decisions": []} + def test_system_prompt_is_forwarded_to_facade(self) -> None: + # ``ChunkedValidationParams.system_prompt`` must reach ``facade.generate``. + # The recipe appends JSON task instructions before dispatch, so we assert + # substring containment with a distinctive sentinel rather than equality. + text = "Alice spoke." + spans = [_entity_span("a", "Alice", "first_name", 0, 5)] + candidates = _candidates_schema(("a", "Alice", "first_name")) + row = _build_row(text=text, seed_entities=spans, candidates=candidates) + + facade = FakeFacade( + "v0", + response={"decisions": [{"id": "a", "decision": "keep", "proposed_label": "", "reason": "x"}]}, + ) + sentinel = "SYSPROMPT_SENTINEL_CHUNKED_VALIDATION_TEST" + params = ChunkedValidationParams( + pool=["v0"], + max_entities_per_call=10, + excerpt_window_chars=50, + prompt_template=_MINIMAL_TEMPLATE, + system_prompt=f"You are a validator. {sentinel}", + ) + + asyncio.run(chunked_validate_row(row, params, {"v0": facade})) + + assert len(facade.calls) == 1 + forwarded = facade.calls[0]["system_prompt"] + assert forwarded is not None + assert sentinel in forwarded + + def test_system_prompt_default_none_is_forwarded_untouched(self) -> None: + # When the caller leaves ``system_prompt`` unset, nothing upstream of + # the recipe should synthesize one. Whatever the recipe produces from + # ``None`` is what the facade sees (today: ``None``). This test pins + # that we don't accidentally inject a placeholder along the way. + text = "Alice spoke." + spans = [_entity_span("a", "Alice", "first_name", 0, 5)] + candidates = _candidates_schema(("a", "Alice", "first_name")) + row = _build_row(text=text, seed_entities=spans, candidates=candidates) + + facade = FakeFacade( + "v0", + response={"decisions": [{"id": "a", "decision": "keep", "proposed_label": "", "reason": "x"}]}, + ) + params = ChunkedValidationParams( + pool=["v0"], + max_entities_per_call=10, + excerpt_window_chars=50, + prompt_template=_MINIMAL_TEMPLATE, + # system_prompt intentionally omitted (default None) + ) + + asyncio.run(chunked_validate_row(row, params, {"v0": facade})) + + assert len(facade.calls) == 1 + # The recipe maps ``None`` input to ``None`` output, so the facade + # should receive no system prompt. + assert facade.calls[0]["system_prompt"] is None + class TestChunkedValidateRowPoolOfTwoRoundRobin: def test_chunks_assigned_round_robin_across_pool(self) -> None: diff --git a/tests/engine/test_model_loader.py b/tests/engine/test_model_loader.py index 82406384..7a967100 100644 --- a/tests/engine/test_model_loader.py +++ b/tests/engine/test_model_loader.py @@ -257,6 +257,21 @@ def test_list_preserved(self) -> None: ) assert selection.entity_validator == ["v1", "v2", "v3"] + def test_tuple_coerced_to_list(self) -> None: + # Tuples are accepted for parity with Pydantic v2's default coercion + # for ``list[str]`` fields; programmatic callers should not need to + # care about the concrete sequence type. The normalizer must return + # a real ``list`` so downstream ``isinstance(value, list)`` branches + # (e.g. in ``resolve_model_alias``) behave consistently. + selection = DetectionModelSelection( + entity_detector="d", + entity_validator=("v1", "v2"), # type: ignore[arg-type] + entity_augmenter="a", + latent_detector="l", + ) + assert selection.entity_validator == ["v1", "v2"] + assert isinstance(selection.entity_validator, list) + def test_empty_list_rejected(self) -> None: with pytest.raises(ValueError, match="at least one model alias"): DetectionModelSelection( From 134f4da06fcbec8481d2904e2c3e2ee41fc08134 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Tue, 21 Apr 2026 11:28:43 -0700 Subject: [PATCH 08/20] docs(detect): tidy review-history residue in chunked-validation prose Four inline prose touches in the chunked-validation module. No code or test behaviour changes; only doc/comment rewording so the source reads as a standalone spec rather than a narrated diff against prior versions. - `chunk_candidates` docstring: state the ``max_entities_per_call > 0`` precondition positively, point at the two upstream enforcement layers, drop the "so we do not re-check it here" coda. - `_dispatch_chunk` defensive-guard comment: keep the substantive reasons (empty-facades precondition, loud named error vs. ``raise None``, guard survives ``python -O``) and drop the assert-vs-if comparison that only makes sense in the context of the review exchange that introduced it. - `_dispatch_chunk` docstring: replace "matching the pre-failover contract" with a direct statement of the single-alias-pool behaviour ("no alternate alias to try"). - `test_system_prompt_default_none_is_forwarded_untouched` comment: tighten the "accidentally inject a placeholder" framing to just state the current recipe contract and what the test pins. Made-with: Cursor --- .../engine/detection/chunked_validation.py | 35 +++++++++---------- tests/engine/test_chunked_validation.py | 8 ++--- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 1d0f6c67..35759b4f 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -160,10 +160,10 @@ def chunk_candidates( ) -> list[list[tuple[Any, EntitySpan]]]: """Partition the ordered (candidate, seed) pairs into chunks of at most ``max_entities_per_call``. - ``max_entities_per_call`` is assumed positive. It is validated upstream by - ``ChunkedValidationParams`` (``Field(gt=0)``) and by - ``AnonymizerDetectConfig.validation_max_entities_per_call`` (``gt=0``), so - we do not re-check it here. + Assumes ``max_entities_per_call > 0``; positivity is enforced upstream at + ``ChunkedValidationParams.max_entities_per_call`` and + ``AnonymizerDetectConfig.validation_max_entities_per_call`` (both + ``Field(gt=0)``). """ return [list(ordered[i : i + max_entities_per_call]) for i in range(0, len(ordered), max_entities_per_call)] @@ -321,11 +321,11 @@ async def _dispatch_chunk( We use ``PydanticResponseRecipe`` so the facade appends JSON task instructions and parses the response into ``RawValidationDecisionsSchema``. - Single-alias pools get no additional attempts — the loop runs exactly - once and re-raises the original exception, matching the pre-failover - contract. Multi-alias pools get ``len(pool)`` total attempts. If every - pool member raises, the *last* exception propagates so DataDesigner - records the row as a ``FailedRecord`` via ``NddAdapter._detect_missing_records``. + Single-alias pools run the loop exactly once and re-raise the original + exception (no alternate alias to try). Multi-alias pools get + ``len(pool)`` total attempts. If every pool member raises, the *last* + exception propagates so DataDesigner records the row as a + ``FailedRecord`` via ``NddAdapter._detect_missing_records``. Each failover attempt is logged at WARNING so operators can correlate degraded pool members with run-level failure-rate spikes. @@ -374,15 +374,14 @@ async def _dispatch_chunk( exc, ) - # Defensive: ``facades`` is non-empty by caller contract - # (``chunked_validate_row`` builds it from the configured pool, which the - # config validator requires to be non-empty). After a non-empty pool fully - # exhausts, ``last_exc`` is guaranteed set and we re-raise it. If we ever - # reach this point with ``last_exc is None``, the precondition was - # violated — raise a loud ``RuntimeError`` rather than ``raise None`` - # (which would produce ``TypeError: exceptions must derive from - # BaseException``). Using a real check instead of ``assert`` so the guard - # survives ``python -O``. + # ``facades`` is non-empty by caller contract: ``chunked_validate_row`` + # builds it from the configured pool, and the config validator requires + # a non-empty pool. After the loop, ``last_exc`` is therefore set and + # we re-raise it. The ``None`` branch exists only to give a loud, + # named error if that precondition is ever violated (rather than + # ``raise None``, which would surface as ``TypeError: exceptions must + # derive from BaseException``) and to keep the guard live under + # ``python -O``, which strips ``assert``. if last_exc is None: raise RuntimeError( "_dispatch_chunk was called with an empty facades list; " diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index fe3cfe3a..0c76ff38 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -411,10 +411,10 @@ def test_system_prompt_is_forwarded_to_facade(self) -> None: assert sentinel in forwarded def test_system_prompt_default_none_is_forwarded_untouched(self) -> None: - # When the caller leaves ``system_prompt`` unset, nothing upstream of - # the recipe should synthesize one. Whatever the recipe produces from - # ``None`` is what the facade sees (today: ``None``). This test pins - # that we don't accidentally inject a placeholder along the way. + # The recipe maps ``None`` input to ``None`` output; this test pins + # that no intermediate layer replaces it with a placeholder string + # when ``ChunkedValidationParams.system_prompt`` is left at its + # default. text = "Alice spoke." spans = [_entity_span("a", "Alice", "first_name", 0, 5)] candidates = _candidates_schema(("a", "Alice", "first_name")) From 841899feb0667e0c8a901621df1eb721bb8f87f3 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 22 Apr 2026 15:36:52 -0700 Subject: [PATCH 09/20] refactor(config): dedupe validator pool and preserve native types from YAML Addresses two review findings on the validator-pool config path. normalize_entity_validator now collapses duplicate aliases to the first occurrence (order preserved) and logs a warning. A duplicate would burn a failover attempt on an already-exhausted endpoint -- almost certainly not what the user intended. Empty pools still raise. load_workflow_selections preserves list-valued roles as list[str] instead of stringifying them. Stringifying would silently collapse a pool YAML block into a single garbled alias repr, which Pydantic would then treat as one alias. get_model_alias now raises TypeError when called on a list-valued role with a pointer to resolve_model_aliases for the pool shape. Tests cover dedup, the no-dup-no-warning path, list-value preservation, and the get_model_alias type error. Made-with: Cursor --- src/anonymizer/config/models.py | 27 ++++++- src/anonymizer/engine/ndd/model_loader.py | 39 ++++++++-- tests/engine/test_model_loader.py | 86 ++++++++++++++++++++++- 3 files changed, 144 insertions(+), 8 deletions(-) diff --git a/src/anonymizer/config/models.py b/src/anonymizer/config/models.py index 88bc7ba0..59792196 100644 --- a/src/anonymizer/config/models.py +++ b/src/anonymizer/config/models.py @@ -3,10 +3,13 @@ from __future__ import annotations +import logging from typing import Any from pydantic import BaseModel, field_validator +logger = logging.getLogger(__name__) + class DetectionModelSelection(BaseModel): """Model aliases for the entity detection pipeline. @@ -26,7 +29,7 @@ class DetectionModelSelection(BaseModel): @field_validator("entity_validator", mode="before") @classmethod def normalize_entity_validator(cls, value: Any) -> list[str]: - """Accept a scalar alias, a list of aliases, or a tuple of aliases; return a non-empty list. + """Accept a scalar alias, a list of aliases, or a tuple of aliases; return a non-empty deduplicated list. Normalizing at parse time keeps every downstream consumer on the same shape (``list[str]``) regardless of whether the user wrote @@ -38,6 +41,11 @@ def normalize_entity_validator(cls, value: Any) -> list[str]: ``DetectionModelSelection(entity_validator=("a", "b"))`` without caring about the concrete sequence type. Any other input type raises ``TypeError``. + + Duplicate aliases are collapsed to the first occurrence (order + preserved) and a warning is logged. A duplicate in the pool would + burn a failover attempt on an already-exhausted endpoint, which + almost certainly isn't what the user wants. """ if isinstance(value, str): aliases: list[str] = [value] @@ -48,7 +56,22 @@ def normalize_entity_validator(cls, value: Any) -> list[str]: cleaned = [alias.strip() for alias in aliases if alias.strip()] if not cleaned: raise ValueError("entity_validator must name at least one model alias.") - return cleaned + seen: set[str] = set() + deduped: list[str] = [] + for alias in cleaned: + if alias in seen: + continue + seen.add(alias) + deduped.append(alias) + if len(deduped) != len(cleaned): + removed = [alias for alias in cleaned if cleaned.count(alias) > 1] + logger.warning( + "entity_validator pool contained duplicate aliases %s; collapsing to %s. " + "Duplicates burn a failover attempt on an already-exhausted endpoint.", + sorted(set(removed)), + deduped, + ) + return deduped class ReplaceModelSelection(BaseModel): diff --git a/src/anonymizer/engine/ndd/model_loader.py b/src/anonymizer/engine/ndd/model_loader.py index ee556926..e2d0d0bb 100644 --- a/src/anonymizer/engine/ndd/model_loader.py +++ b/src/anonymizer/engine/ndd/model_loader.py @@ -88,15 +88,32 @@ def load_models_config(config_dir: Path | None = None) -> dict[str, Any]: return _load_yaml_dict(resolved_dir / "models.yaml") -def load_workflow_selections(workflow_name: WorkflowName, config_dir: Path | None = None) -> dict[str, str]: - """Load selected model aliases for a workflow.""" +def load_workflow_selections( + workflow_name: WorkflowName, + config_dir: Path | None = None, +) -> dict[str, str | list[str]]: + """Load selected model aliases for a workflow. + + Scalar roles (e.g. ``entity_detector``) come back as strings and + list-valued roles (e.g. ``entity_validator``, which accepts a pool) come + back as ``list[str]``. Native types are preserved rather than stringified + so downstream Pydantic selection models see the shape the YAML actually + declared — stringifying a list would silently collapse the pool to a + single garbled alias. + """ resolved_dir = config_dir or DEFAULT_CONFIG_DIR workflow_file = resolved_dir / f"{workflow_name.value}.yaml" workflow_config = _load_yaml_dict(workflow_file) selected_models = workflow_config.get("selected_models", {}) if not isinstance(selected_models, dict): raise ValueError(f"{workflow_file} must define a top-level 'selected_models' mapping.") - return {str(key): str(value) for key, value in selected_models.items()} + normalized: dict[str, str | list[str]] = {} + for key, value in selected_models.items(): + if isinstance(value, list): + normalized[str(key)] = [str(item) for item in value] + else: + normalized[str(key)] = str(value) + return normalized def load_workflow_config(workflow_name: WorkflowName, config_dir: Path | None = None) -> dict[str, Any]: @@ -114,12 +131,24 @@ def load_workflow_config(workflow_name: WorkflowName, config_dir: Path | None = def get_model_alias(workflow_name: WorkflowName, role: str, config_dir: Path | None = None) -> str: - """Return the model alias assigned to a workflow role.""" + """Return the scalar model alias assigned to a workflow role. + + Raises ``TypeError`` if the role is list-valued in the YAML (e.g. a + validator pool). Callers that need the full pool should read the + populated selection model via ``load_default_model_selection()`` and + ``resolve_model_aliases`` instead. + """ selected_models = load_workflow_selections(workflow_name=workflow_name, config_dir=config_dir) if role not in selected_models: available = ", ".join(sorted(selected_models.keys())) raise ValueError(f"Role '{role}' not found in workflow '{workflow_name.value}'. Available roles: {available}") - return selected_models[role] + value = selected_models[role] + if isinstance(value, list): + raise TypeError( + f"Role '{role}' in workflow '{workflow_name.value}' is list-valued (a pool); " + f"use resolve_model_aliases() on the populated selection model instead." + ) + return value def resolve_model_alias( diff --git a/tests/engine/test_model_loader.py b/tests/engine/test_model_loader.py index 7a967100..6af6309d 100644 --- a/tests/engine/test_model_loader.py +++ b/tests/engine/test_model_loader.py @@ -37,6 +37,43 @@ def test_get_model_alias_reads_workflow_mapping() -> None: assert isinstance(alias, str) and alias +def test_load_workflow_selections_preserves_list_values(tmp_path) -> None: + """A YAML pool under ``selected_models`` must round-trip as ``list[str]``. + + Stringifying would silently collapse the pool to a single garbled alias + ("['v1', 'v2']"), and Pydantic's ``normalize_entity_validator`` would + then treat that repr as one alias. Pinning the native-type preservation + here keeps that trap closed. + """ + config_dir = tmp_path + (config_dir / "detection.yaml").write_text( + "selected_models:\n" + " entity_detector: d\n" + " entity_validator:\n" + " - v1\n" + " - v2\n" + " entity_augmenter: a\n" + " latent_detector: l\n" + ) + selections = load_workflow_selections(WorkflowName.detection, config_dir) + assert selections["entity_validator"] == ["v1", "v2"] + assert selections["entity_detector"] == "d" + + +def test_get_model_alias_rejects_list_valued_role(tmp_path) -> None: + """Calling the scalar accessor on a pool-valued role raises ``TypeError``.""" + config_dir = tmp_path + (config_dir / "detection.yaml").write_text( + "selected_models:\n" + " entity_detector: d\n" + " entity_validator: [v1, v2]\n" + " entity_augmenter: a\n" + " latent_detector: l\n" + ) + with pytest.raises(TypeError, match="list-valued"): + get_model_alias(WorkflowName.detection, "entity_validator", config_dir) + + WORKFLOW_YAMLS = [p.stem for p in DEFAULT_CONFIG_DIR.glob("*.yaml") if p.stem != "models"] @@ -46,7 +83,13 @@ def test_default_workflow_aliases_exist_in_models(workflow_name: str) -> None: models_config = load_models_config() known_aliases = {m["alias"] for m in models_config.get("model_configs", [])} selections = load_workflow_selections(WorkflowName(workflow_name)) - unknown = set(selections.values()) - known_aliases + referenced: set[str] = set() + for value in selections.values(): + if isinstance(value, list): + referenced.update(value) + else: + referenced.add(value) + unknown = referenced - known_aliases assert not unknown, f"Workflow '{workflow_name}' references unknown aliases: {unknown}. Known: {known_aliases}" @@ -299,6 +342,47 @@ def test_non_string_non_list_rejected(self) -> None: latent_detector="l", ) + def test_duplicate_aliases_are_deduped_with_warning( + self, + caplog: pytest.LogCaptureFixture, + ) -> None: + # A duplicate alias in the pool would burn a failover attempt on an + # already-exhausted endpoint. The normalizer collapses duplicates to + # the first occurrence (preserving order) and logs a warning so the + # user can see their config wasn't applied exactly as written. + with caplog.at_level("WARNING", logger="anonymizer.config.models"): + selection = DetectionModelSelection( + entity_detector="d", + entity_validator=["v1", "v2", "v1", "v3", "v2"], + entity_augmenter="a", + latent_detector="l", + ) + assert selection.entity_validator == ["v1", "v2", "v3"] + # caplog may double-capture when pytest-caplog and the root logger + # both propagate the record; dedupe on message content instead of + # asserting a raw count. + dedupe_messages = { + r.getMessage() for r in caplog.records if r.levelname == "WARNING" and "duplicate aliases" in r.getMessage() + } + assert len(dedupe_messages) == 1 + + def test_no_warning_when_all_aliases_unique( + self, + caplog: pytest.LogCaptureFixture, + ) -> None: + with caplog.at_level("WARNING", logger="anonymizer.config.models"): + selection = DetectionModelSelection( + entity_detector="d", + entity_validator=["v1", "v2", "v3"], + entity_augmenter="a", + latent_detector="l", + ) + assert selection.entity_validator == ["v1", "v2", "v3"] + dedupe_warnings = [ + r for r in caplog.records if r.levelname == "WARNING" and "duplicate aliases" in r.getMessage() + ] + assert dedupe_warnings == [] + class TestValidateAliasReferencesHandlesValidatorPool: """``validate_model_alias_references`` must expand list-valued roles to one check per alias.""" From 2fe252e07df53f0e5474a15bc73d5f0fa915bc09 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 22 Apr 2026 15:36:58 -0700 Subject: [PATCH 10/20] chore(detect): drop unused build_validation_skeleton helper build_validation_skeleton has been dead since chunked validation moved skeleton construction inline into chunked_validate_row. No internal callers remain; its tests go with it. Made-with: Cursor --- .../engine/detection/custom_columns.py | 16 ------ tests/engine/test_detection_custom_columns.py | 53 ------------------- 2 files changed, 69 deletions(-) diff --git a/src/anonymizer/engine/detection/custom_columns.py b/src/anonymizer/engine/detection/custom_columns.py index bad41d72..059d82ac 100644 --- a/src/anonymizer/engine/detection/custom_columns.py +++ b/src/anonymizer/engine/detection/custom_columns.py @@ -34,7 +34,6 @@ COL_VALIDATED_SEED_ENTITIES, COL_VALIDATION_CANDIDATES, COL_VALIDATION_DECISIONS, - COL_VALIDATION_SKELETON, ) from anonymizer.engine.detection.postprocess import ( EntitySpan, @@ -52,8 +51,6 @@ ValidatedDecisionSchema, ValidatedDecisionsSchema, ValidationCandidatesSchema, - ValidationSkeletonDecisionSchema, - ValidationSkeletonSchema, ) @@ -135,19 +132,6 @@ def prepare_validation_inputs(row: dict[str, Any]) -> dict[str, Any]: return row -@custom_column_generator(required_columns=[COL_SEED_VALIDATION_CANDIDATES]) -def build_validation_skeleton(row: dict[str, Any]) -> dict[str, Any]: - """Pre-populate the decisions template with candidate IDs so the LLM only fills in decision/reason.""" - candidates = ValidationCandidatesSchema.from_raw(row.get(COL_SEED_VALIDATION_CANDIDATES, {})) - skeleton = ValidationSkeletonSchema( - decisions=[ - ValidationSkeletonDecisionSchema(id=c.id, value=c.value, label=c.label) for c in candidates.candidates - ] - ) - row[COL_VALIDATION_SKELETON] = skeleton.model_dump(mode="json") - return row - - @custom_column_generator( required_columns=[COL_VALIDATION_DECISIONS, COL_SEED_VALIDATION_CANDIDATES], ) diff --git a/tests/engine/test_detection_custom_columns.py b/tests/engine/test_detection_custom_columns.py index bd2aa6c5..1a42f226 100644 --- a/tests/engine/test_detection_custom_columns.py +++ b/tests/engine/test_detection_custom_columns.py @@ -26,12 +26,10 @@ COL_VALIDATED_SEED_ENTITIES, COL_VALIDATION_CANDIDATES, COL_VALIDATION_DECISIONS, - COL_VALIDATION_SKELETON, ) from anonymizer.engine.detection.custom_columns import ( _parse_entity_spans, apply_validation_and_finalize, - build_validation_skeleton, enrich_validation_decisions, merge_and_build_candidates, parse_detected_entities, @@ -144,57 +142,6 @@ def test_enrich_validation_decisions_ignores_non_dict_validation_payload() -> No assert result[COL_VALIDATED_ENTITIES] == {"decisions": []} -def test_build_validation_skeleton_produces_null_decisions() -> None: - row: dict[str, Any] = { - COL_SEED_VALIDATION_CANDIDATES: { - "candidates": [ - { - "id": "first_name_0_5", - "value": "Alice", - "label": "first_name", - "context_before": "", - "context_after": " works", - }, - { - "id": "org_15_19", - "value": "Acme", - "label": "organization", - "context_before": "at ", - "context_after": "", - }, - ] - }, - } - result = build_validation_skeleton(row) - skeleton = result[COL_VALIDATION_SKELETON] - assert len(skeleton["decisions"]) == 2 - assert skeleton["decisions"][0]["id"] == "first_name_0_5" - assert skeleton["decisions"][0]["value"] == "Alice" - assert skeleton["decisions"][0]["label"] == "first_name" - assert skeleton["decisions"][0]["decision"] is None - assert skeleton["decisions"][0]["proposed_label"] is None - assert skeleton["decisions"][1]["id"] == "org_15_19" - - -def test_build_validation_skeleton_handles_candidates_with_missing_keys() -> None: - row: dict[str, Any] = { - COL_SEED_VALIDATION_CANDIDATES: { - "candidates": [ - {"id": "x"}, - {"value": "Alice"}, - {}, - ] - }, - } - result = build_validation_skeleton(row) - skeleton = result[COL_VALIDATION_SKELETON] - assert len(skeleton["decisions"]) == 3 - assert skeleton["decisions"][0]["id"] == "x" - assert skeleton["decisions"][0]["value"] == "" - assert skeleton["decisions"][1]["id"] == "" - assert skeleton["decisions"][1]["value"] == "Alice" - - def test_apply_validation_and_finalize_handles_malformed_merged_entities() -> None: row: dict[str, Any] = { COL_TEXT: "Alice works at Acme.", From f093b9926d861a51f79fe2593d9d9846e2e9813b Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 22 Apr 2026 15:37:20 -0700 Subject: [PATCH 11/20] refactor(detect): replace asyncio fan-out with ThreadPoolExecutor; cache Jinja template Drops asyncio.run / asyncio.to_thread / asyncio.gather from the chunked validation path. _dispatch_chunk and chunked_validate_row become sync defs; chunked_validate_row dispatches chunks via a ThreadPoolExecutor and calls facade.generate() directly. Per-alias concurrency is still enforced downstream by each facade's ThrottledModelClient, so the pool exists purely to overlap this row's chunks. Under DataDesigner's async engine the sync calls are transparently bridged to agenerate by the DD runtime (DD#545), so the code path stays engine-agnostic. Test call sites lose their asyncio.run(...) wrappers. A TODO(async-native) comment in the module docstring flags the follow-up migration once the async engine becomes the DD default. Wraps _compile_template in functools.lru_cache(maxsize=4) so a row with N chunks parses the Jinja source once instead of N times. Folds in the post-#119 column rename for this module (COL_MERGED_TAGGED_TEXT -> COL_SEED_TAGGED_TEXT, COL_VALIDATION_CANDIDATES -> COL_SEED_VALIDATION_CANDIDATES, prompt placeholder _merged_tagged_text -> _seed_tagged_text) that the rebase resolved in neighbouring modules but never applied here. Module docstring gains two follow-up-deferred paragraphs explaining (a) why per-instance validation is intentional -- dedup by (value, label) was considered and rejected because it conflates surface form with meaning -- and (b) how peak prompt memory scales with chunk count, with pointers to the orthogonal cost levers to pull if pressure shows up in a real workload. Both are tracked as separate follow-up issues. Made-with: Cursor --- .../engine/detection/chunked_validation.py | 173 +++++++++++------- tests/engine/test_chunked_validation.py | 60 +++--- tests/engine/test_detection_workflow.py | 4 +- 3 files changed, 134 insertions(+), 103 deletions(-) diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 35759b4f..2895e399 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -15,13 +15,12 @@ Failure contract. Each chunk attempts its round-robin-assigned alias first and, on any terminal exception from that alias (5xx after transport retries, connection errors, parser errors), fails over sequentially to the rest of -the pool. A chunk only fails when every pool member has raised. A failed -chunk cancels sibling chunks (``asyncio.gather`` semantics) and raises out -of the custom column generator, at which point DataDesigner drops the row -from the output DataFrame and ``NddAdapter._detect_missing_records`` -surfaces it as a ``FailedRecord``. This is the "best effort on first pass, -identify and reprocess misses" contract: raw text never silently leaks -through as unscrubbed output. +the pool. A chunk only fails when every pool member has raised. The first +failing chunk re-raises out of the custom column generator, at which point +DataDesigner drops the row from the output DataFrame and +``NddAdapter._detect_missing_records`` surfaces it as a ``FailedRecord``. +This is the "best effort on first pass, identify and reprocess misses" +contract: raw text never silently leaks through as unscrubbed output. The per-chunk decisions are merged into a ``ValidationDecisionsSchema``-shaped payload so the downstream ``enrich_validation_decisions`` column keeps @@ -32,25 +31,58 @@ decorated with ``@custom_column_generator`` and bound to a concrete pool. The pure helpers below are exposed for unit testing. -Sync/async boundary with DataDesigner. The DD-exposed wrapper is sync -(DD's default thread-pool engine calls ``generator.generate(record)`` -directly; an async wrapper would return a coroutine and DD would reject it -with ``"must return a dict, got coroutine"``). The wrapper runs the -``chunked_validate_row`` coroutine on a fresh event loop per row via -``asyncio.run``; inside, each chunk dispatch calls the *sync* -``facade.generate()`` through ``asyncio.to_thread`` because the facades DD -hands us under the sync engine are sync-mode ``HttpModelClient``s whose -``agenerate()`` raises. DD#545 (shipped in ``data-designer>=0.5.7``) -bridges sync ``.generate()`` calls to ``agenerate`` transparently under -the async engine, so this same code path works both before and after -Anonymizer#119 turns the async engine on for detection. +Per-instance validation is intentional. Every candidate goes to the LLM in +its own excerpt, even when the same ``(value, label)`` pair appears many +times in a row. Deduping by ``(value, label)`` and broadcasting one +canonical decision to every duplicate was considered (it is the single +largest cost lever on this path) and rejected: it conflates surface form +with meaning, produces silently-wrong answers when the detector's labels +are context-dependent, and gives us no signal when it does. See the +"Validator chunking" section of ``docs/concepts/detection.md`` and the +``context`` block on the C5 review thread in PR #126 for the full +reasoning. If cost becomes pressing before we have data on how often +duplicates genuinely are semantically equivalent, prefer orthogonal +levers: tighter excerpt windows, larger ``max_entities_per_call`` for +cheap validators, or token-budget-aware chunking (tracked as C7 in the +same review thread). + +Concurrency model. ``chunked_validate_row`` dispatches its chunks through a +``ThreadPoolExecutor``. Per-alias concurrency is already enforced downstream +by each facade's ``ThrottledModelClient`` (AIMD on 429), so we intentionally +do not impose a row-level cap here; the pool exists purely to overlap the +chunks for this single row. Under DataDesigner's opt-in async engine the +sync ``facade.generate()`` calls we make are transparently bridged to +``agenerate`` by the DD runtime, so this code path is agnostic to which +engine is active. + +TODO(async-native): once DataDesigner's async engine becomes the default +(tracking: DATA_DESIGNER_ASYNC_ENGINE opt-in flag flips off), drop the +``ThreadPoolExecutor`` + sync ``facade.generate()`` pattern here in favour +of ``async def`` functions calling ``facade.agenerate()`` directly, with +``asyncio.gather`` replacing the executor. That removes one thread hop per +chunk and lets DD's scheduler see per-chunk dispatch as first-class async +work. See the PR #126 review thread (step 2 of Andre's async +simplification suggestion) for context. + +Eager prompt construction. ``chunked_validate_row`` builds the excerpt, +skeleton, and rendered prompt for every chunk before submitting any worker. +Peak prompt memory is ``len(chunks) * (2 * excerpt_window_chars + skeleton ++ ~3KB template overhead)``, i.e. low-MB even at 1000 entities with +``max_entities_per_call`` in the single digits. At workloads where this +becomes observable (very small ``max_entities_per_call`` on multi-thousand- +entity rows, or large ``excerpt_window_chars``), the fix is to move prompt +construction inside each worker and pair it with a row-level concurrency +cap (otherwise all workers race the construction phase in parallel and the +bound is unchanged). Deferred because we have no evidence of memory +pressure at realistic scale; tracked as C6 in the PR #126 review thread. """ from __future__ import annotations -import asyncio +import functools import logging from collections.abc import Sequence +from concurrent.futures import ThreadPoolExecutor from typing import Any from data_designer.config import custom_column_generator @@ -59,11 +91,11 @@ from pydantic import BaseModel, Field from anonymizer.engine.constants import ( - COL_MERGED_TAGGED_TEXT, COL_SEED_ENTITIES, + COL_SEED_TAGGED_TEXT, + COL_SEED_VALIDATION_CANDIDATES, COL_TAG_NOTATION, COL_TEXT, - COL_VALIDATION_CANDIDATES, COL_VALIDATION_DECISIONS, COL_VALIDATION_SKELETON, ) @@ -85,7 +117,7 @@ # Jinja2 environment used to render the per-chunk validation prompt. # The template mirrors the production prompt exactly: we substitute the same -# placeholders (``_merged_tagged_text``, ``_validation_skeleton``, +# placeholders (``_seed_tagged_text``, ``_validation_skeleton``, # ``_tag_notation``) but with per-chunk values. _PROMPT_ENV = Environment( loader=BaseLoader(), @@ -95,6 +127,20 @@ ) +@functools.lru_cache(maxsize=4) +def _compile_template(template: str) -> Any: + """Return a compiled Jinja2 template, cached by source string. + + A row with ``N`` chunks would otherwise re-parse the same template ``N`` + times, which is wasteful for the exact workload this module targets + (high-entity-count rows). ``maxsize=4`` is deliberately tiny: in practice + there is one prompt string per ``EntityDetectionWorkflow`` instance, but + tests may instantiate a handful of distinct templates in a single + process so we keep a small LRU rather than an unbounded cache. + """ + return _PROMPT_ENV.from_string(template) + + class ChunkedValidationParams(BaseModel): """Parameters supplied to :func:`chunked_validate_row` via DD's ``generator_params``. @@ -109,7 +155,7 @@ class ChunkedValidationParams(BaseModel): excerpt_window_chars: Chars of surrounding raw text included in each chunk's excerpt on either side of the chunk span. prompt_template: Jinja2 source for the validation prompt (with - ``_merged_tagged_text``, ``_validation_skeleton``, ``_tag_notation`` + ``_seed_tagged_text``, ``_validation_skeleton``, ``_tag_notation`` placeholders). Typically produced by ``_get_validation_prompt``. system_prompt: Optional system prompt forwarded to each chunk call. """ @@ -229,10 +275,10 @@ def render_chunk_prompt( call: dicts are rendered with Python ``str()`` (Jinja2 default), which is how the existing prompt has always served ``{{ _validation_skeleton }}``. """ - compiled = _PROMPT_ENV.from_string(template) + compiled = _compile_template(template) return compiled.render( **{ - COL_MERGED_TAGGED_TEXT: excerpt, + COL_SEED_TAGGED_TEXT: excerpt, COL_VALIDATION_SKELETON: skeleton, COL_TAG_NOTATION: notation.value, } @@ -286,11 +332,11 @@ def merge_chunk_decisions( # --------------------------------------------------------------------------- -# Async dispatch. Testable by passing fake ``models``. +# Chunk dispatch. Testable by passing fake ``models``. # --------------------------------------------------------------------------- -async def _dispatch_chunk( +def _dispatch_chunk( *, facades: list[tuple[str, Any]], prompt: str, @@ -307,17 +353,6 @@ async def _dispatch_chunk( errors) and its own AIMD throttling on 429, so by the time an exception escapes the facade call we consider that alias exhausted for this chunk. - We call the *sync* ``facade.generate()`` inside ``asyncio.to_thread`` on - purpose. Under DataDesigner's default thread-pool engine the facades - DD hands us are sync-mode ``HttpModelClient``s, and calling - ``facade.agenerate()`` on them raises - ``"Async methods are not available on a sync-mode HttpModelClient"``. - The sync call works under both engines: DD#545 (shipped in - ``data-designer>=0.5.7``) bridges sync ``.generate()`` calls from - custom column generators to ``agenerate`` under the async engine, so - wrapping in ``asyncio.to_thread`` gives us per-row chunk concurrency - today and forward-compatibility after Anonymizer#119 lands. - We use ``PydanticResponseRecipe`` so the facade appends JSON task instructions and parses the response into ``RawValidationDecisionsSchema``. @@ -337,8 +372,7 @@ async def _dispatch_chunk( last_exc: BaseException | None = None for attempt_index, (alias, facade) in enumerate(facades): try: - output, _messages = await asyncio.to_thread( - facade.generate, + output, _messages = facade.generate( prompt=final_prompt, parser=recipe.parse, system_prompt=final_system, @@ -390,14 +424,14 @@ async def _dispatch_chunk( raise last_exc -async def chunked_validate_row( +def chunked_validate_row( row: dict[str, Any], params: ChunkedValidationParams, models: dict[str, Any], ) -> dict[str, Any]: """Run chunked validation for a single row and write ``COL_VALIDATION_DECISIONS``. - This is the async workhorse. Call it directly in tests with fake ``models``; + This is the workhorse. Call it directly in tests with fake ``models``; the DataDesigner-decorated wrapper produced by :func:`make_chunked_validation_generator` just forwards to it. """ @@ -410,7 +444,7 @@ async def chunked_validate_row( ) text = str(row.get(COL_TEXT, "")) - candidates = ValidationCandidatesSchema.from_raw(row.get(COL_VALIDATION_CANDIDATES, {})) + candidates = ValidationCandidatesSchema.from_raw(row.get(COL_SEED_VALIDATION_CANDIDATES, {})) seed_entities_schema = EntitiesSchema.from_raw(row.get(COL_SEED_ENTITIES, {})) notation_raw = row.get(COL_TAG_NOTATION) or TagNotation.sentinel.value notation = TagNotation(str(notation_raw)) @@ -436,7 +470,7 @@ async def chunked_validate_row( ordered = order_candidates_by_position(candidates, all_spans) chunks = chunk_candidates(ordered, params.max_entities_per_call) - tasks: list[Any] = [] + dispatch_kwargs_per_chunk: list[dict[str, Any]] = [] for chunk_index, chunk in enumerate(chunks): chunk_candidates_ = [pair[0] for pair in chunk] chunk_spans = [pair[1] for pair in chunk] @@ -462,18 +496,28 @@ async def chunked_validate_row( start = chunk_index % len(params.pool) rotated_aliases = [params.pool[(start + offset) % len(params.pool)] for offset in range(len(params.pool))] chunk_facades = [(alias, models[alias]) for alias in rotated_aliases] - tasks.append( - _dispatch_chunk( - facades=chunk_facades, - prompt=prompt, - system_prompt=params.system_prompt, - chunk_index=chunk_index, - ) + dispatch_kwargs_per_chunk.append( + { + "facades": chunk_facades, + "prompt": prompt, + "system_prompt": params.system_prompt, + "chunk_index": chunk_index, + } ) - # gather() propagates the first exception, cancelling siblings. That's the - # all-or-nothing row contract: a single terminal chunk failure fails the row. - chunk_results = await asyncio.gather(*tasks) + # Dispatch all chunks concurrently via a ThreadPoolExecutor. Per-alias + # concurrency is still capped downstream by each facade's + # ``ThrottledModelClient`` (AIMD on 429), so the pool's only job here is + # to overlap one row's chunks. ``f.result()`` re-raises the first chunk + # exception, which is what we want: a single terminal chunk failure + # fails the row. Pending workers finish naturally as the ``with`` block + # exits — we just stop observing their results once we re-raise. + if not chunks: + chunk_results: list[RawValidationDecisionsSchema] = [] + else: + with ThreadPoolExecutor(max_workers=len(chunks)) as executor: + futures = [executor.submit(_dispatch_chunk, **kwargs) for kwargs in dispatch_kwargs_per_chunk] + chunk_results = [f.result() for f in futures] row[COL_VALIDATION_DECISIONS] = merge_chunk_decisions(chunk_results, candidates) return row @@ -485,7 +529,7 @@ async def chunked_validate_row( def make_chunked_validation_generator(pool: list[str]) -> Any: - """Build a ``@custom_column_generator``-decorated sync function bound to ``pool``. + """Build a ``@custom_column_generator``-decorated function bound to ``pool``. ``model_aliases`` must be declared statically on the decorator so DataDesigner knows which facades to materialise for the generator. Since @@ -493,19 +537,6 @@ def make_chunked_validation_generator(pool: list[str]) -> Any: The required_columns are exhaustive for DataDesigner's DAG ordering: the generator reads the raw text, seed entities (for positions), the candidate list (what to decide), and the tag notation (for excerpt tagging). - - Why the outer wrapper is sync. DataDesigner routes cell-by-cell custom - generators through a ThreadPoolExecutor by default (the async engine is - opt-in via ``DATA_DESIGNER_ASYNC_ENGINE=1``). The thread-pool path calls - ``generator.generate(record)`` which synchronously invokes our function - and passes its return value straight into ``_postprocess_result``. If the - outer function were ``async``, the sync caller would receive a coroutine - object and DD would reject it with ``"must return a dict, got coroutine"``. - The sync wrapper here runs the async row logic on a fresh event loop - (safe because each DD worker thread has no ambient loop) and returns the - resolved dict, so this works under both the default thread engine and - the opt-in async engine (which wraps sync generators in - ``asyncio.to_thread``). """ if not pool: raise ValueError("Cannot build chunked validation generator: pool is empty.") @@ -514,7 +545,7 @@ def make_chunked_validation_generator(pool: list[str]) -> Any: required_columns=[ COL_TEXT, COL_SEED_ENTITIES, - COL_VALIDATION_CANDIDATES, + COL_SEED_VALIDATION_CANDIDATES, COL_TAG_NOTATION, ], model_aliases=list(pool), @@ -524,6 +555,6 @@ def chunked_validate( generator_params: ChunkedValidationParams, models: dict[str, Any], ) -> dict[str, Any]: - return asyncio.run(chunked_validate_row(row, generator_params, models)) + return chunked_validate_row(row, generator_params, models) return chunked_validate diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index 0c76ff38..78b8fdca 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -4,13 +4,12 @@ """Tests for chunked LLM validation. The module is layered: pure helpers (ordering, chunking, excerpts, prompt -rendering, merging) are tested directly; the async dispatch is tested via a +rendering, merging) are tested directly; the chunk dispatch is tested via a fake ``ModelFacade`` that records calls and returns preconfigured responses. """ from __future__ import annotations -import asyncio import json import re from typing import Any, Callable @@ -20,6 +19,7 @@ from anonymizer.engine.constants import ( COL_MERGED_TAGGED_TEXT, COL_SEED_ENTITIES, + COL_SEED_VALIDATION_CANDIDATES, COL_TAG_NOTATION, COL_TEXT, COL_VALIDATION_CANDIDATES, @@ -53,12 +53,11 @@ class FakeFacade: """Test double for ``ModelFacade`` recording invocations and replaying canned responses. - Exposes a *sync* ``generate()`` method because ``_dispatch_chunk`` now - calls ``asyncio.to_thread(facade.generate, ...)`` (see the module - docstring for why the sync primitive is correct under both DD engines). - Tests still drive the pipeline through ``asyncio.run(chunked_validate_row(...))``, - which awaits the ``to_thread`` task that invokes this sync method in a - worker thread. + Exposes a ``generate()`` method because ``_dispatch_chunk`` calls the + sync facade primitive directly; per-row chunk concurrency comes from + the ``ThreadPoolExecutor`` inside ``chunked_validate_row``. Under DD's + async engine the sync ``.generate()`` call is transparently bridged to + ``agenerate`` by the DD runtime. A canned response can be a ``dict`` (auto-wrapped in a ```json fence), a raw string, or a callable that receives the prompt and returns either. @@ -255,7 +254,7 @@ def test_skeleton_matches_chunk_only(self) -> None: class TestRenderChunkPrompt: def test_substitutes_excerpt_skeleton_and_notation(self) -> None: template = ( - "Input: {{ _merged_tagged_text }}\n" + "Input: {{ _seed_tagged_text }}\n" "Skeleton: {{ _validation_skeleton }}\n" '{%- if _tag_notation == "xml" -%}notation-is-xml{%- endif -%}' ) @@ -329,12 +328,12 @@ def _build_row( for span in seed_entities ] ).model_dump(mode="json"), - COL_VALIDATION_CANDIDATES: candidates.model_dump(mode="json"), + COL_SEED_VALIDATION_CANDIDATES: candidates.model_dump(mode="json"), COL_TAG_NOTATION: notation.value, } -_MINIMAL_TEMPLATE = "TAGGED:{{ _merged_tagged_text }}|SKELETON:{{ _validation_skeleton }}|NOTATION:{{ _tag_notation }}" +_MINIMAL_TEMPLATE = "TAGGED:{{ _seed_tagged_text }}|SKELETON:{{ _validation_skeleton }}|NOTATION:{{ _tag_notation }}" class TestChunkedValidateRowPoolOfOne: @@ -363,7 +362,7 @@ def test_single_chunk_single_alias_dispatches_once_and_merges(self) -> None: prompt_template=_MINIMAL_TEMPLATE, ) - out = asyncio.run(chunked_validate_row(row, params, {"v0": facade})) + out = chunked_validate_row(row, params, {"v0": facade}) assert len(facade.calls) == 1 decisions = out[COL_VALIDATION_DECISIONS]["decisions"] @@ -376,7 +375,7 @@ def test_empty_candidates_short_circuits_without_calls(self) -> None: pool=["v0"], max_entities_per_call=10, excerpt_window_chars=50, prompt_template=_MINIMAL_TEMPLATE ) - out = asyncio.run(chunked_validate_row(row, params, {"v0": facade})) + out = chunked_validate_row(row, params, {"v0": facade}) assert facade.calls == [] assert out[COL_VALIDATION_DECISIONS] == {"decisions": []} @@ -403,7 +402,7 @@ def test_system_prompt_is_forwarded_to_facade(self) -> None: system_prompt=f"You are a validator. {sentinel}", ) - asyncio.run(chunked_validate_row(row, params, {"v0": facade})) + chunked_validate_row(row, params, {"v0": facade}) assert len(facade.calls) == 1 forwarded = facade.calls[0]["system_prompt"] @@ -432,7 +431,7 @@ def test_system_prompt_default_none_is_forwarded_untouched(self) -> None: # system_prompt intentionally omitted (default None) ) - asyncio.run(chunked_validate_row(row, params, {"v0": facade})) + chunked_validate_row(row, params, {"v0": facade}) assert len(facade.calls) == 1 # The recipe maps ``None`` input to ``None`` output, so the facade @@ -464,7 +463,7 @@ def _respond(prompt: str) -> dict: excerpt_window_chars=50, prompt_template=_MINIMAL_TEMPLATE, ) - asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) + chunked_validate_row(row, params, {"v0": v0, "v1": v1}) # 6 candidates / 2 per chunk = 3 chunks; round-robin → v0,v1,v0 assert len(v0.calls) == 2 assert len(v1.calls) == 1 @@ -508,7 +507,7 @@ def respond(prompt: str) -> dict: excerpt_window_chars=50, prompt_template=_MINIMAL_TEMPLATE, ) - out = asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) + out = chunked_validate_row(row, params, {"v0": v0, "v1": v1}) decisions = {d["id"]: d for d in out[COL_VALIDATION_DECISIONS]["decisions"]} assert set(decisions) == {"c1", "c2", "c3", "c4"} # Chunk 0 (c1,c2) → v0; chunk 1 (c3,c4) → v1. @@ -543,7 +542,7 @@ def test_row_fails_only_when_every_pool_member_raises_for_a_chunk(self) -> None: # The last alias tried propagates; order is round-robin + wrap, so # chunk 0 starts at v0 → fails → v1 (fails, last) → raises "from v1". with pytest.raises(RuntimeError, match="forced failure from v1"): - asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) + chunked_validate_row(row, params, {"v0": v0, "v1": v1}) class TestChunkedValidateRowCrossAliasFailover: @@ -561,7 +560,7 @@ def test_primary_alias_failure_falls_over_to_secondary(self) -> None: excerpt_window_chars=50, prompt_template=_MINIMAL_TEMPLATE, ) - out = asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) + out = chunked_validate_row(row, params, {"v0": v0, "v1": v1}) decisions = out[COL_VALIDATION_DECISIONS]["decisions"] assert len(decisions) == 1 assert decisions[0]["id"] == "a" @@ -588,7 +587,7 @@ def test_single_alias_pool_does_not_failover(self) -> None: prompt_template=_MINIMAL_TEMPLATE, ) with pytest.raises(RuntimeError, match="forced failure from v0"): - asyncio.run(chunked_validate_row(row, params, {"v0": v0})) + chunked_validate_row(row, params, {"v0": v0}) assert len(v0.calls) == 1 def test_failover_wraps_starting_from_round_robin_primary(self) -> None: @@ -628,7 +627,7 @@ def v0_response(prompt: str) -> dict: excerpt_window_chars=50, prompt_template=_MINIMAL_TEMPLATE, ) - out = asyncio.run(chunked_validate_row(row, params, {"v0": v0, "v1": v1})) + out = chunked_validate_row(row, params, {"v0": v0, "v1": v1}) decisions = {d["id"]: d["decision"] for d in out[COL_VALIDATION_DECISIONS]["decisions"]} assert decisions == {"a": "keep", "b": "keep", "c": "keep"} # v0 serviced all three chunks: chunk 0 + chunk 2 directly, chunk 1 via failover. @@ -655,7 +654,7 @@ def test_decision_for_non_candidate_id_is_dropped(self) -> None: params = ChunkedValidationParams( pool=["v0"], max_entities_per_call=5, excerpt_window_chars=20, prompt_template=_MINIMAL_TEMPLATE ) - out = asyncio.run(chunked_validate_row(row, params, {"v0": facade})) + out = chunked_validate_row(row, params, {"v0": facade}) ids = [d["id"] for d in out[COL_VALIDATION_DECISIONS]["decisions"]] assert ids == ["a"] @@ -672,7 +671,7 @@ def test_pool_alias_missing_from_models_raises_helpful_error(self) -> None: prompt_template=_MINIMAL_TEMPLATE, ) with pytest.raises(KeyError, match="missing_alias"): - asyncio.run(chunked_validate_row(row, params, {"v0": FakeFacade("v0", response={"decisions": []})})) + chunked_validate_row(row, params, {"v0": FakeFacade("v0", response={"decisions": []})}) # --------------------------------------------------------------------------- @@ -688,22 +687,24 @@ def test_decorator_metadata_encodes_pool_and_required_columns(self) -> None: assert set(meta["required_columns"]) == { COL_TEXT, COL_SEED_ENTITIES, - COL_VALIDATION_CANDIDATES, + COL_SEED_VALIDATION_CANDIDATES, COL_TAG_NOTATION, } # Must not declare columns we deliberately don't read; an over-broad - # required_columns would distort DAG ordering elsewhere. + # required_columns would distort DAG ordering elsewhere. In particular + # the post-augmentation views (COL_MERGED_TAGGED_TEXT, COL_VALIDATION_CANDIDATES) + # are downstream of this step and would create a cycle if declared. assert COL_VALIDATION_SKELETON not in meta["required_columns"] assert COL_MERGED_TAGGED_TEXT not in meta["required_columns"] + assert COL_VALIDATION_CANDIDATES not in meta["required_columns"] def test_factory_rejects_empty_pool(self) -> None: with pytest.raises(ValueError, match="pool is empty"): make_chunked_validation_generator([]) def test_generator_forwards_to_chunked_validate_row(self) -> None: - """The DD-exposed wrapper is *sync* (bridges DD's thread-pool fan-out - to our async row logic via asyncio.run), so this test calls it - directly and asserts it returns a dict, not a coroutine. + """The DD-exposed wrapper is sync; calling it directly must return a + dict populated by the row logic (not a coroutine). """ spans = [_entity_span("a", "Alice", "first_name", 0, 5)] candidates = _candidates_schema(("a", "Alice", "first_name")) @@ -726,7 +727,6 @@ def test_generator_is_sync_callable_returning_dict(self) -> None: import inspect as _inspect fn = make_chunked_validation_generator(["v0"]) - # The decorator wraps the function; unwrap and check the inner target. inner = _inspect.unwrap(fn) assert not _asyncio.iscoroutinefunction(inner), ( "DD-exposed generator must be sync; an async outer wrapper breaks the default thread-pool engine." @@ -847,7 +847,7 @@ def _run( excerpt_window_chars=200, prompt_template=_MINIMAL_TEMPLATE, ) - out = asyncio.run(chunked_validate_row(row, params, {"solo": facade})) + out = chunked_validate_row(row, params, {"solo": facade}) decisions_doc = out[COL_VALIDATION_DECISIONS] validated = apply_validation_decisions(spans, decisions_doc) return decisions_doc, validated, len(facade.calls) diff --git a/tests/engine/test_detection_workflow.py b/tests/engine/test_detection_workflow.py index 4b7a4174..3783dd39 100644 --- a/tests/engine/test_detection_workflow.py +++ b/tests/engine/test_detection_workflow.py @@ -18,10 +18,10 @@ COL_FINAL_ENTITIES, COL_LATENT_ENTITIES, COL_SEED_ENTITIES, + COL_SEED_VALIDATION_CANDIDATES, COL_TAG_NOTATION, COL_TAGGED_TEXT, COL_TEXT, - COL_VALIDATION_CANDIDATES, COL_VALIDATION_DECISIONS, DEFAULT_ENTITY_LABELS, ) @@ -534,7 +534,7 @@ def test_validation_column_is_custom_chunked_generator( assert set(metadata["required_columns"]) == { COL_TEXT, COL_SEED_ENTITIES, - COL_VALIDATION_CANDIDATES, + COL_SEED_VALIDATION_CANDIDATES, COL_TAG_NOTATION, } From 113074d95076d7ea7d54ac561e92355b40720368 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 22 Apr 2026 16:25:09 -0700 Subject: [PATCH 12/20] docs(detect): fix misleading C5 examples in chunked-validation docstring The rejected-dedup rationale previously used "Apple"/"May" examples, which are actually *different* (value, label) keys -- so dedup would not affect them and the examples didn't demonstrate the concern. The real failure case is same (value, label) carrying different correctness across a document, which happens for short names that collide with common English words: "Ira" / "irate" (substring false positive), "Mark" / "mark", "Bill" / "bill", "Hope" / "hope". Validator would keep one and drop the other; dedup broadcasts one decision across both. Made-with: Cursor --- .../engine/detection/chunked_validation.py | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 2895e399..3cb0e9e9 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -35,16 +35,26 @@ its own excerpt, even when the same ``(value, label)`` pair appears many times in a row. Deduping by ``(value, label)`` and broadcasting one canonical decision to every duplicate was considered (it is the single -largest cost lever on this path) and rejected: it conflates surface form -with meaning, produces silently-wrong answers when the detector's labels -are context-dependent, and gives us no signal when it does. See the -"Validator chunking" section of ``docs/concepts/detection.md`` and the -``context`` block on the C5 review thread in PR #126 for the full -reasoning. If cost becomes pressing before we have data on how often -duplicates genuinely are semantically equivalent, prefer orthogonal -levers: tighter excerpt windows, larger ``max_entities_per_call`` for -cheap validators, or token-budget-aware chunking (tracked as C7 in the -same review thread). +largest cost lever on this path) and rejected. + +The concern is that the same surface form + label can carry different +correctness across a document. Short names that collide with common +English words are the canonical case: the detector tags ``"Ira"`` as +``first_name`` in both "Ira said hello" (true positive) and "he grew +irate" (false positive from a substring span), producing identical +``(value, label)`` keys. Same for ``Mark`` / ``mark``, ``Bill`` / +``bill``, ``Hope`` / ``hope``, ``Art`` / ``art``. The validator is +precisely the component whose job is to resolve these; deduping erases +the question before the validator can answer it, and we have no signal +when it happens. See the "Validator chunking" section of +``docs/concepts/detection.md`` and the C5 review thread in PR #126 for +the full reasoning. + +If cost becomes pressing before we have data on how often duplicates +are genuinely semantically equivalent, prefer orthogonal levers: +tighter excerpt windows, larger ``max_entities_per_call`` for cheap +validators, or token-budget-aware chunking (tracked as C7 in the same +review thread). Concurrency model. ``chunked_validate_row`` dispatches its chunks through a ``ThreadPoolExecutor``. Per-alias concurrency is already enforced downstream From d5fd99238f04049b978d00beb4129b36b536f585 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 22 Apr 2026 16:34:17 -0700 Subject: [PATCH 13/20] docs: clean up docstrings --- .../engine/detection/chunked_validation.py | 114 ++++-------------- 1 file changed, 26 insertions(+), 88 deletions(-) diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 3cb0e9e9..6c34e456 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -3,88 +3,34 @@ """Chunked LLM validation for the entity detection pipeline. -Validation is the step that asks an LLM to keep/reclass/drop each candidate -entity produced by the detector. Historically we ran one LLM call per row over -the full tagged text. For long documents with many candidates this hits -context-window and provider rate limits. This module replaces that single -call with a fan-out: partition the row's candidates into chunks, build a -small tagged excerpt of text around each chunk, render the existing -validation prompt per chunk, and dispatch each chunk to an alias selected -round-robin from a configured validator pool. - -Failure contract. Each chunk attempts its round-robin-assigned alias first -and, on any terminal exception from that alias (5xx after transport retries, -connection errors, parser errors), fails over sequentially to the rest of -the pool. A chunk only fails when every pool member has raised. The first -failing chunk re-raises out of the custom column generator, at which point -DataDesigner drops the row from the output DataFrame and +Partition a row's validation candidates into chunks, build a small tagged +excerpt around each chunk, render the validation prompt per chunk, and +dispatch each chunk to an alias selected round-robin from a configured +validator pool. The per-chunk decisions are merged into a +``ValidationDecisionsSchema``-shaped payload consumed by +``enrich_validation_decisions`. + +Public entry point: :func:`make_chunked_validation_generator`, which +produces a ``@custom_column_generator``-decorated function bound to a +concrete pool. The helpers below are exposed for unit testing. + +Failure contract. Each chunk attempts its round-robin primary first and +fails over sequentially to the rest of the pool; a chunk only fails when +every pool member has raised. The first failing chunk re-raises out of +the generator, DataDesigner drops the row, and ``NddAdapter._detect_missing_records`` surfaces it as a ``FailedRecord``. -This is the "best effort on first pass, identify and reprocess misses" -contract: raw text never silently leaks through as unscrubbed output. - -The per-chunk decisions are merged into a ``ValidationDecisionsSchema``-shaped -payload so the downstream ``enrich_validation_decisions`` column keeps -working unchanged. - -The public entry point for DataDesigner wiring is -:func:`make_chunked_validation_generator`, which produces a generator function -decorated with ``@custom_column_generator`` and bound to a concrete pool. -The pure helpers below are exposed for unit testing. - -Per-instance validation is intentional. Every candidate goes to the LLM in -its own excerpt, even when the same ``(value, label)`` pair appears many -times in a row. Deduping by ``(value, label)`` and broadcasting one -canonical decision to every duplicate was considered (it is the single -largest cost lever on this path) and rejected. - -The concern is that the same surface form + label can carry different -correctness across a document. Short names that collide with common -English words are the canonical case: the detector tags ``"Ira"`` as -``first_name`` in both "Ira said hello" (true positive) and "he grew -irate" (false positive from a substring span), producing identical -``(value, label)`` keys. Same for ``Mark`` / ``mark``, ``Bill`` / -``bill``, ``Hope`` / ``hope``, ``Art`` / ``art``. The validator is -precisely the component whose job is to resolve these; deduping erases -the question before the validator can answer it, and we have no signal -when it happens. See the "Validator chunking" section of -``docs/concepts/detection.md`` and the C5 review thread in PR #126 for -the full reasoning. - -If cost becomes pressing before we have data on how often duplicates -are genuinely semantically equivalent, prefer orthogonal levers: -tighter excerpt windows, larger ``max_entities_per_call`` for cheap -validators, or token-budget-aware chunking (tracked as C7 in the same -review thread). - -Concurrency model. ``chunked_validate_row`` dispatches its chunks through a -``ThreadPoolExecutor``. Per-alias concurrency is already enforced downstream -by each facade's ``ThrottledModelClient`` (AIMD on 429), so we intentionally -do not impose a row-level cap here; the pool exists purely to overlap the -chunks for this single row. Under DataDesigner's opt-in async engine the -sync ``facade.generate()`` calls we make are transparently bridged to -``agenerate`` by the DD runtime, so this code path is agnostic to which -engine is active. +Raw text never silently leaks through as unscrubbed output. + +Concurrency. Chunks dispatch through a ``ThreadPoolExecutor``. Per-alias +concurrency is already enforced downstream by each facade's +``ThrottledModelClient`` (AIMD on 429), so there is no row-level cap +here; the pool exists purely to overlap this row's chunks. TODO(async-native): once DataDesigner's async engine becomes the default -(tracking: DATA_DESIGNER_ASYNC_ENGINE opt-in flag flips off), drop the -``ThreadPoolExecutor`` + sync ``facade.generate()`` pattern here in favour -of ``async def`` functions calling ``facade.agenerate()`` directly, with -``asyncio.gather`` replacing the executor. That removes one thread hop per -chunk and lets DD's scheduler see per-chunk dispatch as first-class async -work. See the PR #126 review thread (step 2 of Andre's async -simplification suggestion) for context. - -Eager prompt construction. ``chunked_validate_row`` builds the excerpt, -skeleton, and rendered prompt for every chunk before submitting any worker. -Peak prompt memory is ``len(chunks) * (2 * excerpt_window_chars + skeleton -+ ~3KB template overhead)``, i.e. low-MB even at 1000 entities with -``max_entities_per_call`` in the single digits. At workloads where this -becomes observable (very small ``max_entities_per_call`` on multi-thousand- -entity rows, or large ``excerpt_window_chars``), the fix is to move prompt -construction inside each worker and pair it with a row-level concurrency -cap (otherwise all workers race the construction phase in parallel and the -bound is unchanged). Deferred because we have no evidence of memory -pressure at realistic scale; tracked as C6 in the PR #126 review thread. +(``DATA_DESIGNER_ASYNC_ENGINE`` flips off), replace the +``ThreadPoolExecutor`` + sync ``facade.generate()`` pattern with +``async def`` functions calling ``facade.agenerate()`` under +``asyncio.gather``. """ from __future__ import annotations @@ -139,15 +85,7 @@ @functools.lru_cache(maxsize=4) def _compile_template(template: str) -> Any: - """Return a compiled Jinja2 template, cached by source string. - - A row with ``N`` chunks would otherwise re-parse the same template ``N`` - times, which is wasteful for the exact workload this module targets - (high-entity-count rows). ``maxsize=4`` is deliberately tiny: in practice - there is one prompt string per ``EntityDetectionWorkflow`` instance, but - tests may instantiate a handful of distinct templates in a single - process so we keep a small LRU rather than an unbounded cache. - """ + """Return a compiled Jinja2 template, cached by source string.""" return _PROMPT_ENV.from_string(template) From 0757e887651fff6445a2485ad58f68598d03cce6 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 22 Apr 2026 16:48:41 -0700 Subject: [PATCH 14/20] fix(config): re-validate selected_models overrides instead of model_copy model_copy(update=...) in Pydantic v2 silently skips field validators, so user overrides to selected_models bypassed DetectionModelSelection.normalize_entity_validator entirely: - entity_validator: [] -> accepted as [] (should raise) - entity_validator: [a, a, b] -> not deduped, no warning - entity_validator: [" a ", ""] -> whitespace not stripped Fix routes overrides through type(section).model_validate(merged) in _merge_selections so every field validator re-runs at parse time. Applied uniformly to detection/replace/rewrite to avoid leaving the same landmine for any validators we add to Replace or Rewrite later. Regression tests pin all three failure modes against parse_model_configs itself, which is the entry point the bug was reproduced against. Made-with: Cursor --- src/anonymizer/engine/ndd/model_loader.py | 25 ++++++--- tests/engine/test_model_loader.py | 63 +++++++++++++++++++++++ 2 files changed, 82 insertions(+), 6 deletions(-) diff --git a/src/anonymizer/engine/ndd/model_loader.py b/src/anonymizer/engine/ndd/model_loader.py index e2d0d0bb..b90ec162 100644 --- a/src/anonymizer/engine/ndd/model_loader.py +++ b/src/anonymizer/engine/ndd/model_loader.py @@ -10,6 +10,7 @@ from data_designer.config.models import ModelConfig, load_model_configs from data_designer.config.utils.io_helpers import load_config_file +from pydantic import BaseModel from anonymizer.config.models import ( DetectionModelSelection, @@ -186,21 +187,33 @@ def resolve_model_aliases( def _merge_selections(user_selections: dict[str, dict[str, str]] | None) -> ModelSelection: - """Merge user-provided role selections onto YAML defaults.""" + """Merge user-provided role selections onto YAML defaults. + + Re-validates via ``type(section).model_validate(merged)`` rather than + ``model_copy(update=...)``. Pydantic v2's ``model_copy`` silently skips + field validators, which would let invalid pool configs (empty list, + duplicate aliases, whitespace-only entries) bypass + ``DetectionModelSelection.normalize_entity_validator`` at parse time + and surface as opaque runtime failures later. + """ defaults = load_default_model_selection() if not user_selections or not isinstance(user_selections, dict): return defaults + def _merge(section: BaseModel, overrides: dict[str, Any]) -> BaseModel: + if not overrides: + return section + merged = {**section.model_dump(), **overrides} + return type(section).model_validate(merged) + detection_overrides = user_selections.get(WorkflowName.detection.value, {}) replace_overrides = user_selections.get(WorkflowName.replace.value, {}) rewrite_overrides = user_selections.get(WorkflowName.rewrite.value, {}) return ModelSelection( - detection=defaults.detection.model_copy(update=detection_overrides) - if detection_overrides - else defaults.detection, - replace=defaults.replace.model_copy(update=replace_overrides) if replace_overrides else defaults.replace, - rewrite=defaults.rewrite.model_copy(update=rewrite_overrides) if rewrite_overrides else defaults.rewrite, + detection=_merge(defaults.detection, detection_overrides), + replace=_merge(defaults.replace, replace_overrides), + rewrite=_merge(defaults.rewrite, rewrite_overrides), ) diff --git a/tests/engine/test_model_loader.py b/tests/engine/test_model_loader.py index 6af6309d..d99a59bf 100644 --- a/tests/engine/test_model_loader.py +++ b/tests/engine/test_model_loader.py @@ -149,6 +149,69 @@ def test_parse_model_configs_yaml_without_selections_uses_defaults() -> None: assert result.selected_models.detection.entity_detector == "gliner-pii-detector" +class TestParseModelConfigsRevalidatesOverrides: + """User overrides in ``selected_models`` must rerun field validators. + + ``model_copy(update=...)`` in Pydantic v2 silently skips validators, so + the ``DetectionModelSelection.normalize_entity_validator`` checks + (non-empty, deduped, whitespace-stripped) would be bypassed on override + unless ``_merge_selections`` re-validates. Each of the three failure + modes below is pinned as a regression test so future refactors of the + merge path preserve parse-time rejection. + """ + + def test_empty_entity_validator_override_raises_at_parse_time(self) -> None: + yaml_str = """ +selected_models: + detection: + entity_validator: [] +model_configs: + - alias: gliner-pii-detector + model: test/gliner +""" + with pytest.raises(ValueError, match="at least one model alias"): + parse_model_configs(yaml_str) + + def test_duplicate_aliases_in_override_are_deduped_with_warning( + self, + caplog: pytest.LogCaptureFixture, + ) -> None: + yaml_str = """ +selected_models: + detection: + entity_validator: [v1, v2, v1, v3, v2] +model_configs: + - alias: gliner-pii-detector + model: test/gliner + - alias: v1 + model: test/v1 + - alias: v2 + model: test/v2 + - alias: v3 + model: test/v3 +""" + with caplog.at_level("WARNING", logger="anonymizer.config.models"): + result = parse_model_configs(yaml_str) + assert result.selected_models.detection.entity_validator == ["v1", "v2", "v3"] + assert any("duplicate aliases" in r.getMessage() for r in caplog.records) + + def test_whitespace_only_entry_in_override_is_stripped(self) -> None: + yaml_str = """ +selected_models: + detection: + entity_validator: [" v1 ", " ", "v2"] +model_configs: + - alias: gliner-pii-detector + model: test/gliner + - alias: v1 + model: test/v1 + - alias: v2 + model: test/v2 +""" + result = parse_model_configs(yaml_str) + assert result.selected_models.detection.entity_validator == ["v1", "v2"] + + def test_validate_model_alias_references_accepts_valid_detection_aliases( stub_known_model_configs: list[ModelConfig], stub_slim_model_selection: ModelSelection, From 6b613b0b767305c868d8b73f94ef5fbe7dac4632 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 22 Apr 2026 16:52:07 -0700 Subject: [PATCH 15/20] style(tests): flatten TestParseModelConfigsRevalidatesOverrides to module-level functions Matches the majority style of test_model_loader.py. Class-grouped shared-rationale docstring becomes a short block comment above the three functions. Made-with: Cursor --- tests/engine/test_model_loader.py | 51 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/tests/engine/test_model_loader.py b/tests/engine/test_model_loader.py index d99a59bf..2527d68e 100644 --- a/tests/engine/test_model_loader.py +++ b/tests/engine/test_model_loader.py @@ -149,19 +149,15 @@ def test_parse_model_configs_yaml_without_selections_uses_defaults() -> None: assert result.selected_models.detection.entity_detector == "gliner-pii-detector" -class TestParseModelConfigsRevalidatesOverrides: - """User overrides in ``selected_models`` must rerun field validators. - - ``model_copy(update=...)`` in Pydantic v2 silently skips validators, so - the ``DetectionModelSelection.normalize_entity_validator`` checks - (non-empty, deduped, whitespace-stripped) would be bypassed on override - unless ``_merge_selections`` re-validates. Each of the three failure - modes below is pinned as a regression test so future refactors of the - merge path preserve parse-time rejection. - """ +# parse_model_configs regression tests: user overrides in selected_models must +# rerun field validators. model_copy(update=...) in Pydantic v2 silently skips +# them, so the three DetectionModelSelection.normalize_entity_validator checks +# (non-empty, deduped, whitespace-stripped) would be bypassed on override +# unless _merge_selections re-validates. + - def test_empty_entity_validator_override_raises_at_parse_time(self) -> None: - yaml_str = """ +def test_parse_model_configs_rejects_empty_entity_validator_override() -> None: + yaml_str = """ selected_models: detection: entity_validator: [] @@ -169,14 +165,14 @@ def test_empty_entity_validator_override_raises_at_parse_time(self) -> None: - alias: gliner-pii-detector model: test/gliner """ - with pytest.raises(ValueError, match="at least one model alias"): - parse_model_configs(yaml_str) + with pytest.raises(ValueError, match="at least one model alias"): + parse_model_configs(yaml_str) - def test_duplicate_aliases_in_override_are_deduped_with_warning( - self, - caplog: pytest.LogCaptureFixture, - ) -> None: - yaml_str = """ + +def test_parse_model_configs_dedupes_duplicate_override_aliases_with_warning( + caplog: pytest.LogCaptureFixture, +) -> None: + yaml_str = """ selected_models: detection: entity_validator: [v1, v2, v1, v3, v2] @@ -190,13 +186,14 @@ def test_duplicate_aliases_in_override_are_deduped_with_warning( - alias: v3 model: test/v3 """ - with caplog.at_level("WARNING", logger="anonymizer.config.models"): - result = parse_model_configs(yaml_str) - assert result.selected_models.detection.entity_validator == ["v1", "v2", "v3"] - assert any("duplicate aliases" in r.getMessage() for r in caplog.records) + with caplog.at_level("WARNING", logger="anonymizer.config.models"): + result = parse_model_configs(yaml_str) + assert result.selected_models.detection.entity_validator == ["v1", "v2", "v3"] + assert any("duplicate aliases" in r.getMessage() for r in caplog.records) - def test_whitespace_only_entry_in_override_is_stripped(self) -> None: - yaml_str = """ + +def test_parse_model_configs_strips_whitespace_only_override_entries() -> None: + yaml_str = """ selected_models: detection: entity_validator: [" v1 ", " ", "v2"] @@ -208,8 +205,8 @@ def test_whitespace_only_entry_in_override_is_stripped(self) -> None: - alias: v2 model: test/v2 """ - result = parse_model_configs(yaml_str) - assert result.selected_models.detection.entity_validator == ["v1", "v2"] + result = parse_model_configs(yaml_str) + assert result.selected_models.detection.entity_validator == ["v1", "v2"] def test_validate_model_alias_references_accepts_valid_detection_aliases( From ff80afcea46995a1dcdf23d0b0eb0b2556e99726 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 22 Apr 2026 17:31:34 -0700 Subject: [PATCH 16/20] make workflow yaml validation helper work with list valued pools --- src/anonymizer/engine/ndd/model_loader.py | 17 ++++++-- tests/engine/test_model_loader.py | 50 +++++++++++++++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/src/anonymizer/engine/ndd/model_loader.py b/src/anonymizer/engine/ndd/model_loader.py index b90ec162..d74cbac3 100644 --- a/src/anonymizer/engine/ndd/model_loader.py +++ b/src/anonymizer/engine/ndd/model_loader.py @@ -258,12 +258,23 @@ def _collect_role(bucket: dict[str, str], path: str, value: object) -> None: def _validate_alias_references( models_config: dict[str, Any], - selections: dict[str, str], + selections: dict[str, str | list[str]], workflow_name: str, ) -> None: - """Validate that bundled workflow YAMLs reference aliases defined in models.yaml.""" + """Validate that bundled workflow YAMLs reference aliases defined in models.yaml. + + Handles both scalar-valued roles and list-valued roles (e.g. a validator + pool). A plain ``set(selections.values())`` would raise + ``TypeError: unhashable type: 'list'`` once any role is list-valued. + """ known_aliases = {m["alias"] for m in models_config.get("model_configs", [])} - unknown = set(selections.values()) - known_aliases + referenced: set[str] = set() + for value in selections.values(): + if isinstance(value, list): + referenced.update(value) + else: + referenced.add(value) + unknown = referenced - known_aliases if unknown: raise ValueError( f"Workflow '{workflow_name}' references unknown model aliases: {unknown}. Known aliases: {known_aliases}" diff --git a/tests/engine/test_model_loader.py b/tests/engine/test_model_loader.py index 2527d68e..14e94710 100644 --- a/tests/engine/test_model_loader.py +++ b/tests/engine/test_model_loader.py @@ -32,6 +32,56 @@ def test_load_workflow_config_contains_selected_models() -> None: assert isinstance(entity_detector, str) and entity_detector +def test_load_workflow_config_accepts_list_valued_entity_validator(tmp_path) -> None: + """A list-valued role in detection.yaml must not blow up the alias check. + + Regression test for ``set(selections.values())`` raising + ``TypeError: unhashable type: 'list'`` once any role is list-valued. + """ + (tmp_path / "models.yaml").write_text( + "model_configs:\n" + " - alias: gliner\n" + " model: test/gliner\n" + " - alias: v1\n" + " model: test/v1\n" + " - alias: v2\n" + " model: test/v2\n" + " - alias: a\n" + " model: test/a\n" + ) + (tmp_path / "detection.yaml").write_text( + "selected_models:\n" + " entity_detector: gliner\n" + " entity_validator: [v1, v2]\n" + " entity_augmenter: a\n" + " latent_detector: gliner\n" + ) + config = load_workflow_config(WorkflowName.detection, tmp_path) + assert config["selected_models"]["entity_validator"] == ["v1", "v2"] + + +def test_load_workflow_config_raises_on_unknown_alias_in_validator_pool(tmp_path) -> None: + """An unknown alias inside a list-valued pool should surface by name.""" + (tmp_path / "models.yaml").write_text( + "model_configs:\n" + " - alias: gliner\n" + " model: test/gliner\n" + " - alias: v1\n" + " model: test/v1\n" + " - alias: a\n" + " model: test/a\n" + ) + (tmp_path / "detection.yaml").write_text( + "selected_models:\n" + " entity_detector: gliner\n" + " entity_validator: [v1, does-not-exist]\n" + " entity_augmenter: a\n" + " latent_detector: gliner\n" + ) + with pytest.raises(ValueError, match="does-not-exist"): + load_workflow_config(WorkflowName.detection, tmp_path) + + def test_get_model_alias_reads_workflow_mapping() -> None: alias = get_model_alias(workflow_name=WorkflowName.detection, role="entity_validator") assert isinstance(alias, str) and alias From 4eaee1dde1861332637ac1f79edaa4ed1773a139 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 22 Apr 2026 18:52:28 -0700 Subject: [PATCH 17/20] display: remove prompt template from logging of validation step --- .../engine/detection/chunked_validation.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 6c34e456..58706469 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -106,13 +106,21 @@ class ChunkedValidationParams(BaseModel): ``_seed_tagged_text``, ``_validation_skeleton``, ``_tag_notation`` placeholders). Typically produced by ``_get_validation_prompt``. system_prompt: Optional system prompt forwarded to each chunk call. + + ``prompt_template`` and ``system_prompt`` are marked ``repr=False`` because + DataDesigner's pre-generation logger f-strings this model + (``generator_params: {params}``) and our validation prompt is multi-kB of + entity rules; a non-trivial system prompt would compound that. Hiding + them from ``__str__``/``__repr__`` keeps setup logs readable without + touching serialization — ``model_dump()`` still carries both, so the + generator receives them unchanged. """ pool: list[str] = Field(min_length=1) max_entities_per_call: int = Field(gt=0) excerpt_window_chars: int = Field(gt=0) - prompt_template: str - system_prompt: str | None = None + prompt_template: str = Field(repr=False) + system_prompt: str | None = Field(default=None, repr=False) # --------------------------------------------------------------------------- From 025cbf760dd31baf0d26b98e370c3dd0866a4a8b Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 22 Apr 2026 18:59:32 -0700 Subject: [PATCH 18/20] update logging --- src/anonymizer/engine/detection/chunked_validation.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 58706469..5d30f278 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -426,6 +426,15 @@ def chunked_validate_row( ordered = order_candidates_by_position(candidates, all_spans) chunks = chunk_candidates(ordered, params.max_entities_per_call) + logger.debug( + "chunked validation: %d candidate(s) in %d chunk(s) (max=%d per chunk, window=%d chars), pool=%s", + len(ordered), + len(chunks), + params.max_entities_per_call, + params.excerpt_window_chars, + params.pool, + ) + dispatch_kwargs_per_chunk: list[dict[str, Any]] = [] for chunk_index, chunk in enumerate(chunks): chunk_candidates_ = [pair[0] for pair in chunk] From 743a1327136111cd6929e6e7bf5139b9648bb211 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Thu, 23 Apr 2026 10:18:55 -0700 Subject: [PATCH 19/20] apply fast path for single chunk --- .../engine/detection/chunked_validation.py | 48 +++++++++++----- tests/engine/test_chunked_validation.py | 56 +++++++++++++++++++ 2 files changed, 90 insertions(+), 14 deletions(-) diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 5d30f278..3efaacdf 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -426,25 +426,45 @@ def chunked_validate_row( ordered = order_candidates_by_position(candidates, all_spans) chunks = chunk_candidates(ordered, params.max_entities_per_call) - logger.debug( - "chunked validation: %d candidate(s) in %d chunk(s) (max=%d per chunk, window=%d chars), pool=%s", - len(ordered), - len(chunks), - params.max_entities_per_call, - params.excerpt_window_chars, - params.pool, - ) + if len(chunks) == 1: + logger.debug( + "chunked validation: %d candidate(s) in 1 chunk (full-text excerpt), pool=%s", + len(ordered), + params.pool, + ) + else: + logger.debug( + "chunked validation: %d candidate(s) in %d chunks (max=%d per chunk, window=%d chars), pool=%s", + len(ordered), + len(chunks), + params.max_entities_per_call, + params.excerpt_window_chars, + params.pool, + ) + + # Single-chunk rows preserve parity with the pre-chunking + # ``LLMStructuredColumnConfig`` path by sending the fully tagged + # document. The excerpt window is strictly a cost-control lever for + # multi-chunk dispatch (it bounds per-chunk input tokens); when we're + # only making one call there's no cost reason to clip, and clipping + # would silently narrow the context the validator sees. Computed once + # here because ``len(chunks) == 1`` is loop-invariant. + single_chunk_tagged_text = build_tagged_text(text, all_spans, notation=notation) if len(chunks) == 1 else None dispatch_kwargs_per_chunk: list[dict[str, Any]] = [] for chunk_index, chunk in enumerate(chunks): chunk_candidates_ = [pair[0] for pair in chunk] chunk_spans = [pair[1] for pair in chunk] - excerpt = build_chunk_excerpt( - text=text, - chunk_spans=chunk_spans, - all_spans=all_spans, - window_chars=params.excerpt_window_chars, - notation=notation, + excerpt = ( + single_chunk_tagged_text + if single_chunk_tagged_text is not None + else build_chunk_excerpt( + text=text, + chunk_spans=chunk_spans, + all_spans=all_spans, + window_chars=params.excerpt_window_chars, + notation=notation, + ) ) skeleton = build_chunk_skeleton(chunk_candidates_) prompt = render_chunk_prompt( diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index 78b8fdca..6baa23b7 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -368,6 +368,62 @@ def test_single_chunk_single_alias_dispatches_once_and_merges(self) -> None: decisions = out[COL_VALIDATION_DECISIONS]["decisions"] assert {d["id"]: d["decision"] for d in decisions} == {"a": "keep", "b": "drop"} + def test_single_chunk_sends_single_chunk_tagged_text_not_windowed_excerpt(self) -> None: + """Single-chunk rows must receive the fully tagged document, not a windowed excerpt. + + Regression: before this fix, every chunk — including a lone single + chunk — was routed through ``build_chunk_excerpt`` with the configured + window. That silently narrowed the validator's context relative to the + pre-chunking ``LLMStructuredColumnConfig`` path, which always served + the full ``_seed_tagged_text``. The excerpt window is a multi-chunk + cost-control lever; applying it to the single-call path is incorrect. + """ + prefix = "HEADER_MARKER_ALPHA " * 80 + suffix = " FOOTER_MARKER_OMEGA" * 80 + middle = "Alice met Bob." + text = prefix + middle + suffix + alice_start = len(prefix) + bob_start = alice_start + 10 + + spans = [ + _entity_span("a", "Alice", "first_name", alice_start, alice_start + 5), + _entity_span("b", "Bob", "first_name", bob_start, bob_start + 3), + ] + candidates = _candidates_schema( + ("a", "Alice", "first_name"), + ("b", "Bob", "first_name"), + ) + row = _build_row(text=text, seed_entities=spans, candidates=candidates) + + facade = FakeFacade( + "v0", + response={ + "decisions": [ + {"id": "a", "decision": "keep"}, + {"id": "b", "decision": "keep"}, + ] + }, + ) + params = ChunkedValidationParams( + pool=["v0"], + max_entities_per_call=10, + excerpt_window_chars=50, + prompt_template=_MINIMAL_TEMPLATE, + ) + + chunked_validate_row(row, params, {"v0": facade}) + + assert len(facade.calls) == 1 + prompt = facade.calls[0]["prompt"] + assert "HEADER_MARKER_ALPHA" in prompt, ( + "single-chunk row should receive the full text prefix; a 50-char window " + "around Alice/Bob would clip the prefix entirely." + ) + assert "FOOTER_MARKER_OMEGA" in prompt, ( + "single-chunk row should receive the full text suffix; a 50-char window " + "around Alice/Bob would clip the suffix entirely." + ) + def test_empty_candidates_short_circuits_without_calls(self) -> None: row = _build_row(text="hello", seed_entities=[], candidates=_candidates_schema()) facade = FakeFacade("v0", response={"decisions": []}) From 6cf1ce3f6ebf88e3dd1cf10087f95216520fe565 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Thu, 23 Apr 2026 10:24:14 -0700 Subject: [PATCH 20/20] treat none decisions as not present when chunking validation --- .../engine/detection/chunked_validation.py | 28 +++++++++---------- tests/engine/test_chunked_validation.py | 23 +++++++++++++++ 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 3efaacdf..50601cad 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -251,12 +251,14 @@ def merge_chunk_decisions( - Only decisions whose ids match a known candidate are retained. This is consistent with ``enrich_validation_decisions``, which also filters to valid ids; doing it here too keeps COL_VALIDATION_DECISIONS minimal. - - Duplicate ids across chunks keep the first occurrence. Duplicates - shouldn't happen because candidates partition cleanly, but deduping is - cheap defence-in-depth. - - Candidates with no decision at all flow through as absent; downstream - ``apply_validation_decisions`` treats them as ``keep`` with the original - label, which matches prior behaviour for a partially-answered response. + - Null-decision entries are treated as "no answer" and do NOT reserve + the id, so if a later chunk yields a real verdict for the same id, + that verdict wins. The null entry itself never leaks through: downstream + ``apply_validation_decisions`` relies on candidate-not-in-output to + mean "keep unchanged", which would break if we emitted ``decision=null``. + - Among multiple real verdicts for the same id (shouldn't happen because + candidates partition cleanly, but kept as defence-in-depth), the first + wins. """ candidate_lookup = {c.id: c for c in candidates.candidates} valid_ids = set(candidate_lookup) @@ -266,25 +268,23 @@ def merge_chunk_decisions( for decision in result.decisions: if decision.id not in valid_ids or decision.id in seen: continue + # Skip null-decision entries without marking the id as seen, so a + # later chunk with a real verdict for the same id can still win. + if decision.decision is None: + continue cand = candidate_lookup[decision.id] merged_decisions.append( { "id": decision.id, "value": cand.value, "label": cand.label, - "decision": decision.decision.value if decision.decision is not None else None, + "decision": decision.decision.value, "proposed_label": decision.proposed_label or "", "reason": decision.reason, } ) seen.add(decision.id) - # Validate the final shape so malformed output fails loud here rather than in a - # downstream parser. ``ValidationDecisionsSchema.decision`` is non-optional, so - # we drop rows where the LLM didn't supply a decision; the downstream - # "no decision -> keep" semantics rely on candidate-not-in-output, not on a - # null-decision entry. - filtered = [d for d in merged_decisions if d["decision"] is not None] - return ValidationDecisionsSchema.model_validate({"decisions": filtered}).model_dump(mode="json") + return ValidationDecisionsSchema.model_validate({"decisions": merged_decisions}).model_dump(mode="json") # --------------------------------------------------------------------------- diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index 6baa23b7..f9b402a3 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -299,6 +299,29 @@ def test_drops_decisions_without_verdict(self) -> None: merged = merge_chunk_decisions([chunk], candidates) assert merged == {"decisions": []} + def test_later_real_verdict_wins_over_earlier_null_duplicate(self) -> None: + """A null-decision entry must not reserve the id against a later real verdict. + + If an earlier chunk returns ``decision=None`` for id 'a' and a later + chunk returns ``decision='keep'`` for 'a', the merged payload should + retain the real verdict. Otherwise a transient "no answer" in one + chunk would suppress a valid answer from another. + """ + candidates = _candidates_schema(("a", "Alice", "first_name")) + chunk_one = RawValidationDecisionsSchema.model_validate({"decisions": [{"id": "a", "decision": None}]}) + chunk_two = RawValidationDecisionsSchema.model_validate({"decisions": [{"id": "a", "decision": "keep"}]}) + merged = merge_chunk_decisions([chunk_one, chunk_two], candidates) + assert merged["decisions"] == [ + { + "id": "a", + "value": "Alice", + "label": "first_name", + "decision": "keep", + "proposed_label": "", + "reason": None, + } + ] + # --------------------------------------------------------------------------- # Async dispatch: chunked_validate_row end-to-end with fake facades