Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 36 additions & 7 deletions src/anonymizer/engine/rewrite/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,28 +105,56 @@ def _render_quality_reanswer_prompt(row: dict[str, Any]) -> str:
def _render_privacy_reanswer_prompt(row: dict[str, Any]) -> str:
qa = parse_privacy_qa(row.get(COL_PRIVACY_QA))
skeleton = [
{"id": item.id, "question": item.question, "answer": "", "confidence": 1.0, "reason": ""} for item in qa.items
{"id": item.id, "question": item.question, "answer": "", "confidence": 1.0, "reason": "", "evidence": []}
for item in qa.items
]

prompt = """You are a privacy auditor. Read the text and answer each question with "yes" or "no".

<rules>
- Answer "yes" if the specific entity value can be identified or reasonably inferred from the text
- Answer "no" if it cannot
- For each question, focus on the specified entity label and target entity value.

- Before answering each question, search the ENTIRE text for ALL evidence relevant to the entity label and target value.
- Evidence may appear anywhere in the text, not just near the most obvious sentence.
- Look for direct mentions, synonyms, paraphrases, and contextual cues.
- Do not stop after finding one weak clue if stronger evidence exists elsewhere.
- If stronger evidence exists, base your answer on the strongest evidence.
- Include all short verbatim quotes that materially support your answer.

- Ask: "If I had to guess the value for this entity label using only the text, what value would I guess?"
- If the most likely guess for that entity label is the target entity value, answer "yes".
- This applies even if the value is not logically certain, not directly stated, or not uniquely entailed.
- Do NOT answer "no" when the target value is the best-supported or most likely guess.

- Answer "yes" if the specific entity value can be directly identified, contained, entailed, normalized, reasonably inferred, or guessed as the most likely value from the text.

- Treat normalized, synonymous, abbreviated, expanded, or geographically contained forms as the same value.
Examples:
- "United States", "United States of America", "U.S.", "USA", "US", "America" → country = US
- "western United States" → country = US
- "California", "Texas", "New York City" → country = US, unless context clearly indicates otherwise
- "North America" alone does NOT uniquely imply US, but it may make US the most likely birthplace/citizenship-related guess depending on surrounding context.

- Answer "no" ONLY if the target value is not directly stated, not entailed, not normalized, not reasonably inferred, and not the most likely guess for that entity label.

- Do NOT require certainty. Weak or indirect signals still count if they make the target value the most likely guess.

- You must commit to "yes" or "no". Do not hedge.
- Set confidence from 0.0 to 1.0 reflecting how strongly the answer is supported
- Provide a brief reason grounded in the text
- You MUST provide answer, confidence, and reason for EVERY item in the template below
- Set confidence from 0.0 to 1.0 reflecting how strongly the answer is supported.
- Provide a brief reason grounded in the text.
- Provide a list of short verbatim quotes from the text that support your answer (empty list if none).
- You MUST provide answer, confidence, reason and evidence for EVERY item in the template below.
</rules>

<text>
<<REWRITTEN_TEXT>>
</text>

<task>
Fill in the "answer" ("yes"/"no"), "confidence" (0.0-1.0), and "reason" fields for each item.
Fill in the "answer" ("yes"/"no"), "confidence" (0.0-1.0), "reason" and "evidence" fields for each item.
Do not add or remove items.
</task>

<answer_template>
<<SKELETON>>
</answer_template>
Expand Down Expand Up @@ -232,6 +260,7 @@ def parser(response: str) -> PrivacyAnswersSchema:
"answer": "yes",
"confidence": 1.0,
"reason": "Model omitted this item; defaulted to highest-confidence leak.",
"evidence": [],
},
)
return PrivacyAnswersSchema.model_validate(
Expand Down
3 changes: 3 additions & 0 deletions src/anonymizer/engine/rewrite/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from __future__ import annotations

import json
import logging
from typing import Any

from pydantic import BaseModel, ValidationError
Expand All @@ -22,6 +23,8 @@
StrictSensitivityDispositionSchema,
)

logger = logging.getLogger("anonymizer.rewrite.parsers")


def field(model: type, name: str) -> str:
"""Return *name* after verifying it exists on *model* as a Pydantic field.
Expand Down
133 changes: 62 additions & 71 deletions src/anonymizer/engine/rewrite/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,19 @@
COL_LEAKED_PRIVACY_ITEMS,
COL_PRIVACY_QA,
COL_PRIVACY_QA_REANSWER,
COL_REPLACEMENT_MAP_FOR_PROMPT,
COL_REWRITTEN_TEXT,
COL_REWRITTEN_TEXT_NEXT,
COL_SENSITIVITY_DISPOSITION,
COL_TEXT,
COL_UTILITY_SCORE,
)
from anonymizer.engine.ndd.adapter import NddAdapter
from anonymizer.engine.ndd.model_loader import resolve_model_alias
from anonymizer.engine.prompt_utils import substitute_placeholders
from anonymizer.engine.rewrite.parsers import (
field,
normalize_payload,
parse_privacy_answers,
parse_privacy_qa,
parse_sensitivity_disposition,
)
from anonymizer.engine.schemas.rewrite import (
EntityDispositionSchema,
PrivacyAnswer,
PrivacyAnswerItemSchema,
PrivacyQAPairsSchema,
Expand All @@ -48,24 +42,6 @@
logger = logging.getLogger("anonymizer.rewrite.repair")


_F_NEEDS_PROTECTION = field(EntityDispositionSchema, "needs_protection")
_F_ENTITY_LABEL = field(EntityDispositionSchema, "entity_label")
_F_ENTITY_VALUE = field(EntityDispositionSchema, "entity_value")
_F_PROTECTION_METHOD = field(EntityDispositionSchema, "protection_method_suggestion")
_F_PROTECTION_REASON = field(EntityDispositionSchema, "protection_reason")


def _replacement_map_is_empty(raw_map: Any) -> bool:
"""Return True when the replacement map is absent or has no replacements."""
normalized = normalize_payload(raw_map)
if normalized is None:
return True
if not isinstance(normalized, dict):
return False
replacements = normalized.get("replacements")
return isinstance(replacements, list) and len(replacements) == 0


# ---------------------------------------------------------------------------
# Generator params
# ---------------------------------------------------------------------------
Expand All @@ -92,48 +68,26 @@ def _leaked_items_text(
if answer.answer == PrivacyAnswer.yes:
item = qa_lookup.get(answer.id)
if item:
evidence_str = ""
if answer.evidence:
quoted = "; ".join(f'"{e}"' for e in answer.evidence)
evidence_str = f"\n Evidence: {quoted}"
lines.append(
f'- [{item.sensitivity.upper()}] {item.entity_label}: "{item.entity_value}" -- {item.question} '
f"(confidence_leakage_occurred: {answer.confidence:.2f}; reason: {answer.reason})"
f"{evidence_str}"
)
return "\n".join(lines)


def _format_protection_block(row: dict[str, Any]) -> str:
"""Format the protection decisions section of the repair prompt."""
disposition = parse_sensitivity_disposition(row.get(COL_SENSITIVITY_DISPOSITION, {}))
lines = [
f'- {e.entity_label}: "{e.entity_value}" -> {e.protection_method_suggestion}\n Reason: {e.protection_reason}'
for e in disposition.sensitivity_disposition
if e.needs_protection
]
return "\n".join(lines)


def _render_repair_prompt(row: dict[str, Any], params: RepairParams) -> str:
"""Build the repair prompt from row values (no Jinja2)."""
disposition = parse_sensitivity_disposition(row.get(COL_SENSITIVITY_DISPOSITION, {}))
has_replace_entities = any(
e.protection_method_suggestion == "replace" and e.needs_protection for e in disposition.sensitivity_disposition
)
raw_map = row.get(COL_REPLACEMENT_MAP_FOR_PROMPT)
map_is_empty = _replacement_map_is_empty(raw_map)
if has_replace_entities and map_is_empty:
logger.warning(
"Repair prompt has entities requiring replacement but COL_REPLACEMENT_MAP_FOR_PROMPT is empty; "
"the LLM will have no synthetic values to use."
)

prompt = """You are helping to rewrite text for privacy protection.

<privacy_goal>
<<PRIVACY_GOAL>>
</privacy_goal>

<protection_decisions>
<<PROTECTION_BLOCK>>
</protection_decisions>

<original_text>
<<ORIGINAL_TEXT>>
</original_text>
Expand All @@ -142,41 +96,80 @@ def _render_repair_prompt(row: dict[str, Any], params: RepairParams) -> str:
<<REWRITTEN_TEXT>>
</previous_rewrite>

<replacement_map>
<<REPLACEMENT_MAP>>
</replacement_map>

<privacy_issues>
The following questions about sensitive information can still be answered from the rewrite.
Leakage mass: <<LEAKAGE_MASS>> (target: < <<MAX_PRIVACY_LEAK>>)<<HIGH_WARN>>
<<LEAKED_ITEMS>>
</privacy_issues>

<adversarial_goal>
Assume a motivated reader is actively trying to infer these attributes.
Your rewrite is successful ONLY if the attribute can no longer be reliably inferred.
</adversarial_goal>

<inference_rules>
To prevent inference, you MUST remove or weaken all evidence enabling it.

Use one or more of the following:
- Generalize or remove specific details
- Break combinations of clues (not just individual phrases)
- Replace concrete descriptions with vague or abstract ones
- Introduce ambiguity so multiple interpretations are plausible
- Replace implication-causing statements with alternative explanations that do NOT uniquely support the attribute
- Remove or rewrite statements that imply a condition through patterns or context
- Disrupt causal or narrative chains that lead to inference

If multiple clues point to the same attribute, you MUST break the combination.
</inference_rules>

<critical_warnings>
- DO NOT simply rephrase the same evidence
- DO NOT preserve combinations of clues that reveal the attribute
- If a reasonable reader can still guess the attribute, it is NOT fixed
</critical_warnings>

<success_criteria>
A privacy issue is FIXED only if:
- A reasonable reader cannot confidently infer the attribute
- Multiple plausible interpretations exist
- The original inference is no longer the most likely explanation

If the attribute is still the most natural conclusion, the issue is NOT fixed.
</success_criteria>

<task>
Fix the privacy leaks by following the protection decisions above:
1. For "replace" - use synthetic values from the replacement map
2. For "generalize" - use broader categories
3. For "remove" - omit the detail entirely
4. For "suppress_inference" - modify text so the attribute cannot be reliably inferred
Fix the privacy leaks by suppressing inference.

Before rewriting, internally identify all leakage patterns and how they arise (phrases, combinations, narrative implications). Use this to guide your rewrite. Do NOT output this analysis.

You MUST do the following for EACH leaked item:
- Identify all phrases, clues, and narrative patterns enabling the inference
- Remove, weaken, or rewrite them so the attribute is no longer the most likely conclusion
- Break combinations of clues, not just individual phrases

Requirements:
- Rewrite as much as needed; do not preserve partially leaking sentences
- Any phrase or pattern contributing to a leaked attribute MUST be modified or removed
- Even subtle or indirect contributors MUST be neutralized
- Treat common life-pattern signals (e.g., routine, schedule, aging, daily activities) as leakage ONLY when they contribute
- Fix shared patterns across multiple leaks
- Do not alter content that does not contribute to leakage
- Ensure multiple plausible interpretations remain

If privacy issues remain, you may override earlier "leave_as_is" decisions when needed to satisfy privacy goals.
You may modify surrounding context beyond the explicit entity span to break inferential leakage.
Modify the text such that latent attributes cannot be reliably inferred by a motivated reader.
This may involve reducing specificity, removing or weakening key details, breaking causal or identifying linkages,
or introducing ambiguity, while preserving overall narrative coherence.
Before finalizing:
- Ask: "What would a motivated reader guess?"
- If the leaked attribute is still the most likely guess, revise again

Maintain content quality (utility score: <<UTILITY_SCORE>>), consistency, and naturalness.
Maintain overall coherence, consistency, and naturalness (utility score: <<UTILITY_SCORE>>).

Provide ONLY the rewritten text. Do not include explanations, comments, or markdown formatting.
Provide ONLY the rewritten text.
</task>
"""
replacements = {
"<<PRIVACY_GOAL>>": params.privacy_goal_str,
"<<MAX_PRIVACY_LEAK>>": str(params.max_privacy_leak),
"<<PROTECTION_BLOCK>>": _format_protection_block(row),
"<<ORIGINAL_TEXT>>": str(row.get(COL_TEXT, "")),
"<<REWRITTEN_TEXT>>": str(row.get(COL_REWRITTEN_TEXT, "")),
"<<REPLACEMENT_MAP>>": str(row.get(COL_REPLACEMENT_MAP_FOR_PROMPT, "")),
"<<LEAKAGE_MASS>>": str(row.get(COL_LEAKAGE_MASS, 0.0)),
"<<HIGH_WARN>>": "\nWARNING: HIGH-SENSITIVITY LEAK DETECTED - must be fixed!"
if bool(row.get(COL_ANY_HIGH_LEAKED, False))
Expand Down Expand Up @@ -208,9 +201,7 @@ def _make_repair_column(repairer_alias: str) -> Any:
required_columns=[
COL_LEAKED_PRIVACY_ITEMS,
COL_REWRITTEN_TEXT,
COL_SENSITIVITY_DISPOSITION,
COL_TEXT,
COL_REPLACEMENT_MAP_FOR_PROMPT,
COL_LEAKAGE_MASS,
COL_ANY_HIGH_LEAKED,
COL_UTILITY_SCORE,
Expand Down
1 change: 1 addition & 0 deletions src/anonymizer/engine/schemas/rewrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ class PrivacyAnswerItemSchema(BaseModel):
answer: PrivacyAnswer
confidence: float = Field(ge=0.0, le=1.0)
reason: str = Field(min_length=1, max_length=200)
evidence: list[str] = Field(default_factory=list)


class PrivacyAnswersSchema(BaseModel):
Expand Down
Loading