From 861e6452bf48aeb60a121c393a58f36128cf81de Mon Sep 17 00:00:00 2001 From: asteier2026 Date: Mon, 11 May 2026 08:59:41 -0700 Subject: [PATCH 1/6] feature: add docs for sensitivity and protection method --- docs/concepts/rewrite.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/concepts/rewrite.md b/docs/concepts/rewrite.md index f200d1b7..372ecf7c 100644 --- a/docs/concepts/rewrite.md +++ b/docs/concepts/rewrite.md @@ -15,6 +15,14 @@ The text is then rewritten to reduce identifiability, applying targeted transfor --- +## Key concepts + +**Sensitivity** measures the intrinsic re-identification damage an entity causes if it appears in the output — independently of what else is retained. It is not the protection decision; it feeds the downstream leakage scoring system. + +**Protection method** describes how a sensitive entity is transformed. The choice reflects a holistic view of the document — what other entities are being protected and how shapes what each individual entity needs. The general defaults are: direct identifiers are replaced with plausible synthetic alternatives, quasi-identifiers are generalized to a broader form (e.g., an exact date becomes a quarter, a city becomes a region), and latent entities receive `suppress_inference`, meaning the surrounding text is rewritten to remove the cues that enable the inference rather than replacing a stated value. Entities that do not require protection are left as-is. Occasionally an entity is removed outright when neither replacement nor generalization can preserve meaning without retaining the identifying detail. + +--- + ## Basic usage ```python From 1bd9a4bfdbf21ff94bbfc72bdb058f1733613107 Mon Sep 17 00:00:00 2001 From: asteier2026 Date: Wed, 13 May 2026 08:17:58 -0700 Subject: [PATCH 2/6] fix: address review feedback on sensitivity --- src/anonymizer/engine/schemas/rewrite.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/anonymizer/engine/schemas/rewrite.py b/src/anonymizer/engine/schemas/rewrite.py index f5d81c17..cc65d79d 100644 --- a/src/anonymizer/engine/schemas/rewrite.py +++ b/src/anonymizer/engine/schemas/rewrite.py @@ -151,6 +151,10 @@ def _validate_protection_consistency(self) -> EntityDispositionSchema: raise ValueError( f"Entity {self.id}: needs_protection=True cannot have protection_method_suggestion='leave_as_is'" ) + if self.combined_risk_level == CombinedRiskLevel.high and not self.needs_protection: + raise ValueError(f"Entity {self.id}: combined_risk_level='high' requires needs_protection=True") + if self.combined_risk_level == CombinedRiskLevel.low and self.needs_protection: + raise ValueError(f"Entity {self.id}: combined_risk_level='low' requires needs_protection=False") return self From 05101182c06585d60efc3ce6392974fa6e9a5825 Mon Sep 17 00:00:00 2001 From: asteier2026 Date: Wed, 13 May 2026 08:43:38 -0700 Subject: [PATCH 3/6] fix: undoing change I made to wrong PR --- src/anonymizer/engine/schemas/rewrite.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/anonymizer/engine/schemas/rewrite.py b/src/anonymizer/engine/schemas/rewrite.py index cc65d79d..f5d81c17 100644 --- a/src/anonymizer/engine/schemas/rewrite.py +++ b/src/anonymizer/engine/schemas/rewrite.py @@ -151,10 +151,6 @@ def _validate_protection_consistency(self) -> EntityDispositionSchema: raise ValueError( f"Entity {self.id}: needs_protection=True cannot have protection_method_suggestion='leave_as_is'" ) - if self.combined_risk_level == CombinedRiskLevel.high and not self.needs_protection: - raise ValueError(f"Entity {self.id}: combined_risk_level='high' requires needs_protection=True") - if self.combined_risk_level == CombinedRiskLevel.low and self.needs_protection: - raise ValueError(f"Entity {self.id}: combined_risk_level='low' requires needs_protection=False") return self From 1a8c2a066be6850fe8f9502c8ea327bb0de097f4 Mon Sep 17 00:00:00 2001 From: asteier2026 Date: Wed, 13 May 2026 08:57:45 -0700 Subject: [PATCH 4/6] Update docs/concepts/rewrite.md Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- docs/concepts/rewrite.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/concepts/rewrite.md b/docs/concepts/rewrite.md index 372ecf7c..58e82abe 100644 --- a/docs/concepts/rewrite.md +++ b/docs/concepts/rewrite.md @@ -19,7 +19,13 @@ The text is then rewritten to reduce identifiability, applying targeted transfor **Sensitivity** measures the intrinsic re-identification damage an entity causes if it appears in the output — independently of what else is retained. It is not the protection decision; it feeds the downstream leakage scoring system. -**Protection method** describes how a sensitive entity is transformed. The choice reflects a holistic view of the document — what other entities are being protected and how shapes what each individual entity needs. The general defaults are: direct identifiers are replaced with plausible synthetic alternatives, quasi-identifiers are generalized to a broader form (e.g., an exact date becomes a quarter, a city becomes a region), and latent entities receive `suppress_inference`, meaning the surrounding text is rewritten to remove the cues that enable the inference rather than replacing a stated value. Entities that do not require protection are left as-is. Occasionally an entity is removed outright when neither replacement nor generalization can preserve meaning without retaining the identifying detail. +**Protection method** describes how a sensitive entity is transformed. The choice reflects a holistic view of the document — what other entities are being protected and how shapes what each individual entity needs. The general defaults are: + +- **Direct identifiers** are replaced with plausible synthetic alternatives. +- **Quasi-identifiers** are generalized to a broader form (e.g., an exact date becomes a quarter, a city becomes a region). +- **Latent entities** receive `suppress_inference`: the surrounding text is rewritten to remove the cues that enable the inference rather than replacing a stated value. +- **Low-risk entities** that do not require protection are left as-is. +- Occasionally an entity is **removed outright** when neither replacement nor generalization can preserve meaning without retaining the identifying detail. --- From 1e9d449fbd40cf24d1d1ae9a0c7f2e30e89762ce Mon Sep 17 00:00:00 2001 From: asteier2026 Date: Wed, 13 May 2026 09:19:25 -0700 Subject: [PATCH 5/6] fix: add more detail and organization to definitions --- docs/concepts/rewrite.md | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/docs/concepts/rewrite.md b/docs/concepts/rewrite.md index 58e82abe..3462f277 100644 --- a/docs/concepts/rewrite.md +++ b/docs/concepts/rewrite.md @@ -19,13 +19,21 @@ The text is then rewritten to reduce identifiability, applying targeted transfor **Sensitivity** measures the intrinsic re-identification damage an entity causes if it appears in the output — independently of what else is retained. It is not the protection decision; it feeds the downstream leakage scoring system. -**Protection method** describes how a sensitive entity is transformed. The choice reflects a holistic view of the document — what other entities are being protected and how shapes what each individual entity needs. The general defaults are: - -- **Direct identifiers** are replaced with plausible synthetic alternatives. -- **Quasi-identifiers** are generalized to a broader form (e.g., an exact date becomes a quarter, a city becomes a region). -- **Latent entities** receive `suppress_inference`: the surrounding text is rewritten to remove the cues that enable the inference rather than replacing a stated value. -- **Low-risk entities** that do not require protection are left as-is. -- Occasionally an entity is **removed outright** when neither replacement nor generalization can preserve meaning without retaining the identifying detail. +| Level | Meaning | Examples | Leakage weight | +|-------|---------|---------|----------------| +| `high` | Exposure alone can identify a person | Names, ID numbers, contact details | 1.0 | +| `medium` | Meaningfully narrows the identity space | Location, occupation, age | 0.6 | +| `low` | Minimal standalone identifying power | Generic attributes, widely shared traits | 0.3 | + +**Protection method** describes how a sensitive entity is transformed. The choice reflects a holistic view of the document — what other entities are being protected and how, then shapes what each individual entity needs. + +| Method | What it does | Typical use | +|--------|-------------|-------------| +| `replace` | Substitutes the entity with a plausible synthetic alternative | Direct identifiers (names, IDs, contact details) | +| `generalize` | Replaces the entity with a broader form | Quasi-identifiers (exact date → quarter, city → region) | +| `suppress_inference` | Rewrites the surrounding text to remove cues that enable the inference | Latent entities that are implied rather than stated | +| `remove` | Deletes the entity entirely | Cases where neither replacement nor generalization can preserve meaning without retaining the identifying detail | +| `leave_as_is` | Leaves the entity unchanged | Entities judged not to require protection in context | --- From 83f27646799e78a3855991e72a23973c562e46ca Mon Sep 17 00:00:00 2001 From: asteier2026 Date: Fri, 15 May 2026 10:44:41 -0700 Subject: [PATCH 6/6] Update docs/concepts/rewrite.md Co-authored-by: lipikaramaswamy <31832945+lipikaramaswamy@users.noreply.github.com> --- docs/concepts/rewrite.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/concepts/rewrite.md b/docs/concepts/rewrite.md index 3462f277..50a540ce 100644 --- a/docs/concepts/rewrite.md +++ b/docs/concepts/rewrite.md @@ -17,7 +17,7 @@ The text is then rewritten to reduce identifiability, applying targeted transfor ## Key concepts -**Sensitivity** measures the intrinsic re-identification damage an entity causes if it appears in the output — independently of what else is retained. It is not the protection decision; it feeds the downstream leakage scoring system. +**Sensitivity** measures the intrinsic re-identification damage an entity causes if it appears in the output — independently of what else is retained. It is not the protection decision itself; rather, it feeds the downstream leakage scoring system. | Level | Meaning | Examples | Leakage weight | |-------|---------|---------|----------------|