Skip to content

Commit ce9ecbb

Browse files
committed
feat: Implement anonymization operators and validation logic
- Added RecognizerResult class for handling analyzer results with score and index management. - Introduced RedactOperator and ReplaceOperator for text anonymization. - Created TextReplaceBuilder for managing text replacements while preserving original indices. - Developed Validators class for input validation mirroring Python validators. - Implemented unit tests for AnonymizerEngine, ConflictResolutionStrategy, and various operators. - Added support for custom operators and conflict resolution strategies in the anonymization process. - Enhanced error handling and validation for operator parameters.
1 parent 1e80158 commit ce9ecbb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+2361
-130
lines changed

src/ManagedCode.Presidio.Analyzer/ItFiscalCodeRecognizer.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,5 +121,3 @@ public sealed class ItFiscalCodeRecognizer(
121121
return expected == control ? true : null;
122122
}
123123
}
124-
using System.Collections.Generic;
125-
using System.Linq;

src/ManagedCode.Presidio.Analyzer/ItVatCodeRecognizer.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,3 @@ public sealed class ItVatCodeRecognizer(
7272
return c == sanitized[10] - '0';
7373
}
7474
}
75-
using System.Linq;

src/ManagedCode.Presidio.Analyzer/RemoteRecognizer.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
using ManagedCode.Presidio.Core;
2+
13
namespace ManagedCode.Presidio.Analyzer;
24

35
/// <summary>
@@ -20,7 +22,7 @@ protected abstract IReadOnlyCollection<RecognizerResult> AnalyzeRemote(
2022
IReadOnlyCollection<string> entities,
2123
NlpArtifacts artifacts);
2224

23-
public override IReadOnlyCollection<string> EnhanceUsingContext(
25+
public override IReadOnlyCollection<RecognizerResult> EnhanceUsingContext(
2426
string text,
2527
IReadOnlyCollection<RecognizerResult> ownResults,
2628
IReadOnlyCollection<RecognizerResult> otherResults,
@@ -40,4 +42,3 @@ protected override IReadOnlyCollection<RecognizerResult> AnalyzeCore(
4042
return AnalyzeRemote(text, entities, artifacts);
4143
}
4244
}
43-
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
using System.Security.Cryptography;
2+
using System.Text;
3+
4+
namespace ManagedCode.Presidio.Anonymizer;
5+
6+
internal static class AesCipher
7+
{
8+
public static string Encrypt(byte[] key, string text)
9+
{
10+
using var aes = CreateAes(key);
11+
aes.GenerateIV();
12+
13+
using var encryptor = aes.CreateEncryptor();
14+
var plainBytes = Encoding.UTF8.GetBytes(text ?? string.Empty);
15+
var encrypted = encryptor.TransformFinalBlock(plainBytes, 0, plainBytes.Length);
16+
17+
var payload = new byte[aes.IV.Length + encrypted.Length];
18+
Buffer.BlockCopy(aes.IV, 0, payload, 0, aes.IV.Length);
19+
Buffer.BlockCopy(encrypted, 0, payload, aes.IV.Length, encrypted.Length);
20+
21+
return ToBase64Url(payload);
22+
}
23+
24+
public static string Decrypt(byte[] key, string text)
25+
{
26+
using var aes = CreateAes(key);
27+
var payload = FromBase64Url(text ?? string.Empty);
28+
if (payload.Length < aes.BlockSize / 8)
29+
{
30+
throw new InvalidParamException("Ciphertext payload is too small");
31+
}
32+
33+
var ivLength = aes.BlockSize / 8;
34+
var iv = new byte[ivLength];
35+
Buffer.BlockCopy(payload, 0, iv, 0, ivLength);
36+
aes.IV = iv;
37+
38+
var cipherText = new byte[payload.Length - ivLength];
39+
Buffer.BlockCopy(payload, ivLength, cipherText, 0, cipherText.Length);
40+
41+
using var decryptor = aes.CreateDecryptor();
42+
var decrypted = decryptor.TransformFinalBlock(cipherText, 0, cipherText.Length);
43+
return Encoding.UTF8.GetString(decrypted);
44+
}
45+
46+
public static bool IsValidKeySize(byte[] key)
47+
{
48+
if (key is null)
49+
{
50+
return false;
51+
}
52+
53+
var bitLength = key.Length * 8;
54+
return bitLength == 128 || bitLength == 192 || bitLength == 256;
55+
}
56+
57+
private static Aes CreateAes(byte[] key)
58+
{
59+
if (!IsValidKeySize(key))
60+
{
61+
throw new InvalidParamException("Invalid input, key must be of length 128, 192 or 256 bits");
62+
}
63+
64+
var aes = Aes.Create();
65+
aes.Key = key;
66+
aes.Mode = CipherMode.CBC;
67+
aes.Padding = PaddingMode.PKCS7;
68+
return aes;
69+
}
70+
71+
private static string ToBase64Url(byte[] payload)
72+
{
73+
var base64 = Convert.ToBase64String(payload);
74+
return base64.Replace('+', '-').Replace('/', '_').TrimEnd('=');
75+
}
76+
77+
private static byte[] FromBase64Url(string text)
78+
{
79+
var normalized = text.Replace('-', '+').Replace('_', '/');
80+
var padding = 4 - normalized.Length % 4;
81+
if (padding is > 0 and < 4)
82+
{
83+
normalized = normalized.PadRight(normalized.Length + padding, '=');
84+
}
85+
86+
return Convert.FromBase64String(normalized);
87+
}
88+
}
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
using System.Text.RegularExpressions;
2+
using CoreRecognizerResult = ManagedCode.Presidio.Core.RecognizerResult;
3+
4+
namespace ManagedCode.Presidio.Anonymizer;
5+
6+
/// <summary>
7+
/// Entry point for Presidio anonymization pipeline.
8+
/// </summary>
9+
public sealed class AnonymizerEngine : EngineBase
10+
{
11+
private static readonly Regex WhitespaceRegex = new("^( )+$", RegexOptions.Compiled);
12+
13+
public EngineResult Anonymize(
14+
string text,
15+
IReadOnlyCollection<CoreRecognizerResult>? analyzerResults,
16+
IDictionary<string, OperatorConfig>? operators = null,
17+
ConflictResolutionStrategy conflictResolution = ConflictResolutionStrategy.MergeSimilarOrContained)
18+
{
19+
ArgumentNullException.ThrowIfNull(text);
20+
var safeResults = analyzerResults ?? Array.Empty<CoreRecognizerResult>();
21+
ValidateIndices(text, safeResults);
22+
var mutableResults = CopyRecognizerResults(safeResults);
23+
if (mutableResults.Count == 0)
24+
{
25+
var defaultOperators = EnsureDefaultOperator(operators);
26+
return Operate(text, Array.Empty<PiiEntity>(), defaultOperators, OperatorType.Anonymize);
27+
}
28+
29+
mutableResults.Sort();
30+
mutableResults = RemoveConflictsAndGetTextManipulationData(mutableResults, conflictResolution);
31+
mutableResults = MergeEntitiesWithWhitespaceBetween(text, mutableResults);
32+
var operatorMap = EnsureDefaultOperator(operators);
33+
34+
return Operate(
35+
text,
36+
mutableResults.Cast<PiiEntity>().ToArray(),
37+
operatorMap,
38+
OperatorType.Anonymize);
39+
}
40+
41+
public IReadOnlyCollection<string> GetAnonymizers() => OperatorsFactory.GetAnonymizers().Keys.ToArray();
42+
43+
public void AddAnonymizer<T>() where T : Operator, new() => OperatorsFactory.AddAnonymizeOperator(typeof(T));
44+
45+
public void AddAnonymizer(Type operatorType) => OperatorsFactory.AddAnonymizeOperator(operatorType);
46+
47+
public void RemoveAnonymizer(Type operatorType) => OperatorsFactory.RemoveAnonymizeOperator(operatorType);
48+
49+
private static void ValidateIndices(string text, IEnumerable<CoreRecognizerResult> analyzerResults)
50+
{
51+
foreach (var result in analyzerResults)
52+
{
53+
if (result.Start < 0 || result.Start > text.Length || result.End < 0 || result.End > text.Length)
54+
{
55+
throw new InvalidParamException(
56+
$"Invalid analyzer result, start: {result.Start} and end: {result.End}, while text length is only {text.Length}.");
57+
}
58+
}
59+
}
60+
61+
private static List<RecognizerResult> CopyRecognizerResults(IReadOnlyCollection<CoreRecognizerResult> analyzerResults)
62+
{
63+
var results = new List<RecognizerResult>(analyzerResults.Count);
64+
foreach (var result in analyzerResults)
65+
{
66+
results.Add(RecognizerResult.FromCore(result));
67+
}
68+
69+
return results;
70+
}
71+
72+
private static List<RecognizerResult> RemoveConflictsAndGetTextManipulationData(
73+
List<RecognizerResult> analyzerResults,
74+
ConflictResolutionStrategy conflictResolution)
75+
{
76+
var tmpAnalyzerResults = new List<RecognizerResult>();
77+
var otherElements = new List<RecognizerResult>(analyzerResults);
78+
79+
foreach (var result in analyzerResults)
80+
{
81+
otherElements.Remove(result);
82+
83+
var isMergeSameEntityType = false;
84+
foreach (var other in otherElements)
85+
{
86+
if (!string.Equals(result.EntityType, other.EntityType, StringComparison.Ordinal))
87+
{
88+
continue;
89+
}
90+
91+
if (result.Intersects(other) == 0)
92+
{
93+
continue;
94+
}
95+
96+
other.UpdateSpan(Math.Min(result.Start, other.Start), Math.Max(result.End, other.End));
97+
other.UpdateScore(Math.Max(result.Score, other.Score));
98+
isMergeSameEntityType = true;
99+
break;
100+
}
101+
102+
if (!isMergeSameEntityType)
103+
{
104+
otherElements.Add(result);
105+
tmpAnalyzerResults.Add(result);
106+
}
107+
}
108+
109+
var uniqueElements = new List<RecognizerResult>();
110+
otherElements = new List<RecognizerResult>(tmpAnalyzerResults);
111+
foreach (var result in tmpAnalyzerResults)
112+
{
113+
otherElements.Remove(result);
114+
if (!IsResultConflicted(otherElements, result))
115+
{
116+
otherElements.Add(result);
117+
uniqueElements.Add(result);
118+
}
119+
}
120+
121+
if (conflictResolution == ConflictResolutionStrategy.RemoveIntersections)
122+
{
123+
uniqueElements.Sort((a, b) => a.Start.CompareTo(b.Start));
124+
var index = 0;
125+
while (index < uniqueElements.Count - 1)
126+
{
127+
var current = uniqueElements[index];
128+
var next = uniqueElements[index + 1];
129+
130+
if (current.End <= next.Start)
131+
{
132+
index += 1;
133+
continue;
134+
}
135+
136+
if (current.Score >= next.Score)
137+
{
138+
if (current.End >= next.End)
139+
{
140+
uniqueElements.RemoveAt(index + 1);
141+
continue;
142+
}
143+
144+
next.UpdateSpan(current.End, next.End);
145+
}
146+
else
147+
{
148+
if (next.Start <= current.Start)
149+
{
150+
uniqueElements.RemoveAt(index);
151+
continue;
152+
}
153+
154+
current.UpdateSpan(current.Start, next.Start);
155+
}
156+
157+
uniqueElements.Sort((a, b) => a.Start.CompareTo(b.Start));
158+
index = 0;
159+
}
160+
161+
uniqueElements = uniqueElements
162+
.Where(element => element.Start <= element.End)
163+
.ToList();
164+
}
165+
166+
return uniqueElements;
167+
}
168+
169+
private static List<RecognizerResult> MergeEntitiesWithWhitespaceBetween(
170+
string text,
171+
List<RecognizerResult> analyzerResults)
172+
{
173+
var mergedResults = new List<RecognizerResult>();
174+
RecognizerResult? previous = null;
175+
foreach (var result in analyzerResults)
176+
{
177+
if (previous is not null &&
178+
string.Equals(previous.EntityType, result.EntityType, StringComparison.Ordinal) &&
179+
IsWhitespace(text, previous.End, result.Start))
180+
{
181+
mergedResults.Remove(previous);
182+
result.UpdateSpan(previous.Start, result.End);
183+
}
184+
185+
mergedResults.Add(result);
186+
previous = result;
187+
}
188+
189+
return mergedResults;
190+
}
191+
192+
private static bool IsWhitespace(string text, int start, int end)
193+
{
194+
if (start < 0 || end > text.Length || end <= start)
195+
{
196+
return false;
197+
}
198+
199+
var span = text[start..end];
200+
return WhitespaceRegex.IsMatch(span);
201+
}
202+
203+
private static bool IsResultConflicted(IEnumerable<RecognizerResult> others, RecognizerResult candidate)
204+
{
205+
foreach (var other in others)
206+
{
207+
if (candidate.HasConflict(other))
208+
{
209+
return true;
210+
}
211+
}
212+
213+
return false;
214+
}
215+
216+
private static Dictionary<string, OperatorConfig> EnsureDefaultOperator(IDictionary<string, OperatorConfig>? operators)
217+
{
218+
var map = new Dictionary<string, OperatorConfig>(StringComparer.Ordinal);
219+
if (operators is not null)
220+
{
221+
foreach (var pair in operators)
222+
{
223+
map[pair.Key] = pair.Value;
224+
}
225+
}
226+
227+
if (!map.ContainsKey("DEFAULT"))
228+
{
229+
map["DEFAULT"] = new OperatorConfig("replace");
230+
}
231+
232+
return map;
233+
}
234+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
using System.Runtime.CompilerServices;
2+
3+
[assembly: InternalsVisibleTo("ManagedCode.Presidio.Anonymizer.Tests")]
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
namespace ManagedCode.Presidio.Anonymizer;
2+
3+
public enum ConflictResolutionStrategy
4+
{
5+
MergeSimilarOrContained,
6+
RemoveIntersections,
7+
}

0 commit comments

Comments
 (0)