Skip to content

Commit 1e80158

Browse files
committed
more work
1 parent e12354c commit 1e80158

94 files changed

Lines changed: 4644 additions & 268 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

AGENTS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ Update guidelines:
3838
- for Presidio analyzer tests, NEVER add stubbed recognizer tests; port the Python scenarios to exercise the real analyzer pipeline end-to-end
3939
- for Presidio analyzer parity work, keep iterating without pausing for confirmation and focus solely on integration tests that validate real functionality
4040
- for Presidio migration tasks, do not stop to ask the user for clarification mid-task; follow the migration plan and deliver completed work
41+
- for Presidio migration tasks, when the user says "продовжити"/"continue", proceed through the target file step by step without asking for additional confirmation
42+
- for Presidio migration tasks, when you see a way to improve something, note the idea in the working file and then implement it without waiting for user approval
43+
- for Presidio migration tasks, default to continuing the migration workflow without waiting for "продовжити"/"continue"; halt only if the user explicitly redirects
44+
- for Presidio migration tasks, when the user specifies an execution order for follow-up work, honor that sequence without reconfirming and keep progressing task-by-task
45+
- for Presidio migration tasks, capture any important follow-up items directly in the working file as TODOs so they are not lost
4146
- for Presidio test work, ALWAYS include negative/error scenarios alongside positive cases to validate failure paths
4247
- for Presidio recognizer coverage, ensure EU social security numbers are handled alongside US SSN patterns
4348
- use enums and constants over magic strings and numbers

Directory.Packages.props

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,7 @@
1010
<PackageVersion Include="Microsoft.ML.OnnxRuntime" Version="1.19.2" />
1111
<PackageVersion Include="Microsoft.ML.Tokenizers" Version="1.0.3" />
1212
<PackageVersion Include="libphonenumber-csharp" Version="8.13.36" />
13+
<PackageVersion Include="YamlDotNet" Version="15.1.2" />
14+
<PackageVersion Include="Shouldly" Version="4.2.1" />
1315
</ItemGroup>
1416
</Project>

PLAN-MIGRATION.md

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ This document tracks parity work between `external/microsoft-presidio` (Python)
1515
| `EntityRecognizer` base | `presidio_analyzer/entity_recognizer.py` || Ported as `EntityRecognizer` (C#) |
1616
| `Pattern` helper | `predefined_recognizers/generic/pattern_recognizer.py` || Implemented as `Pattern` (C#) |
1717
| `PatternRecognizer` | same || Implemented with regex caching & validation hooks |
18-
| `LocalRecognizer` / remote base classes | `local_recognizer.py`, `remote_recognizer.py` | 🚧 | Not yet required; evaluate once we port remote recognizers |
19-
| `RecognizerRegistry` & provider | `recognizer_registry.py` | 🚧 | Basic functionality present; needs YAML loader + configuration parity |
18+
| `LocalRecognizer` / remote base classes | `local_recognizer.py`, `remote_recognizer.py` | | Ported as `LocalRecognizer`/`RemoteRecognizer` with unit coverage |
19+
| `RecognizerRegistry` & provider | `recognizer_registry.py` | | YAML-backed loader + reflective instantiation aligned with Python |
2020
| NLP engines (spaCy, transformers, etc.) | `nlp_engine/` | 🚧 | Only ONNX pipeline ported (`OnnxNlpEngine`) |
2121

2222
## Generic Recognizers
@@ -37,17 +37,17 @@ This document tracks parity work between `external/microsoft-presidio` (Python)
3737

3838
| Country | Python Class | Status |
3939
| --- | --- | --- |
40-
| Australia | `AuAbnRecognizer`, `AuAcnRecognizer`, `AuMedicareRecognizer`, `AuTfnRecognizer` | 🚧 |
40+
| Australia | `AuAbnRecognizer`, `AuAcnRecognizer`, `AuMedicareRecognizer`, `AuTfnRecognizer` | |
4141
| Finland | `FiPersonalIdentityCodeRecognizer` ||
42-
| India | `InAadhaarRecognizer`, `InGstinRecognizer`, `InPanRecognizer`, `InPassportRecognizer`, `InVehicleRegistrationRecognizer`, `InVoterRecognizer` | 🚧 |
43-
| Italy | `ItDriverLicenseRecognizer`, `ItFiscalCodeRecognizer`, `ItIdentityCardRecognizer`, `ItPassportRecognizer`, `ItVatCodeRecognizer` | 🚧 |
44-
| Korea | `KrRrnRecognizer` | 🚧 |
42+
| India | `InAadhaarRecognizer`, `InGstinRecognizer`, `InPanRecognizer`, `InPassportRecognizer`, `InVehicleRegistrationRecognizer`, `InVoterRecognizer` | |
43+
| Italy | `ItDriverLicenseRecognizer`, `ItFiscalCodeRecognizer`, `ItIdentityCardRecognizer`, `ItPassportRecognizer`, `ItVatCodeRecognizer` | |
44+
| Korea | `KrRrnRecognizer` | |
4545
| Poland | `PlPeselRecognizer` ||
46-
| Singapore | `SgFinRecognizer`, `SgUenRecognizer` | 🚧 |
47-
| Spain | `EsNieRecognizer`, `EsNifRecognizer` | 🚧 |
48-
| Thailand | `ThTninRecognizer` | 🚧 |
49-
| UK | `NhsRecognizer`, `UkNinoRecognizer` | 🚧 |
50-
| US | `MedicalLicenseRecognizer`, `UsBankRecognizer`, `UsLicenseRecognizer`, `UsItinRecognizer`, `UsPassportRecognizer` (🚧), `UsSsnRecognizer` (✅) | 🚧 |
46+
| Singapore | `SgFinRecognizer`, `SgUenRecognizer` | |
47+
| Spain | `EsNieRecognizer`, `EsNifRecognizer` | |
48+
| Thailand | `ThTninRecognizer` | |
49+
| UK | `NhsRecognizer`, `UkNinoRecognizer` | |
50+
| US | `MedicalLicenseRecognizer`, `UsBankRecognizer`, `UsLicenseRecognizer`, `UsItinRecognizer`, `UsPassportRecognizer`, `UsSsnRecognizer` | |
5151

5252
## NLP Engine Recognizers
5353

@@ -67,7 +67,6 @@ This document tracks parity work between `external/microsoft-presidio` (Python)
6767

6868
## Next Actions
6969

70-
- Begin porting country-specific recognizers (prioritize high-demand markets) now that the generic suite is complete in C#.
71-
- Triage country-specific recognizers based on customer demand.
72-
- Flesh out registry configuration loading (YAML) for parity with Python.
73-
- Plan for NLP engine parity (spaCy/Stanza/Transformers) or replacements.
70+
- Continue porting any remaining country-specific recognizers not yet covered (e.g., Australia-specific business identifiers beyond the current scope, additional EU IDs, etc.).
71+
- Prioritize recognizer backlog based on customer demand and add coverage tests alongside each port.
72+
- Implement .NET equivalents for spaCy/Stanza/Transformers NLP engines or design alternative pipelines that meet parity guarantees.

src/ManagedCode.Presidio.Analyzer/AnalyzerEngine.cs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,8 @@ public AnalyzerEngine(
3838

3939
if (registry is null)
4040
{
41-
var provider = new RecognizerRegistryProvider(
42-
new RecognizerRegistryConfiguration(_supportedLanguages));
43-
_registry = provider.CreateRecognizerRegistry();
44-
_registry.AddNlpRecognizer(_nlpEngine);
41+
var provider = new RecognizerRegistryProvider();
42+
_registry = provider.CreateRecognizerRegistry(_nlpEngine, _supportedLanguages);
4543
}
4644
else
4745
{
@@ -54,8 +52,6 @@ public AnalyzerEngine(
5452
_registry = registry;
5553
}
5654

57-
_registry.LoadPredefinedRecognizers(_nlpEngine, _supportedLanguages);
58-
5955
_contextAwareEnhancer = contextAwareEnhancer ?? new LemmaContextAwareEnhancer();
6056
}
6157

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
using System.Runtime.CompilerServices;
2+
3+
[assembly: InternalsVisibleTo("ManagedCode.Presidio.Analyzer.Tests")]
4+
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
namespace ManagedCode.Presidio.Analyzer;
2+
3+
/// <summary>
4+
/// Recognizes Australian Business Numbers (ABN) using canonical patterns and checksum validation.
5+
/// </summary>
6+
public sealed class AuAbnRecognizer(
7+
IEnumerable<Pattern>? patterns = null,
8+
IEnumerable<string>? context = null,
9+
string supportedLanguage = "en",
10+
string supportedEntity = "AU_ABN",
11+
IEnumerable<(string Search, string Replacement)>? replacementPairs = null) : PatternRecognizer(
12+
supportedEntity,
13+
patterns ?? DefaultPatterns,
14+
context: context ?? DefaultContext,
15+
supportedLanguage: supportedLanguage)
16+
{
17+
private static readonly Pattern[] DefaultPatterns =
18+
{
19+
new("ABN (Medium)", @"\b\d{2}\s\d{3}\s\d{3}\s\d{3}\b", 0.1),
20+
new("ABN (Low)", @"\b\d{11}\b", 0.01),
21+
};
22+
23+
private static readonly string[] DefaultContext =
24+
{
25+
"australian business number",
26+
"abn",
27+
};
28+
29+
private static readonly int[] Weights = { 10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19 };
30+
31+
private readonly IReadOnlyList<(string Search, string Replacement)> _replacementPairs =
32+
replacementPairs?.ToArray() ?? new[]
33+
{
34+
("-", string.Empty),
35+
(" ", string.Empty),
36+
};
37+
38+
protected override bool? ValidateResult(string patternText)
39+
{
40+
if (string.IsNullOrWhiteSpace(patternText))
41+
{
42+
return false;
43+
}
44+
45+
var sanitized = EntityRecognizer.SanitizeValue(patternText, _replacementPairs);
46+
if (sanitized.Length != 11 || sanitized.Any(ch => !char.IsDigit(ch)))
47+
{
48+
return false;
49+
}
50+
51+
Span<int> digits = stackalloc int[11];
52+
for (var i = 0; i < sanitized.Length; i++)
53+
{
54+
digits[i] = sanitized[i] - '0';
55+
}
56+
57+
digits[0] = digits[0] == 0 ? 9 : digits[0] - 1;
58+
59+
var sum = 0;
60+
for (var i = 0; i < Weights.Length; i++)
61+
{
62+
sum += digits[i] * Weights[i];
63+
}
64+
65+
return sum % 89 == 0;
66+
}
67+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
namespace ManagedCode.Presidio.Analyzer;
2+
3+
/// <summary>
4+
/// Recognizes Australian Company Numbers (ACN) with checksum validation.
5+
/// </summary>
6+
public sealed class AuAcnRecognizer(
7+
IEnumerable<Pattern>? patterns = null,
8+
IEnumerable<string>? context = null,
9+
string supportedLanguage = "en",
10+
string supportedEntity = "AU_ACN",
11+
IEnumerable<(string Search, string Replacement)>? replacementPairs = null) : PatternRecognizer(
12+
supportedEntity,
13+
patterns ?? DefaultPatterns,
14+
context: context ?? DefaultContext,
15+
supportedLanguage: supportedLanguage)
16+
{
17+
private static readonly Pattern[] DefaultPatterns =
18+
{
19+
new("ACN (Medium)", @"\b\d{3}\s\d{3}\s\d{3}\b", 0.1),
20+
new("ACN (Low)", @"\b\d{9}\b", 0.01),
21+
};
22+
23+
private static readonly string[] DefaultContext =
24+
{
25+
"australian company number",
26+
"acn",
27+
};
28+
29+
private static readonly int[] Weights = { 8, 7, 6, 5, 4, 3, 2, 1 };
30+
31+
private readonly IReadOnlyList<(string Search, string Replacement)> _replacementPairs =
32+
replacementPairs?.ToArray() ?? new[]
33+
{
34+
("-", string.Empty),
35+
(" ", string.Empty),
36+
};
37+
38+
protected override bool? ValidateResult(string patternText)
39+
{
40+
if (string.IsNullOrWhiteSpace(patternText))
41+
{
42+
return false;
43+
}
44+
45+
var sanitized = EntityRecognizer.SanitizeValue(patternText, _replacementPairs);
46+
if (sanitized.Length != 9 || sanitized.Any(ch => !char.IsDigit(ch)))
47+
{
48+
return false;
49+
}
50+
51+
Span<int> digits = stackalloc int[9];
52+
for (var i = 0; i < sanitized.Length; i++)
53+
{
54+
digits[i] = sanitized[i] - '0';
55+
}
56+
57+
var sum = 0;
58+
for (var i = 0; i < Weights.Length; i++)
59+
{
60+
sum += digits[i] * Weights[i];
61+
}
62+
63+
var remainder = sum % 10;
64+
var checkDigit = (10 - remainder) % 10;
65+
return checkDigit == digits[^1];
66+
}
67+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
namespace ManagedCode.Presidio.Analyzer;
2+
3+
/// <summary>
4+
/// Recognizes Australian Medicare numbers using checksum validation.
5+
/// </summary>
6+
public sealed class AuMedicareRecognizer(
7+
IEnumerable<Pattern>? patterns = null,
8+
IEnumerable<string>? context = null,
9+
string supportedLanguage = "en",
10+
string supportedEntity = "AU_MEDICARE",
11+
IEnumerable<(string Search, string Replacement)>? replacementPairs = null) : PatternRecognizer(
12+
supportedEntity,
13+
patterns ?? DefaultPatterns,
14+
context: context ?? DefaultContext,
15+
supportedLanguage: supportedLanguage)
16+
{
17+
private static readonly Pattern[] DefaultPatterns =
18+
{
19+
new("Medicare (Medium)", @"\b\d{4}\s\d{5}\s\d\b", 0.1),
20+
new("Medicare (Low)", @"\b\d{10}\b", 0.01),
21+
};
22+
23+
private static readonly string[] DefaultContext =
24+
{
25+
"medicare",
26+
"australian medicare",
27+
};
28+
29+
private static readonly int[] Weights = { 1, 3, 7, 9, 1, 3, 7, 9 };
30+
31+
private readonly IReadOnlyList<(string Search, string Replacement)> _replacementPairs =
32+
replacementPairs?.ToArray() ?? new[]
33+
{
34+
("-", string.Empty),
35+
(" ", string.Empty),
36+
};
37+
38+
protected override bool? ValidateResult(string patternText)
39+
{
40+
if (string.IsNullOrWhiteSpace(patternText))
41+
{
42+
return false;
43+
}
44+
45+
var sanitized = EntityRecognizer.SanitizeValue(patternText, _replacementPairs);
46+
if (sanitized.Length != 10 || sanitized.Any(ch => !char.IsDigit(ch)))
47+
{
48+
return false;
49+
}
50+
51+
Span<int> digits = stackalloc int[10];
52+
for (var i = 0; i < sanitized.Length; i++)
53+
{
54+
digits[i] = sanitized[i] - '0';
55+
}
56+
57+
var sum = 0;
58+
for (var i = 0; i < Weights.Length; i++)
59+
{
60+
sum += digits[i] * Weights[i];
61+
}
62+
63+
return (sum % 10) == digits[8];
64+
}
65+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
namespace ManagedCode.Presidio.Analyzer;
2+
3+
/// <summary>
4+
/// Recognizes Australian Tax File Numbers (TFN) with checksum validation.
5+
/// </summary>
6+
public sealed class AuTfnRecognizer(
7+
IEnumerable<Pattern>? patterns = null,
8+
IEnumerable<string>? context = null,
9+
string supportedLanguage = "en",
10+
string supportedEntity = "AU_TFN",
11+
IEnumerable<(string Search, string Replacement)>? replacementPairs = null) : PatternRecognizer(
12+
supportedEntity,
13+
patterns ?? DefaultPatterns,
14+
context: context ?? DefaultContext,
15+
supportedLanguage: supportedLanguage)
16+
{
17+
private static readonly Pattern[] DefaultPatterns =
18+
{
19+
new("TFN (Medium)", @"\b\d{3}\s\d{3}\s\d{3}\b", 0.1),
20+
new("TFN (Low)", @"\b\d{9}\b", 0.01),
21+
};
22+
23+
private static readonly string[] DefaultContext =
24+
{
25+
"tax file number",
26+
"tfn",
27+
};
28+
29+
private static readonly int[] Weights = { 1, 4, 3, 7, 5, 8, 6, 9, 10 };
30+
31+
private readonly IReadOnlyList<(string Search, string Replacement)> _replacementPairs =
32+
replacementPairs?.ToArray() ?? new[]
33+
{
34+
("-", string.Empty),
35+
(" ", string.Empty),
36+
};
37+
38+
protected override bool? ValidateResult(string patternText)
39+
{
40+
if (string.IsNullOrWhiteSpace(patternText))
41+
{
42+
return false;
43+
}
44+
45+
var sanitized = EntityRecognizer.SanitizeValue(patternText, _replacementPairs);
46+
if (sanitized.Length != 9 || sanitized.Any(ch => !char.IsDigit(ch)))
47+
{
48+
return false;
49+
}
50+
51+
Span<int> digits = stackalloc int[9];
52+
for (var i = 0; i < sanitized.Length; i++)
53+
{
54+
digits[i] = sanitized[i] - '0';
55+
}
56+
57+
var sum = 0;
58+
for (var i = 0; i < Weights.Length; i++)
59+
{
60+
sum += digits[i] * Weights[i];
61+
}
62+
63+
return sum % 11 == 0;
64+
}
65+
}

0 commit comments

Comments
 (0)