33import threading
44from collections import defaultdict
55import logging
6- from typing import Dict , List , Optional , Union
6+ from typing import Dict , List , Optional , Sequence , Union
77from pathlib import Path
88
99import onnxruntime as ort
1010
1111from presidio_anonymizer import AnonymizerEngine
1212from presidio_anonymizer .entities import RecognizerResult
1313
14+ from typing import NamedTuple as _NamedTuple
15+
1416from sct .utils import constants
1517from sct .utils .anonymization_map import AnonymizationMap
1618from sct .utils .onnx_pipeline import load_onnx_ner_model
1719from sct .config import DEFAULT_NER_MODELS , DEFAULT_NER_ENSEMBLE , NER_ENSEMBLE_DEFAULT_KEYS , LANG_KEYS
1820
21+
22+ class AnonymizeResult (_NamedTuple ):
23+ """Result of text anonymization — lightweight typed container."""
24+ text : str
25+
1926ort .set_default_logger_severity (3 ) # Silence ONNX Runtime warnings
2027
2128logger = logging .getLogger (__name__ )
@@ -53,7 +60,8 @@ def __init__(self, cache_dir: Optional[Path] = None, device: Optional[str] = Non
5360 ner_batch_size : int = 8 ,
5461 ensemble_models : Optional [Dict ] = None ,
5562 ensemble_default_keys : Optional [tuple ] = None ,
56- replacement_mode : str = 'placeholder' ):
63+ replacement_mode : str = 'placeholder' ,
64+ synthetic_replacer = None ):
5765 """Initialize NER processor.
5866
5967 Args:
@@ -67,11 +75,12 @@ def __init__(self, cache_dir: Optional[Path] = None, device: Optional[str] = Non
6775 Required when ner_backend involves GLiNER.
6876 torch_model_names: Language-keyed dict of PyTorch model repo IDs.
6977 Required when ner_backend involves torch.
78+ synthetic_replacer: Shared SyntheticReplacer instance (from TextCleaner).
7079 """
7180 self ._ner_backend = ner_backend
7281 self ._ner_batch_size = ner_batch_size
7382 self ._replacement_mode = replacement_mode
74- self ._synthetic_replacer = None # Lazy-loaded when replacement_mode='synthetic'
83+ self ._synthetic_replacer = synthetic_replacer
7584 self ._gliner_pipe = None
7685 self ._ensemble_models : Dict [str , tuple ] = ensemble_models if ensemble_models is not None else DEFAULT_NER_ENSEMBLE
7786 self ._ensemble_default_keys : tuple = (
@@ -195,11 +204,12 @@ def _init_presidio_gliner(self, gliner_config):
195204 try :
196205 from presidio_analyzer .predefined_recognizers import GLiNERRecognizer # noqa: S404
197206 gliner_recognizer = GLiNERRecognizer (
198- model_path = gliner_config ['model' ],
207+ model_name = gliner_config ['model' ],
199208 supported_entities = [
200209 label .upper ()
201210 for label in gliner_config .get ('labels' , ['person' , 'organization' , 'location' ])
202211 ],
212+ threshold = gliner_config .get ('threshold' , 0.4 ),
203213 )
204214 self ._analyzer .registry .add_recognizer (gliner_recognizer )
205215 except ImportError :
@@ -208,13 +218,6 @@ def _init_presidio_gliner(self, gliner_config):
208218 "Install with: pip install presidio-analyzer gliner"
209219 )
210220
211- def _get_synthetic_replacer (self ):
212- """Lazily initialize the SyntheticReplacer."""
213- if self ._synthetic_replacer is None :
214- from sct .utils .synthetic import SyntheticReplacer
215- self ._synthetic_replacer = SyntheticReplacer ()
216- return self ._synthetic_replacer
217-
218221 def _get_ensemble_keys (self , language : str ) -> tuple :
219222 """Return ordered model keys to run for the given language."""
220223 return self ._ensemble_models .get (language , self ._ensemble_default_keys )
@@ -313,9 +316,8 @@ def anonymize_text(self, text, filtered_data, replacement_mode='placeholder',
313316 return self ._anonymize_reversible (text , filtered_data , anon_map )
314317
315318 if replacement_mode == 'synthetic' :
316- replacer = self ._get_synthetic_replacer ()
317- result_text = replacer .generate_for_entities (text , filtered_data )
318- return type ('AnonymizeResult' , (), {'text' : result_text })()
319+ result_text = self ._synthetic_replacer .generate_for_entities (text , filtered_data )
320+ return AnonymizeResult (text = result_text )
319321
320322 has_custom = any (
321323 items ['entity_group' ] not in ENTITY_TYPE_MAP
@@ -328,7 +330,7 @@ def anonymize_text(self, text, filtered_data, replacement_mode='placeholder',
328330 for items in sorted_data :
329331 tag = ENTITY_TYPE_MAP .get (items ['entity_group' ], items ['entity_group' ])
330332 text = text [:items ['start' ]] + f"<{ tag } >" + text [items ['end' ]:]
331- return type ( ' AnonymizeResult' , (), { ' text' : text })( )
333+ return AnonymizeResult ( text = text )
332334 else :
333335 # Standard entities only: use Presidio (existing behavior)
334336 analyzer_result = []
@@ -348,13 +350,14 @@ def anonymize_text(self, text, filtered_data, replacement_mode='placeholder',
348350 if 0 <= entry .start < text_length and 0 < entry .end <= text_length
349351 ]
350352
351- return self .engine .anonymize (text = text , analyzer_results = analyzer_result )
353+ engine_result = self .engine .anonymize (text = text , analyzer_results = analyzer_result )
354+ return AnonymizeResult (text = engine_result .text )
352355
353356 def _anonymize_reversible (self , text , filtered_data , anon_map = None ):
354357 """Replace entities with indexed placeholders and populate the map.
355358
356359 Uses right-to-left replacement to preserve character offsets.
357- Returns an AnonymizeResult-like object with ``.text`` attribute .
360+ Returns an `` AnonymizeResult`` with the anonymized ``.text``.
358361 """
359362 if anon_map is None :
360363 anon_map = AnonymizationMap ()
@@ -373,7 +376,7 @@ def _anonymize_reversible(self, text, filtered_data, anon_map=None):
373376 )
374377 text = text [:item ['start' ]] + placeholder + text [item ['end' ]:]
375378
376- return type ( ' AnonymizeResult' , (), { ' text' : text , 'anon_map' : anon_map })( )
379+ return AnonymizeResult ( text = text )
377380
378381 def ner_ensemble (self , ner_results , t ):
379382 """Apply ensemble voting across multiple model results.
@@ -440,7 +443,7 @@ def _simple_chunk(self, text: str, max_tokens: int = 384) -> List[str]:
440443 def ner_process (
441444 self ,
442445 text : str ,
443- positional_tags : Optional [List [str ]] = None ,
446+ positional_tags : Optional [Sequence [str ]] = None ,
444447 ner_confidence_threshold : Optional [float ] = None ,
445448 language : Optional [str ] = None ,
446449 anon_map : Optional ['AnonymizationMap' ] = None ,
@@ -474,6 +477,16 @@ def ner_process(
474477 if not chunks :
475478 return text
476479
480+ # Pre-compute GLiNER-only tag set (constant across chunks)
481+ gliner_all_tags = None
482+ if self ._ner_backend == 'gliner' and self ._gliner_pipe :
483+ gliner_all_tags = set (positional_tags )
484+ gliner_all_tags .update (self ._gliner_pipe .label_map .values ())
485+ gliner_all_tags .update (
486+ label .upper () for label in self ._gliner_pipe .labels
487+ if label not in self ._gliner_pipe .label_map
488+ )
489+
477490 # --- Inference + ensemble per chunk ---
478491 ner_clean_text = []
479492 for chunk in chunks :
@@ -486,7 +499,7 @@ def ner_process(
486499 model_name = self ._model_names .get (key , key )
487500 model_lock = self ._get_lock (model_name )
488501 with model_lock :
489- batch = self ._get_pipeline ( key ) ([chunk ])
502+ batch = self ._pipelines [ key ] ([chunk ])
490503 ner_results .extend (self .ner_data (batch [0 ], positional_tags ))
491504
492505 # GLiNER backend
@@ -495,15 +508,8 @@ def ner_process(
495508 gliner_lock = self ._get_lock ('gliner' )
496509 with gliner_lock :
497510 gliner_batch = self ._gliner_pipe ([chunk ])
498- if self ._ner_backend == 'gliner' :
499- # GLiNER-only: include all mapped entity types
500- all_tags = set (positional_tags )
501- all_tags .update (self ._gliner_pipe .label_map .values ())
502- all_tags .update (
503- label .upper () for label in self ._gliner_pipe .labels
504- if label not in self ._gliner_pipe .label_map
505- )
506- ner_results .extend (self .ner_data (gliner_batch [0 ], all_tags ))
511+ if gliner_all_tags is not None :
512+ ner_results .extend (self .ner_data (gliner_batch [0 ], gliner_all_tags ))
507513 else :
508514 # Ensemble: filter to positional_tags only
509515 ner_results .extend (self .ner_data (gliner_batch [0 ], positional_tags ))
0 commit comments