rhnfzl
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sct/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎sct/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sct/config.py‎
Lines changed: 2 additions & 3 deletions b/‎sct/config.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎sct/sct.py‎
Lines changed: 5 additions & 14 deletions b/‎sct/sct.py‎
Lines changed: 5 additions & 14 deletions
diff --git a/‎sct/utils/gliclass_adapter.py‎
Lines changed: 16 additions & 33 deletions b/‎sct/utils/gliclass_adapter.py‎
Lines changed: 16 additions & 33 deletions
diff --git a/‎sct/utils/gliner_adapter.py‎
Lines changed: 13 additions & 4 deletions b/‎sct/utils/gliner_adapter.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎sct/utils/ner.py‎
Lines changed: 35 additions & 29 deletions b/‎sct/utils/ner.py‎
Lines changed: 35 additions & 29 deletions
@@ -173,6 +173,7 @@ SPECS.md
 docs/plans/
 docs/GLINER_GAP_ANALYSIS.md
 docs/V060_PLAN.md
+docs/FUNCTIONAL_TEST_REPORT.md
 squeakycleantext-explorer.html
 ralph-loop-prompt.md
 .claude/
 
@@ -5,7 +5,7 @@
 from sct.utils.anonymization_map import AnonymizationMap, MapEntry
 from sct.utils.process_result import ProcessResult
 
-__version__ = "0.6.0"
+__version__ = "0.6.1"
 __all__ = [
     "TextCleaner", "TextCleanerConfig",
     "PII_LABELS", "PII_LABEL_MAP",
 
@@ -25,6 +25,7 @@
 VALID_NER_BACKENDS = frozenset({
     'onnx', 'torch', 'gliner', 'ensemble_onnx', 'ensemble_torch', 'presidio_gliner',
 })
+GLINER_BACKENDS = frozenset({'gliner', 'ensemble_onnx', 'ensemble_torch', 'presidio_gliner'})
 
 DEFAULT_NER_MODELS: dict[str, str] = {
     'ENGLISH': 'rhnfzl/xlm-roberta-large-conll03-english-onnx',
@@ -320,9 +321,7 @@ def __post_init__(self):
             )
 
         # GLiNER fields required for gliner/ensemble backends
-        needs_gliner = self.ner_backend in (
-            'gliner', 'ensemble_onnx', 'ensemble_torch', 'presidio_gliner',
-        )
+        needs_gliner = self.ner_backend in GLINER_BACKENDS
         if needs_gliner:
             if not self.gliner_model:
                 raise ValueError(
 
@@ -7,7 +7,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any, Callable, Dict, List, Optional
 
-from sct.config import TextCleanerConfig, _config_from_module_globals
+from sct.config import TextCleanerConfig, GLINER_BACKENDS, _config_from_module_globals
 from sct.utils import constants, contact, datetime, ner, normtext, resources, special, stopwords
 from sct.utils.anonymization_map import AnonymizationMap
 from sct.utils.process_result import ProcessResult
@@ -71,9 +71,7 @@ def __init__(self, cfg: Optional[TextCleanerConfig] = None):
         if self.cfg.check_ner_process:
             # Build GLiNER config dict (if needed)
             gliner_config = None
-            needs_gliner = self.cfg.ner_backend in (
-                'gliner', 'ensemble_onnx', 'ensemble_torch', 'presidio_gliner',
-            )
+            needs_gliner = self.cfg.ner_backend in GLINER_BACKENDS
             if needs_gliner:
                 gliner_config = {
                     'model': self.cfg.gliner_model,
@@ -109,10 +107,8 @@ def __init__(self, cfg: Optional[TextCleanerConfig] = None):
                     else None
                 ),
                 replacement_mode=self.cfg.replacement_mode,
+                synthetic_replacer=self._synthetic_replacer,
             )
-        else:
-            pass  # self.GeneralNER already initialized to None above
-
         # GLiClass document-level pre-classification (optional, lazy-loaded)
         self._gliclass: Any = None
         if self.cfg.check_classify_document:
@@ -126,7 +122,6 @@ def __init__(self, cfg: Optional[TextCleanerConfig] = None):
                 onnx=self.cfg.gliclass_onnx,
             )
 
-        self.batch_size = 8
         self._pipeline: List[Callable[[str], str]] = []
         self._post_fuzzy_pipeline: List[Callable[[str], str]] = []
         self._init_pipeline()
@@ -227,9 +222,6 @@ def _process_single(self, text: str) -> ProcessResult:
         # Detect language (pure function, thread-safe)
         language = self._detect_language(text)
 
-        # Pass language explicitly through pipeline context dict
-        ctx = {"language": language}
-
         current_text = text
 
         # Pre-fuzzy pipeline steps (unicode fix → html → urls → emails → dates)
@@ -239,12 +231,11 @@ def _process_single(self, text: str) -> ProcessResult:
         # Fuzzy date replacement — requires language context, called explicitly
         # to avoid thread-local; positioned between replace_dates and replace_years.
         if self.cfg.check_fuzzy_replace_dates:
-            lang = ctx.get("language")
             current_text = self.ProcessDateTime.fuzzy_replace_dates(
                 current_text,
                 replace_with=self.cfg.replace_with_dates,
                 score_cutoff=self.cfg.fuzzy_date_score_cutoff,
-                language=lang,
+                language=language,
             )
 
         # Post-fuzzy pipeline steps (years → phones → numbers → symbols → whitespace)
@@ -255,7 +246,7 @@ def _process_single(self, text: str) -> ProcessResult:
         if self.cfg.check_ner_process and self.GeneralNER is not None:
             current_text = self.GeneralNER.ner_process(
                 current_text,
-                positional_tags=list(self.cfg.positional_tags),
+                positional_tags=self.cfg.positional_tags,
                 ner_confidence_threshold=self.cfg.ner_confidence_threshold,
                 language=language,
                 anon_map=anon_map,
 
@@ -32,18 +32,21 @@ def __init__(
         self._onnx = onnx
         self._pipeline = None
 
-        if onnx:
-            self._init_onnx(model_id)
-        else:
-            self._init_pytorch(model_id)
+        self._init_model(model_id)
 
         logger.info(
             "Loaded GLiClass model: %s (onnx=%s, labels=%d)",
             model_id, onnx, len(self.labels),
         )
 
-    def _init_pytorch(self, model_id: str) -> None:
-        """Load GLiClass model via PyTorch (gliclass package)."""
+    def _init_model(self, model_id: str) -> None:
+        """Load GLiClass model via gliclass package.
+
+        Note: gliclass does not yet expose a native ONNX loader, so the
+        ``onnx`` flag is recorded but both paths use the same PyTorch-backed
+        ``GLiClassModel.from_pretrained``.  When gliclass adds ONNX support,
+        this method should branch on ``self._onnx``.
+        """
         try:
             from gliclass import GLiClassModel, ZeroShotClassificationPipeline  # noqa: S404
             from transformers import AutoTokenizer
@@ -62,26 +65,6 @@ def _init_pytorch(self, model_id: str) -> None:
             device='cpu',
         )
 
-    def _init_onnx(self, model_id: str) -> None:
-        """Load GLiClass model via ONNX Runtime (torch-free)."""
-        try:
-            from gliclass import GLiClassModel, ZeroShotClassificationPipeline  # noqa: S404
-            from transformers import AutoTokenizer
-        except ImportError:
-            raise ImportError(
-                "gliclass + onnxruntime are required for ONNX GLiClass backend. "
-                "Install with: pip install squeakycleantext[classify] squeakycleantext[classify-onnx]"
-            )
-
-        model = GLiClassModel.from_pretrained(model_id)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self._pipeline = ZeroShotClassificationPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            classification_type=self.classification_type,
-            device='cpu',
-        )
-
     def classify(self, text: str) -> List[Dict[str, float]]:
         """Classify text against configured labels.
 
@@ -94,16 +77,16 @@ def classify(self, text: str) -> List[Dict[str, float]]:
 
         result = self._pipeline(
             text,
-            candidate_labels=self.labels,
+            labels=self.labels,
         )
 
-        # Pipeline returns {"sequence": ..., "labels": [...], "scores": [...]}
+        # Pipeline returns list[list[dict]] — one list per input text,
+        # each containing {"label": str, "score": float} dicts.
         classifications = []
-        labels = result.get('labels', [])
-        scores = result.get('scores', [])
-        for label, score in zip(labels, scores):
-            if score >= self.threshold:
-                classifications.append({'label': label, 'score': score})
+        entries = result[0] if result else []
+        for entry in entries:
+            if entry['score'] >= self.threshold:
+                classifications.append({'label': entry['label'], 'score': entry['score']})
 
         classifications.sort(key=lambda x: x['score'], reverse=True)
         return classifications
 
@@ -48,10 +48,19 @@ def __init__(
                     "Install with: pip install squeakycleantext[gliner]"
                 )
             if onnx:
-                self.model = GLiNER.from_pretrained(
-                    model_id, load_onnx_model=True, load_tokenizer=True,
-                )
-                logger.info("Loaded GLiNER model in ONNX mode: %s", model_id)
+                try:
+                    self.model = GLiNER.from_pretrained(
+                        model_id, load_onnx_model=True, load_tokenizer=True,
+                    )
+                    logger.info("Loaded GLiNER model in ONNX mode: %s", model_id)
+                except FileNotFoundError:
+                    logger.warning(
+                        "ONNX model not found for %s (GLiNER issue #314: most models "
+                        "don't ship model.onnx at repo root). Falling back to PyTorch.",
+                        model_id,
+                    )
+                    self.model = GLiNER.from_pretrained(model_id)
+                    self._onnx = False
             else:
                 self.model = GLiNER.from_pretrained(model_id)
             if device == 'cuda' and not onnx:
 
@@ -3,19 +3,26 @@
 import threading
 from collections import defaultdict
 import logging
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Sequence, Union
 from pathlib import Path
 
 import onnxruntime as ort
 
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import RecognizerResult
 
+from typing import NamedTuple as _NamedTuple
+
 from sct.utils import constants
 from sct.utils.anonymization_map import AnonymizationMap
 from sct.utils.onnx_pipeline import load_onnx_ner_model
 from sct.config import DEFAULT_NER_MODELS, DEFAULT_NER_ENSEMBLE, NER_ENSEMBLE_DEFAULT_KEYS, LANG_KEYS
 
+
+class AnonymizeResult(_NamedTuple):
+    """Result of text anonymization — lightweight typed container."""
+    text: str
+
 ort.set_default_logger_severity(3)  # Silence ONNX Runtime warnings
 
 logger = logging.getLogger(__name__)
@@ -53,7 +60,8 @@ def __init__(self, cache_dir: Optional[Path] = None, device: Optional[str] = Non
                  ner_batch_size: int = 8,
                  ensemble_models: Optional[Dict] = None,
                  ensemble_default_keys: Optional[tuple] = None,
-                 replacement_mode: str = 'placeholder'):
+                 replacement_mode: str = 'placeholder',
+                 synthetic_replacer=None):
         """Initialize NER processor.
 
         Args:
@@ -67,11 +75,12 @@ def __init__(self, cache_dir: Optional[Path] = None, device: Optional[str] = Non
                 Required when ner_backend involves GLiNER.
             torch_model_names: Language-keyed dict of PyTorch model repo IDs.
                 Required when ner_backend involves torch.
+            synthetic_replacer: Shared SyntheticReplacer instance (from TextCleaner).
         """
         self._ner_backend = ner_backend
         self._ner_batch_size = ner_batch_size
         self._replacement_mode = replacement_mode
-        self._synthetic_replacer = None  # Lazy-loaded when replacement_mode='synthetic'
+        self._synthetic_replacer = synthetic_replacer
         self._gliner_pipe = None
         self._ensemble_models: Dict[str, tuple] = ensemble_models if ensemble_models is not None else DEFAULT_NER_ENSEMBLE
         self._ensemble_default_keys: tuple = (
@@ -195,11 +204,12 @@ def _init_presidio_gliner(self, gliner_config):
         try:
             from presidio_analyzer.predefined_recognizers import GLiNERRecognizer  # noqa: S404
             gliner_recognizer = GLiNERRecognizer(
-                model_path=gliner_config['model'],
+                model_name=gliner_config['model'],
                 supported_entities=[
                     label.upper()
                     for label in gliner_config.get('labels', ['person', 'organization', 'location'])
                 ],
+                threshold=gliner_config.get('threshold', 0.4),
             )
             self._analyzer.registry.add_recognizer(gliner_recognizer)
         except ImportError:
@@ -208,13 +218,6 @@ def _init_presidio_gliner(self, gliner_config):
                 "Install with: pip install presidio-analyzer gliner"
             )
 
-    def _get_synthetic_replacer(self):
-        """Lazily initialize the SyntheticReplacer."""
-        if self._synthetic_replacer is None:
-            from sct.utils.synthetic import SyntheticReplacer
-            self._synthetic_replacer = SyntheticReplacer()
-        return self._synthetic_replacer
-
     def _get_ensemble_keys(self, language: str) -> tuple:
         """Return ordered model keys to run for the given language."""
         return self._ensemble_models.get(language, self._ensemble_default_keys)
@@ -313,9 +316,8 @@ def anonymize_text(self, text, filtered_data, replacement_mode='placeholder',
             return self._anonymize_reversible(text, filtered_data, anon_map)
 
         if replacement_mode == 'synthetic':
-            replacer = self._get_synthetic_replacer()
-            result_text = replacer.generate_for_entities(text, filtered_data)
-            return type('AnonymizeResult', (), {'text': result_text})()
+            result_text = self._synthetic_replacer.generate_for_entities(text, filtered_data)
+            return AnonymizeResult(text=result_text)
 
         has_custom = any(
             items['entity_group'] not in ENTITY_TYPE_MAP
@@ -328,7 +330,7 @@ def anonymize_text(self, text, filtered_data, replacement_mode='placeholder',
             for items in sorted_data:
                 tag = ENTITY_TYPE_MAP.get(items['entity_group'], items['entity_group'])
                 text = text[:items['start']] + f"<{tag}>" + text[items['end']:]
-            return type('AnonymizeResult', (), {'text': text})()
+            return AnonymizeResult(text=text)
         else:
             # Standard entities only: use Presidio (existing behavior)
             analyzer_result = []
@@ -348,13 +350,14 @@ def anonymize_text(self, text, filtered_data, replacement_mode='placeholder',
                 if 0 <= entry.start < text_length and 0 < entry.end <= text_length
             ]
 
-            return self.engine.anonymize(text=text, analyzer_results=analyzer_result)
+            engine_result = self.engine.anonymize(text=text, analyzer_results=analyzer_result)
+            return AnonymizeResult(text=engine_result.text)
 
     def _anonymize_reversible(self, text, filtered_data, anon_map=None):
         """Replace entities with indexed placeholders and populate the map.
 
         Uses right-to-left replacement to preserve character offsets.
-        Returns an AnonymizeResult-like object with ``.text`` attribute.
+        Returns an ``AnonymizeResult`` with the anonymized ``.text``.
         """
         if anon_map is None:
             anon_map = AnonymizationMap()
@@ -373,7 +376,7 @@ def _anonymize_reversible(self, text, filtered_data, anon_map=None):
             )
             text = text[:item['start']] + placeholder + text[item['end']:]
 
-        return type('AnonymizeResult', (), {'text': text, 'anon_map': anon_map})()
+        return AnonymizeResult(text=text)
 
     def ner_ensemble(self, ner_results, t):
         """Apply ensemble voting across multiple model results.
@@ -440,7 +443,7 @@ def _simple_chunk(self, text: str, max_tokens: int = 384) -> List[str]:
     def ner_process(
         self,
         text: str,
-        positional_tags: Optional[List[str]] = None,
+        positional_tags: Optional[Sequence[str]] = None,
         ner_confidence_threshold: Optional[float] = None,
         language: Optional[str] = None,
         anon_map: Optional['AnonymizationMap'] = None,
@@ -474,6 +477,16 @@ def ner_process(
         if not chunks:
             return text
 
+        # Pre-compute GLiNER-only tag set (constant across chunks)
+        gliner_all_tags = None
+        if self._ner_backend == 'gliner' and self._gliner_pipe:
+            gliner_all_tags = set(positional_tags)
+            gliner_all_tags.update(self._gliner_pipe.label_map.values())
+            gliner_all_tags.update(
+                label.upper() for label in self._gliner_pipe.labels
+                if label not in self._gliner_pipe.label_map
+            )
+
         # --- Inference + ensemble per chunk ---
         ner_clean_text = []
         for chunk in chunks:
@@ -486,7 +499,7 @@ def ner_process(
                     model_name = self._model_names.get(key, key)
                     model_lock = self._get_lock(model_name)
                     with model_lock:
-                        batch = self._get_pipeline(key)([chunk])
+                        batch = self._pipelines[key]([chunk])
                     ner_results.extend(self.ner_data(batch[0], positional_tags))
 
             # GLiNER backend
@@ -495,15 +508,8 @@ def ner_process(
                 gliner_lock = self._get_lock('gliner')
                 with gliner_lock:
                     gliner_batch = self._gliner_pipe([chunk])
-                if self._ner_backend == 'gliner':
-                    # GLiNER-only: include all mapped entity types
-                    all_tags = set(positional_tags)
-                    all_tags.update(self._gliner_pipe.label_map.values())
-                    all_tags.update(
-                        label.upper() for label in self._gliner_pipe.labels
-                        if label not in self._gliner_pipe.label_map
-                    )
-                    ner_results.extend(self.ner_data(gliner_batch[0], all_tags))
+                if gliner_all_tags is not None:
+                    ner_results.extend(self.ner_data(gliner_batch[0], gliner_all_tags))
                 else:
                     # Ensemble: filter to positional_tags only
                     ner_results.extend(self.ner_data(gliner_batch[0], positional_tags))