digital-brain/backend/orchestrator/search_normalization.py at main · githubdoramon/digital-brain · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from __future__ import annotations

import unicodedata
from collections.abc import Iterable


def _strip_accents(text: str) -> str:
    normalized = unicodedata.normalize("NFD", text)
    return "".join(ch for ch in normalized if unicodedata.category(ch) != "Mn")


def normalize_search_text(text: str) -> str:
    """Normalize input for case- and accent-insensitive search."""
    if text is None:
        return ""
    cleaned = _strip_accents(str(text))
    compact = " ".join(cleaned.split())
    return compact.casefold()


def normalize_search_list(values: Iterable[str] | None) -> list[str]:
    normalized: list[str] = []
    seen = set()
    for value in values or []:
        candidate = normalize_search_text(value)
        if not candidate or candidate in seen:
            continue
        seen.add(candidate)
        normalized.append(candidate)
    return normalized