Merge pull request #16 from erinshek/main

erinshek · web-flow · commit e783bee3f51f · 2026-04-12T12:34:48.000+05:00
Add word syllabification for Karakalpak Latin and Cyrillic scripts
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ print(cyrillic2latin("Ассалаўма әлейкум"))  # Assalawma áleykum
 |---|---|
 | **Script Conversion** | Bidirectional Latin ↔ Cyrillic conversion with multi-character mapping (`sh`→`ш`, `ch`→`ч`) and special Cyrillic rules (`ьи`→`yi`, `ьо`→`yo`, `ъе`→`ye`) |
 | **Number to Words** | Converts integers and floats to Karakalpak words in Latin or Cyrillic script. Supports range 0 to 10³⁰, negative numbers, and decimal fractions |
+| **Word Syllabification** | Splits Karakalpak words into syllables, works with both Latin and Cyrillic scripts, preserves letter case, and recognises digraphs like `sh`, `ch`, `yu`, `ya`, `aw`, `ew` |
 | **String Utilities** | Karakalpak-aware `upper()` / `lower()` that correctly handle the dotless `ı` ↔ `Í` character pair |
 | **CLI Tools** | `cyr2lat` and `lat2cyr` commands for converting text files from the terminal |
 
@@ -58,6 +59,26 @@ to_word(-42)                     # minus qırıq eki
 
 **Raises:** `NumberRangeError` if `number` exceeds 10³⁰.
 
+### Word Syllabification
+
+```python
+from kaalin.syllable import syllabify
+
+syllabify("qaraqalpaqstan")   # ['qa', 'ra', 'qal', 'paq', 'stan']
+syllabify("kompyuter")        # ['kom', 'pyu', 'ter']
+syllabify("Шарапат")          # ['Ша', 'ра', 'пат']
+syllabify("Adam")             # ['A', 'dam']
+
+"-".join(syllabify("úydegiler"))   # 'úy-de-gi-ler'
+```
+
+**Parameters:**
+- `word` (`str`) — the word to split. Accepts Latin or Cyrillic input.
+
+**Returns:** A `list[str]` of syllables in the same script as the input. Words with fewer than two vowels are returned as a single-element list unchanged.
+
+**Raises:** `TypeError` if `word` is not a string.
+
 ### String Utilities
 
 ```python
@@ -87,6 +108,7 @@ lat2cyr input.txt output.txt   # writes output.txt
 
 - Converting Karakalpak text between Latin and Cyrillic scripts
 - Displaying numbers as Karakalpak words (invoices, checks, education)
+- Splitting words into syllables for hyphenation, typesetting, or language learning
 - NLP preprocessing for Karakalpak text (script normalization)
 - Building Karakalpak-language applications that need locale-aware string operations
 - Batch-converting text files via CLI
diff --git a/kaalin/__init__.py b/kaalin/__init__.py
@@ -1,4 +1,5 @@
 from .number import NumberRangeError
 from .converter import latin2cyrillic, cyrillic2latin
+from .syllable import syllabify
 from . import string
 from . import cli
diff --git a/kaalin/syllable/__init__.py b/kaalin/syllable/__init__.py
@@ -0,0 +1 @@
+from .syllabifier import syllabify
diff --git a/kaalin/syllable/syllabifier.py b/kaalin/syllable/syllabifier.py
@@ -0,0 +1,192 @@
+"""Split Karakalpak words into syllables.
+
+Works with both Latin and Cyrillic Karakalpak text. Cyrillic input is
+transliterated to Latin for the pattern-matching pass and then
+converted back, so callers get syllables in the same script they
+passed in. Original letter case is preserved.
+
+The algorithm:
+
+1. Collapse multi-character letter sequences (``sh``, ``ch``, ``yu``,
+   ``ya``, ``aw``, ``ew``) into single placeholder characters so the
+   classifier can treat them atomically.
+2. Map each character to ``V`` (vowel), ``C`` (consonant) or ``?``
+   (unknown).
+3. Walk the resulting pattern with a fixed set of rules that reflect
+   Karakalpak phonotactics, producing a list of syllable lengths.
+4. Slice the original word with those lengths so each syllable keeps
+   its own characters and case.
+"""
+
+from kaalin.converter import cyrillic2latin, latin2cyrillic
+
+
+_VOWELS = set("aáeioóuúíıÿŷåê")
+_CONSONANTS = set("bcdfgǵhjklmnńpqrsştçvwxyz")
+
+# (original multi-char sequence, single-char placeholder)
+_AUTO_CORRECT_PAIRS = (
+    ("sh", "ş"),
+    ("ch", "ç"),
+    ("yu", "ŷ"),
+    ("ya", "ÿ"),
+    ("aw", "å"),
+    ("ew", "ê"),
+)
+
+
+def _auto_correct(text: str) -> str:
+    for src, dst in _AUTO_CORRECT_PAIRS:
+        text = text.replace(src, dst)
+    return text
+
+
+def _inverse_correct(text: str) -> str:
+    for src, dst in _AUTO_CORRECT_PAIRS:
+        text = text.replace(dst, src)
+    return text
+
+
+def _classify(text: str) -> str:
+    return "".join(
+        "V" if ch in _VOWELS else "C" if ch in _CONSONANTS else "?"
+        for ch in text
+    )
+
+
+def _create_map(pattern: str) -> list[int]:
+    syllable_map: list[int] = []
+    i = 0
+    n = len(pattern)
+
+    def p(offset: int = 0) -> str:
+        idx = i + offset
+        return pattern[idx] if idx < n else ""
+
+    while i < n:
+
+        # ── V-initial ──────────────────────────────────────────────
+        if p(0) == "V":
+            if p(1) in ("", "V"):
+                syllable_map.append(1); i += 1
+            elif p(1) == "C" and p(2) == "V":
+                syllable_map.append(1); i += 1
+            elif p(1) == "C" and p(2) == "C" and p(3) == "V":
+                syllable_map.append(2); i += 2
+            elif p(1) == "C" and p(2) == "C" and p(3) == "C" and p(4) == "V":
+                syllable_map.append(3); i += 3
+            elif p(1) == "C" and p(2) == "C":
+                syllable_map.append(3); i += 3
+            else:
+                syllable_map.append(2); i += 2
+
+        # ── C-initial ──────────────────────────────────────────────
+        elif p(0) == "C":
+
+            # CCV-initial
+            if p(1) == "C" and p(2) == "V":
+                if p(3) in ("", "V"):
+                    syllable_map.append(3); i += 3
+                elif p(3) == "C" and p(4) == "V":
+                    syllable_map.append(3); i += 3
+                elif p(3) == "C" and p(4) == "C" and p(5) == "V":
+                    syllable_map.append(4); i += 4
+                elif p(3) == "C" and p(4) == "C":
+                    syllable_map.append(5); i += 5
+                else:
+                    syllable_map.append(4); i += 4
+
+            # CV-initial
+            elif p(1) == "V":
+                if p(2) in ("", "V"):
+                    syllable_map.append(2); i += 2
+                elif p(2) == "C" and p(3) == "V":
+                    syllable_map.append(2); i += 2
+                elif p(2) == "C" and p(3) == "C" and p(4) == "V":
+                    syllable_map.append(3); i += 3
+                elif p(2) == "C" and p(3) == "C" and p(4) == "C" and p(5) == "V":
+                    syllable_map.append(3); i += 3
+                elif p(2) == "C" and p(3) == "C":
+                    syllable_map.append(4); i += 4
+                else:
+                    syllable_map.append(3); i += 3
+
+            else:
+                syllable_map.append(1); i += 1
+        else:
+            syllable_map.append(1); i += 1
+
+    return syllable_map
+
+
+def _count_vowels(text: str) -> int:
+    return sum(1 for ch in text if ch in _VOWELS)
+
+
+def _is_cyrillic(text: str) -> bool:
+    return any("\u0400" <= ch <= "\u04FF" for ch in text)
+
+
+def syllabify(word: str) -> list[str]:
+    """Split a single Karakalpak word into syllables.
+
+    Accepts Latin or Cyrillic Karakalpak input and returns the
+    syllables in the same script as the input, with the original
+    letter case preserved.
+
+    Args:
+        word: The word to split. Leading and trailing whitespace is
+            ignored.
+
+    Returns:
+        A list of syllables. An empty input yields an empty list.
+        Words with fewer than two vowels are returned as a single
+        syllable unchanged.
+
+    Raises:
+        TypeError: If ``word`` is not a string.
+    """
+    if not isinstance(word, str):
+        raise TypeError("word must be a string")
+
+    stripped = word.strip()
+    if not stripped:
+        return []
+
+    was_cyrillic = _is_cyrillic(stripped)
+    latin_word = cyrillic2latin(stripped) if was_cyrillic else stripped
+
+    lower_word = latin_word.lower()
+    corrected = _auto_correct(lower_word)
+
+    if _count_vowels(corrected) < 2:
+        return [stripped]
+
+    pattern = _classify(corrected)
+    syllable_map = _create_map(pattern)
+
+    # Slice the corrected (placeholder) string so we know how much of
+    # it belongs to each syllable.
+    rem = corrected
+    corrected_parts: list[str] = []
+    for length in syllable_map:
+        corrected_parts.append(rem[:length])
+        rem = rem[length:]
+    if rem:
+        corrected_parts[-1] += rem
+
+    # Restore multi-char letters — the length of each restored part
+    # now matches the original Latin word.
+    restored = [_inverse_correct(part) for part in corrected_parts]
+
+    # Index into the original Latin word to keep the caller's case.
+    cased: list[str] = []
+    idx = 0
+    for part in restored:
+        step = len(part)
+        cased.append(latin_word[idx : idx + step])
+        idx += step
+
+    if was_cyrillic:
+        return [latin2cyrillic(part) for part in cased]
+    return cased
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "kaalin"
-version = "3.3.2"
+version = "3.3.3b1"
 description = "Karakalpak language toolkit for Python — Latin/Cyrillic script conversion, number-to-words, and string utilities"
 readme = "README.md"
 license = "MIT"
diff --git a/test/test_kaalin_syllable.py b/test/test_kaalin_syllable.py
@@ -0,0 +1,99 @@
+import unittest
+
+from kaalin.syllable import syllabify
+
+
+class TestKaalinSyllable(unittest.TestCase):
+
+    def test_latin_basic(self):
+        self.assertEqual(syllabify("adam"), ["a", "dam"])
+        self.assertEqual(syllabify("adamlar"), ["a", "dam", "lar"])
+        self.assertEqual(syllabify("áke"), ["á", "ke"])
+        self.assertEqual(syllabify("ata"), ["a", "ta"])
+        self.assertEqual(syllabify("balalar"), ["ba", "la", "lar"])
+        self.assertEqual(syllabify("úydegiler"), ["úy", "de", "gi", "ler"])
+        self.assertEqual(syllabify("usıdan"), ["u", "sı", "dan"])
+        self.assertEqual(syllabify("házir"), ["há", "zir"])
+        self.assertEqual(syllabify("futbol"), ["fut", "bol"])
+        self.assertEqual(syllabify("yanvar"), ["yan", "var"])
+        self.assertEqual(syllabify("mart"), ["mart"])
+        self.assertEqual(syllabify("úlke"), ["úl", "ke"])
+        self.assertEqual(syllabify("telefon"), ["te", "le", "fon"])
+        self.assertEqual(syllabify("meniń"), ["me", "niń"])
+        self.assertEqual(syllabify("seniń"), ["se", "niń"])
+        
+
+    def test_latin_complex_clusters(self):
+        self.assertEqual(
+            syllabify("qaraqalpaqstan"),
+            ["qa", "ra", "qal", "paq", "stan"],
+        )
+        self.assertEqual(syllabify("ózbekstan"), ["óz", "bek", "stan"])
+
+    def test_latin_digraphs(self):
+        # "yu" is a single vocalic sound: yu → ŷ
+        self.assertEqual(syllabify("kompyuter"), ["kom", "pyu", "ter"])
+        # "aw" collapses into one vowel unit: aw → å
+        self.assertEqual(syllabify("awıl"), ["aw", "ıl"])
+        # "sh" collapses into one consonant unit: sh → ş
+        self.assertEqual(syllabify("sharapat"), ["sha", "ra", "pat"])
+        # "ch" collapses into one consonant unit: ch → ç
+        self.assertEqual(syllabify("chempion"), ["chem", "pi", "on"])
+
+    def test_latin_single_vowel_returns_whole_word(self):
+        self.assertEqual(syllabify("hám"), ["hám"])
+        self.assertEqual(syllabify("top"), ["top"])
+        self.assertEqual(syllabify("zor"), ["zor"])
+        self.assertEqual(syllabify("men"), ["men"])
+
+    def test_latin_preserves_case(self):
+        self.assertEqual(syllabify("Adam"), ["A", "dam"])
+        self.assertEqual(syllabify("Sharapat"), ["Sha", "ra", "pat"])
+        self.assertEqual(syllabify("ADAMLAR"), ["A", "DAM", "LAR"])
+
+    def test_cyrillic_basic(self):
+        self.assertEqual(syllabify("адам"), ["а", "дам"])
+        self.assertEqual(syllabify("баллалар"), ["бал", "ла", "лар"])
+        self.assertEqual(syllabify("әке"), ["ә", "ке"])
+        self.assertEqual(syllabify("ата"), ["а", "та"])
+
+    def test_cyrillic_complex(self):
+        self.assertEqual(
+            syllabify("қарақалпақстан"),
+            ["қа", "ра", "қал", "пақ", "стан"],
+        )
+        self.assertEqual(syllabify("өзбекстан"), ["өз", "бек", "стан"])
+        self.assertEqual(syllabify("шарапат"), ["ша", "ра", "пат"])
+
+    def test_cyrillic_preserves_case(self):
+        self.assertEqual(syllabify("Адам"), ["А", "дам"])
+        self.assertEqual(syllabify("Шарапат"), ["Ша", "ра", "пат"])
+
+    def test_cyrillic_single_vowel(self):
+        self.assertEqual(syllabify("ҳәм"), ["ҳәм"])
+        self.assertEqual(syllabify("топ"), ["топ"])
+        self.assertEqual(syllabify("зор"), ["зор"])
+
+    def test_whitespace_is_stripped(self):
+        self.assertEqual(syllabify("  adam  "), ["a", "dam"])
+
+    def test_empty_string(self):
+        self.assertEqual(syllabify(""), [])
+        self.assertEqual(syllabify("   "), [])
+
+    def test_non_string_raises_type_error(self):
+        with self.assertRaises(TypeError):
+            syllabify(42)
+        with self.assertRaises(TypeError):
+            syllabify(None)
+        with self.assertRaises(TypeError):
+            syllabify(["adam"])
+
+    def test_exposed_at_package_root(self):
+        import kaalin
+
+        self.assertIs(kaalin.syllabify, syllabify)
+
+
+if __name__ == "__main__":
+    unittest.main()