Skip to content

Commit e783bee

Browse files
authored
Merge pull request #16 from erinshek/main
Add word syllabification for Karakalpak Latin and Cyrillic scripts
2 parents 39e01ce + c49a12a commit e783bee

6 files changed

Lines changed: 316 additions & 1 deletion

File tree

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ print(cyrillic2latin("Ассалаўма әлейкум")) # Assalawma áleykum
2525
|---|---|
2626
| **Script Conversion** | Bidirectional Latin ↔ Cyrillic conversion with multi-character mapping (`sh``ш`, `ch``ч`) and special Cyrillic rules (`ьи``yi`, `ьо``yo`, `ъе``ye`) |
2727
| **Number to Words** | Converts integers and floats to Karakalpak words in Latin or Cyrillic script. Supports range 0 to 10³⁰, negative numbers, and decimal fractions |
28+
| **Word Syllabification** | Splits Karakalpak words into syllables, works with both Latin and Cyrillic scripts, preserves letter case, and recognises digraphs like `sh`, `ch`, `yu`, `ya`, `aw`, `ew` |
2829
| **String Utilities** | Karakalpak-aware `upper()` / `lower()` that correctly handle the dotless `ı``Í` character pair |
2930
| **CLI Tools** | `cyr2lat` and `lat2cyr` commands for converting text files from the terminal |
3031

@@ -58,6 +59,26 @@ to_word(-42) # minus qırıq eki
5859

5960
**Raises:** `NumberRangeError` if `number` exceeds 10³⁰.
6061

62+
### Word Syllabification
63+
64+
```python
65+
from kaalin.syllable import syllabify
66+
67+
syllabify("qaraqalpaqstan") # ['qa', 'ra', 'qal', 'paq', 'stan']
68+
syllabify("kompyuter") # ['kom', 'pyu', 'ter']
69+
syllabify("Шарапат") # ['Ша', 'ра', 'пат']
70+
syllabify("Adam") # ['A', 'dam']
71+
72+
"-".join(syllabify("úydegiler")) # 'úy-de-gi-ler'
73+
```
74+
75+
**Parameters:**
76+
- `word` (`str`) — the word to split. Accepts Latin or Cyrillic input.
77+
78+
**Returns:** A `list[str]` of syllables in the same script as the input. Words with fewer than two vowels are returned as a single-element list unchanged.
79+
80+
**Raises:** `TypeError` if `word` is not a string.
81+
6182
### String Utilities
6283

6384
```python
@@ -87,6 +108,7 @@ lat2cyr input.txt output.txt # writes output.txt
87108

88109
- Converting Karakalpak text between Latin and Cyrillic scripts
89110
- Displaying numbers as Karakalpak words (invoices, checks, education)
111+
- Splitting words into syllables for hyphenation, typesetting, or language learning
90112
- NLP preprocessing for Karakalpak text (script normalization)
91113
- Building Karakalpak-language applications that need locale-aware string operations
92114
- Batch-converting text files via CLI

kaalin/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .number import NumberRangeError
22
from .converter import latin2cyrillic, cyrillic2latin
3+
from .syllable import syllabify
34
from . import string
45
from . import cli

kaalin/syllable/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .syllabifier import syllabify

kaalin/syllable/syllabifier.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
"""Split Karakalpak words into syllables.
2+
3+
Works with both Latin and Cyrillic Karakalpak text. Cyrillic input is
4+
transliterated to Latin for the pattern-matching pass and then
5+
converted back, so callers get syllables in the same script they
6+
passed in. Original letter case is preserved.
7+
8+
The algorithm:
9+
10+
1. Collapse multi-character letter sequences (``sh``, ``ch``, ``yu``,
11+
``ya``, ``aw``, ``ew``) into single placeholder characters so the
12+
classifier can treat them atomically.
13+
2. Map each character to ``V`` (vowel), ``C`` (consonant) or ``?``
14+
(unknown).
15+
3. Walk the resulting pattern with a fixed set of rules that reflect
16+
Karakalpak phonotactics, producing a list of syllable lengths.
17+
4. Slice the original word with those lengths so each syllable keeps
18+
its own characters and case.
19+
"""
20+
21+
from kaalin.converter import cyrillic2latin, latin2cyrillic
22+
23+
24+
_VOWELS = set("aáeioóuúíıÿŷåê")
25+
_CONSONANTS = set("bcdfgǵhjklmnńpqrsştçvwxyz")
26+
27+
# (original multi-char sequence, single-char placeholder)
28+
_AUTO_CORRECT_PAIRS = (
29+
("sh", "ş"),
30+
("ch", "ç"),
31+
("yu", "ŷ"),
32+
("ya", "ÿ"),
33+
("aw", "å"),
34+
("ew", "ê"),
35+
)
36+
37+
38+
def _auto_correct(text: str) -> str:
39+
for src, dst in _AUTO_CORRECT_PAIRS:
40+
text = text.replace(src, dst)
41+
return text
42+
43+
44+
def _inverse_correct(text: str) -> str:
45+
for src, dst in _AUTO_CORRECT_PAIRS:
46+
text = text.replace(dst, src)
47+
return text
48+
49+
50+
def _classify(text: str) -> str:
51+
return "".join(
52+
"V" if ch in _VOWELS else "C" if ch in _CONSONANTS else "?"
53+
for ch in text
54+
)
55+
56+
57+
def _create_map(pattern: str) -> list[int]:
58+
syllable_map: list[int] = []
59+
i = 0
60+
n = len(pattern)
61+
62+
def p(offset: int = 0) -> str:
63+
idx = i + offset
64+
return pattern[idx] if idx < n else ""
65+
66+
while i < n:
67+
68+
# ── V-initial ──────────────────────────────────────────────
69+
if p(0) == "V":
70+
if p(1) in ("", "V"):
71+
syllable_map.append(1); i += 1
72+
elif p(1) == "C" and p(2) == "V":
73+
syllable_map.append(1); i += 1
74+
elif p(1) == "C" and p(2) == "C" and p(3) == "V":
75+
syllable_map.append(2); i += 2
76+
elif p(1) == "C" and p(2) == "C" and p(3) == "C" and p(4) == "V":
77+
syllable_map.append(3); i += 3
78+
elif p(1) == "C" and p(2) == "C":
79+
syllable_map.append(3); i += 3
80+
else:
81+
syllable_map.append(2); i += 2
82+
83+
# ── C-initial ──────────────────────────────────────────────
84+
elif p(0) == "C":
85+
86+
# CCV-initial
87+
if p(1) == "C" and p(2) == "V":
88+
if p(3) in ("", "V"):
89+
syllable_map.append(3); i += 3
90+
elif p(3) == "C" and p(4) == "V":
91+
syllable_map.append(3); i += 3
92+
elif p(3) == "C" and p(4) == "C" and p(5) == "V":
93+
syllable_map.append(4); i += 4
94+
elif p(3) == "C" and p(4) == "C":
95+
syllable_map.append(5); i += 5
96+
else:
97+
syllable_map.append(4); i += 4
98+
99+
# CV-initial
100+
elif p(1) == "V":
101+
if p(2) in ("", "V"):
102+
syllable_map.append(2); i += 2
103+
elif p(2) == "C" and p(3) == "V":
104+
syllable_map.append(2); i += 2
105+
elif p(2) == "C" and p(3) == "C" and p(4) == "V":
106+
syllable_map.append(3); i += 3
107+
elif p(2) == "C" and p(3) == "C" and p(4) == "C" and p(5) == "V":
108+
syllable_map.append(3); i += 3
109+
elif p(2) == "C" and p(3) == "C":
110+
syllable_map.append(4); i += 4
111+
else:
112+
syllable_map.append(3); i += 3
113+
114+
else:
115+
syllable_map.append(1); i += 1
116+
else:
117+
syllable_map.append(1); i += 1
118+
119+
return syllable_map
120+
121+
122+
def _count_vowels(text: str) -> int:
123+
return sum(1 for ch in text if ch in _VOWELS)
124+
125+
126+
def _is_cyrillic(text: str) -> bool:
127+
return any("\u0400" <= ch <= "\u04FF" for ch in text)
128+
129+
130+
def syllabify(word: str) -> list[str]:
131+
"""Split a single Karakalpak word into syllables.
132+
133+
Accepts Latin or Cyrillic Karakalpak input and returns the
134+
syllables in the same script as the input, with the original
135+
letter case preserved.
136+
137+
Args:
138+
word: The word to split. Leading and trailing whitespace is
139+
ignored.
140+
141+
Returns:
142+
A list of syllables. An empty input yields an empty list.
143+
Words with fewer than two vowels are returned as a single
144+
syllable unchanged.
145+
146+
Raises:
147+
TypeError: If ``word`` is not a string.
148+
"""
149+
if not isinstance(word, str):
150+
raise TypeError("word must be a string")
151+
152+
stripped = word.strip()
153+
if not stripped:
154+
return []
155+
156+
was_cyrillic = _is_cyrillic(stripped)
157+
latin_word = cyrillic2latin(stripped) if was_cyrillic else stripped
158+
159+
lower_word = latin_word.lower()
160+
corrected = _auto_correct(lower_word)
161+
162+
if _count_vowels(corrected) < 2:
163+
return [stripped]
164+
165+
pattern = _classify(corrected)
166+
syllable_map = _create_map(pattern)
167+
168+
# Slice the corrected (placeholder) string so we know how much of
169+
# it belongs to each syllable.
170+
rem = corrected
171+
corrected_parts: list[str] = []
172+
for length in syllable_map:
173+
corrected_parts.append(rem[:length])
174+
rem = rem[length:]
175+
if rem:
176+
corrected_parts[-1] += rem
177+
178+
# Restore multi-char letters — the length of each restored part
179+
# now matches the original Latin word.
180+
restored = [_inverse_correct(part) for part in corrected_parts]
181+
182+
# Index into the original Latin word to keep the caller's case.
183+
cased: list[str] = []
184+
idx = 0
185+
for part in restored:
186+
step = len(part)
187+
cased.append(latin_word[idx : idx + step])
188+
idx += step
189+
190+
if was_cyrillic:
191+
return [latin2cyrillic(part) for part in cased]
192+
return cased

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "kaalin"
7-
version = "3.3.2"
7+
version = "3.3.3b1"
88
description = "Karakalpak language toolkit for Python — Latin/Cyrillic script conversion, number-to-words, and string utilities"
99
readme = "README.md"
1010
license = "MIT"

test/test_kaalin_syllable.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import unittest
2+
3+
from kaalin.syllable import syllabify
4+
5+
6+
class TestKaalinSyllable(unittest.TestCase):
7+
8+
def test_latin_basic(self):
9+
self.assertEqual(syllabify("adam"), ["a", "dam"])
10+
self.assertEqual(syllabify("adamlar"), ["a", "dam", "lar"])
11+
self.assertEqual(syllabify("áke"), ["á", "ke"])
12+
self.assertEqual(syllabify("ata"), ["a", "ta"])
13+
self.assertEqual(syllabify("balalar"), ["ba", "la", "lar"])
14+
self.assertEqual(syllabify("úydegiler"), ["úy", "de", "gi", "ler"])
15+
self.assertEqual(syllabify("usıdan"), ["u", "sı", "dan"])
16+
self.assertEqual(syllabify("házir"), ["há", "zir"])
17+
self.assertEqual(syllabify("futbol"), ["fut", "bol"])
18+
self.assertEqual(syllabify("yanvar"), ["yan", "var"])
19+
self.assertEqual(syllabify("mart"), ["mart"])
20+
self.assertEqual(syllabify("úlke"), ["úl", "ke"])
21+
self.assertEqual(syllabify("telefon"), ["te", "le", "fon"])
22+
self.assertEqual(syllabify("meniń"), ["me", "niń"])
23+
self.assertEqual(syllabify("seniń"), ["se", "niń"])
24+
25+
26+
def test_latin_complex_clusters(self):
27+
self.assertEqual(
28+
syllabify("qaraqalpaqstan"),
29+
["qa", "ra", "qal", "paq", "stan"],
30+
)
31+
self.assertEqual(syllabify("ózbekstan"), ["óz", "bek", "stan"])
32+
33+
def test_latin_digraphs(self):
34+
# "yu" is a single vocalic sound: yu → ŷ
35+
self.assertEqual(syllabify("kompyuter"), ["kom", "pyu", "ter"])
36+
# "aw" collapses into one vowel unit: aw → å
37+
self.assertEqual(syllabify("awıl"), ["aw", "ıl"])
38+
# "sh" collapses into one consonant unit: sh → ş
39+
self.assertEqual(syllabify("sharapat"), ["sha", "ra", "pat"])
40+
# "ch" collapses into one consonant unit: ch → ç
41+
self.assertEqual(syllabify("chempion"), ["chem", "pi", "on"])
42+
43+
def test_latin_single_vowel_returns_whole_word(self):
44+
self.assertEqual(syllabify("hám"), ["hám"])
45+
self.assertEqual(syllabify("top"), ["top"])
46+
self.assertEqual(syllabify("zor"), ["zor"])
47+
self.assertEqual(syllabify("men"), ["men"])
48+
49+
def test_latin_preserves_case(self):
50+
self.assertEqual(syllabify("Adam"), ["A", "dam"])
51+
self.assertEqual(syllabify("Sharapat"), ["Sha", "ra", "pat"])
52+
self.assertEqual(syllabify("ADAMLAR"), ["A", "DAM", "LAR"])
53+
54+
def test_cyrillic_basic(self):
55+
self.assertEqual(syllabify("адам"), ["а", "дам"])
56+
self.assertEqual(syllabify("баллалар"), ["бал", "ла", "лар"])
57+
self.assertEqual(syllabify("әке"), ["ә", "ке"])
58+
self.assertEqual(syllabify("ата"), ["а", "та"])
59+
60+
def test_cyrillic_complex(self):
61+
self.assertEqual(
62+
syllabify("қарақалпақстан"),
63+
["қа", "ра", "қал", "пақ", "стан"],
64+
)
65+
self.assertEqual(syllabify("өзбекстан"), ["өз", "бек", "стан"])
66+
self.assertEqual(syllabify("шарапат"), ["ша", "ра", "пат"])
67+
68+
def test_cyrillic_preserves_case(self):
69+
self.assertEqual(syllabify("Адам"), ["А", "дам"])
70+
self.assertEqual(syllabify("Шарапат"), ["Ша", "ра", "пат"])
71+
72+
def test_cyrillic_single_vowel(self):
73+
self.assertEqual(syllabify("ҳәм"), ["ҳәм"])
74+
self.assertEqual(syllabify("топ"), ["топ"])
75+
self.assertEqual(syllabify("зор"), ["зор"])
76+
77+
def test_whitespace_is_stripped(self):
78+
self.assertEqual(syllabify(" adam "), ["a", "dam"])
79+
80+
def test_empty_string(self):
81+
self.assertEqual(syllabify(""), [])
82+
self.assertEqual(syllabify(" "), [])
83+
84+
def test_non_string_raises_type_error(self):
85+
with self.assertRaises(TypeError):
86+
syllabify(42)
87+
with self.assertRaises(TypeError):
88+
syllabify(None)
89+
with self.assertRaises(TypeError):
90+
syllabify(["adam"])
91+
92+
def test_exposed_at_package_root(self):
93+
import kaalin
94+
95+
self.assertIs(kaalin.syllabify, syllabify)
96+
97+
98+
if __name__ == "__main__":
99+
unittest.main()

0 commit comments

Comments
 (0)