Skip to content

Commit df60e53

Browse files
ambvclaude
andcommitted
Add pure Python implementation of unicodedata.iter_graphemes()
New module Lib/_py_grapheme.py implements the full Unicode TR29 Extended Grapheme Cluster algorithm in pure Python, using the unicodedata.grapheme_cluster_break(), extended_pictographic(), and indic_conjunct_break() property accessors. Refactored GraphemeBreakTest into a BaseGraphemeBreakTest mixin so that both C and pure Python implementations share the same test suite, including the TR29 conformance test against GraphemeBreakTest.txt. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent dfeb160 commit df60e53

File tree

2 files changed

+318
-13
lines changed

2 files changed

+318
-13
lines changed

Lib/_py_grapheme.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
"""Pure Python implementation of unicodedata.iter_graphemes().
2+
3+
Uses the extended grapheme cluster rules from Unicode TR29.
4+
"""
5+
6+
import sys
7+
import unicodedata
8+
9+
10+
class Segment:
11+
"""Represents a grapheme cluster segment within a string."""
12+
13+
__slots__ = ('_string', 'start', 'end')
14+
15+
def __init__(self, string, start, end):
16+
self._string = string
17+
self.start = start
18+
self.end = end
19+
20+
def __str__(self):
21+
return self._string[self.start:self.end]
22+
23+
def __repr__(self):
24+
return f"<Segment {self.start}:{self.end}>"
25+
26+
27+
# Grapheme_Cluster_Break property values (matching C #defines)
28+
_GCB_Other = "Other"
29+
_GCB_Prepend = "Prepend"
30+
_GCB_CR = "CR"
31+
_GCB_LF = "LF"
32+
_GCB_Control = "Control"
33+
_GCB_Extend = "Extend"
34+
_GCB_Regional_Indicator = "Regional_Indicator"
35+
_GCB_SpacingMark = "SpacingMark"
36+
_GCB_L = "L"
37+
_GCB_V = "V"
38+
_GCB_T = "T"
39+
_GCB_LV = "LV"
40+
_GCB_LVT = "LVT"
41+
_GCB_ZWJ = "ZWJ"
42+
43+
# Indic_Conjunct_Break property values
44+
_InCB_None = "None"
45+
_InCB_Linker = "Linker"
46+
_InCB_Consonant = "Consonant"
47+
_InCB_Extend = "Extend"
48+
49+
# Extended Pictographic FSM states (for GB11)
50+
_EP_INIT = 0
51+
_EP_STARTED = 1
52+
_EP_ZWJ = 2
53+
_EP_MATCHED = 3
54+
55+
# Indic Conjunct Break FSM states (for GB9c)
56+
_INCB_INIT = 0
57+
_INCB_STARTED = 1
58+
_INCB_LINKER = 2
59+
_INCB_MATCHED = 3
60+
61+
62+
def _update_ext_pict_state(state, gcb, ext_pict):
63+
if ext_pict:
64+
return _EP_MATCHED if state == _EP_ZWJ else _EP_STARTED
65+
if state == _EP_STARTED or state == _EP_MATCHED:
66+
if gcb == _GCB_Extend:
67+
return _EP_STARTED
68+
if gcb == _GCB_ZWJ:
69+
return _EP_ZWJ
70+
return _EP_INIT
71+
72+
73+
def _update_incb_state(state, incb):
74+
if incb == _InCB_Consonant:
75+
return _INCB_MATCHED if state == _INCB_LINKER else _INCB_STARTED
76+
if state != _INCB_INIT:
77+
if incb == _InCB_Extend:
78+
return _INCB_LINKER if state == _INCB_LINKER else _INCB_STARTED
79+
if incb == _InCB_Linker:
80+
return _INCB_LINKER
81+
return _INCB_INIT
82+
83+
84+
def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state):
85+
"""Return True if a grapheme cluster break occurs between two characters."""
86+
# GB3: Do not break between a CR and LF.
87+
if prev_gcb == _GCB_CR and curr_gcb == _GCB_LF:
88+
return False
89+
90+
# GB4: Break after controls.
91+
if prev_gcb in (_GCB_CR, _GCB_LF, _GCB_Control):
92+
return True
93+
94+
# GB5: Break before controls.
95+
if curr_gcb in (_GCB_CR, _GCB_LF, _GCB_Control):
96+
return True
97+
98+
# GB6: Do not break Hangul syllable sequences (L).
99+
if prev_gcb == _GCB_L and curr_gcb in (_GCB_L, _GCB_V, _GCB_LV, _GCB_LVT):
100+
return False
101+
102+
# GB7: Do not break Hangul syllable sequences (LV, V).
103+
if prev_gcb in (_GCB_LV, _GCB_V) and curr_gcb in (_GCB_V, _GCB_T):
104+
return False
105+
106+
# GB8: Do not break Hangul syllable sequences (LVT, T).
107+
if prev_gcb in (_GCB_LVT, _GCB_T) and curr_gcb == _GCB_T:
108+
return False
109+
110+
# GB9: Do not break before extending characters or ZWJ.
111+
if curr_gcb in (_GCB_Extend, _GCB_ZWJ):
112+
return False
113+
114+
# GB9a: Do not break before SpacingMarks.
115+
if curr_gcb == _GCB_SpacingMark:
116+
return False
117+
118+
# GB9b: Do not break after Prepend characters.
119+
if prev_gcb == _GCB_Prepend:
120+
return False
121+
122+
# GB9c: Do not break within Indic conjunct clusters.
123+
if incb_state == _INCB_MATCHED:
124+
return False
125+
126+
# GB11: Do not break within emoji ZWJ sequences.
127+
if ep_state == _EP_MATCHED:
128+
return False
129+
130+
# GB12/GB13: Do not break within emoji flag sequences.
131+
if prev_gcb == _GCB_Regional_Indicator and curr_gcb == _GCB_Regional_Indicator:
132+
return ri_flag
133+
134+
# GB999: Otherwise, break everywhere.
135+
return True
136+
137+
138+
def iter_graphemes(string, start=0, end=sys.maxsize):
139+
"""Iterate over grapheme clusters in a string.
140+
141+
Uses extended grapheme cluster rules from TR29.
142+
143+
Returns an iterator yielding Segment objects with start/end attributes
144+
and str() support.
145+
"""
146+
if not isinstance(string, str):
147+
raise TypeError(
148+
"argument must be a unicode character, not "
149+
f"'{type(string).__name__}'"
150+
)
151+
152+
length = len(string)
153+
# Adjust indices (matching CPython's ADJUST_INDICES macro)
154+
if end > length:
155+
end = length
156+
if end < 0:
157+
end += length
158+
if end < 0:
159+
end = 0
160+
if start < 0:
161+
start += length
162+
if start < 0:
163+
start = 0
164+
165+
return _iter_grapheme_clusters(string, start, end)
166+
167+
168+
def _iter_grapheme_clusters(string, start, end):
169+
gcb = _GCB_Other
170+
ep_state = _EP_INIT
171+
incb_state = _INCB_INIT
172+
ri_flag = False
173+
174+
cluster_start = start
175+
pos = start
176+
while pos < end:
177+
ch = string[pos]
178+
curr_gcb = unicodedata.grapheme_cluster_break(ch)
179+
ext_pict = unicodedata.extended_pictographic(ch)
180+
incb = unicodedata.indic_conjunct_break(ch)
181+
182+
ep_state = _update_ext_pict_state(ep_state, curr_gcb, ext_pict)
183+
ri_flag = (not ri_flag) if curr_gcb == _GCB_Regional_Indicator else False
184+
incb_state = _update_incb_state(incb_state, incb)
185+
186+
prev_gcb = gcb
187+
gcb = curr_gcb
188+
189+
if pos != cluster_start and _grapheme_break(
190+
prev_gcb, curr_gcb, ep_state, ri_flag, incb_state
191+
):
192+
yield Segment(string, cluster_start, pos)
193+
cluster_start = pos
194+
195+
pos += 1
196+
197+
if cluster_start < end:
198+
yield Segment(string, cluster_start, end)

Lib/test/test_unicodedata.py

Lines changed: 120 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1300,16 +1300,103 @@ class MyStr(str):
13001300
self.assertIs(type(normalize(form, MyStr(input_str))), str)
13011301

13021302

1303-
class GraphemeBreakTest(unittest.TestCase):
1303+
class BaseGraphemeBreakTest:
1304+
iter_graphemes = staticmethod(unicodedata.iter_graphemes)
1305+
1306+
def test_grapheme_break_types(self):
1307+
self.assertRaises(TypeError, self.iter_graphemes)
1308+
self.assertRaises(TypeError, self.iter_graphemes, b'x')
1309+
1310+
def test_grapheme_break_empty(self):
1311+
graphemes = self._graphemes
1312+
self.assertEqual(graphemes(''), [])
1313+
1314+
def test_grapheme_break_simple(self):
1315+
graphemes = self._graphemes
1316+
self.assertEqual(graphemes('abcd'), ['a', 'b', 'c', 'd'])
1317+
self.assertEqual(graphemes('abcd', 1), ['b', 'c', 'd'])
1318+
self.assertEqual(graphemes('abcd', 1, 3), ['b', 'c'])
1319+
self.assertEqual(graphemes('abcd', -3), ['b', 'c', 'd'])
1320+
self.assertEqual(graphemes('abcd', 1, -1), ['b', 'c'])
1321+
self.assertEqual(graphemes('abcd', 3, 1), [])
1322+
self.assertEqual(graphemes('abcd', 5), [])
1323+
self.assertEqual(graphemes('abcd', 0, 5), ['a', 'b', 'c', 'd'])
1324+
self.assertEqual(graphemes('abcd', -5), ['a', 'b', 'c', 'd'])
1325+
self.assertEqual(graphemes('abcd', 0, -5), [])
1326+
1327+
def test_grapheme_break_rules(self):
1328+
graphemes = self._graphemes
1329+
# GB3
1330+
self.assertEqual(graphemes('\r\n'), ['\r\n'])
1331+
# GB4
1332+
self.assertEqual(graphemes('\r\u0308'), ['\r', '\u0308'])
1333+
self.assertEqual(graphemes('\n\u0308'), ['\n', '\u0308'])
1334+
self.assertEqual(graphemes('\0\u0308'), ['\0', '\u0308'])
1335+
# GB5
1336+
self.assertEqual(graphemes('\u06dd\r'), ['\u06dd', '\r'])
1337+
self.assertEqual(graphemes('\u06dd\n'), ['\u06dd', '\n'])
1338+
self.assertEqual(graphemes('\u06dd\0'), ['\u06dd', '\0'])
1339+
# GB6
1340+
self.assertEqual(graphemes('\u1100\u1160'), ['\u1100\u1160'])
1341+
self.assertEqual(graphemes('\u1100\uAC00'), ['\u1100\uAC00'])
1342+
self.assertEqual(graphemes('\u1100\uAC01'), ['\u1100\uAC01'])
1343+
# GB7
1344+
self.assertEqual(graphemes('\uAC00\u1160'), ['\uAC00\u1160'])
1345+
self.assertEqual(graphemes('\uAC00\u11A8'), ['\uAC00\u11A8'])
1346+
self.assertEqual(graphemes('\u1160\u1160'), ['\u1160\u1160'])
1347+
self.assertEqual(graphemes('\u1160\u11A8'), ['\u1160\u11A8'])
1348+
# GB8
1349+
self.assertEqual(graphemes('\uAC01\u11A8'), ['\uAC01\u11A8'])
1350+
self.assertEqual(graphemes('\u11A8\u11A8'), ['\u11A8\u11A8'])
1351+
# GB9
1352+
self.assertEqual(graphemes('a\u0300'), ['a\u0300'])
1353+
self.assertEqual(graphemes('a\u200D'), ['a\u200D'])
1354+
# GB9a
1355+
self.assertEqual(graphemes('\u0905\u0903'), ['\u0905\u0903'])
1356+
# GB9b
1357+
self.assertEqual(graphemes('\u06dd\u0661'), ['\u06dd\u0661'])
1358+
# GB9c
1359+
self.assertEqual(graphemes('\u0915\u094d\u0924'),
1360+
['\u0915\u094d\u0924'])
1361+
self.assertEqual(graphemes('\u0915\u094D\u094D\u0924'),
1362+
['\u0915\u094D\u094D\u0924'])
1363+
self.assertEqual(graphemes('\u0915\u094D\u0924\u094D\u092F'),
1364+
['\u0915\u094D\u0924\u094D\u092F'])
1365+
# GB11
1366+
self.assertEqual(graphemes(
1367+
'\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
1368+
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'),
1369+
['\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
1370+
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'])
1371+
# GB12
1372+
self.assertEqual(graphemes(
1373+
'\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
1374+
['\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
1375+
# GB13
1376+
self.assertEqual(graphemes(
1377+
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
1378+
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
1379+
1380+
def test_segment_object(self):
1381+
segments = list(self.iter_graphemes('spa\u0300m'))
1382+
self.assertEqual(len(segments), 4, segments)
1383+
segment = segments[2]
1384+
self.assertEqual(segment.start, 2)
1385+
self.assertEqual(segment.end, 4)
1386+
self.assertEqual(str(segment), 'a\u0300')
1387+
1388+
def _graphemes(self, *args):
1389+
return list(map(str, self.iter_graphemes(*args)))
1390+
13041391
@requires_resource('network')
1305-
def test_grapheme_break(self):
1392+
def test_tr29_conformance(self):
13061393
TESTDATAFILE = "GraphemeBreakTest.txt"
13071394
testdata = download_test_data_file(TESTDATAFILE)
13081395

13091396
with testdata:
1310-
self.run_grapheme_break_tests(testdata)
1397+
self._run_grapheme_break_tests(testdata)
13111398

1312-
def run_grapheme_break_tests(self, testdata):
1399+
def _run_grapheme_break_tests(self, testdata):
13131400
for line in testdata:
13141401
line, _, comment = line.partition('#')
13151402
line = line.strip()
@@ -1330,19 +1417,32 @@ def run_grapheme_break_tests(self, testdata):
13301417
self.assertEqual(chunks.pop(), '', line)
13311418
input = ''.join(chunks)
13321419
with self.subTest(line):
1333-
result = list(unicodedata.iter_graphemes(input))
1420+
result = list(self.iter_graphemes(input))
13341421
self.assertEqual(list(map(str, result)), chunks, comment)
1335-
self.assertEqual([x.start for x in result], breaks[:-1], comment)
1336-
self.assertEqual([x.end for x in result], breaks[1:], comment)
1422+
self.assertEqual([x.start for x in result],
1423+
breaks[:-1], comment)
1424+
self.assertEqual([x.end for x in result],
1425+
breaks[1:], comment)
13371426
for i in range(1, len(breaks) - 1):
1338-
result = list(unicodedata.iter_graphemes(input, breaks[i]))
1339-
self.assertEqual(list(map(str, result)), chunks[i:], comment)
1340-
self.assertEqual([x.start for x in result], breaks[i:-1], comment)
1341-
self.assertEqual([x.end for x in result], breaks[i+1:], comment)
1427+
result = list(self.iter_graphemes(input, breaks[i]))
1428+
self.assertEqual(list(map(str, result)),
1429+
chunks[i:], comment)
1430+
self.assertEqual([x.start for x in result],
1431+
breaks[i:-1], comment)
1432+
self.assertEqual([x.end for x in result],
1433+
breaks[i+1:], comment)
1434+
1435+
1436+
class GraphemeBreakTest(unittest.TestCase, BaseGraphemeBreakTest):
1437+
iter_graphemes = staticmethod(unicodedata.iter_graphemes)
1438+
1439+
def test_segment_repr(self):
1440+
segment = list(unicodedata.iter_graphemes('spa\u0300m'))[2]
1441+
self.assertEqual(repr(segment), '<Segment 2:4>')
1442+
self.assertRaises(TypeError, iter, segment)
1443+
self.assertRaises(TypeError, len, segment)
13421444

13431445
def test_reference_loops(self):
1344-
# Test that reference loops involving GraphemeBreakIterator or
1345-
# Segment can be broken by the garbage collector.
13461446
class S(str):
13471447
pass
13481448

@@ -1363,5 +1463,12 @@ class S(str):
13631463
self.assertIsNone(wr())
13641464

13651465

1466+
class PyGraphemeBreakTest(unittest.TestCase, BaseGraphemeBreakTest):
1467+
@classmethod
1468+
def setUpClass(cls):
1469+
from _py_grapheme import iter_graphemes
1470+
cls.iter_graphemes = staticmethod(iter_graphemes)
1471+
1472+
13661473
if __name__ == "__main__":
13671474
unittest.main()

0 commit comments

Comments
 (0)