Add pure Python implementation of unicodedata.iter_graphemes()

ambv · claude · ambv · commit df60e5384be3 · 2026-04-07T15:58:03.000+02:00
New module Lib/_py_grapheme.py implements the full Unicode TR29
Extended Grapheme Cluster algorithm in pure Python, using the
unicodedata.grapheme_cluster_break(), extended_pictographic(), and
indic_conjunct_break() property accessors.

Refactored GraphemeBreakTest into a BaseGraphemeBreakTest mixin so
that both C and pure Python implementations share the same test suite,
including the TR29 conformance test against GraphemeBreakTest.txt.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/Lib/_py_grapheme.py b/Lib/_py_grapheme.py
@@ -0,0 +1,198 @@
+"""Pure Python implementation of unicodedata.iter_graphemes().
+
+Uses the extended grapheme cluster rules from Unicode TR29.
+"""
+
+import sys
+import unicodedata
+
+
+class Segment:
+    """Represents a grapheme cluster segment within a string."""
+
+    __slots__ = ('_string', 'start', 'end')
+
+    def __init__(self, string, start, end):
+        self._string = string
+        self.start = start
+        self.end = end
+
+    def __str__(self):
+        return self._string[self.start:self.end]
+
+    def __repr__(self):
+        return f"<Segment {self.start}:{self.end}>"
+
+
+# Grapheme_Cluster_Break property values (matching C #defines)
+_GCB_Other = "Other"
+_GCB_Prepend = "Prepend"
+_GCB_CR = "CR"
+_GCB_LF = "LF"
+_GCB_Control = "Control"
+_GCB_Extend = "Extend"
+_GCB_Regional_Indicator = "Regional_Indicator"
+_GCB_SpacingMark = "SpacingMark"
+_GCB_L = "L"
+_GCB_V = "V"
+_GCB_T = "T"
+_GCB_LV = "LV"
+_GCB_LVT = "LVT"
+_GCB_ZWJ = "ZWJ"
+
+# Indic_Conjunct_Break property values
+_InCB_None = "None"
+_InCB_Linker = "Linker"
+_InCB_Consonant = "Consonant"
+_InCB_Extend = "Extend"
+
+# Extended Pictographic FSM states (for GB11)
+_EP_INIT = 0
+_EP_STARTED = 1
+_EP_ZWJ = 2
+_EP_MATCHED = 3
+
+# Indic Conjunct Break FSM states (for GB9c)
+_INCB_INIT = 0
+_INCB_STARTED = 1
+_INCB_LINKER = 2
+_INCB_MATCHED = 3
+
+
+def _update_ext_pict_state(state, gcb, ext_pict):
+    if ext_pict:
+        return _EP_MATCHED if state == _EP_ZWJ else _EP_STARTED
+    if state == _EP_STARTED or state == _EP_MATCHED:
+        if gcb == _GCB_Extend:
+            return _EP_STARTED
+        if gcb == _GCB_ZWJ:
+            return _EP_ZWJ
+    return _EP_INIT
+
+
+def _update_incb_state(state, incb):
+    if incb == _InCB_Consonant:
+        return _INCB_MATCHED if state == _INCB_LINKER else _INCB_STARTED
+    if state != _INCB_INIT:
+        if incb == _InCB_Extend:
+            return _INCB_LINKER if state == _INCB_LINKER else _INCB_STARTED
+        if incb == _InCB_Linker:
+            return _INCB_LINKER
+    return _INCB_INIT
+
+
+def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state):
+    """Return True if a grapheme cluster break occurs between two characters."""
+    # GB3: Do not break between a CR and LF.
+    if prev_gcb == _GCB_CR and curr_gcb == _GCB_LF:
+        return False
+
+    # GB4: Break after controls.
+    if prev_gcb in (_GCB_CR, _GCB_LF, _GCB_Control):
+        return True
+
+    # GB5: Break before controls.
+    if curr_gcb in (_GCB_CR, _GCB_LF, _GCB_Control):
+        return True
+
+    # GB6: Do not break Hangul syllable sequences (L).
+    if prev_gcb == _GCB_L and curr_gcb in (_GCB_L, _GCB_V, _GCB_LV, _GCB_LVT):
+        return False
+
+    # GB7: Do not break Hangul syllable sequences (LV, V).
+    if prev_gcb in (_GCB_LV, _GCB_V) and curr_gcb in (_GCB_V, _GCB_T):
+        return False
+
+    # GB8: Do not break Hangul syllable sequences (LVT, T).
+    if prev_gcb in (_GCB_LVT, _GCB_T) and curr_gcb == _GCB_T:
+        return False
+
+    # GB9: Do not break before extending characters or ZWJ.
+    if curr_gcb in (_GCB_Extend, _GCB_ZWJ):
+        return False
+
+    # GB9a: Do not break before SpacingMarks.
+    if curr_gcb == _GCB_SpacingMark:
+        return False
+
+    # GB9b: Do not break after Prepend characters.
+    if prev_gcb == _GCB_Prepend:
+        return False
+
+    # GB9c: Do not break within Indic conjunct clusters.
+    if incb_state == _INCB_MATCHED:
+        return False
+
+    # GB11: Do not break within emoji ZWJ sequences.
+    if ep_state == _EP_MATCHED:
+        return False
+
+    # GB12/GB13: Do not break within emoji flag sequences.
+    if prev_gcb == _GCB_Regional_Indicator and curr_gcb == _GCB_Regional_Indicator:
+        return ri_flag
+
+    # GB999: Otherwise, break everywhere.
+    return True
+
+
+def iter_graphemes(string, start=0, end=sys.maxsize):
+    """Iterate over grapheme clusters in a string.
+
+    Uses extended grapheme cluster rules from TR29.
+
+    Returns an iterator yielding Segment objects with start/end attributes
+    and str() support.
+    """
+    if not isinstance(string, str):
+        raise TypeError(
+            "argument must be a unicode character, not "
+            f"'{type(string).__name__}'"
+        )
+
+    length = len(string)
+    # Adjust indices (matching CPython's ADJUST_INDICES macro)
+    if end > length:
+        end = length
+    if end < 0:
+        end += length
+        if end < 0:
+            end = 0
+    if start < 0:
+        start += length
+        if start < 0:
+            start = 0
+
+    return _iter_grapheme_clusters(string, start, end)
+
+
+def _iter_grapheme_clusters(string, start, end):
+    gcb = _GCB_Other
+    ep_state = _EP_INIT
+    incb_state = _INCB_INIT
+    ri_flag = False
+
+    cluster_start = start
+    pos = start
+    while pos < end:
+        ch = string[pos]
+        curr_gcb = unicodedata.grapheme_cluster_break(ch)
+        ext_pict = unicodedata.extended_pictographic(ch)
+        incb = unicodedata.indic_conjunct_break(ch)
+
+        ep_state = _update_ext_pict_state(ep_state, curr_gcb, ext_pict)
+        ri_flag = (not ri_flag) if curr_gcb == _GCB_Regional_Indicator else False
+        incb_state = _update_incb_state(incb_state, incb)
+
+        prev_gcb = gcb
+        gcb = curr_gcb
+
+        if pos != cluster_start and _grapheme_break(
+            prev_gcb, curr_gcb, ep_state, ri_flag, incb_state
+        ):
+            yield Segment(string, cluster_start, pos)
+            cluster_start = pos
+
+        pos += 1
+
+    if cluster_start < end:
+        yield Segment(string, cluster_start, end)
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -1300,16 +1300,103 @@ class MyStr(str):
                     self.assertIs(type(normalize(form, MyStr(input_str))), str)
 
 
-class GraphemeBreakTest(unittest.TestCase):
+class BaseGraphemeBreakTest:
+    iter_graphemes = staticmethod(unicodedata.iter_graphemes)
+
+    def test_grapheme_break_types(self):
+        self.assertRaises(TypeError, self.iter_graphemes)
+        self.assertRaises(TypeError, self.iter_graphemes, b'x')
+
+    def test_grapheme_break_empty(self):
+        graphemes = self._graphemes
+        self.assertEqual(graphemes(''), [])
+
+    def test_grapheme_break_simple(self):
+        graphemes = self._graphemes
+        self.assertEqual(graphemes('abcd'), ['a', 'b', 'c', 'd'])
+        self.assertEqual(graphemes('abcd', 1), ['b', 'c', 'd'])
+        self.assertEqual(graphemes('abcd', 1, 3), ['b', 'c'])
+        self.assertEqual(graphemes('abcd', -3), ['b', 'c', 'd'])
+        self.assertEqual(graphemes('abcd', 1, -1), ['b', 'c'])
+        self.assertEqual(graphemes('abcd', 3, 1), [])
+        self.assertEqual(graphemes('abcd', 5), [])
+        self.assertEqual(graphemes('abcd', 0, 5), ['a', 'b', 'c', 'd'])
+        self.assertEqual(graphemes('abcd', -5), ['a', 'b', 'c', 'd'])
+        self.assertEqual(graphemes('abcd', 0, -5), [])
+
+    def test_grapheme_break_rules(self):
+        graphemes = self._graphemes
+        # GB3
+        self.assertEqual(graphemes('\r\n'), ['\r\n'])
+        # GB4
+        self.assertEqual(graphemes('\r\u0308'), ['\r', '\u0308'])
+        self.assertEqual(graphemes('\n\u0308'), ['\n', '\u0308'])
+        self.assertEqual(graphemes('\0\u0308'), ['\0', '\u0308'])
+        # GB5
+        self.assertEqual(graphemes('\u06dd\r'), ['\u06dd', '\r'])
+        self.assertEqual(graphemes('\u06dd\n'), ['\u06dd', '\n'])
+        self.assertEqual(graphemes('\u06dd\0'), ['\u06dd', '\0'])
+        # GB6
+        self.assertEqual(graphemes('\u1100\u1160'), ['\u1100\u1160'])
+        self.assertEqual(graphemes('\u1100\uAC00'), ['\u1100\uAC00'])
+        self.assertEqual(graphemes('\u1100\uAC01'), ['\u1100\uAC01'])
+        # GB7
+        self.assertEqual(graphemes('\uAC00\u1160'), ['\uAC00\u1160'])
+        self.assertEqual(graphemes('\uAC00\u11A8'), ['\uAC00\u11A8'])
+        self.assertEqual(graphemes('\u1160\u1160'), ['\u1160\u1160'])
+        self.assertEqual(graphemes('\u1160\u11A8'), ['\u1160\u11A8'])
+        # GB8
+        self.assertEqual(graphemes('\uAC01\u11A8'), ['\uAC01\u11A8'])
+        self.assertEqual(graphemes('\u11A8\u11A8'), ['\u11A8\u11A8'])
+        # GB9
+        self.assertEqual(graphemes('a\u0300'), ['a\u0300'])
+        self.assertEqual(graphemes('a\u200D'), ['a\u200D'])
+        # GB9a
+        self.assertEqual(graphemes('\u0905\u0903'), ['\u0905\u0903'])
+        # GB9b
+        self.assertEqual(graphemes('\u06dd\u0661'), ['\u06dd\u0661'])
+        # GB9c
+        self.assertEqual(graphemes('\u0915\u094d\u0924'),
+                         ['\u0915\u094d\u0924'])
+        self.assertEqual(graphemes('\u0915\u094D\u094D\u0924'),
+                         ['\u0915\u094D\u094D\u0924'])
+        self.assertEqual(graphemes('\u0915\u094D\u0924\u094D\u092F'),
+                         ['\u0915\u094D\u0924\u094D\u092F'])
+        # GB11
+        self.assertEqual(graphemes(
+                '\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
+                '\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'),
+                ['\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
+                '\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'])
+        # GB12
+        self.assertEqual(graphemes(
+            '\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
+            ['\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
+        # GB13
+        self.assertEqual(graphemes(
+            'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
+            ['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
+
+    def test_segment_object(self):
+        segments = list(self.iter_graphemes('spa\u0300m'))
+        self.assertEqual(len(segments), 4, segments)
+        segment = segments[2]
+        self.assertEqual(segment.start, 2)
+        self.assertEqual(segment.end, 4)
+        self.assertEqual(str(segment), 'a\u0300')
+
+    def _graphemes(self, *args):
+        return list(map(str, self.iter_graphemes(*args)))
+
     @requires_resource('network')
-    def test_grapheme_break(self):
+    def test_tr29_conformance(self):
         TESTDATAFILE = "GraphemeBreakTest.txt"
         testdata = download_test_data_file(TESTDATAFILE)
 
         with testdata:
-            self.run_grapheme_break_tests(testdata)
+            self._run_grapheme_break_tests(testdata)
 
-    def run_grapheme_break_tests(self, testdata):
+    def _run_grapheme_break_tests(self, testdata):
         for line in testdata:
             line, _, comment = line.partition('#')
             line = line.strip()
@@ -1330,19 +1417,32 @@ def run_grapheme_break_tests(self, testdata):
             self.assertEqual(chunks.pop(), '', line)
             input = ''.join(chunks)
             with self.subTest(line):
-                result = list(unicodedata.iter_graphemes(input))
+                result = list(self.iter_graphemes(input))
                 self.assertEqual(list(map(str, result)), chunks, comment)
-                self.assertEqual([x.start for x in result], breaks[:-1], comment)
-                self.assertEqual([x.end for x in result], breaks[1:], comment)
+                self.assertEqual([x.start for x in result],
+                                 breaks[:-1], comment)
+                self.assertEqual([x.end for x in result],
+                                 breaks[1:], comment)
                 for i in range(1, len(breaks) - 1):
-                    result = list(unicodedata.iter_graphemes(input, breaks[i]))
-                    self.assertEqual(list(map(str, result)), chunks[i:], comment)
-                    self.assertEqual([x.start for x in result], breaks[i:-1], comment)
-                    self.assertEqual([x.end for x in result], breaks[i+1:], comment)
+                    result = list(self.iter_graphemes(input, breaks[i]))
+                    self.assertEqual(list(map(str, result)),
+                                     chunks[i:], comment)
+                    self.assertEqual([x.start for x in result],
+                                     breaks[i:-1], comment)
+                    self.assertEqual([x.end for x in result],
+                                     breaks[i+1:], comment)
+
+
+class GraphemeBreakTest(unittest.TestCase, BaseGraphemeBreakTest):
+    iter_graphemes = staticmethod(unicodedata.iter_graphemes)
+
+    def test_segment_repr(self):
+        segment = list(unicodedata.iter_graphemes('spa\u0300m'))[2]
+        self.assertEqual(repr(segment), '<Segment 2:4>')
+        self.assertRaises(TypeError, iter, segment)
+        self.assertRaises(TypeError, len, segment)
 
     def test_reference_loops(self):
-        # Test that reference loops involving GraphemeBreakIterator or
-        # Segment can be broken by the garbage collector.
         class S(str):
             pass
 
@@ -1363,5 +1463,12 @@ class S(str):
         self.assertIsNone(wr())
 
 
+class PyGraphemeBreakTest(unittest.TestCase, BaseGraphemeBreakTest):
+    @classmethod
+    def setUpClass(cls):
+        from _py_grapheme import iter_graphemes
+        cls.iter_graphemes = staticmethod(iter_graphemes)
+
+
 if __name__ == "__main__":
     unittest.main()