11"""Pure Python implementation of unicodedata.iter_graphemes().
22
33Uses the extended grapheme cluster rules from Unicode TR29.
4+
5+ Property tables are in _py_grapheme_db.py, generated by
6+ Tools/unicode/makeunicodedata.py from the Unicode Character Database.
47"""
58
69import sys
7- import unicodedata
10+ from bisect import bisect_right
11+
12+ from _py_grapheme_db import (
13+ GCB_RANGES ,
14+ GCB_Other , GCB_Prepend , GCB_CR , GCB_LF , GCB_Control ,
15+ GCB_Extend , GCB_Regional_Indicator , GCB_SpacingMark ,
16+ GCB_L , GCB_V , GCB_T , GCB_LV , GCB_LVT , GCB_ZWJ ,
17+ EXT_PICT_RANGES ,
18+ INCB_RANGES ,
19+ InCB_None , InCB_Linker , InCB_Consonant , InCB_Extend ,
20+ )
821
922
1023class Segment :
@@ -24,28 +37,6 @@ def __repr__(self):
2437 return f"<Segment { self .start } :{ self .end } >"
2538
2639
27- # Grapheme_Cluster_Break property values (matching C #defines)
28- _GCB_Other = "Other"
29- _GCB_Prepend = "Prepend"
30- _GCB_CR = "CR"
31- _GCB_LF = "LF"
32- _GCB_Control = "Control"
33- _GCB_Extend = "Extend"
34- _GCB_Regional_Indicator = "Regional_Indicator"
35- _GCB_SpacingMark = "SpacingMark"
36- _GCB_L = "L"
37- _GCB_V = "V"
38- _GCB_T = "T"
39- _GCB_LV = "LV"
40- _GCB_LVT = "LVT"
41- _GCB_ZWJ = "ZWJ"
42-
43- # Indic_Conjunct_Break property values
44- _InCB_None = "None"
45- _InCB_Linker = "Linker"
46- _InCB_Consonant = "Consonant"
47- _InCB_Extend = "Extend"
48-
4940# Extended Pictographic FSM states (for GB11)
5041_EP_INIT = 0
5142_EP_STARTED = 1
@@ -58,65 +49,97 @@ def __repr__(self):
5849_INCB_LINKER = 2
5950_INCB_MATCHED = 3
6051
52+ # Hangul syllable constants
53+ _HANGUL_S_BASE = 0xAC00
54+ _HANGUL_S_COUNT = 11172
55+ _HANGUL_T_COUNT = 28
56+
57+ # Precomputed start arrays for bisect lookup
58+ _GCB_STARTS = tuple (r [0 ] for r in GCB_RANGES )
59+ _EXT_PICT_STARTS = tuple (r [0 ] for r in EXT_PICT_RANGES )
60+ _INCB_STARTS = tuple (r [0 ] for r in INCB_RANGES )
61+
6162
62- def _update_ext_pict_state (state , gcb , ext_pict ):
63- if ext_pict :
64- return _EP_MATCHED if state == _EP_ZWJ else _EP_STARTED
65- if state == _EP_STARTED or state == _EP_MATCHED :
66- if gcb == _GCB_Extend :
67- return _EP_STARTED
68- if gcb == _GCB_ZWJ :
69- return _EP_ZWJ
70- return _EP_INIT
63+ # ---------------------------------------------------------------------------
64+ # Property lookup functions
65+ # ---------------------------------------------------------------------------
7166
67+ def _get_gcb (cp ):
68+ """Return the Grapheme_Cluster_Break value for a codepoint."""
69+ idx = bisect_right (_GCB_STARTS , cp ) - 1
70+ if idx >= 0 :
71+ entry = GCB_RANGES [idx ]
72+ if cp <= entry [1 ]:
73+ return entry [2 ]
74+ # Hangul syllables: LV if (cp - S_BASE) % T_COUNT == 0, else LVT
75+ if _HANGUL_S_BASE <= cp < _HANGUL_S_BASE + _HANGUL_S_COUNT :
76+ if (cp - _HANGUL_S_BASE ) % _HANGUL_T_COUNT == 0 :
77+ return GCB_LV
78+ return GCB_LVT
79+ return GCB_Other
7280
73- def _update_incb_state (state , incb ):
74- if incb == _InCB_Consonant :
75- return _INCB_MATCHED if state == _INCB_LINKER else _INCB_STARTED
76- if state != _INCB_INIT :
77- if incb == _InCB_Extend :
78- return _INCB_LINKER if state == _INCB_LINKER else _INCB_STARTED
79- if incb == _InCB_Linker :
80- return _INCB_LINKER
81- return _INCB_INIT
8281
82+ def _get_ext_pict (cp ):
83+ """Return True if the codepoint has the Extended_Pictographic property."""
84+ idx = bisect_right (_EXT_PICT_STARTS , cp ) - 1
85+ return idx >= 0 and cp <= EXT_PICT_RANGES [idx ][1 ]
86+
87+
88+ def _get_incb (cp ):
89+ """Return the Indic_Conjunct_Break value for a codepoint."""
90+ idx = bisect_right (_INCB_STARTS , cp ) - 1
91+ if idx >= 0 :
92+ entry = INCB_RANGES [idx ]
93+ if cp <= entry [1 ]:
94+ return entry [2 ]
95+ return InCB_None
96+
97+
98+ # ---------------------------------------------------------------------------
99+ # Grapheme break algorithm (TR29)
100+ # ---------------------------------------------------------------------------
83101
84102def _grapheme_break (prev_gcb , curr_gcb , ep_state , ri_flag , incb_state ):
85103 """Return True if a grapheme cluster break occurs between two characters."""
86104 # GB3: Do not break between a CR and LF.
87- if prev_gcb == _GCB_CR and curr_gcb == _GCB_LF :
105+ if prev_gcb == GCB_CR and curr_gcb == GCB_LF :
88106 return False
89107
90108 # GB4: Break after controls.
91- if prev_gcb in ( _GCB_CR , _GCB_LF , _GCB_Control ) :
109+ if prev_gcb == GCB_CR or prev_gcb == GCB_LF or prev_gcb == GCB_Control :
92110 return True
93111
94112 # GB5: Break before controls.
95- if curr_gcb in ( _GCB_CR , _GCB_LF , _GCB_Control ) :
113+ if curr_gcb == GCB_CR or curr_gcb == GCB_LF or curr_gcb == GCB_Control :
96114 return True
97115
98116 # GB6: Do not break Hangul syllable sequences (L).
99- if prev_gcb == _GCB_L and curr_gcb in (_GCB_L , _GCB_V , _GCB_LV , _GCB_LVT ):
117+ if prev_gcb == GCB_L and (
118+ curr_gcb == GCB_L or curr_gcb == GCB_V
119+ or curr_gcb == GCB_LV or curr_gcb == GCB_LVT
120+ ):
100121 return False
101122
102123 # GB7: Do not break Hangul syllable sequences (LV, V).
103- if prev_gcb in (_GCB_LV , _GCB_V ) and curr_gcb in (_GCB_V , _GCB_T ):
124+ if (prev_gcb == GCB_LV or prev_gcb == GCB_V ) and (
125+ curr_gcb == GCB_V or curr_gcb == GCB_T
126+ ):
104127 return False
105128
106129 # GB8: Do not break Hangul syllable sequences (LVT, T).
107- if prev_gcb in ( _GCB_LVT , _GCB_T ) and curr_gcb == _GCB_T :
130+ if ( prev_gcb == GCB_LVT or prev_gcb == GCB_T ) and curr_gcb == GCB_T :
108131 return False
109132
110133 # GB9: Do not break before extending characters or ZWJ.
111- if curr_gcb in ( _GCB_Extend , _GCB_ZWJ ) :
134+ if curr_gcb == GCB_Extend or curr_gcb == GCB_ZWJ :
112135 return False
113136
114137 # GB9a: Do not break before SpacingMarks.
115- if curr_gcb == _GCB_SpacingMark :
138+ if curr_gcb == GCB_SpacingMark :
116139 return False
117140
118141 # GB9b: Do not break after Prepend characters.
119- if prev_gcb == _GCB_Prepend :
142+ if prev_gcb == GCB_Prepend :
120143 return False
121144
122145 # GB9c: Do not break within Indic conjunct clusters.
@@ -128,13 +151,17 @@ def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state):
128151 return False
129152
130153 # GB12/GB13: Do not break within emoji flag sequences.
131- if prev_gcb == _GCB_Regional_Indicator and curr_gcb == _GCB_Regional_Indicator :
154+ if prev_gcb == GCB_Regional_Indicator and curr_gcb == GCB_Regional_Indicator :
132155 return ri_flag
133156
134157 # GB999: Otherwise, break everywhere.
135158 return True
136159
137160
161+ # ---------------------------------------------------------------------------
162+ # Public API
163+ # ---------------------------------------------------------------------------
164+
138165def iter_graphemes (string , / , start = 0 , end = sys .maxsize ):
139166 """Iterate over grapheme clusters in a string.
140167
@@ -150,7 +177,6 @@ def iter_graphemes(string, /, start=0, end=sys.maxsize):
150177 )
151178
152179 length = len (string )
153- # Adjust indices (matching CPython's ADJUST_INDICES macro)
154180 if end > length :
155181 end = length
156182 if end < 0 :
@@ -166,22 +192,51 @@ def iter_graphemes(string, /, start=0, end=sys.maxsize):
166192
167193
168194def _iter_grapheme_clusters (string , start , end ):
169- gcb = _GCB_Other
195+ gcb = GCB_Other
170196 ep_state = _EP_INIT
171197 incb_state = _INCB_INIT
172198 ri_flag = False
173199
174200 cluster_start = start
175201 pos = start
176202 while pos < end :
177- ch = string [pos ]
178- curr_gcb = unicodedata .grapheme_cluster_break (ch )
179- ext_pict = unicodedata .extended_pictographic (ch )
180- incb = unicodedata .indic_conjunct_break (ch )
181-
182- ep_state = _update_ext_pict_state (ep_state , curr_gcb , ext_pict )
183- ri_flag = (not ri_flag ) if curr_gcb == _GCB_Regional_Indicator else False
184- incb_state = _update_incb_state (incb_state , incb )
203+ cp = ord (string [pos ])
204+ curr_gcb = _get_gcb (cp )
205+
206+ # Update Extended Pictographic FSM (GB11)
207+ ext_pict = _get_ext_pict (cp )
208+ if ext_pict :
209+ ep_state = _EP_MATCHED if ep_state == _EP_ZWJ else _EP_STARTED
210+ elif ep_state == _EP_STARTED or ep_state == _EP_MATCHED :
211+ if curr_gcb == GCB_Extend :
212+ ep_state = _EP_STARTED
213+ elif curr_gcb == GCB_ZWJ :
214+ ep_state = _EP_ZWJ
215+ else :
216+ ep_state = _EP_INIT
217+ else :
218+ ep_state = _EP_INIT
219+
220+ # Update Regional Indicator flag (GB12/GB13)
221+ ri_flag = not ri_flag if curr_gcb == GCB_Regional_Indicator else False
222+
223+ # Update Indic Conjunct Break FSM (GB9c)
224+ curr_incb = _get_incb (cp )
225+ if curr_incb == InCB_Consonant :
226+ incb_state = (
227+ _INCB_MATCHED if incb_state == _INCB_LINKER else _INCB_STARTED
228+ )
229+ elif incb_state != _INCB_INIT :
230+ if curr_incb == InCB_Extend :
231+ incb_state = (
232+ _INCB_LINKER if incb_state == _INCB_LINKER else _INCB_STARTED
233+ )
234+ elif curr_incb == InCB_Linker :
235+ incb_state = _INCB_LINKER
236+ else :
237+ incb_state = _INCB_INIT
238+ else :
239+ incb_state = _INCB_INIT
185240
186241 prev_gcb = gcb
187242 gcb = curr_gcb
0 commit comments