Skip to content

Commit 70bdb56

Browse files
ambvclaude
andcommitted
Make _py_grapheme standalone by generating property tables
Add makegraphemedata() to Tools/unicode/makeunicodedata.py that generates Lib/_py_grapheme_db.py from the Unicode data files (GraphemeBreakProperty.txt, emoji-data.txt, DerivedCoreProperties.txt). _py_grapheme.py now imports property tables from _py_grapheme_db and uses bisect for lookups instead of calling unicodedata functions added in 3.15. This makes the module usable on Python 3.13 and 3.14 by regenerating the tables for the appropriate Unicode version. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 38db422 commit 70bdb56

File tree

4 files changed

+590
-61
lines changed

4 files changed

+590
-61
lines changed

Lib/_py_grapheme.py

Lines changed: 116 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,23 @@
11
"""Pure Python implementation of unicodedata.iter_graphemes().
22
33
Uses the extended grapheme cluster rules from Unicode TR29.
4+
5+
Property tables are in _py_grapheme_db.py, generated by
6+
Tools/unicode/makeunicodedata.py from the Unicode Character Database.
47
"""
58

69
import sys
7-
import unicodedata
10+
from bisect import bisect_right
11+
12+
from _py_grapheme_db import (
13+
GCB_RANGES,
14+
GCB_Other, GCB_Prepend, GCB_CR, GCB_LF, GCB_Control,
15+
GCB_Extend, GCB_Regional_Indicator, GCB_SpacingMark,
16+
GCB_L, GCB_V, GCB_T, GCB_LV, GCB_LVT, GCB_ZWJ,
17+
EXT_PICT_RANGES,
18+
INCB_RANGES,
19+
InCB_None, InCB_Linker, InCB_Consonant, InCB_Extend,
20+
)
821

922

1023
class Segment:
@@ -24,28 +37,6 @@ def __repr__(self):
2437
return f"<Segment {self.start}:{self.end}>"
2538

2639

27-
# Grapheme_Cluster_Break property values (matching C #defines)
28-
_GCB_Other = "Other"
29-
_GCB_Prepend = "Prepend"
30-
_GCB_CR = "CR"
31-
_GCB_LF = "LF"
32-
_GCB_Control = "Control"
33-
_GCB_Extend = "Extend"
34-
_GCB_Regional_Indicator = "Regional_Indicator"
35-
_GCB_SpacingMark = "SpacingMark"
36-
_GCB_L = "L"
37-
_GCB_V = "V"
38-
_GCB_T = "T"
39-
_GCB_LV = "LV"
40-
_GCB_LVT = "LVT"
41-
_GCB_ZWJ = "ZWJ"
42-
43-
# Indic_Conjunct_Break property values
44-
_InCB_None = "None"
45-
_InCB_Linker = "Linker"
46-
_InCB_Consonant = "Consonant"
47-
_InCB_Extend = "Extend"
48-
4940
# Extended Pictographic FSM states (for GB11)
5041
_EP_INIT = 0
5142
_EP_STARTED = 1
@@ -58,65 +49,97 @@ def __repr__(self):
5849
_INCB_LINKER = 2
5950
_INCB_MATCHED = 3
6051

52+
# Hangul syllable constants
53+
_HANGUL_S_BASE = 0xAC00
54+
_HANGUL_S_COUNT = 11172
55+
_HANGUL_T_COUNT = 28
56+
57+
# Precomputed start arrays for bisect lookup
58+
_GCB_STARTS = tuple(r[0] for r in GCB_RANGES)
59+
_EXT_PICT_STARTS = tuple(r[0] for r in EXT_PICT_RANGES)
60+
_INCB_STARTS = tuple(r[0] for r in INCB_RANGES)
61+
6162

62-
def _update_ext_pict_state(state, gcb, ext_pict):
63-
if ext_pict:
64-
return _EP_MATCHED if state == _EP_ZWJ else _EP_STARTED
65-
if state == _EP_STARTED or state == _EP_MATCHED:
66-
if gcb == _GCB_Extend:
67-
return _EP_STARTED
68-
if gcb == _GCB_ZWJ:
69-
return _EP_ZWJ
70-
return _EP_INIT
63+
# ---------------------------------------------------------------------------
64+
# Property lookup functions
65+
# ---------------------------------------------------------------------------
7166

67+
def _get_gcb(cp):
68+
"""Return the Grapheme_Cluster_Break value for a codepoint."""
69+
idx = bisect_right(_GCB_STARTS, cp) - 1
70+
if idx >= 0:
71+
entry = GCB_RANGES[idx]
72+
if cp <= entry[1]:
73+
return entry[2]
74+
# Hangul syllables: LV if (cp - S_BASE) % T_COUNT == 0, else LVT
75+
if _HANGUL_S_BASE <= cp < _HANGUL_S_BASE + _HANGUL_S_COUNT:
76+
if (cp - _HANGUL_S_BASE) % _HANGUL_T_COUNT == 0:
77+
return GCB_LV
78+
return GCB_LVT
79+
return GCB_Other
7280

73-
def _update_incb_state(state, incb):
74-
if incb == _InCB_Consonant:
75-
return _INCB_MATCHED if state == _INCB_LINKER else _INCB_STARTED
76-
if state != _INCB_INIT:
77-
if incb == _InCB_Extend:
78-
return _INCB_LINKER if state == _INCB_LINKER else _INCB_STARTED
79-
if incb == _InCB_Linker:
80-
return _INCB_LINKER
81-
return _INCB_INIT
8281

82+
def _get_ext_pict(cp):
83+
"""Return True if the codepoint has the Extended_Pictographic property."""
84+
idx = bisect_right(_EXT_PICT_STARTS, cp) - 1
85+
return idx >= 0 and cp <= EXT_PICT_RANGES[idx][1]
86+
87+
88+
def _get_incb(cp):
89+
"""Return the Indic_Conjunct_Break value for a codepoint."""
90+
idx = bisect_right(_INCB_STARTS, cp) - 1
91+
if idx >= 0:
92+
entry = INCB_RANGES[idx]
93+
if cp <= entry[1]:
94+
return entry[2]
95+
return InCB_None
96+
97+
98+
# ---------------------------------------------------------------------------
99+
# Grapheme break algorithm (TR29)
100+
# ---------------------------------------------------------------------------
83101

84102
def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state):
85103
"""Return True if a grapheme cluster break occurs between two characters."""
86104
# GB3: Do not break between a CR and LF.
87-
if prev_gcb == _GCB_CR and curr_gcb == _GCB_LF:
105+
if prev_gcb == GCB_CR and curr_gcb == GCB_LF:
88106
return False
89107

90108
# GB4: Break after controls.
91-
if prev_gcb in (_GCB_CR, _GCB_LF, _GCB_Control):
109+
if prev_gcb == GCB_CR or prev_gcb == GCB_LF or prev_gcb == GCB_Control:
92110
return True
93111

94112
# GB5: Break before controls.
95-
if curr_gcb in (_GCB_CR, _GCB_LF, _GCB_Control):
113+
if curr_gcb == GCB_CR or curr_gcb == GCB_LF or curr_gcb == GCB_Control:
96114
return True
97115

98116
# GB6: Do not break Hangul syllable sequences (L).
99-
if prev_gcb == _GCB_L and curr_gcb in (_GCB_L, _GCB_V, _GCB_LV, _GCB_LVT):
117+
if prev_gcb == GCB_L and (
118+
curr_gcb == GCB_L or curr_gcb == GCB_V
119+
or curr_gcb == GCB_LV or curr_gcb == GCB_LVT
120+
):
100121
return False
101122

102123
# GB7: Do not break Hangul syllable sequences (LV, V).
103-
if prev_gcb in (_GCB_LV, _GCB_V) and curr_gcb in (_GCB_V, _GCB_T):
124+
if (prev_gcb == GCB_LV or prev_gcb == GCB_V) and (
125+
curr_gcb == GCB_V or curr_gcb == GCB_T
126+
):
104127
return False
105128

106129
# GB8: Do not break Hangul syllable sequences (LVT, T).
107-
if prev_gcb in (_GCB_LVT, _GCB_T) and curr_gcb == _GCB_T:
130+
if (prev_gcb == GCB_LVT or prev_gcb == GCB_T) and curr_gcb == GCB_T:
108131
return False
109132

110133
# GB9: Do not break before extending characters or ZWJ.
111-
if curr_gcb in (_GCB_Extend, _GCB_ZWJ):
134+
if curr_gcb == GCB_Extend or curr_gcb == GCB_ZWJ:
112135
return False
113136

114137
# GB9a: Do not break before SpacingMarks.
115-
if curr_gcb == _GCB_SpacingMark:
138+
if curr_gcb == GCB_SpacingMark:
116139
return False
117140

118141
# GB9b: Do not break after Prepend characters.
119-
if prev_gcb == _GCB_Prepend:
142+
if prev_gcb == GCB_Prepend:
120143
return False
121144

122145
# GB9c: Do not break within Indic conjunct clusters.
@@ -128,13 +151,17 @@ def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state):
128151
return False
129152

130153
# GB12/GB13: Do not break within emoji flag sequences.
131-
if prev_gcb == _GCB_Regional_Indicator and curr_gcb == _GCB_Regional_Indicator:
154+
if prev_gcb == GCB_Regional_Indicator and curr_gcb == GCB_Regional_Indicator:
132155
return ri_flag
133156

134157
# GB999: Otherwise, break everywhere.
135158
return True
136159

137160

161+
# ---------------------------------------------------------------------------
162+
# Public API
163+
# ---------------------------------------------------------------------------
164+
138165
def iter_graphemes(string, /, start=0, end=sys.maxsize):
139166
"""Iterate over grapheme clusters in a string.
140167
@@ -150,7 +177,6 @@ def iter_graphemes(string, /, start=0, end=sys.maxsize):
150177
)
151178

152179
length = len(string)
153-
# Adjust indices (matching CPython's ADJUST_INDICES macro)
154180
if end > length:
155181
end = length
156182
if end < 0:
@@ -166,22 +192,51 @@ def iter_graphemes(string, /, start=0, end=sys.maxsize):
166192

167193

168194
def _iter_grapheme_clusters(string, start, end):
169-
gcb = _GCB_Other
195+
gcb = GCB_Other
170196
ep_state = _EP_INIT
171197
incb_state = _INCB_INIT
172198
ri_flag = False
173199

174200
cluster_start = start
175201
pos = start
176202
while pos < end:
177-
ch = string[pos]
178-
curr_gcb = unicodedata.grapheme_cluster_break(ch)
179-
ext_pict = unicodedata.extended_pictographic(ch)
180-
incb = unicodedata.indic_conjunct_break(ch)
181-
182-
ep_state = _update_ext_pict_state(ep_state, curr_gcb, ext_pict)
183-
ri_flag = (not ri_flag) if curr_gcb == _GCB_Regional_Indicator else False
184-
incb_state = _update_incb_state(incb_state, incb)
203+
cp = ord(string[pos])
204+
curr_gcb = _get_gcb(cp)
205+
206+
# Update Extended Pictographic FSM (GB11)
207+
ext_pict = _get_ext_pict(cp)
208+
if ext_pict:
209+
ep_state = _EP_MATCHED if ep_state == _EP_ZWJ else _EP_STARTED
210+
elif ep_state == _EP_STARTED or ep_state == _EP_MATCHED:
211+
if curr_gcb == GCB_Extend:
212+
ep_state = _EP_STARTED
213+
elif curr_gcb == GCB_ZWJ:
214+
ep_state = _EP_ZWJ
215+
else:
216+
ep_state = _EP_INIT
217+
else:
218+
ep_state = _EP_INIT
219+
220+
# Update Regional Indicator flag (GB12/GB13)
221+
ri_flag = not ri_flag if curr_gcb == GCB_Regional_Indicator else False
222+
223+
# Update Indic Conjunct Break FSM (GB9c)
224+
curr_incb = _get_incb(cp)
225+
if curr_incb == InCB_Consonant:
226+
incb_state = (
227+
_INCB_MATCHED if incb_state == _INCB_LINKER else _INCB_STARTED
228+
)
229+
elif incb_state != _INCB_INIT:
230+
if curr_incb == InCB_Extend:
231+
incb_state = (
232+
_INCB_LINKER if incb_state == _INCB_LINKER else _INCB_STARTED
233+
)
234+
elif curr_incb == InCB_Linker:
235+
incb_state = _INCB_LINKER
236+
else:
237+
incb_state = _INCB_INIT
238+
else:
239+
incb_state = _INCB_INIT
185240

186241
prev_gcb = gcb
187242
gcb = curr_gcb

0 commit comments

Comments
 (0)