@@ -30,63 +30,63 @@ maturin develop --release
3030## Usage
3131
3232``` python
33- import unicode_segmentation_py as us
33+ import unicode_segmentation_rs
3434
3535# Grapheme clusters (user-perceived characters)
3636text = " Hello 👨👩👧👦 World"
37- clusters = us .graphemes(text, is_extended = True )
37+ clusters = unicode_segmentation_py .graphemes(text, is_extended = True )
3838print (clusters) # ['H', 'e', 'l', 'l', 'o', ' ', '👨👩👧👦', ' ', 'W', 'o', 'r', 'l', 'd']
3939
4040# Get grapheme clusters with their byte indices
41- indices = us .grapheme_indices(text, is_extended = True )
41+ indices = unicode_segmentation_py .grapheme_indices(text, is_extended = True )
4242print (indices) # [(0, 'H'), (1, 'e'), ...]
4343
4444# Word boundaries (includes punctuation and whitespace)
4545text = " Hello, world!"
46- words = us .split_word_bounds(text)
46+ words = unicode_segmentation_py .split_word_bounds(text)
4747print (words) # ['Hello', ',', ' ', 'world', '!']
4848
4949# Unicode words (excludes punctuation and whitespace)
50- words = us .unicode_words(text)
50+ words = unicode_segmentation_py .unicode_words(text)
5151print (words) # ['Hello', 'world']
5252
5353# Word indices
54- indices = us .split_word_bound_indices(text)
54+ indices = unicode_segmentation_py .split_word_bound_indices(text)
5555print (indices) # [(0, 'Hello'), (5, ','), ...]
5656
5757# Sentence segmentation
5858text = " Hello world. How are you? I'm fine."
59- sentences = us .unicode_sentences(text)
59+ sentences = unicode_segmentation_py .unicode_sentences(text)
6060print (sentences) # ['Hello world. ', 'How are you? ', "I'm fine."]
6161
6262# Display width calculation
6363text = " Hello 世界"
64- width = us .text_width(text)
64+ width = unicode_segmentation_py .text_width(text)
6565print (width) # 10 (Hello=5, space=1, 世=2, 界=2, but depends on terminal)
6666
6767# Character width
68- print (us .text_width(' A' )) # Some(1)
69- print (us .text_width(' 世' )) # Some(2)
70- print (us .text_width(' \t ' )) # None (control character)
68+ print (unicode_segmentation_py .text_width(' A' )) # Some(1)
69+ print (unicode_segmentation_py .text_width(' 世' )) # Some(2)
70+ print (unicode_segmentation_py .text_width(' \t ' )) # None (control character)
7171```
7272
7373## Examples
7474
7575### Grapheme Cluster Segmentation
7676
7777``` python
78- import unicode_segmentation_py as us
78+ import unicode_segmentation_rs
7979
8080# Complex emojis and combining characters
8181text = " Hello 👨👩👧👦 नमस्ते"
8282print (f " Text: { text} " )
83- print (f " Graphemes: { us .graphemes(text, is_extended = True )} " )
84- print (f " Length (graphemes): { len (us .graphemes(text, is_extended = True ))} " )
83+ print (f " Graphemes: { unicode_segmentation_py .graphemes(text, is_extended = True )} " )
84+ print (f " Length (graphemes): { len (unicode_segmentation_py .graphemes(text, is_extended = True ))} " )
8585print (f " Length (chars): { len (text)} " )
8686
8787# With indices
8888print (" Grapheme indices:" )
89- for idx, cluster in us .grapheme_indices(text, is_extended = True ):
89+ for idx, cluster in unicode_segmentation_py .grapheme_indices(text, is_extended = True ):
9090 print (f " { idx:3d } : { cluster!r } " )
9191```
9292
@@ -95,12 +95,12 @@ for idx, cluster in us.grapheme_indices(text, is_extended=True):
9595``` python
9696text = " Hello, world! How are you?"
9797print (f " Text: { text} " )
98- print (f " Word bounds: { us .split_word_bounds(text)} " )
99- print (f " Unicode words: { us .unicode_words(text)} " )
98+ print (f " Word bounds: { unicode_segmentation_py .split_word_bounds(text)} " )
99+ print (f " Unicode words: { unicode_segmentation_py .unicode_words(text)} " )
100100
101101# With indices
102102print (" Word boundary indices:" )
103- for idx, word in us .split_word_bound_indices(text):
103+ for idx, word in unicode_segmentation_py .split_word_bound_indices(text):
104104 print (f " { idx:3d } : { word!r } " )
105105```
106106
@@ -109,7 +109,7 @@ for idx, word in us.split_word_bound_indices(text):
109109``` python
110110text = " Hello world. How are you? I'm fine, thanks! What about you?"
111111print (f " Text: { text} " )
112- sentences = us .unicode_sentences(text)
112+ sentences = unicode_segmentation_py .unicode_sentences(text)
113113print (" Sentences:" )
114114for i, sentence in enumerate (sentences, 1 ):
115115 print (f " { i} . { sentence!r } " )
@@ -121,17 +121,17 @@ for i, sentence in enumerate(sentences, 1):
121121# Arabic
122122arabic = " مرحبا بك. كيف حالك؟"
123123print (f " Arabic: { arabic} " )
124- print (f " Sentences: { us .unicode_sentences(arabic)} " )
124+ print (f " Sentences: { unicode_segmentation_py .unicode_sentences(arabic)} " )
125125
126126# Japanese
127127japanese = " こんにちは。お元気ですか?"
128128print (f " Japanese: { japanese} " )
129- print (f " Sentences: { us .unicode_sentences(japanese)} " )
129+ print (f " Sentences: { unicode_segmentation_py .unicode_sentences(japanese)} " )
130130
131131# Mixed languages
132132mixed = " Hello世界! This is a test文章."
133133print (f " Mixed: { mixed} " )
134- print (f " Words: { us .unicode_words(mixed)} " )
134+ print (f " Words: { unicode_segmentation_py .unicode_words(mixed)} " )
135135```
136136
137137### Display Width Calculation
@@ -147,15 +147,15 @@ examples = [
147147]
148148
149149for text in examples:
150- width = us .text_width(text)
151- width_cjk = us .text_width_cjk(text)
150+ width = unicode_segmentation_py .text_width(text)
151+ width_cjk = unicode_segmentation_py .text_width_cjk(text)
152152 print (f " Text: { text!r:20 } Width: { width:2 } CJK: { width_cjk:2 } Chars: { len (text):2 } " )
153153
154154# Character widths
155155chars = [' a' , ' A' , ' 1' , ' ' , ' 世' , ' 界' , ' あ' , ' 🎉' , ' \t ' , ' \n ' ]
156156for c in chars:
157- w = us .text_width(c)
158- w_cjk = us .text_width_cjk(c)
157+ w = unicode_segmentation_py .text_width(c)
158+ w_cjk = unicode_segmentation_py .text_width_cjk(c)
159159 w_str = str (w) if w is not None else " None"
160160 w_cjk_str = str (w_cjk) if w_cjk is not None else " None"
161161 print (f " { c!r:6 } width: { w_str:4 } cjk: { w_cjk_str:4 } " )
0 commit comments