Skip to content

Commit a244ba5

Browse files
neutrinocerosjquast
authored andcommitted
Add grapheme clustering support for cursor movement
**Problem** Test sequence (copy and paste into any REPL/edit area):: πŸ‘¨β€πŸ‘©β€πŸ‘§ πŸ‘©β€β€β€πŸ‘¨ πŸ‘©β€πŸ’»πŸ‘‹πŸΏ ❀️⭐ πŸ‡―πŸ‡΅πŸ‡©πŸ‡ͺ café ninΜƒo ÅoΜ‚Μ£ δΈ­ζ–‡!. Moving the cursor over and around emojis get strange. insertions become chaotic. Cursor position becomes indeterminate (even negative!), input result becomes more corrupted with user confusion as draws become corrupted. This is briefly described in #274 by @jonathanslenders: > Notice that it still requires multiple cursor movements (left/right arrow) to move across these characters. **Solution**: Close #274 "Handle decomposed unicode characters" (2018) through careful integration of new functions, [wcwidth.iter_graphemes](https://wcwidth.readthedocs.io/en/latest/intro.html#iter-graphemes) and [wcwidth.grapheme_boundary_before](https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.grapheme_boundary_before).
1 parent 8dad396 commit a244ba5

File tree

9 files changed

+227
-64
lines changed

9 files changed

+227
-64
lines changed

β€Žpyproject.tomlβ€Ž

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ classifiers = [
2222
]
2323
requires-python = ">=3.8"
2424
dependencies = [
25-
"wcwidth",
25+
"wcwidth>=0.5.0",
2626
]
2727

2828
[project.urls]

β€Žsrc/prompt_toolkit/document.pyβ€Ž

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import weakref
1111
from typing import Callable, Dict, Iterable, List, NoReturn, Pattern, cast
1212

13+
import wcwidth
14+
1315
from .clipboard import ClipboardData
1416
from .filters import vi_mode
1517
from .selection import PasteMode, SelectionState, SelectionType
@@ -158,13 +160,22 @@ def selection(self) -> SelectionState | None:
158160

159161
@property
160162
def current_char(self) -> str:
161-
"""Return character under cursor or an empty string."""
162-
return self._get_char_relative_to_cursor(0) or ""
163+
"""Return grapheme cluster under cursor or an empty string."""
164+
text_after = self.text_after_cursor
165+
if not text_after:
166+
return ""
167+
for grapheme in wcwidth.iter_graphemes(text_after):
168+
return grapheme
169+
return ""
163170

164171
@property
165172
def char_before_cursor(self) -> str:
166-
"""Return character before the cursor or an empty string."""
167-
return self._get_char_relative_to_cursor(-1) or ""
173+
"""Return grapheme cluster before the cursor or an empty string."""
174+
text_before = self.text_before_cursor
175+
if not text_before:
176+
return ""
177+
boundary = wcwidth.grapheme_boundary_before(text_before, len(text_before))
178+
return text_before[boundary:]
168179

169180
@property
170181
def text_before_cursor(self) -> str:
@@ -251,15 +262,6 @@ def leading_whitespace_in_current_line(self) -> str:
251262
length = len(current_line) - len(current_line.lstrip())
252263
return current_line[:length]
253264

254-
def _get_char_relative_to_cursor(self, offset: int = 0) -> str:
255-
"""
256-
Return character relative to cursor position, or empty string
257-
"""
258-
try:
259-
return self.text[self.cursor_position + offset]
260-
except IndexError:
261-
return ""
262-
263265
@property
264266
def on_first_line(self) -> bool:
265267
"""
@@ -692,21 +694,44 @@ def find_previous_matching_line(
692694

693695
def get_cursor_left_position(self, count: int = 1) -> int:
694696
"""
695-
Relative position for cursor left.
697+
Relative position for cursor left (grapheme cluster aware).
696698
"""
697699
if count < 0:
698700
return self.get_cursor_right_position(-count)
699701

700-
return -min(self.cursor_position_col, count)
702+
line_before = self.current_line_before_cursor
703+
if not line_before:
704+
return 0
705+
706+
pos = len(line_before)
707+
for _ in range(count):
708+
if pos <= 0:
709+
break
710+
new_pos = wcwidth.grapheme_boundary_before(line_before, pos)
711+
if new_pos == pos:
712+
break
713+
pos = new_pos
714+
715+
return pos - len(line_before)
701716

702717
def get_cursor_right_position(self, count: int = 1) -> int:
703718
"""
704-
Relative position for cursor_right.
719+
Relative position for cursor right (grapheme cluster aware).
705720
"""
706721
if count < 0:
707722
return self.get_cursor_left_position(-count)
708723

709-
return min(count, len(self.current_line_after_cursor))
724+
line_after = self.current_line_after_cursor
725+
if not line_after:
726+
return 0
727+
728+
pos = 0
729+
for i, grapheme in enumerate(wcwidth.iter_graphemes(line_after)):
730+
if i >= count:
731+
break
732+
pos += len(grapheme)
733+
734+
return pos
710735

711736
def get_cursor_up_position(
712737
self, count: int = 1, preferred_column: int | None = None

β€Žsrc/prompt_toolkit/formatted_text/utils.pyβ€Ž

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from typing import Iterable, cast
1111

12-
from prompt_toolkit.utils import get_cwidth
12+
import wcwidth
1313

1414
from .base import (
1515
AnyFormattedText,
@@ -48,17 +48,17 @@ def fragment_list_len(fragments: StyleAndTextTuples) -> int:
4848
def fragment_list_width(fragments: StyleAndTextTuples) -> int:
4949
"""
5050
Return the character width of this text fragment list.
51-
(Take double width characters into account.)
51+
(Take double width characters and grapheme clusters into account.)
5252
5353
:param fragments: List of ``(style_str, text)`` or
5454
``(style_str, text, mouse_handler)`` tuples.
5555
"""
56-
ZeroWidthEscape = "[ZeroWidthEscape]"
56+
# control codes are skipped by '[ZeroWidthEscape]' escape marker, and
57+
# so fastest integration of width() is by using control_does='ignore'
5758
return sum(
58-
get_cwidth(c)
59+
wcwidth.width(item[1], control_codes="ignore")
5960
for item in fragments
60-
for c in item[1]
61-
if ZeroWidthEscape not in item[0]
61+
if "[ZeroWidthEscape]" not in item[0]
6262
)
6363

6464

β€Žsrc/prompt_toolkit/layout/containers.pyβ€Ž

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from functools import partial
1111
from typing import TYPE_CHECKING, Callable, Sequence, Union, cast
1212

13+
import wcwidth
14+
1315
from prompt_toolkit.application.current import get_app
1416
from prompt_toolkit.cache import SimpleCache
1517
from prompt_toolkit.data_structures import Point
@@ -2014,7 +2016,7 @@ def copy_line(
20142016
new_screen.zero_width_escapes[y + ypos][x + xpos] += text
20152017
continue
20162018

2017-
for c in text:
2019+
for c in wcwidth.iter_graphemes(text):
20182020
char = _CHAR_CACHE[c, style]
20192021
char_width = char.width
20202022

@@ -2052,26 +2054,7 @@ def copy_line(
20522054
for i in range(1, char_width):
20532055
new_buffer_row[x + xpos + i] = empty_char
20542056

2055-
# If this is a zero width characters, then it's
2056-
# probably part of a decomposed unicode character.
2057-
# See: https://en.wikipedia.org/wiki/Unicode_equivalence
2058-
# Merge it in the previous cell.
2059-
elif char_width == 0:
2060-
# Handle all character widths. If the previous
2061-
# character is a multiwidth character, then
2062-
# merge it two positions back.
2063-
for pw in [2, 1]: # Previous character width.
2064-
if (
2065-
x - pw >= 0
2066-
and new_buffer_row[x + xpos - pw].width == pw
2067-
):
2068-
prev_char = new_buffer_row[x + xpos - pw]
2069-
char2 = _CHAR_CACHE[
2070-
prev_char.char + c, prev_char.style
2071-
]
2072-
new_buffer_row[x + xpos - pw] = char2
2073-
2074-
# Keep track of write position for each character.
2057+
# Keep track of write position for each grapheme.
20752058
current_rowcol_to_yx[lineno, col + skipped] = (
20762059
y + ypos,
20772060
x + xpos,

β€Žsrc/prompt_toolkit/layout/controls.pyβ€Ž

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from abc import ABCMeta, abstractmethod
99
from typing import TYPE_CHECKING, Callable, Hashable, Iterable, NamedTuple
1010

11+
import wcwidth
12+
1113
from prompt_toolkit.application.current import get_app
1214
from prompt_toolkit.buffer import Buffer
1315
from prompt_toolkit.cache import SimpleCache
@@ -674,12 +676,26 @@ def transform(
674676
) -> _ProcessedLine:
675677
"Transform the fragments for a given line number."
676678

677-
# Get cursor position at this line.
679+
# Build mapping from code point index to grapheme index.
680+
# This is needed because cursor_position_col is in code points,
681+
# but after grapheme-aware explosion, fragments are indexed by
682+
# grapheme clusters.
683+
line_text = fragment_list_to_text(fragments)
684+
codepoint_to_grapheme: dict[int, int] = {}
685+
grapheme_idx = 0
686+
codepoint_idx = 0
687+
for grapheme in wcwidth.iter_graphemes(line_text):
688+
for _ in grapheme: # Each code point in the grapheme
689+
codepoint_to_grapheme[codepoint_idx] = grapheme_idx
690+
codepoint_idx += 1
691+
grapheme_idx += 1
692+
678693
def source_to_display(i: int) -> int:
679-
"""X position from the buffer to the x position in the
680-
processed fragment list. By default, we start from the 'identity'
681-
operation."""
682-
return i
694+
"""Map code point index to grapheme index."""
695+
# Handle positions at or beyond end of line
696+
if i >= codepoint_idx:
697+
return grapheme_idx + (i - codepoint_idx)
698+
return codepoint_to_grapheme.get(i, grapheme_idx)
683699

684700
transformation = merged_processor.apply_transformation(
685701
TransformationInput(

β€Žsrc/prompt_toolkit/layout/utils.pyβ€Ž

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
from typing import TYPE_CHECKING, Iterable, List, TypeVar, cast, overload
44

5+
import wcwidth
6+
57
from prompt_toolkit.formatted_text.base import OneStyleAndTextTuple
68

79
if TYPE_CHECKING:
@@ -60,7 +62,7 @@ def __setitem__(
6062
def explode_text_fragments(fragments: Iterable[_T]) -> _ExplodedList[_T]:
6163
"""
6264
Turn a list of (style_str, text) tuples into another list where each string is
63-
exactly one character.
65+
exactly one grapheme cluster.
6466
6567
It should be fine to call this function several times. Calling this on a
6668
list that is already exploded, is a null operation.
@@ -74,7 +76,7 @@ def explode_text_fragments(fragments: Iterable[_T]) -> _ExplodedList[_T]:
7476
result: list[_T] = []
7577

7678
for style, string, *rest in fragments:
77-
for c in string:
78-
result.append((style, c, *rest)) # type: ignore
79+
for grapheme in wcwidth.iter_graphemes(string):
80+
result.append((style, grapheme, *rest)) # type: ignore
7981

8082
return _ExplodedList(result)

β€Žsrc/prompt_toolkit/utils.pyβ€Ž

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,19 @@
1111
Dict,
1212
Generator,
1313
Generic,
14+
Iterator,
1415
TypeVar,
1516
Union,
1617
)
1718

18-
from wcwidth import wcwidth
19+
import wcwidth
1920

2021
__all__ = [
2122
"Event",
2223
"DummyContext",
2324
"get_cwidth",
25+
"iter_grapheme_clusters",
26+
"grapheme_cluster_count",
2427
"suspend_to_background_supported",
2528
"is_conemu_ansi",
2629
"is_windows",
@@ -138,15 +141,7 @@ def __init__(self) -> None:
138141
self._long_strings: deque[str] = deque()
139142

140143
def __missing__(self, string: str) -> int:
141-
# Note: We use the `max(0, ...` because some non printable control
142-
# characters, like e.g. Ctrl-underscore get a -1 wcwidth value.
143-
# It can be possible that these characters end up in the input
144-
# text.
145-
result: int
146-
if len(string) == 1:
147-
result = max(0, wcwidth(string))
148-
else:
149-
result = sum(self[c] for c in string)
144+
result = wcwidth.width(string, control_codes="ignore")
150145

151146
# Store in cache.
152147
self[string] = result
@@ -175,6 +170,20 @@ def get_cwidth(string: str) -> int:
175170
return _CHAR_SIZES_CACHE[string]
176171

177172

173+
def iter_grapheme_clusters(text: str) -> Iterator[str]:
174+
"""
175+
Iterate over grapheme clusters in text. Wrapper around ``wcwidth.iter_graphemes``.
176+
"""
177+
return wcwidth.iter_graphemes(text)
178+
179+
180+
def grapheme_cluster_count(text: str) -> int:
181+
"""
182+
Return the number of grapheme clusters in text.
183+
"""
184+
return sum(1 for _ in wcwidth.iter_graphemes(text))
185+
186+
178187
def suspend_to_background_supported() -> bool:
179188
"""
180189
Returns `True` when the Python implementation supports

β€Žtests/test_formatted_text.pyβ€Ž

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
merge_formatted_text,
1010
to_formatted_text,
1111
)
12-
from prompt_toolkit.formatted_text.utils import split_lines
12+
from prompt_toolkit.formatted_text.utils import fragment_list_width, split_lines
1313

1414

1515
def test_basic_html():
@@ -336,3 +336,15 @@ def test_split_lines_4():
336336
[("class:a", "line1")],
337337
[("class:a", "")],
338338
]
339+
340+
341+
def test_fragment_list_width():
342+
family = "\U0001F468\u200D\U0001F469\u200D\U0001F467" # ZWJ sequence
343+
heart = "\u2764\uFE0F" # VS-16 emoji
344+
assert fragment_list_width([("", "hello")]) == 5
345+
assert fragment_list_width([("", family)]) == 2
346+
assert fragment_list_width([("", heart)]) == 2
347+
348+
349+
def test_fragment_list_width_zero_width_escape():
350+
assert fragment_list_width([("[ZeroWidthEscape]", "arbitrary")]) == 0

0 commit comments

Comments
Β (0)