Add grapheme clustering support for cursor movement

neutrinoceros · jquast · commit a244ba5c7b87 · 2026-01-27T13:23:29.000-05:00
**Problem** Test sequence (copy and paste into any REPL/edit area):: 👨‍👩‍👧 👩‍❤‍👨 👩‍💻👋🏿 ❤️⭐ 🇯🇵🇩🇪 café niño Åộ 中文!. Moving the cursor over and around emojis get strange. insertions become chaotic. Cursor position becomes indeterminate (even negative!), input result becomes more corrupted with user confusion as draws become corrupted. This is briefly described in #274 by @jonathanslenders: > Notice that it still requires multiple cursor movements (left/right arrow) to move across these characters. **Solution**: Close #274 "Handle decomposed unicode characters" (2018) through careful integration of new functions, [wcwidth.iter_graphemes](https://wcwidth.readthedocs.io/en/latest/intro.html#iter-graphemes) and [wcwidth.grapheme_boundary_before](https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.grapheme_boundary_before).
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
 ]
 requires-python = ">=3.8"
 dependencies = [
-    "wcwidth",
+    "wcwidth>=0.5.0",
 ]
 
 [project.urls]
diff --git a/src/prompt_toolkit/document.py b/src/prompt_toolkit/document.py
@@ -10,6 +10,8 @@
 import weakref
 from typing import Callable, Dict, Iterable, List, NoReturn, Pattern, cast
 
+import wcwidth
+
 from .clipboard import ClipboardData
 from .filters import vi_mode
 from .selection import PasteMode, SelectionState, SelectionType
@@ -158,13 +160,22 @@ def selection(self) -> SelectionState | None:
 
     @property
     def current_char(self) -> str:
-        """Return character under cursor or an empty string."""
-        return self._get_char_relative_to_cursor(0) or ""
+        """Return grapheme cluster under cursor or an empty string."""
+        text_after = self.text_after_cursor
+        if not text_after:
+            return ""
+        for grapheme in wcwidth.iter_graphemes(text_after):
+            return grapheme
+        return ""
 
     @property
     def char_before_cursor(self) -> str:
-        """Return character before the cursor or an empty string."""
-        return self._get_char_relative_to_cursor(-1) or ""
+        """Return grapheme cluster before the cursor or an empty string."""
+        text_before = self.text_before_cursor
+        if not text_before:
+            return ""
+        boundary = wcwidth.grapheme_boundary_before(text_before, len(text_before))
+        return text_before[boundary:]
 
     @property
     def text_before_cursor(self) -> str:
@@ -251,15 +262,6 @@ def leading_whitespace_in_current_line(self) -> str:
         length = len(current_line) - len(current_line.lstrip())
         return current_line[:length]
 
-    def _get_char_relative_to_cursor(self, offset: int = 0) -> str:
-        """
-        Return character relative to cursor position, or empty string
-        """
-        try:
-            return self.text[self.cursor_position + offset]
-        except IndexError:
-            return ""
-
     @property
     def on_first_line(self) -> bool:
         """
@@ -692,21 +694,44 @@ def find_previous_matching_line(
 
     def get_cursor_left_position(self, count: int = 1) -> int:
         """
-        Relative position for cursor left.
+        Relative position for cursor left (grapheme cluster aware).
         """
         if count < 0:
             return self.get_cursor_right_position(-count)
 
-        return -min(self.cursor_position_col, count)
+        line_before = self.current_line_before_cursor
+        if not line_before:
+            return 0
+
+        pos = len(line_before)
+        for _ in range(count):
+            if pos <= 0:
+                break
+            new_pos = wcwidth.grapheme_boundary_before(line_before, pos)
+            if new_pos == pos:
+                break
+            pos = new_pos
+
+        return pos - len(line_before)
 
     def get_cursor_right_position(self, count: int = 1) -> int:
         """
-        Relative position for cursor_right.
+        Relative position for cursor right (grapheme cluster aware).
         """
         if count < 0:
             return self.get_cursor_left_position(-count)
 
-        return min(count, len(self.current_line_after_cursor))
+        line_after = self.current_line_after_cursor
+        if not line_after:
+            return 0
+
+        pos = 0
+        for i, grapheme in enumerate(wcwidth.iter_graphemes(line_after)):
+            if i >= count:
+                break
+            pos += len(grapheme)
+
+        return pos
 
     def get_cursor_up_position(
         self, count: int = 1, preferred_column: int | None = None
diff --git a/src/prompt_toolkit/formatted_text/utils.py b/src/prompt_toolkit/formatted_text/utils.py
@@ -9,7 +9,7 @@
 
 from typing import Iterable, cast
 
-from prompt_toolkit.utils import get_cwidth
+import wcwidth
 
 from .base import (
     AnyFormattedText,
@@ -48,17 +48,17 @@ def fragment_list_len(fragments: StyleAndTextTuples) -> int:
 def fragment_list_width(fragments: StyleAndTextTuples) -> int:
     """
     Return the character width of this text fragment list.
-    (Take double width characters into account.)
+    (Take double width characters and grapheme clusters into account.)
 
     :param fragments: List of ``(style_str, text)`` or
         ``(style_str, text, mouse_handler)`` tuples.
     """
-    ZeroWidthEscape = "[ZeroWidthEscape]"
+    # control codes are skipped by '[ZeroWidthEscape]' escape marker, and
+    # so fastest integration of width() is by using control_does='ignore'
     return sum(
-        get_cwidth(c)
+        wcwidth.width(item[1], control_codes="ignore")
         for item in fragments
-        for c in item[1]
-        if ZeroWidthEscape not in item[0]
+        if "[ZeroWidthEscape]" not in item[0]
     )
 
 
diff --git a/src/prompt_toolkit/layout/containers.py b/src/prompt_toolkit/layout/containers.py
@@ -10,6 +10,8 @@
 from functools import partial
 from typing import TYPE_CHECKING, Callable, Sequence, Union, cast
 
+import wcwidth
+
 from prompt_toolkit.application.current import get_app
 from prompt_toolkit.cache import SimpleCache
 from prompt_toolkit.data_structures import Point
@@ -2014,7 +2016,7 @@ def copy_line(
                     new_screen.zero_width_escapes[y + ypos][x + xpos] += text
                     continue
 
-                for c in text:
+                for c in wcwidth.iter_graphemes(text):
                     char = _CHAR_CACHE[c, style]
                     char_width = char.width
 
@@ -2052,26 +2054,7 @@ def copy_line(
                             for i in range(1, char_width):
                                 new_buffer_row[x + xpos + i] = empty_char
 
-                        # If this is a zero width characters, then it's
-                        # probably part of a decomposed unicode character.
-                        # See: https://en.wikipedia.org/wiki/Unicode_equivalence
-                        # Merge it in the previous cell.
-                        elif char_width == 0:
-                            # Handle all character widths. If the previous
-                            # character is a multiwidth character, then
-                            # merge it two positions back.
-                            for pw in [2, 1]:  # Previous character width.
-                                if (
-                                    x - pw >= 0
-                                    and new_buffer_row[x + xpos - pw].width == pw
-                                ):
-                                    prev_char = new_buffer_row[x + xpos - pw]
-                                    char2 = _CHAR_CACHE[
-                                        prev_char.char + c, prev_char.style
-                                    ]
-                                    new_buffer_row[x + xpos - pw] = char2
-
-                        # Keep track of write position for each character.
+                        # Keep track of write position for each grapheme.
                         current_rowcol_to_yx[lineno, col + skipped] = (
                             y + ypos,
                             x + xpos,
diff --git a/src/prompt_toolkit/layout/controls.py b/src/prompt_toolkit/layout/controls.py
@@ -8,6 +8,8 @@
 from abc import ABCMeta, abstractmethod
 from typing import TYPE_CHECKING, Callable, Hashable, Iterable, NamedTuple
 
+import wcwidth
+
 from prompt_toolkit.application.current import get_app
 from prompt_toolkit.buffer import Buffer
 from prompt_toolkit.cache import SimpleCache
@@ -674,12 +676,26 @@ def transform(
         ) -> _ProcessedLine:
             "Transform the fragments for a given line number."
 
-            # Get cursor position at this line.
+            # Build mapping from code point index to grapheme index.
+            # This is needed because cursor_position_col is in code points,
+            # but after grapheme-aware explosion, fragments are indexed by
+            # grapheme clusters.
+            line_text = fragment_list_to_text(fragments)
+            codepoint_to_grapheme: dict[int, int] = {}
+            grapheme_idx = 0
+            codepoint_idx = 0
+            for grapheme in wcwidth.iter_graphemes(line_text):
+                for _ in grapheme:  # Each code point in the grapheme
+                    codepoint_to_grapheme[codepoint_idx] = grapheme_idx
+                    codepoint_idx += 1
+                grapheme_idx += 1
+
             def source_to_display(i: int) -> int:
-                """X position from the buffer to the x position in the
-                processed fragment list. By default, we start from the 'identity'
-                operation."""
-                return i
+                """Map code point index to grapheme index."""
+                # Handle positions at or beyond end of line
+                if i >= codepoint_idx:
+                    return grapheme_idx + (i - codepoint_idx)
+                return codepoint_to_grapheme.get(i, grapheme_idx)
 
             transformation = merged_processor.apply_transformation(
                 TransformationInput(
diff --git a/src/prompt_toolkit/layout/utils.py b/src/prompt_toolkit/layout/utils.py
@@ -2,6 +2,8 @@
 
 from typing import TYPE_CHECKING, Iterable, List, TypeVar, cast, overload
 
+import wcwidth
+
 from prompt_toolkit.formatted_text.base import OneStyleAndTextTuple
 
 if TYPE_CHECKING:
@@ -60,7 +62,7 @@ def __setitem__(
 def explode_text_fragments(fragments: Iterable[_T]) -> _ExplodedList[_T]:
     """
     Turn a list of (style_str, text) tuples into another list where each string is
-    exactly one character.
+    exactly one grapheme cluster.
 
     It should be fine to call this function several times. Calling this on a
     list that is already exploded, is a null operation.
@@ -74,7 +76,7 @@ def explode_text_fragments(fragments: Iterable[_T]) -> _ExplodedList[_T]:
     result: list[_T] = []
 
     for style, string, *rest in fragments:
-        for c in string:
-            result.append((style, c, *rest))  # type: ignore
+        for grapheme in wcwidth.iter_graphemes(string):
+            result.append((style, grapheme, *rest))  # type: ignore
 
     return _ExplodedList(result)
diff --git a/src/prompt_toolkit/utils.py b/src/prompt_toolkit/utils.py
@@ -11,16 +11,19 @@
     Dict,
     Generator,
     Generic,
+    Iterator,
     TypeVar,
     Union,
 )
 
-from wcwidth import wcwidth
+import wcwidth
 
 __all__ = [
     "Event",
     "DummyContext",
     "get_cwidth",
+    "iter_grapheme_clusters",
+    "grapheme_cluster_count",
     "suspend_to_background_supported",
     "is_conemu_ansi",
     "is_windows",
@@ -138,15 +141,7 @@ def __init__(self) -> None:
         self._long_strings: deque[str] = deque()
 
     def __missing__(self, string: str) -> int:
-        # Note: We use the `max(0, ...` because some non printable control
-        #       characters, like e.g. Ctrl-underscore get a -1 wcwidth value.
-        #       It can be possible that these characters end up in the input
-        #       text.
-        result: int
-        if len(string) == 1:
-            result = max(0, wcwidth(string))
-        else:
-            result = sum(self[c] for c in string)
+        result = wcwidth.width(string, control_codes="ignore")
 
         # Store in cache.
         self[string] = result
@@ -175,6 +170,20 @@ def get_cwidth(string: str) -> int:
     return _CHAR_SIZES_CACHE[string]
 
 
+def iter_grapheme_clusters(text: str) -> Iterator[str]:
+    """
+    Iterate over grapheme clusters in text. Wrapper around ``wcwidth.iter_graphemes``.
+    """
+    return wcwidth.iter_graphemes(text)
+
+
+def grapheme_cluster_count(text: str) -> int:
+    """
+    Return the number of grapheme clusters in text.
+    """
+    return sum(1 for _ in wcwidth.iter_graphemes(text))
+
+
 def suspend_to_background_supported() -> bool:
     """
     Returns `True` when the Python implementation supports
diff --git a/tests/test_formatted_text.py b/tests/test_formatted_text.py
@@ -9,7 +9,7 @@
     merge_formatted_text,
     to_formatted_text,
 )
-from prompt_toolkit.formatted_text.utils import split_lines
+from prompt_toolkit.formatted_text.utils import fragment_list_width, split_lines
 
 
 def test_basic_html():
@@ -336,3 +336,15 @@ def test_split_lines_4():
         [("class:a", "line1")],
         [("class:a", "")],
     ]
+
+
+def test_fragment_list_width():
+    family = "\U0001F468\u200D\U0001F469\u200D\U0001F467"  # ZWJ sequence
+    heart = "\u2764\uFE0F"  # VS-16 emoji
+    assert fragment_list_width([("", "hello")]) == 5
+    assert fragment_list_width([("", family)]) == 2
+    assert fragment_list_width([("", heart)]) == 2
+
+
+def test_fragment_list_width_zero_width_escape():
+    assert fragment_list_width([("[ZeroWidthEscape]", "arbitrary")]) == 0
diff --git a/tests/test_wcwidth_integration.py b/tests/test_wcwidth_integration.py

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ classifiers = [`
`22`	`22`	`]`
`23`	`23`	`requires-python = ">=3.8"`
`24`	`24`	`dependencies = [`
`25`		`- "wcwidth",`
	`25`	`+ "wcwidth>=0.5.0",`
`26`	`26`	`]`
`27`	`27`
`28`	`28`	`[project.urls]`