From 669f98db504af8fcb710c5da91e44d32cff1696f Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 10 Apr 2026 12:23:25 -0400
Subject: [PATCH 1/8] fix: use UTF-16 offsets for Text operations (fixes #308)

Set OffsetKind::Utf16 on yrs Doc so the wire format uses UTF-16 code
unit offsets, matching JS yjs. Without this, pycrdt uses UTF-8 byte
offsets, causing findIndexSS "Unexpected case" crashes when JS yjs
clients apply incremental updates containing multi-byte characters.

In the Python wrapper, convert character (code point) indices to
UTF-16 code unit indices before passing to yrs. This ensures
Text.insert(), __setitem__, __delitem__, and format() all work
correctly with emoji and other non-BMP characters.

Fixes: #308
Related: jupyter-ai-contrib/jupyter-server-documents#197
---
 python/pycrdt/_text.py | 77 ++++++++++++++++++++++++++++++++++--------
 src/doc.rs             |  8 ++++-
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py
index 49356c0..418bab3 100644
--- a/python/pycrdt/_text.py
+++ b/python/pycrdt/_text.py
@@ -11,6 +11,36 @@
     from ._doc import Doc
 
 
+def _char_to_utf16(text: str, char_index: int) -> int:
+    """Convert a Python character (code point) index to a UTF-16 code unit index.
+
+    Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2
+    UTF-16 code units but only 1 Python character.  The underlying yrs library
+    uses UTF-16 offsets, so all indices passed to it must be converted.
+
+    For pure-ASCII / BMP text this is a no-op (returns ``char_index``
+    unchanged).
+    """
+    if char_index == 0:
+        return 0
+    prefix = text[:char_index]
+    # Count characters that need a surrogate pair (code point > 0xFFFF)
+    extra = sum(1 for ch in prefix if ord(ch) > 0xFFFF)
+    return char_index + extra
+
+
+def _utf16_to_char(text: str, utf16_index: int) -> int:
+    """Convert a UTF-16 code unit index back to a Python character index."""
+    char_idx = 0
+    utf16_idx = 0
+    for ch in text:
+        if utf16_idx >= utf16_index:
+            break
+        utf16_idx += 2 if ord(ch) > 0xFFFF else 1
+        char_idx += 1
+    return char_idx
+
+
 class Text(Sequence):
     """
     A shared data type used for collaborative text editing, similar to a Python `str`.
@@ -89,10 +119,10 @@ def __len__(self) -> int:
         ```
 
         Returns:
-            The length of the text.
+            The length of the text (in Python characters, not UTF-16 code units).
         """
-        with self.doc.transaction() as txn:
-            return self.integrated.len(txn._txn)
+        # Return Python character count, not yrs UTF-16 code unit count
+        return len(str(self))
 
     def __str__(self) -> str:
         """
@@ -169,13 +199,19 @@ def __delitem__(self, key: int | slice) -> None:
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
             if isinstance(key, int):
-                self.integrated.remove_range(txn._txn, key, 1)
+                utf16_idx = _char_to_utf16(current, key)
+                char_at = current[key]
+                utf16_len = 2 if ord(char_at) > 0xFFFF else 1
+                self.integrated.remove_range(txn._txn, utf16_idx, utf16_len)
             elif isinstance(key, slice):
                 start, stop = self._check_slice(key)
                 length = stop - start
                 if length > 0:
-                    self.integrated.remove_range(txn._txn, start, length)
+                    utf16_start = _char_to_utf16(current, start)
+                    utf16_stop = _char_to_utf16(current, stop)
+                    self.integrated.remove_range(txn._txn, utf16_start, utf16_stop - utf16_start)
             else:
                 raise RuntimeError(f"Index not supported: {key}")
 
@@ -214,20 +250,26 @@ def __setitem__(self, key: int | slice, value: str) -> None:
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
             if isinstance(key, int):
                 value_len = len(value)
                 if value_len != 1:
                     raise RuntimeError(
                         f"Single item assigned value must have a length of 1, not {value_len}"
                     )
-                del self[key]
-                self.integrated.insert(txn._txn, key, value)
+                utf16_idx = _char_to_utf16(current, key)
+                char_at = current[key]
+                utf16_len = 2 if ord(char_at) > 0xFFFF else 1
+                self.integrated.remove_range(txn._txn, utf16_idx, utf16_len)
+                self.integrated.insert(txn._txn, utf16_idx, value)
             elif isinstance(key, slice):
                 start, stop = self._check_slice(key)
-                length = stop - start
+                utf16_start = _char_to_utf16(current, start)
+                utf16_stop = _char_to_utf16(current, stop)
+                length = utf16_stop - utf16_start
                 if length > 0:
-                    self.integrated.remove_range(txn._txn, start, length)
-                self.integrated.insert(txn._txn, start, value)
+                    self.integrated.remove_range(txn._txn, utf16_start, length)
+                self.integrated.insert(txn._txn, utf16_start, value)
             else:
                 raise RuntimeError(f"Index not supported: {key}")
 
@@ -251,8 +293,10 @@ def insert(self, index: int, value: str, attrs: dict[str, Any] | None = None) ->
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
+            utf16_index = _char_to_utf16(current, index)
             self.integrated.insert(
-                txn._txn, index, value, iter(attrs.items()) if attrs is not None else None
+                txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None
             )
 
     def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None:
@@ -266,8 +310,10 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
+            utf16_index = _char_to_utf16(current, index)
             self.integrated.insert_embed(
-                txn._txn, index, value, iter(attrs.items()) if attrs is not None else None
+                txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None
             )
 
     def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
@@ -282,9 +328,12 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
             start, stop = self._check_slice(slice(start, stop))
-            length = stop - start
+            current = str(self)
+            utf16_start = _char_to_utf16(current, start)
+            utf16_stop = _char_to_utf16(current, stop)
+            length = utf16_stop - utf16_start
             if length > 0:
-                self.integrated.format(txn._txn, start, length, iter(attrs.items()))
+                self.integrated.format(txn._txn, utf16_start, length, iter(attrs.items()))
 
     def diff(self) -> list[tuple[Any, dict[str, Any] | None]]:
         """
diff --git a/src/doc.rs b/src/doc.rs
index 61109cc..27e875d 100644
--- a/src/doc.rs
+++ b/src/doc.rs
@@ -3,7 +3,7 @@ use pyo3::IntoPyObjectExt;
 use pyo3::exceptions::{PyRuntimeError, PyValueError};
 use pyo3::types::{PyBool, PyBytes, PyDict, PyInt, PyList};
 use yrs::{
-    Doc as _Doc, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn
+    Doc as _Doc, OffsetKind, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn
 };
 use yrs::updates::encoder::{Encode, Encoder};
 use yrs::updates::decoder::Decode;
@@ -32,6 +32,7 @@ impl Doc {
         let mut options = yrs::Options::default();
         options.client_id = original.doc.client_id();
         options.skip_gc = original.doc.skip_gc();
+        options.offset_kind = OffsetKind::Utf16;
         if let Some(collection_id) = original.doc.collection_id() {
             options.collection_id = Some(collection_id);
         }
@@ -84,6 +85,11 @@ impl Doc {
                 .map_err(|_| PyValueError::new_err("skip_gc must be a valid bool"))?;
             options.skip_gc = _skip_gc;
         }
+        // Use UTF-16 offsets for compatibility with JS yjs clients.
+        // Without this, pycrdt uses UTF-8 byte offsets which causes
+        // findIndexSS crashes when JS yjs applies incremental updates
+        // containing multi-byte characters.
+        options.offset_kind = OffsetKind::Utf16;
         let doc = _Doc::with_options(options);
         Ok(Doc { doc })
     }

From f3a2e7e596fd2d7bf2b35ea4fdbf97eb7b5eb484 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 10 Apr 2026 14:44:42 -0400
Subject: [PATCH 2/8] test: add 12 Unicode/emoji tests for Text operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cover insert, delete, setitem, slice, len, and cross-doc sync with:
- emoji (surrogate pairs: 📊 🎉)
- CJK (BMP: 价格 世界 特征工程)
- Cyrillic (мир)
- supplementary plane (𝒜 𠀀)
- mixed scripts in one text

These all fail on stock pycrdt 0.12.50 and pass with the OffsetKind::Utf16 fix.
---
 tests/test_text.py | 153 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)

diff --git a/tests/test_text.py b/tests/test_text.py
index ba913a0..ab4e507 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -228,6 +228,159 @@ def test_sticky_index(serialize: str):
     assert text1[new_idx] == "*"
 
 
+def test_unicode_emoji_insert():
+    """Text.insert() after emoji characters should use character positions, not byte offsets."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "A📊B"
+    assert str(text) == "A📊B"
+    assert len(text) == 3
+
+    # Insert at position 2 = between 📊 and B
+    text.insert(2, "X")
+    assert str(text) == "A📊XB", f"Got {str(text)!r}, emoji insert position is wrong"
+
+
+def test_unicode_emoji_sequential_inserts():
+    """Sequential inserts after emoji should maintain correct positions."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "# Analysis 📊\n"
+    text.insert(len(text), "model = fit()\n")
+    text.insert(len(text), "# 特征工程\n")
+    text.insert(len(text), 'print("done")\n')
+
+    expected = '# Analysis 📊\nmodel = fit()\n# 特征工程\nprint("done")\n'
+    assert str(text) == expected, f"Got {str(text)!r}"
+
+
+def test_unicode_emoji_len():
+    """len() should return Python character count, not byte count."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "A📊B"
+    assert len(text) == 3  # 3 chars, not 6 bytes or 4 UTF-16 code units
+
+    text += "🎉"
+    assert len(text) == 4
+
+
+def test_unicode_emoji_delete():
+    """Deleting a character after an emoji should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text("A📊BC")
+
+    del text[2]  # delete B (after emoji)
+    assert str(text) == "A📊C", f"Got {str(text)!r}"
+
+
+def test_unicode_emoji_delete_emoji():
+    """Deleting an emoji character itself should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text("A📊B")
+
+    del text[1]  # delete 📊
+    assert str(text) == "AB", f"Got {str(text)!r}"
+
+
+def test_unicode_emoji_slice_delete():
+    """Slice deletion across emoji boundaries should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text("A📊B🎉C")
+
+    del text[1:4]  # delete 📊B🎉
+    assert str(text) == "AC", f"Got {str(text)!r}"
+
+
+def test_unicode_emoji_setitem():
+    """Replacing a character after an emoji should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text("A📊BC")
+
+    text[2] = "X"  # replace B (after emoji)
+    assert str(text) == "A📊XC", f"Got {str(text)!r}"
+
+
+def test_unicode_emoji_slice_setitem():
+    """Slice replacement spanning emoji should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text("A📊B🎉C")
+
+    text[1:4] = "XYZ"  # replace 📊B🎉 with XYZ
+    assert str(text) == "AXYZC", f"Got {str(text)!r}"
+
+
+def test_unicode_cjk():
+    """CJK characters (BMP, 1 UTF-16 code unit each) should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "价格"
+    text.insert(2, "X")
+    assert str(text) == "价格X", f"Got {str(text)!r}"
+    assert len(text) == 3
+
+
+def test_unicode_mixed_scripts():
+    """Mixed ASCII, CJK, Cyrillic, and emoji in one text."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "Hello"
+    text.insert(5, " 世界")
+    text.insert(8, " 📊")
+    text.insert(11, " мир")
+    text.insert(15, "!")
+
+    expected = "Hello 世界 📊 мир!"
+    assert str(text) == expected, f"Got {str(text)!r}"
+    assert len(text) == 15
+
+
+def test_unicode_supplementary_plane():
+    """Characters outside BMP (require UTF-16 surrogate pairs)."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    # 𝒜 (U+1D49C) = Mathematical Script Capital A
+    # 𠀀 (U+20000) = CJK Unified Ideograph Extension B
+    text += "A𝒜B𠀀C"
+    assert len(text) == 5
+
+    text.insert(2, "X")  # between 𝒜 and B
+    assert str(text) == "A𝒜XB𠀀C", f"Got {str(text)!r}"
+
+    text.insert(5, "Y")  # between 𠀀 and C
+    assert str(text) == "A𝒜XB𠀀YC", f"Got {str(text)!r}"
+
+
+def test_unicode_cross_doc_sync():
+    """Updates with Unicode content should sync correctly between two pycrdt docs."""
+    doc1 = Doc()
+    doc1["text"] = text1 = Text()
+
+    # Capture updates from doc1
+    updates = []
+    doc1.observe(lambda event: updates.append(event.update))
+
+    text1 += "# Analysis 📊\n"
+    text1.insert(len(text1), "model = fit()\n")
+    text1.insert(len(text1), "# 特征工程\n")
+
+    # Apply to doc2
+    doc2 = Doc()
+    doc2["text"] = Text()
+    for update in updates:
+        doc2.apply_update(update)
+
+    assert str(doc2["text"]) == str(text1), (
+        f"Docs diverged: doc1={str(text1)!r} doc2={str(doc2['text'])!r}"
+    )
+
+
 def test_sticky_index_transaction():
     doc = Doc()
     text = doc.get("text", type=Text)

From 8264d95de14fb00435d338b76111dc81ab12b97a Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 10 Apr 2026 14:48:36 -0400
Subject: [PATCH 3/8] test: add granular diff tests from jupyter_ydoc#370

11 parametrized test cases adapted from jupyter-server/jupyter_ydoc#370
covering emoji swaps, flags, ZWJ family sequences, combining marks,
keycap sequences, RTL/LTR text, Japanese, and math operators.

These exercise Text insert/delete/replace via SequenceMatcher-based
diffing (the same pattern jupyter_ydoc.YUnicode.set() uses).
---
 tests/test_text.py | 116 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/tests/test_text.py b/tests/test_text.py
index ab4e507..dd77df2 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -381,6 +381,122 @@ def test_unicode_cross_doc_sync():
     )
 
 
+# Test cases adapted from jupyter-server/jupyter_ydoc#370 (prior art for
+# the workaround at the jupyter_ydoc layer). These exercise pycrdt's Text
+# operations directly with the same Unicode edge cases. Each test sets
+# initial content, then applies a granular edit (using SequenceMatcher on
+# byte offsets, matching how jupyter_ydoc.YUnicode.set() works), and verifies
+# the result is correct.
+from difflib import SequenceMatcher
+
+
+def _apply_diff(text, old_value, new_value):
+    """Apply a granular diff from old_value to new_value using character-level
+    SequenceMatcher. With the UTF-16 offset fix, pycrdt Text indices are
+    character-based, so we diff on characters (not bytes)."""
+    matcher = SequenceMatcher(a=old_value, b=new_value)
+
+    offset = 0
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        if tag == "replace":
+            text[i1 + offset : i2 + offset] = new_value[j1:j2]
+            offset += (j2 - j1) - (i2 - i1)
+        elif tag == "delete":
+            del text[i1 + offset : i2 + offset]
+            offset -= i2 - i1
+        elif tag == "insert":
+            text.insert(i1 + offset, new_value[j1:j2])
+            offset += j2 - j1
+
+
+@pytest.mark.parametrize(
+    "initial, updated",
+    [
+        # emojis swapped
+        (
+            "I like security 🎨 but I really love painting 🔒",
+            "I like security 🔒 but I really love painting 🎨",
+        ),
+        # text changes, emojis stay in place
+        (
+            "Here is a rocket: ⭐ and a star: 🚀",
+            "Here is a star: ⭐ and a rocket: 🚀",
+        ),
+        # change of text and emojis
+        (
+            "Here are some happy faces: 😀😁😂",
+            "Here are some sad faces: 😞😢😭",
+        ),
+        # change of characters with combining marks
+        (
+            "Combining characters: á é í ó ú",
+            "Combining characters: ú ó í é á",
+        ),
+        # flags (regional indicator sequences)
+        (
+            "Flags: 🇺🇸🇬🇧🇨🇦",
+            "Flags: 🇨🇦🇬🇧🇺🇸",
+        ),
+        # Zero-width joiner sequences (family emoji)
+        (
+            "A family 👨\u200d👩\u200d👧\u200d👦 (with two children)",
+            "A family 👨\u200d👩\u200d👧 (with one child)",
+        ),
+        # Mixed RTL/LTR text
+        (
+            "Hello שלום world",
+            "Hello עולם world",
+        ),
+        # Keycap sequences
+        (
+            "Numbers: 1️⃣2️⃣3️⃣",
+            "Numbers: 3️⃣2️⃣1️⃣",
+        ),
+        # Emoji at boundaries
+        (
+            "👋 middle text 🎉",
+            "🎉 middle text 👋",
+        ),
+        # Japanese characters
+        (
+            "こんにちは世界",
+            "こんにちは地球",
+        ),
+        # Julia math operators
+        (
+            "x ∈ [1, 2, 3] && y ≥ 0",
+            "x ∉ [1, 2, 3] || y ≤ 0",
+        ),
+    ],
+    ids=[
+        "emoji_swap",
+        "text_change_emoji_stay",
+        "emoji_change",
+        "combining_marks",
+        "flags",
+        "zwj_family",
+        "rtl_ltr",
+        "keycap",
+        "emoji_boundaries",
+        "japanese",
+        "math_operators",
+    ],
+)
+def test_unicode_granular_diff(initial, updated):
+    """Granular text edits with multi-byte Unicode should produce correct results.
+
+    Test cases adapted from jupyter-server/jupyter_ydoc#370.
+    """
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += initial
+    assert str(text) == initial
+
+    _apply_diff(text, initial, updated)
+    assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}"
+
+
 def test_sticky_index_transaction():
     doc = Doc()
     text = doc.get("text", type=Text)

From 7cf5af44834ba1ae4f108ea99ed668b90530b723 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 10 Apr 2026 14:52:11 -0400
Subject: [PATCH 4/8] fix: move SequenceMatcher import to top of file (ruff
 E402)

---
 tests/test_text.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_text.py b/tests/test_text.py
index dd77df2..5efdc1f 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -1,3 +1,5 @@
+from difflib import SequenceMatcher
+
 import pytest
 from anyio import TASK_STATUS_IGNORED, Event, create_task_group
 from anyio.abc import TaskStatus
@@ -387,7 +389,6 @@ def test_unicode_cross_doc_sync():
 # initial content, then applies a granular edit (using SequenceMatcher on
 # byte offsets, matching how jupyter_ydoc.YUnicode.set() works), and verifies
 # the result is correct.
-from difflib import SequenceMatcher
 
 
 def _apply_diff(text, old_value, new_value):

From b1ed6ae56be03f2555f4fc99527143b9a49bf526 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Mon, 13 Apr 2026 20:06:41 -0400
Subject: [PATCH 5/8] test: add tests for _utf16_to_char helper

Addresses review feedback from @davidbrochart. Tests cover ASCII
(identity), BMP characters, supplementary plane (emoji), multiple
emoji, and roundtrip with _char_to_utf16.
---
 tests/test_text.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/test_text.py b/tests/test_text.py
index 5efdc1f..d67f52e 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -4,6 +4,7 @@
 from anyio import TASK_STATUS_IGNORED, Event, create_task_group
 from anyio.abc import TaskStatus
 from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text
+from pycrdt._text import _char_to_utf16, _utf16_to_char
 
 pytestmark = pytest.mark.anyio
 
@@ -498,6 +499,62 @@ def test_unicode_granular_diff(initial, updated):
     assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}"
 
 
+def test_utf16_to_char_ascii():
+    """_utf16_to_char is identity for pure ASCII text."""
+    text = "Hello, World!"
+    for i in range(len(text) + 1):
+        assert _utf16_to_char(text, i) == i
+
+
+def test_utf16_to_char_bmp():
+    """BMP characters (CJK, Cyrillic) are 1 UTF-16 code unit each."""
+    text = "价格分析"  # 4 BMP CJK chars = 4 UTF-16 code units
+    assert _utf16_to_char(text, 0) == 0
+    assert _utf16_to_char(text, 1) == 1
+    assert _utf16_to_char(text, 2) == 2
+    assert _utf16_to_char(text, 4) == 4
+
+
+def test_utf16_to_char_supplementary():
+    """Supplementary plane chars (emoji) take 2 UTF-16 code units."""
+    text = "A📊B"  # UTF-16: A(1) 📊(2) B(1) = 4 code units, 3 chars
+    assert _utf16_to_char(text, 0) == 0  # before A
+    assert _utf16_to_char(text, 1) == 1  # before 📊
+    assert _utf16_to_char(text, 3) == 2  # before B (1 + 2 = 3)
+    assert _utf16_to_char(text, 4) == 3  # end
+
+
+def test_utf16_to_char_multiple_emoji():
+    """Multiple supplementary plane characters."""
+    text = "A📊B🎉C"  # UTF-16: A(1) 📊(2) B(1) 🎉(2) C(1) = 7 units, 5 chars
+    assert _utf16_to_char(text, 0) == 0  # before A
+    assert _utf16_to_char(text, 1) == 1  # before 📊
+    assert _utf16_to_char(text, 3) == 2  # before B
+    assert _utf16_to_char(text, 4) == 3  # before 🎉
+    assert _utf16_to_char(text, 6) == 4  # before C
+    assert _utf16_to_char(text, 7) == 5  # end
+
+
+def test_utf16_to_char_roundtrip():
+    """_char_to_utf16 and _utf16_to_char are inverses."""
+    texts = [
+        "Hello",
+        "A📊B",
+        "价格分析",
+        "# Analysis 📊\n",
+        "A𝒜B𠀀C",
+        "Hello 世界 📊 мир!",
+        "🎉📊🔒",
+    ]
+    for text in texts:
+        for char_idx in range(len(text) + 1):
+            utf16_idx = _char_to_utf16(text, char_idx)
+            assert _utf16_to_char(text, utf16_idx) == char_idx, (
+                f"Roundtrip failed for {text!r} at char_idx={char_idx}: "
+                f"utf16={utf16_idx}, back={_utf16_to_char(text, utf16_idx)}"
+            )
+
+
 def test_sticky_index_transaction():
     doc = Doc()
     text = doc.get("text", type=Text)

From f48610b1a49c2eb312aeb4f4e4e38c76d9489488 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 17 Apr 2026 00:05:54 -0400
Subject: [PATCH 6/8] fix: convert UTF-16 offset in Text.__iadd__ and drop
 unused _utf16_to_char
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Text.__iadd__ passed len(self) (Python character count) to the yrs
insert, but yrs expects a UTF-16 code unit index — so `t += "X"` after
an emoji landed inside the surrogate pair. Convert the index through
_char_to_utf16, matching every other mutating method.

Also removes _utf16_to_char and its tests, which had no callers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/pycrdt/_text.py | 16 ++--------
 tests/test_text.py     | 69 ++++++++----------------------------------
 2 files changed, 15 insertions(+), 70 deletions(-)

diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py
index 418bab3..069da86 100644
--- a/python/pycrdt/_text.py
+++ b/python/pycrdt/_text.py
@@ -29,18 +29,6 @@ def _char_to_utf16(text: str, char_index: int) -> int:
     return char_index + extra
 
 
-def _utf16_to_char(text: str, utf16_index: int) -> int:
-    """Convert a UTF-16 code unit index back to a Python character index."""
-    char_idx = 0
-    utf16_idx = 0
-    for ch in text:
-        if utf16_idx >= utf16_index:
-            break
-        utf16_idx += 2 if ord(ch) > 0xFFFF else 1
-        char_idx += 1
-    return char_idx
-
-
 class Text(Sequence):
     """
     A shared data type used for collaborative text editing, similar to a Python `str`.
@@ -158,7 +146,9 @@ def __iadd__(self, value: str) -> Text:
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
-            self.integrated.insert(txn._txn, len(self), value)
+            current = str(self)
+            utf16_index = _char_to_utf16(current, len(current))
+            self.integrated.insert(txn._txn, utf16_index, value)
             return self
 
     def _check_slice(self, key: slice) -> tuple[int, int]:
diff --git a/tests/test_text.py b/tests/test_text.py
index d67f52e..fc06f0a 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -4,7 +4,7 @@
 from anyio import TASK_STATUS_IGNORED, Event, create_task_group
 from anyio.abc import TaskStatus
 from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text
-from pycrdt._text import _char_to_utf16, _utf16_to_char
+from pycrdt._text import _char_to_utf16
 
 pytestmark = pytest.mark.anyio
 
@@ -259,6 +259,17 @@ def test_unicode_emoji_sequential_inserts():
     assert str(text) == expected, f"Got {str(text)!r}"
 
 
+def test_unicode_emoji_iadd():
+    """`+=` after emoji should append at the end (regression for UTF-16 offset bug)."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "A📊B"
+    text += "X"
+
+    assert str(text) == "A📊BX"
+
+
 def test_unicode_emoji_len():
     """len() should return Python character count, not byte count."""
     doc = Doc()
@@ -499,62 +510,6 @@ def test_unicode_granular_diff(initial, updated):
     assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}"
 
 
-def test_utf16_to_char_ascii():
-    """_utf16_to_char is identity for pure ASCII text."""
-    text = "Hello, World!"
-    for i in range(len(text) + 1):
-        assert _utf16_to_char(text, i) == i
-
-
-def test_utf16_to_char_bmp():
-    """BMP characters (CJK, Cyrillic) are 1 UTF-16 code unit each."""
-    text = "价格分析"  # 4 BMP CJK chars = 4 UTF-16 code units
-    assert _utf16_to_char(text, 0) == 0
-    assert _utf16_to_char(text, 1) == 1
-    assert _utf16_to_char(text, 2) == 2
-    assert _utf16_to_char(text, 4) == 4
-
-
-def test_utf16_to_char_supplementary():
-    """Supplementary plane chars (emoji) take 2 UTF-16 code units."""
-    text = "A📊B"  # UTF-16: A(1) 📊(2) B(1) = 4 code units, 3 chars
-    assert _utf16_to_char(text, 0) == 0  # before A
-    assert _utf16_to_char(text, 1) == 1  # before 📊
-    assert _utf16_to_char(text, 3) == 2  # before B (1 + 2 = 3)
-    assert _utf16_to_char(text, 4) == 3  # end
-
-
-def test_utf16_to_char_multiple_emoji():
-    """Multiple supplementary plane characters."""
-    text = "A📊B🎉C"  # UTF-16: A(1) 📊(2) B(1) 🎉(2) C(1) = 7 units, 5 chars
-    assert _utf16_to_char(text, 0) == 0  # before A
-    assert _utf16_to_char(text, 1) == 1  # before 📊
-    assert _utf16_to_char(text, 3) == 2  # before B
-    assert _utf16_to_char(text, 4) == 3  # before 🎉
-    assert _utf16_to_char(text, 6) == 4  # before C
-    assert _utf16_to_char(text, 7) == 5  # end
-
-
-def test_utf16_to_char_roundtrip():
-    """_char_to_utf16 and _utf16_to_char are inverses."""
-    texts = [
-        "Hello",
-        "A📊B",
-        "价格分析",
-        "# Analysis 📊\n",
-        "A𝒜B𠀀C",
-        "Hello 世界 📊 мир!",
-        "🎉📊🔒",
-    ]
-    for text in texts:
-        for char_idx in range(len(text) + 1):
-            utf16_idx = _char_to_utf16(text, char_idx)
-            assert _utf16_to_char(text, utf16_idx) == char_idx, (
-                f"Roundtrip failed for {text!r} at char_idx={char_idx}: "
-                f"utf16={utf16_idx}, back={_utf16_to_char(text, utf16_idx)}"
-            )
-
-
 def test_sticky_index_transaction():
     doc = Doc()
     text = doc.get("text", type=Text)

From 780d8a877e6a830bc85afc9991b23e7d231e1de1 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 17 Apr 2026 00:17:13 -0400
Subject: [PATCH 7/8] fix: use UTF-16 offsets for XmlText operations

XmlText mirrored the same bug that #379 fixed in Text: every mutating
method passed a raw Python character index to yrs, but yrs (with
OffsetKind::Utf16 set on the Doc) expects UTF-16 code unit offsets.
Non-BMP content (emoji, supplementary plane) landed at the wrong
position or split surrogate pairs.

Convert the index through _char_to_utf16 in insert, insert_embed,
format, and __delitem__ (handling surrogate-pair length for single
deletes). __iadd__ inherits the fix via self.insert.

Also change __len__ to return Python character count, matching Text.
Previously it returned yrs' UTF-16 length, which disagreed with the
string returned by str(self) for non-BMP characters.

Depends on #379 for the OffsetKind::Utf16 doc option.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/pycrdt/_xml.py | 39 +++++++++++++++++++--------
 tests/test_xml.py     | 63 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 11 deletions(-)

diff --git a/python/pycrdt/_xml.py b/python/pycrdt/_xml.py
index b9125a0..b7180c4 100644
--- a/python/pycrdt/_xml.py
+++ b/python/pycrdt/_xml.py
@@ -7,6 +7,7 @@
 from ._pycrdt import XmlEvent as _XmlEvent
 from ._pycrdt import XmlFragment as _XmlFragment
 from ._pycrdt import XmlText as _XmlText
+from ._text import _char_to_utf16
 
 if TYPE_CHECKING:
     from typing import Any, Iterable, Mapping, Sized, TypeVar
@@ -228,8 +229,9 @@ def _init(self, value: str | None) -> None:  # pragma: no cover
             self.integrated.insert(txn._txn, 0, value)
 
     def __len__(self) -> int:
-        with self.doc.transaction() as txn:
-            return self.integrated.len(txn._txn)
+        # Python character count, matching Text.__len__. yrs' internal len
+        # is in UTF-16 code units, which would misreport for non-BMP chars.
+        return len(str(self))
 
     def __iadd__(self, value: str) -> XmlText:
         with self.doc.transaction():
@@ -247,8 +249,12 @@ def insert(self, index: int, value: str, attrs: Mapping[str, Any] | None = None)
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            utf16_index = _char_to_utf16(str(self), index)
             self.integrated.insert(
-                txn._txn, index, value, iter(attrs.items()) if attrs is not None else iter([])
+                txn._txn,
+                utf16_index,
+                value,
+                iter(attrs.items()) if attrs is not None else iter([]),
             )
 
     def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None:
@@ -263,13 +269,14 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
             _attrs = iter(attrs.items()) if attrs is not None else None
+            utf16_index = _char_to_utf16(str(self), index)
             if isinstance(value, BaseType):
                 # shared type
                 assert txn._txn is not None
-                self._do_and_integrate("insert", value, txn._txn, index, _attrs)
+                self._do_and_integrate("insert", value, txn._txn, utf16_index, _attrs)
             else:
                 # primitive type
-                self.integrated.insert_embed(txn._txn, index, value, _attrs)
+                self.integrated.insert_embed(txn._txn, utf16_index, value, _attrs)
 
     def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
         """
@@ -283,9 +290,12 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
             start, stop = _check_slice(self, slice(start, stop))
-            length = stop - start
+            current = str(self)
+            utf16_start = _char_to_utf16(current, start)
+            utf16_stop = _char_to_utf16(current, stop)
+            length = utf16_stop - utf16_start
             if length > 0:
-                self.integrated.format(txn._txn, start, length, iter(attrs.items()))
+                self.integrated.format(txn._txn, utf16_start, length, iter(attrs.items()))
 
     def diff(self) -> list[tuple[Any, dict[str, Any] | None]]:
         """
@@ -301,13 +311,20 @@ def diff(self) -> list[tuple[Any, dict[str, Any] | None]]:
     def __delitem__(self, key: int | slice) -> None:
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
             if isinstance(key, int):
-                self.integrated.remove_range(txn._txn, key, 1)
+                utf16_idx = _char_to_utf16(current, key)
+                char_at = current[key]
+                utf16_len = 2 if ord(char_at) > 0xFFFF else 1
+                self.integrated.remove_range(txn._txn, utf16_idx, utf16_len)
             elif isinstance(key, slice):
                 start, stop = _check_slice(self, key)
-                length = stop - start
-                if length > 0:
-                    self.integrated.remove_range(txn._txn, start, length)
+                if stop - start > 0:
+                    utf16_start = _char_to_utf16(current, start)
+                    utf16_stop = _char_to_utf16(current, stop)
+                    self.integrated.remove_range(
+                        txn._txn, utf16_start, utf16_stop - utf16_start
+                    )
             else:
                 raise TypeError(f"Index not supported: {key}")
 
diff --git a/tests/test_xml.py b/tests/test_xml.py
index 058e2a0..60717aa 100644
--- a/tests/test_xml.py
+++ b/tests/test_xml.py
@@ -166,6 +166,69 @@ def test_text():
     doc["test2"] = XmlFragment([XmlText()])
 
 
+def test_xml_text_unicode_len():
+    """XmlText.__len__ returns Python character count, matching Text."""
+    doc = Doc()
+    doc["x"] = frag = XmlFragment([XmlText("A📊B")])
+    text = frag.children[0]
+
+    assert len(text) == 3  # 3 Python chars, not 4 UTF-16 code units
+
+
+def test_xml_text_unicode_iadd():
+    """`+=` after emoji appends at the end (regression for UTF-16 offset bug)."""
+    doc = Doc()
+    doc["x"] = frag = XmlFragment([XmlText()])
+    text = frag.children[0]
+
+    text += "A📊B"
+    text += "X"
+
+    assert str(text) == "A📊BX"
+
+
+def test_xml_text_unicode_insert():
+    """insert() places text at the correct position after an emoji."""
+    doc = Doc()
+    doc["x"] = frag = XmlFragment([XmlText("A📊B")])
+    text = frag.children[0]
+
+    text.insert(2, "X")
+
+    assert str(text) == "A📊XB"
+
+
+def test_xml_text_unicode_delete():
+    """del by index and slice removes the correct character around an emoji."""
+    doc = Doc()
+    doc["x"] = frag = XmlFragment([XmlText("A📊BC")])
+    text = frag.children[0]
+
+    del text[1]  # remove 📊
+    assert str(text) == "ABC"
+
+    doc["y"] = frag2 = XmlFragment([XmlText("A📊BC🎉D")])
+    text2 = frag2.children[0]
+
+    del text2[1:4]  # remove 📊BC
+    assert str(text2) == "A🎉D"
+
+
+def test_xml_text_unicode_format():
+    """format() uses character-index bounds and wraps non-BMP chars correctly."""
+    doc = Doc()
+    doc["x"] = frag = XmlFragment([XmlText("A📊B")])
+    text = frag.children[0]
+
+    text.format(1, 2, {"bold": True})  # format just the emoji
+
+    assert text.diff() == [
+        ("A", None),
+        ("📊", {"bold": True}),
+        ("B", None),
+    ]
+
+
 def test_element_with_any_attribute():
     doc = Doc()
 

From 952e09c1adbb02466f5b19f4930af09526122d96 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 17 Apr 2026 04:18:45 +0000
Subject: [PATCH 8/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 python/pycrdt/_xml.py | 4 +---
 tests/test_text.py    | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/pycrdt/_xml.py b/python/pycrdt/_xml.py
index b7180c4..0741729 100644
--- a/python/pycrdt/_xml.py
+++ b/python/pycrdt/_xml.py
@@ -322,9 +322,7 @@ def __delitem__(self, key: int | slice) -> None:
                 if stop - start > 0:
                     utf16_start = _char_to_utf16(current, start)
                     utf16_stop = _char_to_utf16(current, stop)
-                    self.integrated.remove_range(
-                        txn._txn, utf16_start, utf16_stop - utf16_start
-                    )
+                    self.integrated.remove_range(txn._txn, utf16_start, utf16_stop - utf16_start)
             else:
                 raise TypeError(f"Index not supported: {key}")
 
diff --git a/tests/test_text.py b/tests/test_text.py
index fc06f0a..f84b933 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -4,7 +4,6 @@
 from anyio import TASK_STATUS_IGNORED, Event, create_task_group
 from anyio.abc import TaskStatus
 from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text
-from pycrdt._text import _char_to_utf16
 
 pytestmark = pytest.mark.anyio