From 669f98db504af8fcb710c5da91e44d32cff1696f Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 10 Apr 2026 12:23:25 -0400 Subject: [PATCH 1/8] fix: use UTF-16 offsets for Text operations (fixes #308) Set OffsetKind::Utf16 on yrs Doc so the wire format uses UTF-16 code unit offsets, matching JS yjs. Without this, pycrdt uses UTF-8 byte offsets, causing findIndexSS "Unexpected case" crashes when JS yjs clients apply incremental updates containing multi-byte characters. In the Python wrapper, convert character (code point) indices to UTF-16 code unit indices before passing to yrs. This ensures Text.insert(), __setitem__, __delitem__, and format() all work correctly with emoji and other non-BMP characters. Fixes: #308 Related: jupyter-ai-contrib/jupyter-server-documents#197 --- python/pycrdt/_text.py | 77 ++++++++++++++++++++++++++++++++++-------- src/doc.rs | 8 ++++- 2 files changed, 70 insertions(+), 15 deletions(-) diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py index 49356c0..418bab3 100644 --- a/python/pycrdt/_text.py +++ b/python/pycrdt/_text.py @@ -11,6 +11,36 @@ from ._doc import Doc +def _char_to_utf16(text: str, char_index: int) -> int: + """Convert a Python character (code point) index to a UTF-16 code unit index. + + Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2 + UTF-16 code units but only 1 Python character. The underlying yrs library + uses UTF-16 offsets, so all indices passed to it must be converted. + + For pure-ASCII / BMP text this is a no-op (returns ``char_index`` + unchanged). + """ + if char_index == 0: + return 0 + prefix = text[:char_index] + # Count characters that need a surrogate pair (code point > 0xFFFF) + extra = sum(1 for ch in prefix if ord(ch) > 0xFFFF) + return char_index + extra + + +def _utf16_to_char(text: str, utf16_index: int) -> int: + """Convert a UTF-16 code unit index back to a Python character index.""" + char_idx = 0 + utf16_idx = 0 + for ch in text: + if utf16_idx >= utf16_index: + break + utf16_idx += 2 if ord(ch) > 0xFFFF else 1 + char_idx += 1 + return char_idx + + class Text(Sequence): """ A shared data type used for collaborative text editing, similar to a Python `str`. @@ -89,10 +119,10 @@ def __len__(self) -> int: ``` Returns: - The length of the text. + The length of the text (in Python characters, not UTF-16 code units). """ - with self.doc.transaction() as txn: - return self.integrated.len(txn._txn) + # Return Python character count, not yrs UTF-16 code unit count + return len(str(self)) def __str__(self) -> str: """ @@ -169,13 +199,19 @@ def __delitem__(self, key: int | slice) -> None: """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) if isinstance(key, int): - self.integrated.remove_range(txn._txn, key, 1) + utf16_idx = _char_to_utf16(current, key) + char_at = current[key] + utf16_len = 2 if ord(char_at) > 0xFFFF else 1 + self.integrated.remove_range(txn._txn, utf16_idx, utf16_len) elif isinstance(key, slice): start, stop = self._check_slice(key) length = stop - start if length > 0: - self.integrated.remove_range(txn._txn, start, length) + utf16_start = _char_to_utf16(current, start) + utf16_stop = _char_to_utf16(current, stop) + self.integrated.remove_range(txn._txn, utf16_start, utf16_stop - utf16_start) else: raise RuntimeError(f"Index not supported: {key}") @@ -214,20 +250,26 @@ def __setitem__(self, key: int | slice, value: str) -> None: """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) if isinstance(key, int): value_len = len(value) if value_len != 1: raise RuntimeError( f"Single item assigned value must have a length of 1, not {value_len}" ) - del self[key] - self.integrated.insert(txn._txn, key, value) + utf16_idx = _char_to_utf16(current, key) + char_at = current[key] + utf16_len = 2 if ord(char_at) > 0xFFFF else 1 + self.integrated.remove_range(txn._txn, utf16_idx, utf16_len) + self.integrated.insert(txn._txn, utf16_idx, value) elif isinstance(key, slice): start, stop = self._check_slice(key) - length = stop - start + utf16_start = _char_to_utf16(current, start) + utf16_stop = _char_to_utf16(current, stop) + length = utf16_stop - utf16_start if length > 0: - self.integrated.remove_range(txn._txn, start, length) - self.integrated.insert(txn._txn, start, value) + self.integrated.remove_range(txn._txn, utf16_start, length) + self.integrated.insert(txn._txn, utf16_start, value) else: raise RuntimeError(f"Index not supported: {key}") @@ -251,8 +293,10 @@ def insert(self, index: int, value: str, attrs: dict[str, Any] | None = None) -> """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) + utf16_index = _char_to_utf16(current, index) self.integrated.insert( - txn._txn, index, value, iter(attrs.items()) if attrs is not None else None + txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None ) def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None: @@ -266,8 +310,10 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) + utf16_index = _char_to_utf16(current, index) self.integrated.insert_embed( - txn._txn, index, value, iter(attrs.items()) if attrs is not None else None + txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None ) def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None: @@ -282,9 +328,12 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None: with self.doc.transaction() as txn: self._forbid_read_transaction(txn) start, stop = self._check_slice(slice(start, stop)) - length = stop - start + current = str(self) + utf16_start = _char_to_utf16(current, start) + utf16_stop = _char_to_utf16(current, stop) + length = utf16_stop - utf16_start if length > 0: - self.integrated.format(txn._txn, start, length, iter(attrs.items())) + self.integrated.format(txn._txn, utf16_start, length, iter(attrs.items())) def diff(self) -> list[tuple[Any, dict[str, Any] | None]]: """ diff --git a/src/doc.rs b/src/doc.rs index 61109cc..27e875d 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -3,7 +3,7 @@ use pyo3::IntoPyObjectExt; use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::types::{PyBool, PyBytes, PyDict, PyInt, PyList}; use yrs::{ - Doc as _Doc, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn + Doc as _Doc, OffsetKind, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn }; use yrs::updates::encoder::{Encode, Encoder}; use yrs::updates::decoder::Decode; @@ -32,6 +32,7 @@ impl Doc { let mut options = yrs::Options::default(); options.client_id = original.doc.client_id(); options.skip_gc = original.doc.skip_gc(); + options.offset_kind = OffsetKind::Utf16; if let Some(collection_id) = original.doc.collection_id() { options.collection_id = Some(collection_id); } @@ -84,6 +85,11 @@ impl Doc { .map_err(|_| PyValueError::new_err("skip_gc must be a valid bool"))?; options.skip_gc = _skip_gc; } + // Use UTF-16 offsets for compatibility with JS yjs clients. + // Without this, pycrdt uses UTF-8 byte offsets which causes + // findIndexSS crashes when JS yjs applies incremental updates + // containing multi-byte characters. + options.offset_kind = OffsetKind::Utf16; let doc = _Doc::with_options(options); Ok(Doc { doc }) } From f3a2e7e596fd2d7bf2b35ea4fdbf97eb7b5eb484 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 10 Apr 2026 14:44:42 -0400 Subject: [PATCH 2/8] test: add 12 Unicode/emoji tests for Text operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cover insert, delete, setitem, slice, len, and cross-doc sync with: - emoji (surrogate pairs: πŸ“Š πŸŽ‰) - CJK (BMP: δ»·ζ Ό δΈ–η•Œ 特征ε·₯程) - Cyrillic (ΠΌΠΈΡ€) - supplementary plane (π’œ π €€) - mixed scripts in one text These all fail on stock pycrdt 0.12.50 and pass with the OffsetKind::Utf16 fix. --- tests/test_text.py | 153 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/tests/test_text.py b/tests/test_text.py index ba913a0..ab4e507 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -228,6 +228,159 @@ def test_sticky_index(serialize: str): assert text1[new_idx] == "*" +def test_unicode_emoji_insert(): + """Text.insert() after emoji characters should use character positions, not byte offsets.""" + doc = Doc() + doc["text"] = text = Text() + + text += "AπŸ“ŠB" + assert str(text) == "AπŸ“ŠB" + assert len(text) == 3 + + # Insert at position 2 = between πŸ“Š and B + text.insert(2, "X") + assert str(text) == "AπŸ“ŠXB", f"Got {str(text)!r}, emoji insert position is wrong" + + +def test_unicode_emoji_sequential_inserts(): + """Sequential inserts after emoji should maintain correct positions.""" + doc = Doc() + doc["text"] = text = Text() + + text += "# Analysis πŸ“Š\n" + text.insert(len(text), "model = fit()\n") + text.insert(len(text), "# 特征ε·₯程\n") + text.insert(len(text), 'print("done")\n') + + expected = '# Analysis πŸ“Š\nmodel = fit()\n# 特征ε·₯程\nprint("done")\n' + assert str(text) == expected, f"Got {str(text)!r}" + + +def test_unicode_emoji_len(): + """len() should return Python character count, not byte count.""" + doc = Doc() + doc["text"] = text = Text() + + text += "AπŸ“ŠB" + assert len(text) == 3 # 3 chars, not 6 bytes or 4 UTF-16 code units + + text += "πŸŽ‰" + assert len(text) == 4 + + +def test_unicode_emoji_delete(): + """Deleting a character after an emoji should work correctly.""" + doc = Doc() + doc["text"] = text = Text("AπŸ“ŠBC") + + del text[2] # delete B (after emoji) + assert str(text) == "AπŸ“ŠC", f"Got {str(text)!r}" + + +def test_unicode_emoji_delete_emoji(): + """Deleting an emoji character itself should work correctly.""" + doc = Doc() + doc["text"] = text = Text("AπŸ“ŠB") + + del text[1] # delete πŸ“Š + assert str(text) == "AB", f"Got {str(text)!r}" + + +def test_unicode_emoji_slice_delete(): + """Slice deletion across emoji boundaries should work correctly.""" + doc = Doc() + doc["text"] = text = Text("AπŸ“ŠBπŸŽ‰C") + + del text[1:4] # delete πŸ“ŠBπŸŽ‰ + assert str(text) == "AC", f"Got {str(text)!r}" + + +def test_unicode_emoji_setitem(): + """Replacing a character after an emoji should work correctly.""" + doc = Doc() + doc["text"] = text = Text("AπŸ“ŠBC") + + text[2] = "X" # replace B (after emoji) + assert str(text) == "AπŸ“ŠXC", f"Got {str(text)!r}" + + +def test_unicode_emoji_slice_setitem(): + """Slice replacement spanning emoji should work correctly.""" + doc = Doc() + doc["text"] = text = Text("AπŸ“ŠBπŸŽ‰C") + + text[1:4] = "XYZ" # replace πŸ“ŠBπŸŽ‰ with XYZ + assert str(text) == "AXYZC", f"Got {str(text)!r}" + + +def test_unicode_cjk(): + """CJK characters (BMP, 1 UTF-16 code unit each) should work correctly.""" + doc = Doc() + doc["text"] = text = Text() + + text += "δ»·ζ Ό" + text.insert(2, "X") + assert str(text) == "δ»·ζ ΌX", f"Got {str(text)!r}" + assert len(text) == 3 + + +def test_unicode_mixed_scripts(): + """Mixed ASCII, CJK, Cyrillic, and emoji in one text.""" + doc = Doc() + doc["text"] = text = Text() + + text += "Hello" + text.insert(5, " δΈ–η•Œ") + text.insert(8, " πŸ“Š") + text.insert(11, " ΠΌΠΈΡ€") + text.insert(15, "!") + + expected = "Hello δΈ–η•Œ πŸ“Š ΠΌΠΈΡ€!" + assert str(text) == expected, f"Got {str(text)!r}" + assert len(text) == 15 + + +def test_unicode_supplementary_plane(): + """Characters outside BMP (require UTF-16 surrogate pairs).""" + doc = Doc() + doc["text"] = text = Text() + + # π’œ (U+1D49C) = Mathematical Script Capital A + # π €€ (U+20000) = CJK Unified Ideograph Extension B + text += "Aπ’œBπ €€C" + assert len(text) == 5 + + text.insert(2, "X") # between π’œ and B + assert str(text) == "Aπ’œXBπ €€C", f"Got {str(text)!r}" + + text.insert(5, "Y") # between π €€ and C + assert str(text) == "Aπ’œXBπ €€YC", f"Got {str(text)!r}" + + +def test_unicode_cross_doc_sync(): + """Updates with Unicode content should sync correctly between two pycrdt docs.""" + doc1 = Doc() + doc1["text"] = text1 = Text() + + # Capture updates from doc1 + updates = [] + doc1.observe(lambda event: updates.append(event.update)) + + text1 += "# Analysis πŸ“Š\n" + text1.insert(len(text1), "model = fit()\n") + text1.insert(len(text1), "# 特征ε·₯程\n") + + # Apply to doc2 + doc2 = Doc() + doc2["text"] = Text() + for update in updates: + doc2.apply_update(update) + + assert str(doc2["text"]) == str(text1), ( + f"Docs diverged: doc1={str(text1)!r} doc2={str(doc2['text'])!r}" + ) + + def test_sticky_index_transaction(): doc = Doc() text = doc.get("text", type=Text) From 8264d95de14fb00435d338b76111dc81ab12b97a Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 10 Apr 2026 14:48:36 -0400 Subject: [PATCH 3/8] test: add granular diff tests from jupyter_ydoc#370 11 parametrized test cases adapted from jupyter-server/jupyter_ydoc#370 covering emoji swaps, flags, ZWJ family sequences, combining marks, keycap sequences, RTL/LTR text, Japanese, and math operators. These exercise Text insert/delete/replace via SequenceMatcher-based diffing (the same pattern jupyter_ydoc.YUnicode.set() uses). --- tests/test_text.py | 116 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/tests/test_text.py b/tests/test_text.py index ab4e507..dd77df2 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -381,6 +381,122 @@ def test_unicode_cross_doc_sync(): ) +# Test cases adapted from jupyter-server/jupyter_ydoc#370 (prior art for +# the workaround at the jupyter_ydoc layer). These exercise pycrdt's Text +# operations directly with the same Unicode edge cases. Each test sets +# initial content, then applies a granular edit (using SequenceMatcher on +# byte offsets, matching how jupyter_ydoc.YUnicode.set() works), and verifies +# the result is correct. +from difflib import SequenceMatcher + + +def _apply_diff(text, old_value, new_value): + """Apply a granular diff from old_value to new_value using character-level + SequenceMatcher. With the UTF-16 offset fix, pycrdt Text indices are + character-based, so we diff on characters (not bytes).""" + matcher = SequenceMatcher(a=old_value, b=new_value) + + offset = 0 + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if tag == "replace": + text[i1 + offset : i2 + offset] = new_value[j1:j2] + offset += (j2 - j1) - (i2 - i1) + elif tag == "delete": + del text[i1 + offset : i2 + offset] + offset -= i2 - i1 + elif tag == "insert": + text.insert(i1 + offset, new_value[j1:j2]) + offset += j2 - j1 + + +@pytest.mark.parametrize( + "initial, updated", + [ + # emojis swapped + ( + "I like security 🎨 but I really love painting πŸ”’", + "I like security πŸ”’ but I really love painting 🎨", + ), + # text changes, emojis stay in place + ( + "Here is a rocket: ⭐ and a star: πŸš€", + "Here is a star: ⭐ and a rocket: πŸš€", + ), + # change of text and emojis + ( + "Here are some happy faces: πŸ˜€πŸ˜πŸ˜‚", + "Here are some sad faces: 😞😒😭", + ), + # change of characters with combining marks + ( + "Combining characters: Γ‘ Γ© Γ­ Γ³ ΓΊ", + "Combining characters: ΓΊ Γ³ Γ­ Γ© Γ‘", + ), + # flags (regional indicator sequences) + ( + "Flags: πŸ‡ΊπŸ‡ΈπŸ‡¬πŸ‡§πŸ‡¨πŸ‡¦", + "Flags: πŸ‡¨πŸ‡¦πŸ‡¬πŸ‡§πŸ‡ΊπŸ‡Έ", + ), + # Zero-width joiner sequences (family emoji) + ( + "A family πŸ‘¨\u200dπŸ‘©\u200dπŸ‘§\u200dπŸ‘¦ (with two children)", + "A family πŸ‘¨\u200dπŸ‘©\u200dπŸ‘§ (with one child)", + ), + # Mixed RTL/LTR text + ( + "Hello Χ©ΧœΧ•Χ world", + "Hello Χ’Χ•ΧœΧ world", + ), + # Keycap sequences + ( + "Numbers: 1️⃣2️⃣3️⃣", + "Numbers: 3️⃣2️⃣1️⃣", + ), + # Emoji at boundaries + ( + "πŸ‘‹ middle text πŸŽ‰", + "πŸŽ‰ middle text πŸ‘‹", + ), + # Japanese characters + ( + "γ“γ‚“γ«γ‘γ―δΈ–η•Œ", + "γ“γ‚“γ«γ‘γ―εœ°ηƒ", + ), + # Julia math operators + ( + "x ∈ [1, 2, 3] && y β‰₯ 0", + "x βˆ‰ [1, 2, 3] || y ≀ 0", + ), + ], + ids=[ + "emoji_swap", + "text_change_emoji_stay", + "emoji_change", + "combining_marks", + "flags", + "zwj_family", + "rtl_ltr", + "keycap", + "emoji_boundaries", + "japanese", + "math_operators", + ], +) +def test_unicode_granular_diff(initial, updated): + """Granular text edits with multi-byte Unicode should produce correct results. + + Test cases adapted from jupyter-server/jupyter_ydoc#370. + """ + doc = Doc() + doc["text"] = text = Text() + + text += initial + assert str(text) == initial + + _apply_diff(text, initial, updated) + assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}" + + def test_sticky_index_transaction(): doc = Doc() text = doc.get("text", type=Text) From 7cf5af44834ba1ae4f108ea99ed668b90530b723 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 10 Apr 2026 14:52:11 -0400 Subject: [PATCH 4/8] fix: move SequenceMatcher import to top of file (ruff E402) --- tests/test_text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_text.py b/tests/test_text.py index dd77df2..5efdc1f 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -1,3 +1,5 @@ +from difflib import SequenceMatcher + import pytest from anyio import TASK_STATUS_IGNORED, Event, create_task_group from anyio.abc import TaskStatus @@ -387,7 +389,6 @@ def test_unicode_cross_doc_sync(): # initial content, then applies a granular edit (using SequenceMatcher on # byte offsets, matching how jupyter_ydoc.YUnicode.set() works), and verifies # the result is correct. -from difflib import SequenceMatcher def _apply_diff(text, old_value, new_value): From b1ed6ae56be03f2555f4fc99527143b9a49bf526 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Mon, 13 Apr 2026 20:06:41 -0400 Subject: [PATCH 5/8] test: add tests for _utf16_to_char helper Addresses review feedback from @davidbrochart. Tests cover ASCII (identity), BMP characters, supplementary plane (emoji), multiple emoji, and roundtrip with _char_to_utf16. --- tests/test_text.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/test_text.py b/tests/test_text.py index 5efdc1f..d67f52e 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -4,6 +4,7 @@ from anyio import TASK_STATUS_IGNORED, Event, create_task_group from anyio.abc import TaskStatus from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text +from pycrdt._text import _char_to_utf16, _utf16_to_char pytestmark = pytest.mark.anyio @@ -498,6 +499,62 @@ def test_unicode_granular_diff(initial, updated): assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}" +def test_utf16_to_char_ascii(): + """_utf16_to_char is identity for pure ASCII text.""" + text = "Hello, World!" + for i in range(len(text) + 1): + assert _utf16_to_char(text, i) == i + + +def test_utf16_to_char_bmp(): + """BMP characters (CJK, Cyrillic) are 1 UTF-16 code unit each.""" + text = "δ»·ζ Όεˆ†ζž" # 4 BMP CJK chars = 4 UTF-16 code units + assert _utf16_to_char(text, 0) == 0 + assert _utf16_to_char(text, 1) == 1 + assert _utf16_to_char(text, 2) == 2 + assert _utf16_to_char(text, 4) == 4 + + +def test_utf16_to_char_supplementary(): + """Supplementary plane chars (emoji) take 2 UTF-16 code units.""" + text = "AπŸ“ŠB" # UTF-16: A(1) πŸ“Š(2) B(1) = 4 code units, 3 chars + assert _utf16_to_char(text, 0) == 0 # before A + assert _utf16_to_char(text, 1) == 1 # before πŸ“Š + assert _utf16_to_char(text, 3) == 2 # before B (1 + 2 = 3) + assert _utf16_to_char(text, 4) == 3 # end + + +def test_utf16_to_char_multiple_emoji(): + """Multiple supplementary plane characters.""" + text = "AπŸ“ŠBπŸŽ‰C" # UTF-16: A(1) πŸ“Š(2) B(1) πŸŽ‰(2) C(1) = 7 units, 5 chars + assert _utf16_to_char(text, 0) == 0 # before A + assert _utf16_to_char(text, 1) == 1 # before πŸ“Š + assert _utf16_to_char(text, 3) == 2 # before B + assert _utf16_to_char(text, 4) == 3 # before πŸŽ‰ + assert _utf16_to_char(text, 6) == 4 # before C + assert _utf16_to_char(text, 7) == 5 # end + + +def test_utf16_to_char_roundtrip(): + """_char_to_utf16 and _utf16_to_char are inverses.""" + texts = [ + "Hello", + "AπŸ“ŠB", + "δ»·ζ Όεˆ†ζž", + "# Analysis πŸ“Š\n", + "Aπ’œBπ €€C", + "Hello δΈ–η•Œ πŸ“Š ΠΌΠΈΡ€!", + "πŸŽ‰πŸ“ŠπŸ”’", + ] + for text in texts: + for char_idx in range(len(text) + 1): + utf16_idx = _char_to_utf16(text, char_idx) + assert _utf16_to_char(text, utf16_idx) == char_idx, ( + f"Roundtrip failed for {text!r} at char_idx={char_idx}: " + f"utf16={utf16_idx}, back={_utf16_to_char(text, utf16_idx)}" + ) + + def test_sticky_index_transaction(): doc = Doc() text = doc.get("text", type=Text) From f48610b1a49c2eb312aeb4f4e4e38c76d9489488 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 17 Apr 2026 00:05:54 -0400 Subject: [PATCH 6/8] fix: convert UTF-16 offset in Text.__iadd__ and drop unused _utf16_to_char MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Text.__iadd__ passed len(self) (Python character count) to the yrs insert, but yrs expects a UTF-16 code unit index β€” so `t += "X"` after an emoji landed inside the surrogate pair. Convert the index through _char_to_utf16, matching every other mutating method. Also removes _utf16_to_char and its tests, which had no callers. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/pycrdt/_text.py | 16 ++-------- tests/test_text.py | 69 ++++++++---------------------------------- 2 files changed, 15 insertions(+), 70 deletions(-) diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py index 418bab3..069da86 100644 --- a/python/pycrdt/_text.py +++ b/python/pycrdt/_text.py @@ -29,18 +29,6 @@ def _char_to_utf16(text: str, char_index: int) -> int: return char_index + extra -def _utf16_to_char(text: str, utf16_index: int) -> int: - """Convert a UTF-16 code unit index back to a Python character index.""" - char_idx = 0 - utf16_idx = 0 - for ch in text: - if utf16_idx >= utf16_index: - break - utf16_idx += 2 if ord(ch) > 0xFFFF else 1 - char_idx += 1 - return char_idx - - class Text(Sequence): """ A shared data type used for collaborative text editing, similar to a Python `str`. @@ -158,7 +146,9 @@ def __iadd__(self, value: str) -> Text: """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) - self.integrated.insert(txn._txn, len(self), value) + current = str(self) + utf16_index = _char_to_utf16(current, len(current)) + self.integrated.insert(txn._txn, utf16_index, value) return self def _check_slice(self, key: slice) -> tuple[int, int]: diff --git a/tests/test_text.py b/tests/test_text.py index d67f52e..fc06f0a 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -4,7 +4,7 @@ from anyio import TASK_STATUS_IGNORED, Event, create_task_group from anyio.abc import TaskStatus from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text -from pycrdt._text import _char_to_utf16, _utf16_to_char +from pycrdt._text import _char_to_utf16 pytestmark = pytest.mark.anyio @@ -259,6 +259,17 @@ def test_unicode_emoji_sequential_inserts(): assert str(text) == expected, f"Got {str(text)!r}" +def test_unicode_emoji_iadd(): + """`+=` after emoji should append at the end (regression for UTF-16 offset bug).""" + doc = Doc() + doc["text"] = text = Text() + + text += "AπŸ“ŠB" + text += "X" + + assert str(text) == "AπŸ“ŠBX" + + def test_unicode_emoji_len(): """len() should return Python character count, not byte count.""" doc = Doc() @@ -499,62 +510,6 @@ def test_unicode_granular_diff(initial, updated): assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}" -def test_utf16_to_char_ascii(): - """_utf16_to_char is identity for pure ASCII text.""" - text = "Hello, World!" - for i in range(len(text) + 1): - assert _utf16_to_char(text, i) == i - - -def test_utf16_to_char_bmp(): - """BMP characters (CJK, Cyrillic) are 1 UTF-16 code unit each.""" - text = "δ»·ζ Όεˆ†ζž" # 4 BMP CJK chars = 4 UTF-16 code units - assert _utf16_to_char(text, 0) == 0 - assert _utf16_to_char(text, 1) == 1 - assert _utf16_to_char(text, 2) == 2 - assert _utf16_to_char(text, 4) == 4 - - -def test_utf16_to_char_supplementary(): - """Supplementary plane chars (emoji) take 2 UTF-16 code units.""" - text = "AπŸ“ŠB" # UTF-16: A(1) πŸ“Š(2) B(1) = 4 code units, 3 chars - assert _utf16_to_char(text, 0) == 0 # before A - assert _utf16_to_char(text, 1) == 1 # before πŸ“Š - assert _utf16_to_char(text, 3) == 2 # before B (1 + 2 = 3) - assert _utf16_to_char(text, 4) == 3 # end - - -def test_utf16_to_char_multiple_emoji(): - """Multiple supplementary plane characters.""" - text = "AπŸ“ŠBπŸŽ‰C" # UTF-16: A(1) πŸ“Š(2) B(1) πŸŽ‰(2) C(1) = 7 units, 5 chars - assert _utf16_to_char(text, 0) == 0 # before A - assert _utf16_to_char(text, 1) == 1 # before πŸ“Š - assert _utf16_to_char(text, 3) == 2 # before B - assert _utf16_to_char(text, 4) == 3 # before πŸŽ‰ - assert _utf16_to_char(text, 6) == 4 # before C - assert _utf16_to_char(text, 7) == 5 # end - - -def test_utf16_to_char_roundtrip(): - """_char_to_utf16 and _utf16_to_char are inverses.""" - texts = [ - "Hello", - "AπŸ“ŠB", - "δ»·ζ Όεˆ†ζž", - "# Analysis πŸ“Š\n", - "Aπ’œBπ €€C", - "Hello δΈ–η•Œ πŸ“Š ΠΌΠΈΡ€!", - "πŸŽ‰πŸ“ŠπŸ”’", - ] - for text in texts: - for char_idx in range(len(text) + 1): - utf16_idx = _char_to_utf16(text, char_idx) - assert _utf16_to_char(text, utf16_idx) == char_idx, ( - f"Roundtrip failed for {text!r} at char_idx={char_idx}: " - f"utf16={utf16_idx}, back={_utf16_to_char(text, utf16_idx)}" - ) - - def test_sticky_index_transaction(): doc = Doc() text = doc.get("text", type=Text) From 780d8a877e6a830bc85afc9991b23e7d231e1de1 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 17 Apr 2026 00:17:13 -0400 Subject: [PATCH 7/8] fix: use UTF-16 offsets for XmlText operations XmlText mirrored the same bug that #379 fixed in Text: every mutating method passed a raw Python character index to yrs, but yrs (with OffsetKind::Utf16 set on the Doc) expects UTF-16 code unit offsets. Non-BMP content (emoji, supplementary plane) landed at the wrong position or split surrogate pairs. Convert the index through _char_to_utf16 in insert, insert_embed, format, and __delitem__ (handling surrogate-pair length for single deletes). __iadd__ inherits the fix via self.insert. Also change __len__ to return Python character count, matching Text. Previously it returned yrs' UTF-16 length, which disagreed with the string returned by str(self) for non-BMP characters. Depends on #379 for the OffsetKind::Utf16 doc option. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/pycrdt/_xml.py | 39 +++++++++++++++++++-------- tests/test_xml.py | 63 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 11 deletions(-) diff --git a/python/pycrdt/_xml.py b/python/pycrdt/_xml.py index b9125a0..b7180c4 100644 --- a/python/pycrdt/_xml.py +++ b/python/pycrdt/_xml.py @@ -7,6 +7,7 @@ from ._pycrdt import XmlEvent as _XmlEvent from ._pycrdt import XmlFragment as _XmlFragment from ._pycrdt import XmlText as _XmlText +from ._text import _char_to_utf16 if TYPE_CHECKING: from typing import Any, Iterable, Mapping, Sized, TypeVar @@ -228,8 +229,9 @@ def _init(self, value: str | None) -> None: # pragma: no cover self.integrated.insert(txn._txn, 0, value) def __len__(self) -> int: - with self.doc.transaction() as txn: - return self.integrated.len(txn._txn) + # Python character count, matching Text.__len__. yrs' internal len + # is in UTF-16 code units, which would misreport for non-BMP chars. + return len(str(self)) def __iadd__(self, value: str) -> XmlText: with self.doc.transaction(): @@ -247,8 +249,12 @@ def insert(self, index: int, value: str, attrs: Mapping[str, Any] | None = None) """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + utf16_index = _char_to_utf16(str(self), index) self.integrated.insert( - txn._txn, index, value, iter(attrs.items()) if attrs is not None else iter([]) + txn._txn, + utf16_index, + value, + iter(attrs.items()) if attrs is not None else iter([]), ) def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None: @@ -263,13 +269,14 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No with self.doc.transaction() as txn: self._forbid_read_transaction(txn) _attrs = iter(attrs.items()) if attrs is not None else None + utf16_index = _char_to_utf16(str(self), index) if isinstance(value, BaseType): # shared type assert txn._txn is not None - self._do_and_integrate("insert", value, txn._txn, index, _attrs) + self._do_and_integrate("insert", value, txn._txn, utf16_index, _attrs) else: # primitive type - self.integrated.insert_embed(txn._txn, index, value, _attrs) + self.integrated.insert_embed(txn._txn, utf16_index, value, _attrs) def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None: """ @@ -283,9 +290,12 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None: with self.doc.transaction() as txn: self._forbid_read_transaction(txn) start, stop = _check_slice(self, slice(start, stop)) - length = stop - start + current = str(self) + utf16_start = _char_to_utf16(current, start) + utf16_stop = _char_to_utf16(current, stop) + length = utf16_stop - utf16_start if length > 0: - self.integrated.format(txn._txn, start, length, iter(attrs.items())) + self.integrated.format(txn._txn, utf16_start, length, iter(attrs.items())) def diff(self) -> list[tuple[Any, dict[str, Any] | None]]: """ @@ -301,13 +311,20 @@ def diff(self) -> list[tuple[Any, dict[str, Any] | None]]: def __delitem__(self, key: int | slice) -> None: with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) if isinstance(key, int): - self.integrated.remove_range(txn._txn, key, 1) + utf16_idx = _char_to_utf16(current, key) + char_at = current[key] + utf16_len = 2 if ord(char_at) > 0xFFFF else 1 + self.integrated.remove_range(txn._txn, utf16_idx, utf16_len) elif isinstance(key, slice): start, stop = _check_slice(self, key) - length = stop - start - if length > 0: - self.integrated.remove_range(txn._txn, start, length) + if stop - start > 0: + utf16_start = _char_to_utf16(current, start) + utf16_stop = _char_to_utf16(current, stop) + self.integrated.remove_range( + txn._txn, utf16_start, utf16_stop - utf16_start + ) else: raise TypeError(f"Index not supported: {key}") diff --git a/tests/test_xml.py b/tests/test_xml.py index 058e2a0..60717aa 100644 --- a/tests/test_xml.py +++ b/tests/test_xml.py @@ -166,6 +166,69 @@ def test_text(): doc["test2"] = XmlFragment([XmlText()]) +def test_xml_text_unicode_len(): + """XmlText.__len__ returns Python character count, matching Text.""" + doc = Doc() + doc["x"] = frag = XmlFragment([XmlText("AπŸ“ŠB")]) + text = frag.children[0] + + assert len(text) == 3 # 3 Python chars, not 4 UTF-16 code units + + +def test_xml_text_unicode_iadd(): + """`+=` after emoji appends at the end (regression for UTF-16 offset bug).""" + doc = Doc() + doc["x"] = frag = XmlFragment([XmlText()]) + text = frag.children[0] + + text += "AπŸ“ŠB" + text += "X" + + assert str(text) == "AπŸ“ŠBX" + + +def test_xml_text_unicode_insert(): + """insert() places text at the correct position after an emoji.""" + doc = Doc() + doc["x"] = frag = XmlFragment([XmlText("AπŸ“ŠB")]) + text = frag.children[0] + + text.insert(2, "X") + + assert str(text) == "AπŸ“ŠXB" + + +def test_xml_text_unicode_delete(): + """del by index and slice removes the correct character around an emoji.""" + doc = Doc() + doc["x"] = frag = XmlFragment([XmlText("AπŸ“ŠBC")]) + text = frag.children[0] + + del text[1] # remove πŸ“Š + assert str(text) == "ABC" + + doc["y"] = frag2 = XmlFragment([XmlText("AπŸ“ŠBCπŸŽ‰D")]) + text2 = frag2.children[0] + + del text2[1:4] # remove πŸ“ŠBC + assert str(text2) == "AπŸŽ‰D" + + +def test_xml_text_unicode_format(): + """format() uses character-index bounds and wraps non-BMP chars correctly.""" + doc = Doc() + doc["x"] = frag = XmlFragment([XmlText("AπŸ“ŠB")]) + text = frag.children[0] + + text.format(1, 2, {"bold": True}) # format just the emoji + + assert text.diff() == [ + ("A", None), + ("πŸ“Š", {"bold": True}), + ("B", None), + ] + + def test_element_with_any_attribute(): doc = Doc() From 952e09c1adbb02466f5b19f4930af09526122d96 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 04:18:45 +0000 Subject: [PATCH 8/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- python/pycrdt/_xml.py | 4 +--- tests/test_text.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/python/pycrdt/_xml.py b/python/pycrdt/_xml.py index b7180c4..0741729 100644 --- a/python/pycrdt/_xml.py +++ b/python/pycrdt/_xml.py @@ -322,9 +322,7 @@ def __delitem__(self, key: int | slice) -> None: if stop - start > 0: utf16_start = _char_to_utf16(current, start) utf16_stop = _char_to_utf16(current, stop) - self.integrated.remove_range( - txn._txn, utf16_start, utf16_stop - utf16_start - ) + self.integrated.remove_range(txn._txn, utf16_start, utf16_stop - utf16_start) else: raise TypeError(f"Index not supported: {key}") diff --git a/tests/test_text.py b/tests/test_text.py index fc06f0a..f84b933 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -4,7 +4,6 @@ from anyio import TASK_STATUS_IGNORED, Event, create_task_group from anyio.abc import TaskStatus from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text -from pycrdt._text import _char_to_utf16 pytestmark = pytest.mark.anyio