Skip to content

Commit acfff9e

Browse files
committed
fix(telegram): port MarkdownV2-safe truncation helpers (chat@4.27.0)
The previous truncation in ``TelegramAdapter.truncate_message`` / ``truncate_caption`` called ``_truncate_to_utf16`` with a literal ``...`` ellipsis, which Telegram's MarkdownV2 parser rejects: ``.`` is a reserved character; the slice can leave an orphan trailing ``\``; and the slice can cut through a paired entity (``*bold*``, `` `code` ``, ``[label](url)``) leaving it unclosed. Any MarkdownV2 message exceeding 4096 chars (or 1024 for captions) triggered ``Bad Request: can't parse entities``. Port of upstream b9a1961 (chat@4.27.0) plus the streaming-chunk safety trim from f46a6fb (chat#446): - ``truncate_for_telegram(text, limit, parse_mode)``: MarkdownV2 path uses an escaped ``\.\.\.`` ellipsis and walks back past unbalanced entity delimiters or orphan backslashes before appending. Plain text keeps the literal ``...``. - ``find_unescaped_positions(text, marker)``: scans for unescaped occurrences of an entity marker, accounting for arbitrary runs of escape backslashes. - ``ends_with_orphan_backslash(text)``: True when the trailing run of ``\`` has odd parity. - ``_find_unescaped_positions_outside_code``: skips occurrences inside fenced and inline code spans (Telegram treats markers there as literal text). - ``_trim_to_markdown_v2_safe_boundary``: best-effort backwards walk past unbalanced delimiters / orphan backslashes / unmatched ``[``. ``truncate_message`` / ``truncate_caption`` now accept an optional ``parse_mode``; ``post_message`` / ``edit_message`` / ``send_document`` plumb it through. Plain-mode behaviour is unchanged (still UTF-16 aware). Tests: 8 length-limit tests on the MarkdownV2 path (escaped ellipsis, orphan backslash, unclosed bold / code / open bracket, all-special input, balanced no-op, plain passthrough); 4 streaming-chunk safety trims; 4 ``find_unescaped_positions`` tests; 5 ``ends_with_orphan_backslash`` tests; 3 adapter-level integration tests verifying ``truncate_message`` / ``truncate_caption`` dispatch on parse_mode. Each docstring includes "What to fix if this fails:" pointing at the relevant helper. https://claude.ai/code/session_01FyMxQn2BEAzmwKS1GZczKj
1 parent 2d07c6d commit acfff9e

2 files changed

Lines changed: 456 additions & 8 deletions

File tree

src/chat_sdk/adapters/telegram/adapter.py

Lines changed: 200 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,183 @@ def _truncate_to_utf16(text: str, limit: int, ellipsis: str = "...") -> str:
164164
return text[:cut] + ellipsis
165165

166166

167+
# ---------------------------------------------------------------------------
168+
# MarkdownV2-safe truncation
169+
# ---------------------------------------------------------------------------
170+
#
171+
# Port of packages/adapter-telegram/src/markdown.ts (chat@4.27.0).
172+
#
173+
# Naive ``slice + "..."`` produces invalid MarkdownV2: ``.`` is a reserved
174+
# character (must be escaped as ``\.``); a slice can leave an orphan
175+
# trailing ``\`` that escapes the ellipsis or nothing; and a slice can cut
176+
# through a paired entity (``*bold*``, `` `code` ``, ``[label](url)``)
177+
# leaving it unclosed. Telegram rejects all three with
178+
# ``Bad Request: can't parse entities``.
179+
#
180+
# These helpers walk back past unbalanced delimiters and orphan backslashes
181+
# before appending an escaped ellipsis. They also run on
182+
# under-the-limit MarkdownV2 inputs (per upstream f46a6fb / chat#446) so
183+
# streamed chunks that arrive with a transiently unpaired opener are
184+
# trimmed back to a parseable boundary.
185+
186+
# Entity delimiters whose opener/closer pairing must be preserved when
187+
# truncating a rendered MarkdownV2 string.
188+
_MARKDOWN_V2_ENTITY_MARKERS: tuple[str, ...] = ("*", "_", "~", "`")
189+
190+
_MARKDOWN_V2_ELLIPSIS = "\\.\\.\\."
191+
_PLAIN_ELLIPSIS = "..."
192+
193+
194+
def find_unescaped_positions(text: str, marker: str) -> list[int]:
195+
"""Return indices of every occurrence of *marker* in *text* not preceded
196+
by an odd number of backslashes (i.e. not escaped)."""
197+
positions: list[int] = []
198+
for i, ch in enumerate(text):
199+
if ch != marker:
200+
continue
201+
backslashes = 0
202+
j = i - 1
203+
while j >= 0 and text[j] == "\\":
204+
backslashes += 1
205+
j -= 1
206+
if backslashes % 2 == 0:
207+
positions.append(i)
208+
return positions
209+
210+
211+
def _find_unescaped_positions_outside_code(text: str, marker: str) -> list[int]:
212+
"""Like :func:`find_unescaped_positions` but skips occurrences inside
213+
fenced code blocks (```````) or inline code spans
214+
(`````). Inside those regions Telegram treats ``*``, ``_``, ``~``,
215+
``[``, ``]`` as literal text.
216+
217+
Port of upstream ``findUnescapedPositionsOutsideCode`` (chat#446).
218+
"""
219+
positions: list[int] = []
220+
in_fence = False
221+
in_inline = False
222+
backslashes = 0
223+
i = 0
224+
n = len(text)
225+
while i < n:
226+
ch = text[i]
227+
228+
if ch == "\\":
229+
backslashes += 1
230+
i += 1
231+
continue
232+
233+
escaped = backslashes % 2 == 1
234+
backslashes = 0
235+
236+
if ch == "`" and not escaped:
237+
is_triple = text[i + 1 : i + 2] == "`" and text[i + 2 : i + 3] == "`"
238+
if is_triple and not in_inline:
239+
in_fence = not in_fence
240+
i += 3
241+
continue
242+
if not in_fence:
243+
in_inline = not in_inline
244+
i += 1
245+
continue
246+
247+
if ch == marker and not escaped and not in_fence and not in_inline:
248+
positions.append(i)
249+
i += 1
250+
251+
return positions
252+
253+
254+
def ends_with_orphan_backslash(text: str) -> bool:
255+
"""Return True if *text* ends with an odd number of trailing ``\\``."""
256+
trailing = 0
257+
i = len(text) - 1
258+
while i >= 0 and text[i] == "\\":
259+
trailing += 1
260+
i -= 1
261+
return trailing % 2 == 1
262+
263+
264+
def _trim_to_markdown_v2_safe_boundary(text: str) -> str:
265+
"""Drop trailing characters that would produce invalid MarkdownV2.
266+
267+
Drops:
268+
- orphan trailing ``\\`` (would escape the appended ellipsis or nothing)
269+
- unclosed entity delimiter (``*``, ``_``, ``~``, `` ` ``) whose closer
270+
was cut off
271+
- unmatched ``[`` from a link whose closer was cut off
272+
273+
Best-effort: may drop more than strictly necessary in edge cases, but
274+
guarantees the output is parseable MarkdownV2 (when the input was).
275+
"""
276+
current = text
277+
max_iterations = len(current) + 1
278+
279+
for _ in range(max_iterations):
280+
if ends_with_orphan_backslash(current):
281+
current = current[:-1]
282+
continue
283+
284+
min_unsafe_position = len(current)
285+
286+
for marker in _MARKDOWN_V2_ENTITY_MARKERS:
287+
if marker == "`":
288+
positions = find_unescaped_positions(current, marker)
289+
else:
290+
positions = _find_unescaped_positions_outside_code(current, marker)
291+
if len(positions) % 2 == 1:
292+
last_unpaired = positions[-1] if positions else len(current)
293+
if last_unpaired < min_unsafe_position:
294+
min_unsafe_position = last_unpaired
295+
296+
open_brackets = _find_unescaped_positions_outside_code(current, "[")
297+
close_brackets = _find_unescaped_positions_outside_code(current, "]")
298+
if len(open_brackets) > len(close_brackets):
299+
last_open = open_brackets[-1] if open_brackets else len(current)
300+
if last_open < min_unsafe_position:
301+
min_unsafe_position = last_open
302+
303+
if min_unsafe_position >= len(current):
304+
return current
305+
306+
current = current[:min_unsafe_position]
307+
308+
return current
309+
310+
311+
def truncate_for_telegram(text: str, limit: int, parse_mode: str | None) -> str:
312+
"""Truncate *text* to *limit* characters, appending an ellipsis.
313+
314+
For MarkdownV2 (``parse_mode == "MarkdownV2"``), uses an escaped
315+
ellipsis (``\\.\\.\\.``) and trims back past any unbalanced entity
316+
delimiter or orphan backslash before appending. Plain text gets a
317+
literal ``...``.
318+
319+
Even when *text* is under the limit, MarkdownV2 inputs go through
320+
:func:`_trim_to_markdown_v2_safe_boundary` so that streamed chunks
321+
with transiently unpaired entity markers don't trigger Telegram's
322+
``can't parse entities`` 400 (port of chat#446 / upstream f46a6fb).
323+
324+
Limit is interpreted in Python ``len()`` units (== UTF-16 code units
325+
for the BMP, == 1 for astral characters). Telegram's 4096 / 1024
326+
caps are documented in UTF-16 code units; for parity with upstream
327+
we follow upstream's `string.length` semantics here. Pre-existing
328+
UTF-16-aware truncation is reserved for non-MarkdownV2 paths.
329+
"""
330+
is_markdown_v2 = parse_mode == "MarkdownV2"
331+
332+
if len(text) <= limit:
333+
return _trim_to_markdown_v2_safe_boundary(text) if is_markdown_v2 else text
334+
335+
ellipsis = _MARKDOWN_V2_ELLIPSIS if is_markdown_v2 else _PLAIN_ELLIPSIS
336+
sliced = text[: limit - len(ellipsis)]
337+
338+
if is_markdown_v2:
339+
sliced = _trim_to_markdown_v2_safe_boundary(sliced)
340+
341+
return f"{sliced}{ellipsis}"
342+
343+
167344
def _trim_trailing_slashes(url: str) -> str:
168345
"""Remove trailing ``/`` characters from *url*."""
169346
end = len(url)
@@ -838,7 +1015,8 @@ async def post_message(
8381015
if card
8391016
else self._format_converter.render_postable(message),
8401017
"gchat",
841-
)
1018+
),
1019+
parse_mode,
8421020
)
8431021

8441022
files = extract_files(message)
@@ -920,7 +1098,8 @@ async def edit_message(
9201098
if card
9211099
else self._format_converter.render_postable(message),
9221100
"gchat",
923-
)
1101+
),
1102+
parse_mode,
9241103
)
9251104

9261105
if not text.strip():
@@ -1453,7 +1632,7 @@ async def send_document(
14531632
form_data.add_field("message_thread_id", str(thread.message_thread_id))
14541633

14551634
if text.strip():
1456-
form_data.add_field("caption", self.truncate_caption(text))
1635+
form_data.add_field("caption", self.truncate_caption(text, parse_mode))
14571636
if parse_mode:
14581637
form_data.add_field("parse_mode", parse_mode)
14591638

@@ -1736,12 +1915,26 @@ def resolve_parse_mode(
17361915

17371916
# -- Truncation ----------------------------------------------------------
17381917

1739-
def truncate_message(self, text: str) -> str:
1740-
"""Truncate message text to the Telegram limit (measured in UTF-16 code units)."""
1918+
def truncate_message(self, text: str, parse_mode: str | None = None) -> str:
1919+
"""Truncate message text to the Telegram message limit.
1920+
1921+
For ``parse_mode == "MarkdownV2"`` uses :func:`truncate_for_telegram`,
1922+
which escapes the ellipsis and walks back past any unbalanced
1923+
entity delimiter or orphan backslash so the result is parseable.
1924+
For plain text, falls back to UTF-16 truncation with a literal
1925+
``"..."`` ellipsis.
1926+
"""
1927+
if parse_mode == "MarkdownV2":
1928+
return truncate_for_telegram(text, TELEGRAM_MESSAGE_LIMIT, parse_mode)
17411929
return _truncate_to_utf16(text, TELEGRAM_MESSAGE_LIMIT)
17421930

1743-
def truncate_caption(self, text: str) -> str:
1744-
"""Truncate caption text to the Telegram caption limit (measured in UTF-16 code units)."""
1931+
def truncate_caption(self, text: str, parse_mode: str | None = None) -> str:
1932+
"""Truncate caption text to the Telegram caption limit.
1933+
1934+
See :meth:`truncate_message` for parse-mode handling.
1935+
"""
1936+
if parse_mode == "MarkdownV2":
1937+
return truncate_for_telegram(text, TELEGRAM_CAPTION_LIMIT, parse_mode)
17451938
return _truncate_to_utf16(text, TELEGRAM_CAPTION_LIMIT)
17461939

17471940
# -- Emoji / reactions ---------------------------------------------------

0 commit comments

Comments
 (0)