Skip to content

Commit 5d06ec9

Browse files
authored
Merge pull request #2563 from Badiboy/master
Apply new apply_html_entities procedure
2 parents 7d11b89 + 5a47c6c commit 5d06ec9

File tree

2 files changed

+109
-90
lines changed

2 files changed

+109
-90
lines changed

telebot/formatting.py

Lines changed: 102 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
"""
22
Markdown & HTML formatting functions.
3-
4-
.. versionadded:: 4.5.1
53
"""
64

75
import re
@@ -45,6 +43,7 @@ def escape_html(content: str) -> str:
4543
return html.escape(content)
4644

4745

46+
# noinspection RegExpRedundantEscape
4847
def escape_markdown(content: str) -> str:
4948
"""
5049
Escapes Markdown characters in a string of Markdown.
@@ -269,6 +268,9 @@ def mcode(content: str, language: str="", escape: Optional[bool]=True) -> str:
269268
:param content: The string to code.
270269
:type content: :obj:`str`
271270
271+
:param language: The programming language of the code. Defaults to an empty string.
272+
:type language: :obj:`str`
273+
272274
:param escape: True if you need to escape special characters. Defaults to True.
273275
:type escape: :obj:`bool`
274276
@@ -304,6 +306,9 @@ def hpre(content: str, escape: Optional[bool]=True, language: str="") -> str:
304306
:param escape: True if you need to escape special characters. Defaults to True.
305307
:type escape: :obj:`bool`
306308
309+
:param language: The programming language of the code. Defaults to an empty string.
310+
:type language: :obj:`str`
311+
307312
:return: The formatted string.
308313
:rtype: :obj:`str`
309314
"""
@@ -368,31 +373,11 @@ def hcite(content: str, escape: Optional[bool] = True, expandable: Optional[bool
368373
)
369374

370375

371-
def apply_html_entities(text: str, entities: Optional[List], custom_subs: Optional[Dict[str, str]]) -> str:
376+
def apply_html_entities(text: str, entities: Optional[List]=None, custom_subs: Optional[Dict[str, str]]=None) -> str:
372377
"""
373-
Author: @sviat9440
374-
Updaters: @badiboy, @EgorKhabarov
375-
Message: "*Test* parse _formatting_, [url](https://example.com), [text_mention](tg://user?id=123456) and mention @username"
376-
377-
.. code-block:: python3
378-
:caption: Example:
379-
380-
apply_html_entities(text, entities)
381-
>> "<b>Test</b> parse <i>formatting</i>, <a href=\"https://example.com\">url</a>, <a href=\"tg://user?id=123456\">text_mention</a> and mention @username"
382-
383-
Custom subs:
384-
You can customize the substitutes. By default, there is no substitute for the entities: hashtag, bot_command, email. You can add or modify substitute an existing entity.
385-
.. code-block:: python3
386-
:caption: Example:
387-
388-
apply_html_entities(
389-
text,
390-
entities,
391-
{"bold": "<strong class=\"example\">{text}</strong>", "italic": "<i class=\"example\">{text}</i>", "mention": "<a href={url}>{text}</a>"},
392-
)
393-
>> "<strong class=\"example\">Test</strong> parse <i class=\"example\">formatting</i>, <a href=\"https://example.com\">url</a> and <a href=\"tg://user?id=123456\">text_mention</a> and mention <a href=\"https://t.me/username\">@username</a>"
378+
Apply HTML formatting to text based on provided entities.
379+
Handles nested and overlapping entities correctly.
394380
"""
395-
396381
if not entities:
397382
return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
398383

@@ -401,79 +386,109 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option
401386
"italic": "<i>{text}</i>",
402387
"pre": "<pre>{text}</pre>",
403388
"code": "<code>{text}</code>",
404-
# "url": "<a href=\"{url}\">{text}</a>", # @badiboy plain URLs have no text and do not need tags
405389
"text_link": "<a href=\"{url}\">{text}</a>",
406390
"strikethrough": "<s>{text}</s>",
407391
"underline": "<u>{text}</u>",
408392
"spoiler": "<span class=\"tg-spoiler\">{text}</span>",
409393
"custom_emoji": "<tg-emoji emoji-id=\"{custom_emoji_id}\">{text}</tg-emoji>",
410394
"blockquote": "<blockquote>{text}</blockquote>",
411395
"expandable_blockquote": "<blockquote expandable>{text}</blockquote>",
412-
413396
}
414397

415398
if custom_subs:
416399
for key, value in custom_subs.items():
417400
_subs[key] = value
401+
402+
# Sort entities by offset (starting position), with longer entities first for equal offsets
403+
sorted_entities = sorted(entities, key=lambda e: (e.offset, -e.length))
404+
405+
# Convert text to utf-16 encoding for proper handling
418406
utf16_text = text.encode("utf-16-le")
419-
html_text = ""
420-
421-
def func(upd_text, subst_type=None, url=None, user=None, custom_emoji_id=None, language=None):
422-
upd_text = upd_text.decode("utf-16-le")
423-
if subst_type == "text_mention":
424-
subst_type = "text_link"
425-
url = "tg://user?id={0}".format(user.id)
426-
elif subst_type == "mention":
427-
url = "https://t.me/{0}".format(upd_text[1:])
428-
upd_text = upd_text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
429-
if not subst_type or not _subs.get(subst_type):
430-
return upd_text
431-
subs = _subs.get(subst_type)
432-
if subst_type == "custom_emoji":
433-
return subs.format(text=upd_text, custom_emoji_id=custom_emoji_id)
434-
elif (subst_type == "pre") and language:
435-
return "<pre><code class=\"language-{0}\">{1}</code></pre>".format(language, upd_text)
436-
return subs.format(text=upd_text, url=url)
437-
438-
offset = 0
439-
start_index = 0
440-
end_index = 0
441-
for entity in entities:
442-
if entity.offset > offset:
443-
# when the offset is not 0: for example, a __b__
444-
# we need to add the text before the entity to the html_text
445-
html_text += func(utf16_text[offset * 2: entity.offset * 2])
446-
offset = entity.offset
447-
448-
new_string = func(utf16_text[offset * 2: (offset + entity.length) * 2], subst_type=entity.type,
449-
url=entity.url, user=entity.user, custom_emoji_id=entity.custom_emoji_id,
450-
language=entity.language)
451-
start_index = len(html_text)
452-
html_text += new_string
453-
offset += entity.length
454-
end_index = len(html_text)
455-
elif entity.offset == offset:
456-
new_string = func(utf16_text[offset * 2: (offset + entity.length) * 2], subst_type=entity.type,
457-
url=entity.url, user=entity.user, custom_emoji_id=entity.custom_emoji_id,
458-
language=entity.language)
459-
start_index = len(html_text)
460-
html_text += new_string
461-
end_index = len(html_text)
462-
offset += entity.length
407+
408+
def escape_entity(text_part):
409+
"""Escape HTML special characters in a text part"""
410+
if isinstance(text_part, bytes):
411+
text_part = text_part.decode("utf-16-le")
412+
return text_part.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
413+
414+
def format_entity(entity, content):
415+
"""Apply entity formatting to the content"""
416+
entity_type = entity.type
417+
418+
# Handle different entity types
419+
if entity_type == "text_mention" and hasattr(entity, 'user'):
420+
return f"<a href=\"tg://user?id={entity.user.id}\">{content}</a>"
421+
# elif entity_type == "mention": # No need to do this, @username works fine
422+
# username = content[1:] # Remove @ symbol
423+
# return f"<a href=\"https://t.me/{username}\">{content}</a>"
424+
elif entity_type == "text_link" and hasattr(entity, 'url'):
425+
return f"<a href=\"{entity.url}\">{content}</a>"
426+
elif entity_type == "custom_emoji" and hasattr(entity, 'custom_emoji_id'):
427+
return f"<tg-emoji emoji-id=\"{entity.custom_emoji_id}\">{content}</tg-emoji>"
428+
elif entity_type == "pre" and hasattr(entity, 'language') and entity.language:
429+
return f"<pre><code class=\"language-{entity.language}\">{content}</code></pre>"
430+
elif entity_type in _subs:
431+
template = _subs[entity_type]
432+
return template.format(text=content)
433+
434+
# If no matching entity type, return text as is
435+
return content
436+
437+
def process_entities(byte_text, entity_list, start_pos=0, end_pos=None):
438+
if end_pos is None:
439+
end_pos = len(byte_text)
440+
441+
if not entity_list or start_pos >= end_pos:
442+
return escape_entity(byte_text[start_pos:end_pos])
443+
444+
current_entity = entity_list[0]
445+
current_start = current_entity.offset * 2
446+
current_end = current_start + current_entity.length * 2
447+
448+
if current_end <= start_pos or current_start >= end_pos:
449+
return escape_entity(byte_text[start_pos:end_pos])
450+
451+
result = []
452+
453+
if current_start > start_pos:
454+
result.append(escape_entity(byte_text[start_pos:current_start]))
455+
456+
nested_entities = []
457+
remaining_entities = []
458+
459+
for entity in entity_list[1:]:
460+
entity_start = entity.offset * 2
461+
# entity_end = entity_start + entity.length * 2
462+
463+
if current_start <= entity_start < current_end:
464+
nested_entities.append(entity)
465+
else:
466+
remaining_entities.append(entity)
467+
468+
if nested_entities:
469+
inner_content = process_entities(
470+
byte_text,
471+
nested_entities,
472+
current_start,
473+
current_end
474+
)
463475
else:
464-
# Here we are processing nested entities.
465-
# We shouldn't update offset, because they are the same as entity before.
466-
# And, here we are replacing previous string with a new html-rendered text(previous string is already html-rendered,
467-
# And we don't change it).
468-
entity_string = html_text[start_index: end_index].encode("utf-16-le")
469-
formatted_string = func(entity_string, subst_type=entity.type, url=entity.url, user=entity.user,
470-
custom_emoji_id=entity.custom_emoji_id,
471-
language=entity.language). \
472-
replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
473-
html_text = html_text[:start_index] + formatted_string + html_text[end_index:]
474-
end_index = len(html_text)
475-
476-
if offset * 2 < len(utf16_text):
477-
html_text += func(utf16_text[offset * 2:])
478-
479-
return html_text
476+
inner_content = escape_entity(byte_text[current_start:current_end])
477+
478+
result.append(format_entity(current_entity, inner_content))
479+
480+
if current_end < end_pos and remaining_entities:
481+
result.append(process_entities(
482+
byte_text,
483+
remaining_entities,
484+
current_end,
485+
end_pos
486+
))
487+
elif current_end < end_pos:
488+
result.append(escape_entity(byte_text[current_end:end_pos]))
489+
490+
return "".join(result)
491+
492+
html_result = process_entities(utf16_text, sorted_entities)
493+
494+
return html_result

tests/test_types.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def test_message_entity():
278278
sample_string_1 = r'{"update_id":934522126,"message":{"message_id":1374510,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682177590,"text":"b b b","entities":[{"offset":0,"length":2,"type":"bold"},{"offset":0,"length":1,"type":"italic"},{"offset":2,"length":2,"type":"bold"},{"offset":2,"length":1,"type":"italic"},{"offset":4,"length":1,"type":"bold"},{"offset":4,"length":1,"type":"italic"}]}}'
279279
update = types.Update.de_json(sample_string_1)
280280
message: types.Message = update.message
281-
assert message.html_text == "<i><b>b </b></i><i><b>b </b></i><i><b>b</b></i>"
281+
assert message.html_text == "<b><i>b</i> </b><b><i>b</i> </b><b><i>b</i></b>"
282282

283283
sample_string_2 = r'{"update_id":934522166,"message":{"message_id":1374526,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682179716,"text":"b b b","entities":[{"offset":0,"length":1,"type":"bold"},{"offset":2,"length":1,"type":"bold"},{"offset":4,"length":1,"type":"italic"}]}}'
284284
message_2 = types.Update.de_json(sample_string_2).message
@@ -288,12 +288,16 @@ def test_message_entity():
288288

289289
sample_string_3 = r'{"update_id":934522172,"message":{"message_id":1374530,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682179968,"text":"This is a bold text with a nested italic and bold text.","entities":[{"offset":10,"length":4,"type":"bold"},{"offset":27,"length":7,"type":"italic"},{"offset":34,"length":15,"type":"bold"},{"offset":34,"length":15,"type":"italic"}]}}'
290290
message_3 = types.Update.de_json(sample_string_3).message
291-
assert message_3.html_text == "This is a <b>bold</b> text with a <i>nested </i><i><b>italic and bold</b></i> text."
291+
assert \
292+
(message_3.html_text == "This is a <b>bold</b> text with a <i>nested </i><i><b>italic and bold</b></i> text.") or \
293+
(message_3.html_text == "This is a <b>bold</b> text with a <i>nested </i><b><i>italic and bold</i></b> text.")
292294

293295

294296
sample_string_4 = r'{"update_id":934522437,"message":{"message_id":1374619,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682189507,"forward_from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"forward_date":1682189124,"text":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa😋😋","entities":[{"offset":0,"length":76,"type":"bold"},{"offset":0,"length":76,"type":"italic"},{"offset":0,"length":76,"type":"underline"},{"offset":0,"length":76,"type":"strikethrough"},{"offset":76,"length":2,"type":"custom_emoji","custom_emoji_id":"5456188142006575553"},{"offset":78,"length":2,"type":"custom_emoji","custom_emoji_id":"5456188142006575553"}]}}'
295297
message_4 = types.Update.de_json(sample_string_4).message
296-
assert message_4.html_text == '<s><u><i><b>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</b></i></u></s><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji>'
298+
assert \
299+
(message_4.html_text == '<s><u><i><b>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</b></i></u></s><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji>') or \
300+
(message_4.html_text == '<b><i><u><s>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</s></u></i></b><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji>')
297301

298302

299303
sample_string_5 = r'{"update_id":934522166,"message":{"message_id":1374526,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682179716,"text":"b <b>b</b> <i>i</i>","entities":[{"offset":0,"length":1,"type":"bold"}]}}'

0 commit comments

Comments
 (0)