Merge pull request #2563 from Badiboy/master

Badiboy · web-flow · commit 5d06ec9e2a12 · 2026-02-21T17:19:23.000+03:00
Apply new apply_html_entities procedure
diff --git a/telebot/formatting.py b/telebot/formatting.py
@@ -1,7 +1,5 @@
 """
 Markdown & HTML formatting functions.
-
-.. versionadded:: 4.5.1
 """
 
 import re
@@ -45,6 +43,7 @@ def escape_html(content: str) -> str:
     return html.escape(content)
 
 
+# noinspection RegExpRedundantEscape
 def escape_markdown(content: str) -> str:
     """
     Escapes Markdown characters in a string of Markdown.
@@ -269,6 +268,9 @@ def mcode(content: str, language: str="", escape: Optional[bool]=True) -> str:
     :param content: The string to code.
     :type content: :obj:`str`
 
+    :param language: The programming language of the code. Defaults to an empty string.
+    :type language: :obj:`str`
+
     :param escape: True if you need to escape special characters. Defaults to True.
     :type escape: :obj:`bool`
 
@@ -304,6 +306,9 @@ def hpre(content: str, escape: Optional[bool]=True, language: str="") -> str:
     :param escape: True if you need to escape special characters. Defaults to True.
     :type escape: :obj:`bool`
 
+    :param language: The programming language of the code. Defaults to an empty string.
+    :type language: :obj:`str`
+
     :return: The formatted string.
     :rtype: :obj:`str`
     """
@@ -368,31 +373,11 @@ def hcite(content: str, escape: Optional[bool] = True, expandable: Optional[bool
     )
 
 
-def apply_html_entities(text: str, entities: Optional[List], custom_subs: Optional[Dict[str, str]]) -> str:
+def apply_html_entities(text: str, entities: Optional[List]=None, custom_subs: Optional[Dict[str, str]]=None) -> str:
     """
-    Author: @sviat9440
-    Updaters: @badiboy, @EgorKhabarov
-    Message: "*Test* parse _formatting_, [url](https://example.com), [text_mention](tg://user?id=123456) and mention @username"
-
-    .. code-block:: python3
-        :caption: Example:
-
-        apply_html_entities(text, entities)
-        >> "<b>Test</b> parse <i>formatting</i>, <a href=\"https://example.com\">url</a>, <a href=\"tg://user?id=123456\">text_mention</a> and mention @username"
-
-    Custom subs:
-        You can customize the substitutes. By default, there is no substitute for the entities: hashtag, bot_command, email. You can add or modify substitute an existing entity.
-    .. code-block:: python3
-        :caption: Example:
-
-        apply_html_entities(
-            text,
-            entities,
-            {"bold": "<strong class=\"example\">{text}</strong>", "italic": "<i class=\"example\">{text}</i>", "mention": "<a href={url}>{text}</a>"},
-        )
-        >> "<strong class=\"example\">Test</strong> parse <i class=\"example\">formatting</i>, <a href=\"https://example.com\">url</a> and <a href=\"tg://user?id=123456\">text_mention</a> and mention <a href=\"https://t.me/username\">@username</a>"
+    Apply HTML formatting to text based on provided entities.
+    Handles nested and overlapping entities correctly.
     """
-
     if not entities:
         return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
 
@@ -401,79 +386,109 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option
         "italic": "<i>{text}</i>",
         "pre": "<pre>{text}</pre>",
         "code": "<code>{text}</code>",
-        # "url": "<a href=\"{url}\">{text}</a>", # @badiboy plain URLs have no text and do not need tags
         "text_link": "<a href=\"{url}\">{text}</a>",
         "strikethrough": "<s>{text}</s>",
         "underline": "<u>{text}</u>",
         "spoiler": "<span class=\"tg-spoiler\">{text}</span>",
         "custom_emoji": "<tg-emoji emoji-id=\"{custom_emoji_id}\">{text}</tg-emoji>",
         "blockquote": "<blockquote>{text}</blockquote>",
         "expandable_blockquote": "<blockquote expandable>{text}</blockquote>",
-
     }
 
     if custom_subs:
         for key, value in custom_subs.items():
             _subs[key] = value
+
+    # Sort entities by offset (starting position), with longer entities first for equal offsets
+    sorted_entities = sorted(entities, key=lambda e: (e.offset, -e.length))
+
+    # Convert text to utf-16 encoding for proper handling
     utf16_text = text.encode("utf-16-le")
-    html_text = ""
-
-    def func(upd_text, subst_type=None, url=None, user=None, custom_emoji_id=None, language=None):
-        upd_text = upd_text.decode("utf-16-le")
-        if subst_type == "text_mention":
-            subst_type = "text_link"
-            url = "tg://user?id={0}".format(user.id)
-        elif subst_type == "mention":
-            url = "https://t.me/{0}".format(upd_text[1:])
-        upd_text = upd_text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
-        if not subst_type or not _subs.get(subst_type):
-            return upd_text
-        subs = _subs.get(subst_type)
-        if subst_type == "custom_emoji":
-            return subs.format(text=upd_text, custom_emoji_id=custom_emoji_id)
-        elif (subst_type == "pre") and language:
-            return "<pre><code class=\"language-{0}\">{1}</code></pre>".format(language, upd_text)
-        return subs.format(text=upd_text, url=url)
-
-    offset = 0
-    start_index = 0
-    end_index = 0
-    for entity in entities:
-        if entity.offset > offset:
-            # when the offset is not 0: for example, a __b__
-            # we need to add the text before the entity to the html_text
-            html_text += func(utf16_text[offset * 2: entity.offset * 2])
-            offset = entity.offset
-
-            new_string = func(utf16_text[offset * 2: (offset + entity.length) * 2], subst_type=entity.type,
-                              url=entity.url, user=entity.user, custom_emoji_id=entity.custom_emoji_id,
-                              language=entity.language)
-            start_index = len(html_text)
-            html_text += new_string
-            offset += entity.length
-            end_index = len(html_text)
-        elif entity.offset == offset:
-            new_string = func(utf16_text[offset * 2: (offset + entity.length) * 2], subst_type=entity.type,
-                              url=entity.url, user=entity.user, custom_emoji_id=entity.custom_emoji_id,
-                              language=entity.language)
-            start_index = len(html_text)
-            html_text += new_string
-            end_index = len(html_text)
-            offset += entity.length
+
+    def escape_entity(text_part):
+        """Escape HTML special characters in a text part"""
+        if isinstance(text_part, bytes):
+            text_part = text_part.decode("utf-16-le")
+        return text_part.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+
+    def format_entity(entity, content):
+        """Apply entity formatting to the content"""
+        entity_type = entity.type
+
+        # Handle different entity types
+        if entity_type == "text_mention" and hasattr(entity, 'user'):
+            return f"<a href=\"tg://user?id={entity.user.id}\">{content}</a>"
+        # elif entity_type == "mention":   # No need to do this, @username works fine
+        #     username = content[1:]  # Remove @ symbol
+        #     return f"<a href=\"https://t.me/{username}\">{content}</a>"
+        elif entity_type == "text_link" and hasattr(entity, 'url'):
+            return f"<a href=\"{entity.url}\">{content}</a>"
+        elif entity_type == "custom_emoji" and hasattr(entity, 'custom_emoji_id'):
+            return f"<tg-emoji emoji-id=\"{entity.custom_emoji_id}\">{content}</tg-emoji>"
+        elif entity_type == "pre" and hasattr(entity, 'language') and entity.language:
+            return f"<pre><code class=\"language-{entity.language}\">{content}</code></pre>"
+        elif entity_type in _subs:
+            template = _subs[entity_type]
+            return template.format(text=content)
+
+        # If no matching entity type, return text as is
+        return content
+
+    def process_entities(byte_text, entity_list, start_pos=0, end_pos=None):
+        if end_pos is None:
+            end_pos = len(byte_text)
+
+        if not entity_list or start_pos >= end_pos:
+            return escape_entity(byte_text[start_pos:end_pos])
+
+        current_entity = entity_list[0]
+        current_start = current_entity.offset * 2
+        current_end = current_start + current_entity.length * 2
+
+        if current_end <= start_pos or current_start >= end_pos:
+            return escape_entity(byte_text[start_pos:end_pos])
+
+        result = []
+
+        if current_start > start_pos:
+            result.append(escape_entity(byte_text[start_pos:current_start]))
+
+        nested_entities = []
+        remaining_entities = []
+
+        for entity in entity_list[1:]:
+            entity_start = entity.offset * 2
+            # entity_end = entity_start + entity.length * 2
+
+            if current_start <= entity_start < current_end:
+                nested_entities.append(entity)
+            else:
+                remaining_entities.append(entity)
+
+        if nested_entities:
+            inner_content = process_entities(
+                byte_text,
+                nested_entities,
+                current_start,
+                current_end
+            )
         else:
-            # Here we are processing nested entities.
-            # We shouldn't update offset, because they are the same as entity before.
-            # And, here we are replacing previous string with a new html-rendered text(previous string is already html-rendered,
-            # And we don't change it).
-            entity_string = html_text[start_index: end_index].encode("utf-16-le")
-            formatted_string = func(entity_string, subst_type=entity.type, url=entity.url, user=entity.user,
-                                    custom_emoji_id=entity.custom_emoji_id,
-                                    language=entity.language). \
-                replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
-            html_text = html_text[:start_index] + formatted_string + html_text[end_index:]
-            end_index = len(html_text)
-
-    if offset * 2 < len(utf16_text):
-        html_text += func(utf16_text[offset * 2:])
-
-    return html_text
+            inner_content = escape_entity(byte_text[current_start:current_end])
+
+        result.append(format_entity(current_entity, inner_content))
+
+        if current_end < end_pos and remaining_entities:
+            result.append(process_entities(
+                byte_text,
+                remaining_entities,
+                current_end,
+                end_pos
+            ))
+        elif current_end < end_pos:
+            result.append(escape_entity(byte_text[current_end:end_pos]))
+
+        return "".join(result)
+
+    html_result = process_entities(utf16_text, sorted_entities)
+
+    return html_result
diff --git a/tests/test_types.py b/tests/test_types.py
@@ -278,7 +278,7 @@ def test_message_entity():
     sample_string_1 = r'{"update_id":934522126,"message":{"message_id":1374510,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682177590,"text":"b b b","entities":[{"offset":0,"length":2,"type":"bold"},{"offset":0,"length":1,"type":"italic"},{"offset":2,"length":2,"type":"bold"},{"offset":2,"length":1,"type":"italic"},{"offset":4,"length":1,"type":"bold"},{"offset":4,"length":1,"type":"italic"}]}}'
     update = types.Update.de_json(sample_string_1)
     message: types.Message = update.message
-    assert message.html_text == "<i><b>b </b></i><i><b>b </b></i><i><b>b</b></i>"
+    assert message.html_text == "<b><i>b</i> </b><b><i>b</i> </b><b><i>b</i></b>"
 
     sample_string_2 = r'{"update_id":934522166,"message":{"message_id":1374526,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682179716,"text":"b b b","entities":[{"offset":0,"length":1,"type":"bold"},{"offset":2,"length":1,"type":"bold"},{"offset":4,"length":1,"type":"italic"}]}}'
     message_2 = types.Update.de_json(sample_string_2).message
@@ -288,12 +288,16 @@ def test_message_entity():
 
     sample_string_3 = r'{"update_id":934522172,"message":{"message_id":1374530,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682179968,"text":"This is a bold text with a nested italic and bold text.","entities":[{"offset":10,"length":4,"type":"bold"},{"offset":27,"length":7,"type":"italic"},{"offset":34,"length":15,"type":"bold"},{"offset":34,"length":15,"type":"italic"}]}}'
     message_3 = types.Update.de_json(sample_string_3).message
-    assert message_3.html_text == "This is a <b>bold</b> text with a <i>nested </i><i><b>italic and bold</b></i> text."
+    assert \
+        (message_3.html_text == "This is a <b>bold</b> text with a <i>nested </i><i><b>italic and bold</b></i> text.") or \
+        (message_3.html_text == "This is a <b>bold</b> text with a <i>nested </i><b><i>italic and bold</i></b> text.")
 
 
     sample_string_4 = r'{"update_id":934522437,"message":{"message_id":1374619,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682189507,"forward_from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"forward_date":1682189124,"text":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa😋😋","entities":[{"offset":0,"length":76,"type":"bold"},{"offset":0,"length":76,"type":"italic"},{"offset":0,"length":76,"type":"underline"},{"offset":0,"length":76,"type":"strikethrough"},{"offset":76,"length":2,"type":"custom_emoji","custom_emoji_id":"5456188142006575553"},{"offset":78,"length":2,"type":"custom_emoji","custom_emoji_id":"5456188142006575553"}]}}'
     message_4 = types.Update.de_json(sample_string_4).message
-    assert message_4.html_text == '<s><u><i><b>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</b></i></u></s><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji>'
+    assert \
+        (message_4.html_text == '<s><u><i><b>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</b></i></u></s><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji>') or \
+        (message_4.html_text == '<b><i><u><s>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</s></u></i></b><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji><tg-emoji emoji-id="5456188142006575553">😋</tg-emoji>')
 
 
     sample_string_5 = r'{"update_id":934522166,"message":{"message_id":1374526,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682179716,"text":"b <b>b</b> <i>i</i>","entities":[{"offset":0,"length":1,"type":"bold"}]}}'