11"""
22Markdown & HTML formatting functions.
3-
4- .. versionadded:: 4.5.1
53"""
64
75import re
@@ -45,6 +43,7 @@ def escape_html(content: str) -> str:
4543 return html .escape (content )
4644
4745
46+ # noinspection RegExpRedundantEscape
4847def escape_markdown (content : str ) -> str :
4948 """
5049 Escapes Markdown characters in a string of Markdown.
@@ -269,6 +268,9 @@ def mcode(content: str, language: str="", escape: Optional[bool]=True) -> str:
269268 :param content: The string to code.
270269 :type content: :obj:`str`
271270
271+ :param language: The programming language of the code. Defaults to an empty string.
272+ :type language: :obj:`str`
273+
272274 :param escape: True if you need to escape special characters. Defaults to True.
273275 :type escape: :obj:`bool`
274276
@@ -304,6 +306,9 @@ def hpre(content: str, escape: Optional[bool]=True, language: str="") -> str:
304306 :param escape: True if you need to escape special characters. Defaults to True.
305307 :type escape: :obj:`bool`
306308
309+ :param language: The programming language of the code. Defaults to an empty string.
310+ :type language: :obj:`str`
311+
307312 :return: The formatted string.
308313 :rtype: :obj:`str`
309314 """
@@ -368,31 +373,11 @@ def hcite(content: str, escape: Optional[bool] = True, expandable: Optional[bool
368373 )
369374
370375
371- def apply_html_entities (text : str , entities : Optional [List ], custom_subs : Optional [Dict [str , str ]]) -> str :
376+ def apply_html_entities (text : str , entities : Optional [List ]= None , custom_subs : Optional [Dict [str , str ]]= None ) -> str :
372377 """
373- Author: @sviat9440
374- Updaters: @badiboy, @EgorKhabarov
375- Message: "*Test* parse _formatting_, [url](https://example.com), [text_mention](tg://user?id=123456) and mention @username"
376-
377- .. code-block:: python3
378- :caption: Example:
379-
380- apply_html_entities(text, entities)
381- >> "<b>Test</b> parse <i>formatting</i>, <a href=\" https://example.com\" >url</a>, <a href=\" tg://user?id=123456\" >text_mention</a> and mention @username"
382-
383- Custom subs:
384- You can customize the substitutes. By default, there is no substitute for the entities: hashtag, bot_command, email. You can add or modify substitute an existing entity.
385- .. code-block:: python3
386- :caption: Example:
387-
388- apply_html_entities(
389- text,
390- entities,
391- {"bold": "<strong class=\" example\" >{text}</strong>", "italic": "<i class=\" example\" >{text}</i>", "mention": "<a href={url}>{text}</a>"},
392- )
393- >> "<strong class=\" example\" >Test</strong> parse <i class=\" example\" >formatting</i>, <a href=\" https://example.com\" >url</a> and <a href=\" tg://user?id=123456\" >text_mention</a> and mention <a href=\" https://t.me/username\" >@username</a>"
378+ Apply HTML formatting to text based on provided entities.
379+ Handles nested and overlapping entities correctly.
394380 """
395-
396381 if not entities :
397382 return text .replace ("&" , "&" ).replace ("<" , "<" ).replace (">" , ">" )
398383
@@ -401,79 +386,109 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option
401386 "italic" : "<i>{text}</i>" ,
402387 "pre" : "<pre>{text}</pre>" ,
403388 "code" : "<code>{text}</code>" ,
404- # "url": "<a href=\"{url}\">{text}</a>", # @badiboy plain URLs have no text and do not need tags
405389 "text_link" : "<a href=\" {url}\" >{text}</a>" ,
406390 "strikethrough" : "<s>{text}</s>" ,
407391 "underline" : "<u>{text}</u>" ,
408392 "spoiler" : "<span class=\" tg-spoiler\" >{text}</span>" ,
409393 "custom_emoji" : "<tg-emoji emoji-id=\" {custom_emoji_id}\" >{text}</tg-emoji>" ,
410394 "blockquote" : "<blockquote>{text}</blockquote>" ,
411395 "expandable_blockquote" : "<blockquote expandable>{text}</blockquote>" ,
412-
413396 }
414397
415398 if custom_subs :
416399 for key , value in custom_subs .items ():
417400 _subs [key ] = value
401+
402+ # Sort entities by offset (starting position), with longer entities first for equal offsets
403+ sorted_entities = sorted (entities , key = lambda e : (e .offset , - e .length ))
404+
405+ # Convert text to utf-16 encoding for proper handling
418406 utf16_text = text .encode ("utf-16-le" )
419- html_text = ""
420-
421- def func (upd_text , subst_type = None , url = None , user = None , custom_emoji_id = None , language = None ):
422- upd_text = upd_text .decode ("utf-16-le" )
423- if subst_type == "text_mention" :
424- subst_type = "text_link"
425- url = "tg://user?id={0}" .format (user .id )
426- elif subst_type == "mention" :
427- url = "https://t.me/{0}" .format (upd_text [1 :])
428- upd_text = upd_text .replace ("&" , "&" ).replace ("<" , "<" ).replace (">" , ">" )
429- if not subst_type or not _subs .get (subst_type ):
430- return upd_text
431- subs = _subs .get (subst_type )
432- if subst_type == "custom_emoji" :
433- return subs .format (text = upd_text , custom_emoji_id = custom_emoji_id )
434- elif (subst_type == "pre" ) and language :
435- return "<pre><code class=\" language-{0}\" >{1}</code></pre>" .format (language , upd_text )
436- return subs .format (text = upd_text , url = url )
437-
438- offset = 0
439- start_index = 0
440- end_index = 0
441- for entity in entities :
442- if entity .offset > offset :
443- # when the offset is not 0: for example, a __b__
444- # we need to add the text before the entity to the html_text
445- html_text += func (utf16_text [offset * 2 : entity .offset * 2 ])
446- offset = entity .offset
447-
448- new_string = func (utf16_text [offset * 2 : (offset + entity .length ) * 2 ], subst_type = entity .type ,
449- url = entity .url , user = entity .user , custom_emoji_id = entity .custom_emoji_id ,
450- language = entity .language )
451- start_index = len (html_text )
452- html_text += new_string
453- offset += entity .length
454- end_index = len (html_text )
455- elif entity .offset == offset :
456- new_string = func (utf16_text [offset * 2 : (offset + entity .length ) * 2 ], subst_type = entity .type ,
457- url = entity .url , user = entity .user , custom_emoji_id = entity .custom_emoji_id ,
458- language = entity .language )
459- start_index = len (html_text )
460- html_text += new_string
461- end_index = len (html_text )
462- offset += entity .length
407+
408+ def escape_entity (text_part ):
409+ """Escape HTML special characters in a text part"""
410+ if isinstance (text_part , bytes ):
411+ text_part = text_part .decode ("utf-16-le" )
412+ return text_part .replace ("&" , "&" ).replace ("<" , "<" ).replace (">" , ">" )
413+
414+ def format_entity (entity , content ):
415+ """Apply entity formatting to the content"""
416+ entity_type = entity .type
417+
418+ # Handle different entity types
419+ if entity_type == "text_mention" and hasattr (entity , 'user' ):
420+ return f"<a href=\" tg://user?id={ entity .user .id } \" >{ content } </a>"
421+ # elif entity_type == "mention": # No need to do this, @username works fine
422+ # username = content[1:] # Remove @ symbol
423+ # return f"<a href=\"https://t.me/{username}\">{content}</a>"
424+ elif entity_type == "text_link" and hasattr (entity , 'url' ):
425+ return f"<a href=\" { entity .url } \" >{ content } </a>"
426+ elif entity_type == "custom_emoji" and hasattr (entity , 'custom_emoji_id' ):
427+ return f"<tg-emoji emoji-id=\" { entity .custom_emoji_id } \" >{ content } </tg-emoji>"
428+ elif entity_type == "pre" and hasattr (entity , 'language' ) and entity .language :
429+ return f"<pre><code class=\" language-{ entity .language } \" >{ content } </code></pre>"
430+ elif entity_type in _subs :
431+ template = _subs [entity_type ]
432+ return template .format (text = content )
433+
434+ # If no matching entity type, return text as is
435+ return content
436+
437+ def process_entities (byte_text , entity_list , start_pos = 0 , end_pos = None ):
438+ if end_pos is None :
439+ end_pos = len (byte_text )
440+
441+ if not entity_list or start_pos >= end_pos :
442+ return escape_entity (byte_text [start_pos :end_pos ])
443+
444+ current_entity = entity_list [0 ]
445+ current_start = current_entity .offset * 2
446+ current_end = current_start + current_entity .length * 2
447+
448+ if current_end <= start_pos or current_start >= end_pos :
449+ return escape_entity (byte_text [start_pos :end_pos ])
450+
451+ result = []
452+
453+ if current_start > start_pos :
454+ result .append (escape_entity (byte_text [start_pos :current_start ]))
455+
456+ nested_entities = []
457+ remaining_entities = []
458+
459+ for entity in entity_list [1 :]:
460+ entity_start = entity .offset * 2
461+ # entity_end = entity_start + entity.length * 2
462+
463+ if current_start <= entity_start < current_end :
464+ nested_entities .append (entity )
465+ else :
466+ remaining_entities .append (entity )
467+
468+ if nested_entities :
469+ inner_content = process_entities (
470+ byte_text ,
471+ nested_entities ,
472+ current_start ,
473+ current_end
474+ )
463475 else :
464- # Here we are processing nested entities.
465- # We shouldn't update offset, because they are the same as entity before.
466- # And, here we are replacing previous string with a new html-rendered text(previous string is already html-rendered,
467- # And we don't change it).
468- entity_string = html_text [start_index : end_index ].encode ("utf-16-le" )
469- formatted_string = func (entity_string , subst_type = entity .type , url = entity .url , user = entity .user ,
470- custom_emoji_id = entity .custom_emoji_id ,
471- language = entity .language ). \
472- replace ("&" , "&" ).replace ("<" , "<" ).replace (">" , ">" )
473- html_text = html_text [:start_index ] + formatted_string + html_text [end_index :]
474- end_index = len (html_text )
475-
476- if offset * 2 < len (utf16_text ):
477- html_text += func (utf16_text [offset * 2 :])
478-
479- return html_text
476+ inner_content = escape_entity (byte_text [current_start :current_end ])
477+
478+ result .append (format_entity (current_entity , inner_content ))
479+
480+ if current_end < end_pos and remaining_entities :
481+ result .append (process_entities (
482+ byte_text ,
483+ remaining_entities ,
484+ current_end ,
485+ end_pos
486+ ))
487+ elif current_end < end_pos :
488+ result .append (escape_entity (byte_text [current_end :end_pos ]))
489+
490+ return "" .join (result )
491+
492+ html_result = process_entities (utf16_text , sorted_entities )
493+
494+ return html_result
0 commit comments