1414 "DocMarkdownConverter" ,
1515 "remove_codeblocks" ,
1616)
17- # taken from version 0.6.1 of markdownify
18- WHITESPACE_RE = re .compile (r"[\r\n\s\t ]+" )
1917
2018
2119CODE_BLOCK_RE = re .compile (
@@ -35,20 +33,18 @@ def remove_codeblocks(content: str) -> str:
3533class DocMarkdownConverter (MarkdownConverter ):
3634 """Subclass markdownify's MarkdownCoverter to provide custom conversion methods."""
3735
38- def __init__ (self , * , page_url : str , ** options ) -> None :
39- super (). __init__ ( ** options )
40- self . page_url = page_url
36+ def __init__ (self , * , page_url : str , ** options ):
37+ # Reflow text to avoid unwanted line breaks.
38+ default_options = { "wrap" : True , "wrap_width" : None }
4139
42- # overwritten to use our regex from version 0.6.1
43- def process_text (self , text : str | None ) -> Any :
44- """Process the text, using our custom regex."""
45- return self .escape (WHITESPACE_RE .sub (" " , text or "" ))
40+ super ().__init__ (** default_options | options )
41+ self .page_url = page_url
4642
47- def convert_img (self , el : PageElement , text : str , convert_as_inline : bool ) -> str :
43+ def convert_img (self , el : PageElement , text : str , parent_tags : set [ str ] ) -> str :
4844 """Remove images from the parsed contents, we don't want them."""
4945 return ""
5046
51- def convert_li (self , el : Tag , text : str , convert_as_inline : bool ) -> str :
47+ def convert_li (self , el : Tag , text : str , parent_tags : set [ str ] ) -> str :
5248 """Fix markdownify's erroneous indexing in ol tags."""
5349 parent = el .parent
5450 if parent is not None and parent .name == "ol" :
@@ -65,40 +61,42 @@ def convert_li(self, el: Tag, text: str, convert_as_inline: bool) -> str:
6561 bullet = bullets [depth % len (bullets )]
6662 return f"{ bullet } { text } \n "
6763
68- def convert_hn (self , _n : int , el : PageElement , text : str , convert_as_inline : bool ) -> str :
64+ def _convert_hn (self , n : int , el : PageElement , text : str , parent_tags : set [ str ] ) -> str :
6965 """Convert h tags to bold text with ** instead of adding #."""
70- if convert_as_inline :
66+ if "_inline" in parent_tags :
7167 return text
7268 return f"**{ text } **\n \n "
7369
74- def convert_code (self , el : PageElement , text : str , convert_as_inline : bool ) -> str :
70+ def convert_code (self , el : PageElement , text : str , parent_tags : set [ str ] ) -> str :
7571 """Undo `markdownify`s underscore escaping."""
7672 return f"`{ text } `" .replace ("\\ " , "" )
7773
78- def convert_pre (self , el : Tag , text : str , convert_as_inline : bool ) -> str :
74+ def convert_pre (self , el : Tag , text : str , parent_tags : set [ str ] ) -> str : # pyright: ignore[reportIncompatibleMethodOverride] # bug in pyright
7975 """Wrap any codeblocks in `py` for syntax highlighting."""
8076 code = "" .join (el .strings )
8177 return f"```py\n { code } ```"
8278
83- def convert_a (self , el : Tag , text : str , convert_as_inline : bool ) -> str :
79+ def convert_a (self , el : Tag , text : str , parent_tags : set [ str ] ) -> str :
8480 """Resolve relative URLs to `self.page_url`."""
8581 href = el ["href" ]
8682 assert isinstance (href , str )
8783 el ["href" ] = urljoin (self .page_url , href )
88- return super ().convert_a (el , text , convert_as_inline )
84+ # Discord doesn't handle titles properly, showing links with them as raw text.
85+ el ["title" ] = ""
86+ return super ().convert_a (el , text , parent_tags )
8987
90- def convert_p (self , el : PageElement , text : str , convert_as_inline : bool ) -> str :
88+ def convert_p (self , el : PageElement , text : str , parent_tags : set [ str ] ) -> str :
9189 """Include only one newline instead of two when the parent is a li tag."""
92- if convert_as_inline :
90+ if "_inline" in parent_tags :
9391 return text
9492
9593 parent = el .parent
9694 if parent is not None and parent .name == "li" :
9795 return f"{ text } \n "
98- return super ().convert_p (el , text , convert_as_inline )
96+ return super ().convert_p (el , text , parent_tags )
9997
100- def convert_hr (self , el : PageElement , text : str , convert_as_inline : bool ) -> str :
101- """Convert hr tags to nothing. This is because later versions added this method ."""
98+ def convert_hr (self , el : PageElement , text : str , parent_tags : set [ str ] ) -> str : # pyright: ignore[reportIncompatibleMethodOverride] # bug in pyright
99+ """Ignore `hr` tag ."""
102100 return ""
103101
104102
0 commit comments