We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 61a753b commit bd9b0ccCopy full SHA for bd9b0cc
1 file changed
WDoc/utils/misc.py
@@ -270,8 +270,11 @@ def html_to_text(html: str, remove_image: bool = False) -> str:
270
"""used to strip any html present in the text files"""
271
html = html.replace("</li><li>", "<br>") # otherwise they might get joined
272
html = html.replace("</ul><ul>", "<br>") # otherwise they might get joined
273
+ html = html.replace("<br>", "\n").replace("</br>", "\n") # otherwise newlines are lost
274
soup = BeautifulSoup(html, 'html.parser')
275
text = soup.get_text()
276
+ while "\n\n" in text:
277
+ text = text.replace("\n\n", "\n")
278
if remove_image:
279
if "<img" in text:
280
text = re.sub("<img src=.*?>", "[IMAGE]", text, flags=re.M | re.DOTALL)
0 commit comments