Skip to content

Commit bd9b0cc

Browse files
fix: don't remove newlines when parsing anki notes
1 parent 61a753b commit bd9b0cc

1 file changed

Lines changed: 3 additions & 0 deletions

File tree

WDoc/utils/misc.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,11 @@ def html_to_text(html: str, remove_image: bool = False) -> str:
270270
"""used to strip any html present in the text files"""
271271
html = html.replace("</li><li>", "<br>") # otherwise they might get joined
272272
html = html.replace("</ul><ul>", "<br>") # otherwise they might get joined
273+
html = html.replace("<br>", "\n").replace("</br>", "\n") # otherwise newlines are lost
273274
soup = BeautifulSoup(html, 'html.parser')
274275
text = soup.get_text()
276+
while "\n\n" in text:
277+
text = text.replace("\n\n", "\n")
275278
if remove_image:
276279
if "<img" in text:
277280
text = re.sub("<img src=.*?>", "[IMAGE]", text, flags=re.M | re.DOTALL)

0 commit comments

Comments
 (0)