From 23801f81d95bd55cd1a7f7ff190bbb6d1a94884e Mon Sep 17 00:00:00 2001 From: ANIS <119749586+assinscreedFC@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:24:24 +0200 Subject: [PATCH] Fix #405: no spurious space between emphasis and following punctuation After a closing emphasis marker html2text inserted a separating space before anything except whitespace, brackets and `.!?`. That wrongly added a space before other punctuation, e.g. `hello,` produced `_hello_ ,` instead of `_hello_,`. The separating space is only needed before a word character, which would otherwise attach to the closing marker and stop Markdown from recognising the emphasis. The condition is now `re.match(r"\w", data[0])`. Adds a regression fixture (test/emphasis_punctuation.*) and a ChangeLog entry. Co-authored-by: Claude --- AUTHORS.rst | 1 + ChangeLog.rst | 6 ++++++ html2text/__init__.py | 8 ++++++-- test/emphasis_punctuation.html | 4 ++++ test/emphasis_punctuation.md | 8 ++++++++ 5 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 test/emphasis_punctuation.html create mode 100644 test/emphasis_punctuation.md diff --git a/AUTHORS.rst b/AUTHORS.rst index 5b3ac10..cb1e625 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -40,6 +40,7 @@ The AUTHORS/Contributors are (and/or have been): * Edward Ross * Gregory Anders * Alex Vandiver +* Anis Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index 2d96e3b..2961aba 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,9 @@ +Unreleased +========== +---- + +* Fix #405: Don't insert a spurious space between a closing emphasis marker and following punctuation. + 2025.4.15 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 621fa74..cf10dec 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -881,11 +881,15 @@ def handle_data(self, data: str, entity_char: bool = False) -> None: self.preceding_stressed = True elif self.preceding_stressed: if ( - re.match(r"[^][(){}\s.!?]", data[0]) + re.match(r"\w", data[0]) and not hn(self.current_tag) and self.current_tag not in ["a", "code", "pre"] ): - # should match a letter or common punctuation + # A following word character (letter, digit or underscore) + # attaches to the closing emphasis marker and stops Markdown + # from recognising it, so the separating space is needed only + # before word characters -- not before punctuation, which + # previously received a spurious space. data = " " + data self.preceding_stressed = False diff --git a/test/emphasis_punctuation.html b/test/emphasis_punctuation.html new file mode 100644 index 0000000..91f3a3e --- /dev/null +++ b/test/emphasis_punctuation.html @@ -0,0 +1,4 @@ +

An emphasized, then another: also more; right?

+

A strong, and a quote here"end".

+

wordboundary keeps its space, as does a digit v2.

+

No space before an apostrophe: cat's tail.

diff --git a/test/emphasis_punctuation.md b/test/emphasis_punctuation.md new file mode 100644 index 0000000..63d17d6 --- /dev/null +++ b/test/emphasis_punctuation.md @@ -0,0 +1,8 @@ +An _emphasized_, then _another_: also _more_; right? + +A **strong**, and a quote _here_"end". + +_word_ boundary keeps its space, as does a digit _v_ 2. + +No space before an apostrophe: _cat_'s tail. +