Resolved ptest failures in python-beautifulsoup4 package (microsoft#17428)

durgajagadeesh · web-flow · commit 23a6082061e6 · 2026-06-18T12:17:28.000+05:30
diff --git a/SPECS-EXTENDED/python-beautifulsoup4/0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch b/SPECS-EXTENDED/python-beautifulsoup4/0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch
@@ -0,0 +1,165 @@
+From 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Mon Sep 17 00:00:00 2001
+From: Leonard Richardson <leonardr@segfault.org>
+Date: Mon, 8 Dec 2025 19:34:16 -0500
+Subject: [PATCH] * Change the html.parser tree builder's code for handling
+ numeric   character references, to avoid a crash when using Python versions  
+ that include the fix to Python issue https://bugs.python.org/issue13633  
+ (e.g. Python 3.11.13). [bug=2134393]
+- This version also includes the fix for CVE-2024-34062.
+
+Patch Reference: https://src.fedoraproject.org/rpms/python-beautifulsoup4/blob/rawhide/f/0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch
+
+---
+ CHANGELOG                    |  5 +++
+ bs4/builder/_htmlparser.py   | 78 +++++++++++++++++++++++++++++-------
+ bs4/tests/test_htmlparser.py | 17 ++++++++
+ 3 files changed, 86 insertions(+), 14 deletions(-)
+
+diff --git a/CHANGELOG b/CHANGELOG
+index f61b7e9..606e9f5 100644
+--- a/CHANGELOG
++++ b/CHANGELOG
+@@ -1,5 +1,10 @@
+ = Unreleased
+ 
++* Change the html.parser tree builder's code for handling numeric
++  character references, to avoid a crash when using Python versions
++  that include the fix to Python issue https://bugs.python.org/issue13633
++  (e.g. Python 3.11.13). [bug=2134393]
++
+ * Skip the lxml tree builder's test_surrogate_in_character_reference test
+   if the libxml2 version is less than 2.13.0. Prior versions of libxml2
+   don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346]
+diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
+index 165a3d8..ead800f 100644
+--- a/bs4/builder/_htmlparser.py
++++ b/bs4/builder/_htmlparser.py
+@@ -10,6 +10,7 @@ __all__ = [
+ ]
+ 
+ from html.parser import HTMLParser
++import re
+ 
+ from typing import (
+     Any,
+@@ -223,6 +224,64 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
+         """Handle some textual data that shows up between tags."""
+         self.soup.handle_data(data)
+ 
++    _DECIMAL_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9]+)(.*)")
++    _HEX_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9a-f]+)(.*)")
++
++    @classmethod
++    def _dereference_numeric_character_reference(cls, name:str) -> Tuple[str, bool, str]:
++        """Convert a numeric character reference into an actual character.
++
++        :param name: The number of the character reference, as
++          obtained by html.parser
++
++        :return: A 3-tuple (dereferenced, replacement_added,
++          extra_data). `dereferenced` is the dereferenced character
++          reference, or the empty string if there was no
++          reference. `replacement_added` is True if the reference
++          could only be dereferenced by replacing content with U+FFFD
++          REPLACEMENT CHARACTER. `extra_data` is a portion of data
++          following the character reference, which was deemed to be
++          normal data and not part of the reference at all.
++        """
++        dereferenced:str = ""
++        replacement_added:bool = False
++        extra_data:str = ""
++
++        base:int = 10
++        reg = cls._DECIMAL_REFERENCE_WITH_FOLLOWING_DATA
++        if name.startswith("x") or name.startswith("X"):
++            # Hex reference
++            name = name[1:]
++            base = 16
++            reg = cls._HEX_REFERENCE_WITH_FOLLOWING_DATA
++
++        real_name:Optional[int] = None
++        try:
++            real_name = int(name, base)
++        except ValueError:
++            # This is either bad data that starts with what looks like
++            # a numeric character reference, or a real numeric
++            # reference that wasn't terminated by a semicolon.
++            #
++            # The fix to https://bugs.python.org/issue13633 made it
++            # our responsibility to handle the extra data.
++            #
++            # To preserve the old behavior, we extract the numeric
++            # portion of the incoming "reference" and treat that as a
++            # numeric reference. All subsequent data will be processed
++            # as string data.
++            match = reg.search(name)
++            if match is not None:
++                real_name = int(match.groups()[0], base)
++                extra_data = match.groups()[1]
++
++        if real_name is None:
++            dereferenced = ""
++            extra_data = name
++        else:
++            dereferenced, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
++        return dereferenced, replacement_added, extra_data
++
+     def handle_charref(self, name: str) -> None:
+         """Handle a numeric character reference by converting it to the
+         corresponding Unicode character and treating it as textual
+@@ -230,22 +289,13 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
+ 
+         :param name: Character number, possibly in hexadecimal.
+         """
+-        # TODO: This was originally a workaround for a bug in
+-        # HTMLParser. (http://bugs.python.org/issue13633) The bug has
+-        # been fixed, but removing this code still makes some
+-        # Beautiful Soup tests fail. This needs investigation.
+-        real_name:int
+-        if name.startswith("x"):
+-            real_name = int(name.lstrip("x"), 16)
+-        elif name.startswith("X"):
+-            real_name = int(name.lstrip("X"), 16)
+-        else:
+-            real_name = int(name)
+-
+-        data, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
++        dereferenced, replacement_added, extra_data = self._dereference_numeric_character_reference(name)
+         if replacement_added:
+             self.soup.contains_replacement_characters = True
+-        self.handle_data(data)
++        if dereferenced is not None:
++            self.handle_data(dereferenced)
++        if extra_data is not None:
++            self.handle_data(extra_data)
+ 
+     def handle_entityref(self, name: str) -> None:
+         """Handle a named entity reference by converting it to the
+diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
+index 0086a9d..cb85b53 100644
+--- a/bs4/tests/test_htmlparser.py
++++ b/bs4/tests/test_htmlparser.py
+@@ -162,3 +162,20 @@ class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
+         # Since we do the replacement ourselves, we can set contains_replacement_characters appropriately.
+         # lxml and html5lib do the replacement so all we ever see is REPLACEMENT CHARACTER.
+         assert soup.contains_replacement_characters == True
++
++class TestBeautifulSoupHTMLParser:
++    def test_dereference_numeric_character_reference(self):
++        m = BeautifulSoupHTMLParser._dereference_numeric_character_reference
++        assert m("64") == ("@", False, "")
++        assert m("x64") == ("d", False, "")
++        assert m("X64") == ("d", False, "")
++        assert m("64andsomeextra") == ("@", False, "andsomeextra")
++        assert m("") == ("", False, "")
++        assert m("00whee") == ("�", True, "whee")
++        assert m("xfffdthatsit") == ("�", False, "thatsit")
++        assert m("xabcdplussomeextra") == ("ꯍ", False, "plussomeextra")
++        assert m("obviouslynotnumeric") == ("", False, "obviouslynotnumeric")
++
++        # These are almost certainly wrong but at least it doesn't crash.
++        assert m("xabcdandsomeextra") == ("\U000abcda", False, "ndsomeextra")
++        assert m("xffffffffffffffffffffffbeep") == ("�", True, "p")
+-- 
+2.52.0
+
diff --git a/SPECS-EXTENDED/python-beautifulsoup4/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch b/SPECS-EXTENDED/python-beautifulsoup4/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
@@ -0,0 +1,85 @@
+From ec4a722af07341c4aa3fe604b077a1f773c6fdd2 Mon Sep 17 00:00:00 2001
+From: Leonard Richardson <leonardr@segfault.org>
+Date: Sun, 7 Dec 2025 13:10:42 -0500
+Subject: [PATCH] * Skip the lxml tree builder's
+ test_surrogate_in_character_reference test   if the libxml2 version is less
+ than 2.13.0. Prior versions of libxml2   don't issue the REPLACEMENT
+ CHARACTER we're expecting. [bug=2134346]
+
+Patch Reference: https://src.fedoraproject.org/rpms/python-beautifulsoup4/blob/rawhide/f/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
+
+---
+ CHANGELOG              |  6 ++++++
+ bs4/tests/test_lxml.py | 14 +++++++++++++-
+ tox.ini                |  2 +-
+ 3 files changed, 20 insertions(+), 2 deletions(-)
+
+diff --git a/CHANGELOG b/CHANGELOG
+index 544f128..f61b7e9 100644
+--- a/CHANGELOG
++++ b/CHANGELOG
+@@ -1,3 +1,9 @@
++= Unreleased
++
++* Skip the lxml tree builder's test_surrogate_in_character_reference test
++  if the libxml2 version is less than 2.13.0. Prior versions of libxml2
++  don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346]
++
+ = 4.14.3 (20251130)
+ 
+ * When using one of the lxml tree builders, you can pass in
+diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
+index 0b69956..aa82143 100644
+--- a/bs4/tests/test_lxml.py
++++ b/bs4/tests/test_lxml.py
+@@ -7,6 +7,7 @@ from . import LXML_PRESENT, LXML_VERSION
+ 
+ if LXML_PRESENT:
+     from bs4.builder._lxml import LXMLTreeBuilder, LXMLTreeBuilderForXML
++    from lxml import etree
+ 
+ from bs4 import (
+     BeautifulStoneSoup,
+@@ -47,7 +48,6 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
+ 
+     # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
+     # test if an old version of lxml is installed.
+-
+     @pytest.mark.skipif(
+         not LXML_PRESENT or LXML_VERSION < (2, 3, 5, 0),
+         reason="Skipping doctype test for old version of lxml to avoid segfault.",
+@@ -57,6 +57,18 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
+         doctype = soup.contents[0]
+         assert "" == doctype.strip()
+ 
++    # This is a copy of the HTMLTreeBuilderSmokeTest implementation.
++    # For lxml only, we need to skip the test if the libxml2 version doesn't
++    # have the fix from https://gitlab.gnome.org/GNOME/libxml2/-/commit/4dcc2d743eb83b8aaec0d91660d615fdb024dad0. That means any pre-2.13 version.
++    @pytest.mark.skipif(
++        "etree.LIBXML_VERSION < (2, 13, 0)",
++        reason="libxml version doesn't issue REPLACEMENT CHARACTER",
++    )
++    def test_surrogate_in_character_reference(self):
++        # These character references are invalid and should be replaced with REPLACEMENT CHARACTER.
++        soup = self.soup("<html><body>&#55357;&#56551;</body></html>")
++        assert soup.body.contents == ['��']
++
+     def test_beautifulstonesoup_is_xml_parser(self):
+         # Make sure that the deprecated BSS class uses an xml builder
+         # if one is installed.
+diff --git a/tox.ini b/tox.ini
+index c53e4d8..c60c3e7 100644
+--- a/tox.ini
++++ b/tox.ini
+@@ -2,7 +2,7 @@
+ # encoding autodetection libraries: cchardet, chardet, and charset-normalizer
+ [tox]
+ env_list =
+-    py{37, 38, 39, 310, 311, 312, 313},bare,docs
++    py{37, 38, 39, 310, 311, 312, 313, 314},bare,docs
+ minversion = 3.28.0
+ skip_missing_interpreters = true
+ 
+-- 
+2.52.0
+
diff --git a/SPECS-EXTENDED/python-beautifulsoup4/beautifulsoup4-4.14-disable-soupsieve.patch b/SPECS-EXTENDED/python-beautifulsoup4/beautifulsoup4-4.14-disable-soupsieve.patch
@@ -0,0 +1,12 @@
+Patch Reference: https://src.fedoraproject.org/rpms/python-beautifulsoup4/blob/rawhide/f/beautifulsoup4-4.14-disable-soupsieve.patch
+---
+--- a/pyproject.toml
++++ b/pyproject.toml
+@@ -31,7 +31,6 @@ classifiers = [
+ ]
+ dependencies = [
+     "typing-extensions>=4.0.0",
+-    "soupsieve>=1.6.1",
+ ]
+ 
+ [project.optional-dependencies]
diff --git a/SPECS-EXTENDED/python-beautifulsoup4/python-beautifulsoup4.signatures.json b/SPECS-EXTENDED/python-beautifulsoup4/python-beautifulsoup4.signatures.json
@@ -1,5 +1,5 @@
 {
  "Signatures": {
-  "python-beautifulsoup4-4.12.3.tar.gz": "74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"
+  "python-beautifulsoup4-4.14.3.tar.gz": "6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"
  }
 }
diff --git a/SPECS-EXTENDED/python-beautifulsoup4/python-beautifulsoup4.spec b/SPECS-EXTENDED/python-beautifulsoup4/python-beautifulsoup4.spec
@@ -3,18 +3,17 @@
 %bcond tests 1
 
 Name:           python-beautifulsoup4
-Version:        4.12.3
-Release:        8%{?dist}
+Version:        4.14.3
+Release:        1%{?dist}
 Summary:        HTML/XML parser for quick-turnaround applications like screen-scraping
 License:        MIT
 Vendor:         Microsoft Corporation
 Distribution:   Azure Linux
 URL:            https://www.crummy.com/software/BeautifulSoup/
 Source0:        https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-%{version}.tar.gz#/%{name}-%{version}.tar.gz
-# https://git.launchpad.net/beautifulsoup/commit/?id=9786a62726de5a8caba10021c4d4a58c8a3e9e3f
-
-Patch0:         soupsieve26.patch
-
+Patch0:         0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
+Patch1:         0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch
+Patch11:        beautifulsoup4-4.14-disable-soupsieve.patch
 BuildArch:      noarch
 # html5lib BR just for test coverage
 %if %{with tests}
@@ -37,6 +36,7 @@ BuildRequires:  python3-colorama
 BuildRequires:  python3-chardet
 BuildRequires:  python-cachetools
 BuildRequires:  python3-pyproject-api
+
 %if %{with soupsieve}
 BuildRequires:  python3-packaging
 BuildRequires:  python3-soupsieve
@@ -75,10 +75,11 @@ Obsoletes:      python3-BeautifulSoup < 1:3.2.1-2
 %description -n python3-beautifulsoup4 %_description
 
 %prep
-%autosetup -p1 -n beautifulsoup4-%{version}
-# Fix compatibility with lxml 5.3.0
-# Reported upstream: https://bugs.launchpad.net/beautifulsoup/+bug/2076897
-sed -i "s/strip_cdata=False,//" bs4/builder/_lxml.py
+%autosetup -N -n beautifulsoup4-%{version}
+%autopatch -p1 -M 10
+%if %{without soupsieve}
+%autopatch -p1 -m 10
+%endif
 
 %generate_buildrequires
 %pyproject_buildrequires %{?with_tests: -t}
@@ -102,6 +103,9 @@ python3 -m tox -q --recreate -e py312
 %{python3_sitelib}/bs4
 
 %changelog
+* Mon May 25 2026 Durga Jagadeesh Palli <v-dpalli@microsoft.com> - 4.14.3-1
+- Upgrade to 4.14.3 to fix ptest errors.
+
 * Fri Mar 21 2025 Jyoti kanase <v-jykanase@microsoft.com> -  4.12.3-8
 - Initial Azure Linux import from Fedora 41 (license: MIT).
 - License verified.
diff --git a/SPECS-EXTENDED/python-beautifulsoup4/soupsieve26.patch b/SPECS-EXTENDED/python-beautifulsoup4/soupsieve26.patch
diff --git a/cgmanifest.json b/cgmanifest.json
@@ -22363,8 +22363,8 @@
         "type": "other",
         "other": {
           "name": "python-beautifulsoup4",
-          "version": "4.12.3",
-          "downloadUrl": "https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-4.12.3.tar.gz"
+          "version": "4.14.3",
+          "downloadUrl": "https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-4.14.3.tar.gz"
         }
       }
     },

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`	`2`	`"Signatures": {`
`3`		`- "python-beautifulsoup4-4.12.3.tar.gz": "74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"`
	`3`	`+ "python-beautifulsoup4-4.14.3.tar.gz": "6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"`
`4`	`4`	`}`
`5`	`5`	`}`
Original file line number	Diff line number	Diff line change
`@@ -22363,8 +22363,8 @@`
`22363`	`22363`	`"type": "other",`
`22364`	`22364`	`"other": {`
`22365`	`22365`	`"name": "python-beautifulsoup4",`
`22366`		`- "version": "4.12.3",`
`22367`		`- "downloadUrl": "https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-4.12.3.tar.gz"`
	`22366`	`+ "version": "4.14.3",`
	`22367`	`+ "downloadUrl": "https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-4.14.3.tar.gz"`
`22368`	`22368`	`}`
`22369`	`22369`	`}`
`22370`	`22370`	`},`