Skip to content

Commit 23a6082

Browse files
Resolved ptest failures in python-beautifulsoup4 package (microsoft#17428)
1 parent fe0bb32 commit 23a6082

7 files changed

Lines changed: 279 additions & 61 deletions
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
From 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Mon Sep 17 00:00:00 2001
2+
From: Leonard Richardson <leonardr@segfault.org>
3+
Date: Mon, 8 Dec 2025 19:34:16 -0500
4+
Subject: [PATCH] * Change the html.parser tree builder's code for handling
5+
numeric character references, to avoid a crash when using Python versions
6+
that include the fix to Python issue https://bugs.python.org/issue13633
7+
(e.g. Python 3.11.13). [bug=2134393]
8+
- This version also includes the fix for CVE-2024-34062.
9+
10+
Patch Reference: https://src.fedoraproject.org/rpms/python-beautifulsoup4/blob/rawhide/f/0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch
11+
12+
---
13+
CHANGELOG | 5 +++
14+
bs4/builder/_htmlparser.py | 78 +++++++++++++++++++++++++++++-------
15+
bs4/tests/test_htmlparser.py | 17 ++++++++
16+
3 files changed, 86 insertions(+), 14 deletions(-)
17+
18+
diff --git a/CHANGELOG b/CHANGELOG
19+
index f61b7e9..606e9f5 100644
20+
--- a/CHANGELOG
21+
+++ b/CHANGELOG
22+
@@ -1,5 +1,10 @@
23+
= Unreleased
24+
25+
+* Change the html.parser tree builder's code for handling numeric
26+
+ character references, to avoid a crash when using Python versions
27+
+ that include the fix to Python issue https://bugs.python.org/issue13633
28+
+ (e.g. Python 3.11.13). [bug=2134393]
29+
+
30+
* Skip the lxml tree builder's test_surrogate_in_character_reference test
31+
if the libxml2 version is less than 2.13.0. Prior versions of libxml2
32+
don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346]
33+
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
34+
index 165a3d8..ead800f 100644
35+
--- a/bs4/builder/_htmlparser.py
36+
+++ b/bs4/builder/_htmlparser.py
37+
@@ -10,6 +10,7 @@ __all__ = [
38+
]
39+
40+
from html.parser import HTMLParser
41+
+import re
42+
43+
from typing import (
44+
Any,
45+
@@ -223,6 +224,64 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
46+
"""Handle some textual data that shows up between tags."""
47+
self.soup.handle_data(data)
48+
49+
+ _DECIMAL_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9]+)(.*)")
50+
+ _HEX_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9a-f]+)(.*)")
51+
+
52+
+ @classmethod
53+
+ def _dereference_numeric_character_reference(cls, name:str) -> Tuple[str, bool, str]:
54+
+ """Convert a numeric character reference into an actual character.
55+
+
56+
+ :param name: The number of the character reference, as
57+
+ obtained by html.parser
58+
+
59+
+ :return: A 3-tuple (dereferenced, replacement_added,
60+
+ extra_data). `dereferenced` is the dereferenced character
61+
+ reference, or the empty string if there was no
62+
+ reference. `replacement_added` is True if the reference
63+
+ could only be dereferenced by replacing content with U+FFFD
64+
+ REPLACEMENT CHARACTER. `extra_data` is a portion of data
65+
+ following the character reference, which was deemed to be
66+
+ normal data and not part of the reference at all.
67+
+ """
68+
+ dereferenced:str = ""
69+
+ replacement_added:bool = False
70+
+ extra_data:str = ""
71+
+
72+
+ base:int = 10
73+
+ reg = cls._DECIMAL_REFERENCE_WITH_FOLLOWING_DATA
74+
+ if name.startswith("x") or name.startswith("X"):
75+
+ # Hex reference
76+
+ name = name[1:]
77+
+ base = 16
78+
+ reg = cls._HEX_REFERENCE_WITH_FOLLOWING_DATA
79+
+
80+
+ real_name:Optional[int] = None
81+
+ try:
82+
+ real_name = int(name, base)
83+
+ except ValueError:
84+
+ # This is either bad data that starts with what looks like
85+
+ # a numeric character reference, or a real numeric
86+
+ # reference that wasn't terminated by a semicolon.
87+
+ #
88+
+ # The fix to https://bugs.python.org/issue13633 made it
89+
+ # our responsibility to handle the extra data.
90+
+ #
91+
+ # To preserve the old behavior, we extract the numeric
92+
+ # portion of the incoming "reference" and treat that as a
93+
+ # numeric reference. All subsequent data will be processed
94+
+ # as string data.
95+
+ match = reg.search(name)
96+
+ if match is not None:
97+
+ real_name = int(match.groups()[0], base)
98+
+ extra_data = match.groups()[1]
99+
+
100+
+ if real_name is None:
101+
+ dereferenced = ""
102+
+ extra_data = name
103+
+ else:
104+
+ dereferenced, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
105+
+ return dereferenced, replacement_added, extra_data
106+
+
107+
def handle_charref(self, name: str) -> None:
108+
"""Handle a numeric character reference by converting it to the
109+
corresponding Unicode character and treating it as textual
110+
@@ -230,22 +289,13 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
111+
112+
:param name: Character number, possibly in hexadecimal.
113+
"""
114+
- # TODO: This was originally a workaround for a bug in
115+
- # HTMLParser. (http://bugs.python.org/issue13633) The bug has
116+
- # been fixed, but removing this code still makes some
117+
- # Beautiful Soup tests fail. This needs investigation.
118+
- real_name:int
119+
- if name.startswith("x"):
120+
- real_name = int(name.lstrip("x"), 16)
121+
- elif name.startswith("X"):
122+
- real_name = int(name.lstrip("X"), 16)
123+
- else:
124+
- real_name = int(name)
125+
-
126+
- data, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
127+
+ dereferenced, replacement_added, extra_data = self._dereference_numeric_character_reference(name)
128+
if replacement_added:
129+
self.soup.contains_replacement_characters = True
130+
- self.handle_data(data)
131+
+ if dereferenced is not None:
132+
+ self.handle_data(dereferenced)
133+
+ if extra_data is not None:
134+
+ self.handle_data(extra_data)
135+
136+
def handle_entityref(self, name: str) -> None:
137+
"""Handle a named entity reference by converting it to the
138+
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
139+
index 0086a9d..cb85b53 100644
140+
--- a/bs4/tests/test_htmlparser.py
141+
+++ b/bs4/tests/test_htmlparser.py
142+
@@ -162,3 +162,20 @@ class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
143+
# Since we do the replacement ourselves, we can set contains_replacement_characters appropriately.
144+
# lxml and html5lib do the replacement so all we ever see is REPLACEMENT CHARACTER.
145+
assert soup.contains_replacement_characters == True
146+
+
147+
+class TestBeautifulSoupHTMLParser:
148+
+ def test_dereference_numeric_character_reference(self):
149+
+ m = BeautifulSoupHTMLParser._dereference_numeric_character_reference
150+
+ assert m("64") == ("@", False, "")
151+
+ assert m("x64") == ("d", False, "")
152+
+ assert m("X64") == ("d", False, "")
153+
+ assert m("64andsomeextra") == ("@", False, "andsomeextra")
154+
+ assert m("") == ("", False, "")
155+
+ assert m("00whee") == ("�", True, "whee")
156+
+ assert m("xfffdthatsit") == ("�", False, "thatsit")
157+
+ assert m("xabcdplussomeextra") == ("ꯍ", False, "plussomeextra")
158+
+ assert m("obviouslynotnumeric") == ("", False, "obviouslynotnumeric")
159+
+
160+
+ # These are almost certainly wrong but at least it doesn't crash.
161+
+ assert m("xabcdandsomeextra") == ("\U000abcda", False, "ndsomeextra")
162+
+ assert m("xffffffffffffffffffffffbeep") == ("�", True, "p")
163+
--
164+
2.52.0
165+
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
From ec4a722af07341c4aa3fe604b077a1f773c6fdd2 Mon Sep 17 00:00:00 2001
2+
From: Leonard Richardson <leonardr@segfault.org>
3+
Date: Sun, 7 Dec 2025 13:10:42 -0500
4+
Subject: [PATCH] * Skip the lxml tree builder's
5+
test_surrogate_in_character_reference test if the libxml2 version is less
6+
than 2.13.0. Prior versions of libxml2 don't issue the REPLACEMENT
7+
CHARACTER we're expecting. [bug=2134346]
8+
9+
Patch Reference: https://src.fedoraproject.org/rpms/python-beautifulsoup4/blob/rawhide/f/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
10+
11+
---
12+
CHANGELOG | 6 ++++++
13+
bs4/tests/test_lxml.py | 14 +++++++++++++-
14+
tox.ini | 2 +-
15+
3 files changed, 20 insertions(+), 2 deletions(-)
16+
17+
diff --git a/CHANGELOG b/CHANGELOG
18+
index 544f128..f61b7e9 100644
19+
--- a/CHANGELOG
20+
+++ b/CHANGELOG
21+
@@ -1,3 +1,9 @@
22+
+= Unreleased
23+
+
24+
+* Skip the lxml tree builder's test_surrogate_in_character_reference test
25+
+ if the libxml2 version is less than 2.13.0. Prior versions of libxml2
26+
+ don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346]
27+
+
28+
= 4.14.3 (20251130)
29+
30+
* When using one of the lxml tree builders, you can pass in
31+
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
32+
index 0b69956..aa82143 100644
33+
--- a/bs4/tests/test_lxml.py
34+
+++ b/bs4/tests/test_lxml.py
35+
@@ -7,6 +7,7 @@ from . import LXML_PRESENT, LXML_VERSION
36+
37+
if LXML_PRESENT:
38+
from bs4.builder._lxml import LXMLTreeBuilder, LXMLTreeBuilderForXML
39+
+ from lxml import etree
40+
41+
from bs4 import (
42+
BeautifulStoneSoup,
43+
@@ -47,7 +48,6 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
44+
45+
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
46+
# test if an old version of lxml is installed.
47+
-
48+
@pytest.mark.skipif(
49+
not LXML_PRESENT or LXML_VERSION < (2, 3, 5, 0),
50+
reason="Skipping doctype test for old version of lxml to avoid segfault.",
51+
@@ -57,6 +57,18 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
52+
doctype = soup.contents[0]
53+
assert "" == doctype.strip()
54+
55+
+ # This is a copy of the HTMLTreeBuilderSmokeTest implementation.
56+
+ # For lxml only, we need to skip the test if the libxml2 version doesn't
57+
+ # have the fix from https://gitlab.gnome.org/GNOME/libxml2/-/commit/4dcc2d743eb83b8aaec0d91660d615fdb024dad0. That means any pre-2.13 version.
58+
+ @pytest.mark.skipif(
59+
+ "etree.LIBXML_VERSION < (2, 13, 0)",
60+
+ reason="libxml version doesn't issue REPLACEMENT CHARACTER",
61+
+ )
62+
+ def test_surrogate_in_character_reference(self):
63+
+ # These character references are invalid and should be replaced with REPLACEMENT CHARACTER.
64+
+ soup = self.soup("<html><body>&#55357;&#56551;</body></html>")
65+
+ assert soup.body.contents == ['��']
66+
+
67+
def test_beautifulstonesoup_is_xml_parser(self):
68+
# Make sure that the deprecated BSS class uses an xml builder
69+
# if one is installed.
70+
diff --git a/tox.ini b/tox.ini
71+
index c53e4d8..c60c3e7 100644
72+
--- a/tox.ini
73+
+++ b/tox.ini
74+
@@ -2,7 +2,7 @@
75+
# encoding autodetection libraries: cchardet, chardet, and charset-normalizer
76+
[tox]
77+
env_list =
78+
- py{37, 38, 39, 310, 311, 312, 313},bare,docs
79+
+ py{37, 38, 39, 310, 311, 312, 313, 314},bare,docs
80+
minversion = 3.28.0
81+
skip_missing_interpreters = true
82+
83+
--
84+
2.52.0
85+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
Patch Reference: https://src.fedoraproject.org/rpms/python-beautifulsoup4/blob/rawhide/f/beautifulsoup4-4.14-disable-soupsieve.patch
2+
---
3+
--- a/pyproject.toml
4+
+++ b/pyproject.toml
5+
@@ -31,7 +31,6 @@ classifiers = [
6+
]
7+
dependencies = [
8+
"typing-extensions>=4.0.0",
9+
- "soupsieve>=1.6.1",
10+
]
11+
12+
[project.optional-dependencies]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"Signatures": {
3-
"python-beautifulsoup4-4.12.3.tar.gz": "74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"
3+
"python-beautifulsoup4-4.14.3.tar.gz": "6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"
44
}
55
}

SPECS-EXTENDED/python-beautifulsoup4/python-beautifulsoup4.spec

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,17 @@
33
%bcond tests 1
44

55
Name: python-beautifulsoup4
6-
Version: 4.12.3
7-
Release: 8%{?dist}
6+
Version: 4.14.3
7+
Release: 1%{?dist}
88
Summary: HTML/XML parser for quick-turnaround applications like screen-scraping
99
License: MIT
1010
Vendor: Microsoft Corporation
1111
Distribution: Azure Linux
1212
URL: https://www.crummy.com/software/BeautifulSoup/
1313
Source0: https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-%{version}.tar.gz#/%{name}-%{version}.tar.gz
14-
# https://git.launchpad.net/beautifulsoup/commit/?id=9786a62726de5a8caba10021c4d4a58c8a3e9e3f
15-
16-
Patch0: soupsieve26.patch
17-
14+
Patch0: 0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
15+
Patch1: 0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch
16+
Patch11: beautifulsoup4-4.14-disable-soupsieve.patch
1817
BuildArch: noarch
1918
# html5lib BR just for test coverage
2019
%if %{with tests}
@@ -37,6 +36,7 @@ BuildRequires: python3-colorama
3736
BuildRequires: python3-chardet
3837
BuildRequires: python-cachetools
3938
BuildRequires: python3-pyproject-api
39+
4040
%if %{with soupsieve}
4141
BuildRequires: python3-packaging
4242
BuildRequires: python3-soupsieve
@@ -75,10 +75,11 @@ Obsoletes: python3-BeautifulSoup < 1:3.2.1-2
7575
%description -n python3-beautifulsoup4 %_description
7676

7777
%prep
78-
%autosetup -p1 -n beautifulsoup4-%{version}
79-
# Fix compatibility with lxml 5.3.0
80-
# Reported upstream: https://bugs.launchpad.net/beautifulsoup/+bug/2076897
81-
sed -i "s/strip_cdata=False,//" bs4/builder/_lxml.py
78+
%autosetup -N -n beautifulsoup4-%{version}
79+
%autopatch -p1 -M 10
80+
%if %{without soupsieve}
81+
%autopatch -p1 -m 10
82+
%endif
8283

8384
%generate_buildrequires
8485
%pyproject_buildrequires %{?with_tests: -t}
@@ -102,6 +103,9 @@ python3 -m tox -q --recreate -e py312
102103
%{python3_sitelib}/bs4
103104

104105
%changelog
106+
* Mon May 25 2026 Durga Jagadeesh Palli <v-dpalli@microsoft.com> - 4.14.3-1
107+
- Upgrade to 4.14.3 to fix ptest errors.
108+
105109
* Fri Mar 21 2025 Jyoti kanase <v-jykanase@microsoft.com> - 4.12.3-8
106110
- Initial Azure Linux import from Fedora 41 (license: MIT).
107111
- License verified.

SPECS-EXTENDED/python-beautifulsoup4/soupsieve26.patch

Lines changed: 0 additions & 48 deletions
This file was deleted.

cgmanifest.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22363,8 +22363,8 @@
2236322363
"type": "other",
2236422364
"other": {
2236522365
"name": "python-beautifulsoup4",
22366-
"version": "4.12.3",
22367-
"downloadUrl": "https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-4.12.3.tar.gz"
22366+
"version": "4.14.3",
22367+
"downloadUrl": "https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-4.14.3.tar.gz"
2236822368
}
2236922369
}
2237022370
},

0 commit comments

Comments
 (0)