|
| 1 | +From 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Mon Sep 17 00:00:00 2001 |
| 2 | +From: Leonard Richardson <leonardr@segfault.org> |
| 3 | +Date: Mon, 8 Dec 2025 19:34:16 -0500 |
| 4 | +Subject: [PATCH] * Change the html.parser tree builder's code for handling |
| 5 | + numeric character references, to avoid a crash when using Python versions |
| 6 | + that include the fix to Python issue https://bugs.python.org/issue13633 |
| 7 | + (e.g. Python 3.11.13). [bug=2134393] |
| 8 | +- This version also includes the fix for CVE-2024-34062. |
| 9 | + |
| 10 | +Patch Reference: https://src.fedoraproject.org/rpms/python-beautifulsoup4/blob/rawhide/f/0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch |
| 11 | + |
| 12 | +--- |
| 13 | + CHANGELOG | 5 +++ |
| 14 | + bs4/builder/_htmlparser.py | 78 +++++++++++++++++++++++++++++------- |
| 15 | + bs4/tests/test_htmlparser.py | 17 ++++++++ |
| 16 | + 3 files changed, 86 insertions(+), 14 deletions(-) |
| 17 | + |
| 18 | +diff --git a/CHANGELOG b/CHANGELOG |
| 19 | +index f61b7e9..606e9f5 100644 |
| 20 | +--- a/CHANGELOG |
| 21 | ++++ b/CHANGELOG |
| 22 | +@@ -1,5 +1,10 @@ |
| 23 | + = Unreleased |
| 24 | + |
| 25 | ++* Change the html.parser tree builder's code for handling numeric |
| 26 | ++ character references, to avoid a crash when using Python versions |
| 27 | ++ that include the fix to Python issue https://bugs.python.org/issue13633 |
| 28 | ++ (e.g. Python 3.11.13). [bug=2134393] |
| 29 | ++ |
| 30 | + * Skip the lxml tree builder's test_surrogate_in_character_reference test |
| 31 | + if the libxml2 version is less than 2.13.0. Prior versions of libxml2 |
| 32 | + don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346] |
| 33 | +diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py |
| 34 | +index 165a3d8..ead800f 100644 |
| 35 | +--- a/bs4/builder/_htmlparser.py |
| 36 | ++++ b/bs4/builder/_htmlparser.py |
| 37 | +@@ -10,6 +10,7 @@ __all__ = [ |
| 38 | + ] |
| 39 | + |
| 40 | + from html.parser import HTMLParser |
| 41 | ++import re |
| 42 | + |
| 43 | + from typing import ( |
| 44 | + Any, |
| 45 | +@@ -223,6 +224,64 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): |
| 46 | + """Handle some textual data that shows up between tags.""" |
| 47 | + self.soup.handle_data(data) |
| 48 | + |
| 49 | ++ _DECIMAL_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9]+)(.*)") |
| 50 | ++ _HEX_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9a-f]+)(.*)") |
| 51 | ++ |
| 52 | ++ @classmethod |
| 53 | ++ def _dereference_numeric_character_reference(cls, name:str) -> Tuple[str, bool, str]: |
| 54 | ++ """Convert a numeric character reference into an actual character. |
| 55 | ++ |
| 56 | ++ :param name: The number of the character reference, as |
| 57 | ++ obtained by html.parser |
| 58 | ++ |
| 59 | ++ :return: A 3-tuple (dereferenced, replacement_added, |
| 60 | ++ extra_data). `dereferenced` is the dereferenced character |
| 61 | ++ reference, or the empty string if there was no |
| 62 | ++ reference. `replacement_added` is True if the reference |
| 63 | ++ could only be dereferenced by replacing content with U+FFFD |
| 64 | ++ REPLACEMENT CHARACTER. `extra_data` is a portion of data |
| 65 | ++ following the character reference, which was deemed to be |
| 66 | ++ normal data and not part of the reference at all. |
| 67 | ++ """ |
| 68 | ++ dereferenced:str = "" |
| 69 | ++ replacement_added:bool = False |
| 70 | ++ extra_data:str = "" |
| 71 | ++ |
| 72 | ++ base:int = 10 |
| 73 | ++ reg = cls._DECIMAL_REFERENCE_WITH_FOLLOWING_DATA |
| 74 | ++ if name.startswith("x") or name.startswith("X"): |
| 75 | ++ # Hex reference |
| 76 | ++ name = name[1:] |
| 77 | ++ base = 16 |
| 78 | ++ reg = cls._HEX_REFERENCE_WITH_FOLLOWING_DATA |
| 79 | ++ |
| 80 | ++ real_name:Optional[int] = None |
| 81 | ++ try: |
| 82 | ++ real_name = int(name, base) |
| 83 | ++ except ValueError: |
| 84 | ++ # This is either bad data that starts with what looks like |
| 85 | ++ # a numeric character reference, or a real numeric |
| 86 | ++ # reference that wasn't terminated by a semicolon. |
| 87 | ++ # |
| 88 | ++ # The fix to https://bugs.python.org/issue13633 made it |
| 89 | ++ # our responsibility to handle the extra data. |
| 90 | ++ # |
| 91 | ++ # To preserve the old behavior, we extract the numeric |
| 92 | ++ # portion of the incoming "reference" and treat that as a |
| 93 | ++ # numeric reference. All subsequent data will be processed |
| 94 | ++ # as string data. |
| 95 | ++ match = reg.search(name) |
| 96 | ++ if match is not None: |
| 97 | ++ real_name = int(match.groups()[0], base) |
| 98 | ++ extra_data = match.groups()[1] |
| 99 | ++ |
| 100 | ++ if real_name is None: |
| 101 | ++ dereferenced = "" |
| 102 | ++ extra_data = name |
| 103 | ++ else: |
| 104 | ++ dereferenced, replacement_added = UnicodeDammit.numeric_character_reference(real_name) |
| 105 | ++ return dereferenced, replacement_added, extra_data |
| 106 | ++ |
| 107 | + def handle_charref(self, name: str) -> None: |
| 108 | + """Handle a numeric character reference by converting it to the |
| 109 | + corresponding Unicode character and treating it as textual |
| 110 | +@@ -230,22 +289,13 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): |
| 111 | + |
| 112 | + :param name: Character number, possibly in hexadecimal. |
| 113 | + """ |
| 114 | +- # TODO: This was originally a workaround for a bug in |
| 115 | +- # HTMLParser. (http://bugs.python.org/issue13633) The bug has |
| 116 | +- # been fixed, but removing this code still makes some |
| 117 | +- # Beautiful Soup tests fail. This needs investigation. |
| 118 | +- real_name:int |
| 119 | +- if name.startswith("x"): |
| 120 | +- real_name = int(name.lstrip("x"), 16) |
| 121 | +- elif name.startswith("X"): |
| 122 | +- real_name = int(name.lstrip("X"), 16) |
| 123 | +- else: |
| 124 | +- real_name = int(name) |
| 125 | +- |
| 126 | +- data, replacement_added = UnicodeDammit.numeric_character_reference(real_name) |
| 127 | ++ dereferenced, replacement_added, extra_data = self._dereference_numeric_character_reference(name) |
| 128 | + if replacement_added: |
| 129 | + self.soup.contains_replacement_characters = True |
| 130 | +- self.handle_data(data) |
| 131 | ++ if dereferenced is not None: |
| 132 | ++ self.handle_data(dereferenced) |
| 133 | ++ if extra_data is not None: |
| 134 | ++ self.handle_data(extra_data) |
| 135 | + |
| 136 | + def handle_entityref(self, name: str) -> None: |
| 137 | + """Handle a named entity reference by converting it to the |
| 138 | +diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py |
| 139 | +index 0086a9d..cb85b53 100644 |
| 140 | +--- a/bs4/tests/test_htmlparser.py |
| 141 | ++++ b/bs4/tests/test_htmlparser.py |
| 142 | +@@ -162,3 +162,20 @@ class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest): |
| 143 | + # Since we do the replacement ourselves, we can set contains_replacement_characters appropriately. |
| 144 | + # lxml and html5lib do the replacement so all we ever see is REPLACEMENT CHARACTER. |
| 145 | + assert soup.contains_replacement_characters == True |
| 146 | ++ |
| 147 | ++class TestBeautifulSoupHTMLParser: |
| 148 | ++ def test_dereference_numeric_character_reference(self): |
| 149 | ++ m = BeautifulSoupHTMLParser._dereference_numeric_character_reference |
| 150 | ++ assert m("64") == ("@", False, "") |
| 151 | ++ assert m("x64") == ("d", False, "") |
| 152 | ++ assert m("X64") == ("d", False, "") |
| 153 | ++ assert m("64andsomeextra") == ("@", False, "andsomeextra") |
| 154 | ++ assert m("") == ("", False, "") |
| 155 | ++ assert m("00whee") == ("�", True, "whee") |
| 156 | ++ assert m("xfffdthatsit") == ("�", False, "thatsit") |
| 157 | ++ assert m("xabcdplussomeextra") == ("ꯍ", False, "plussomeextra") |
| 158 | ++ assert m("obviouslynotnumeric") == ("", False, "obviouslynotnumeric") |
| 159 | ++ |
| 160 | ++ # These are almost certainly wrong but at least it doesn't crash. |
| 161 | ++ assert m("xabcdandsomeextra") == ("\U000abcda", False, "ndsomeextra") |
| 162 | ++ assert m("xffffffffffffffffffffffbeep") == ("�", True, "p") |
| 163 | +-- |
| 164 | +2.52.0 |
| 165 | + |
0 commit comments