From e1f130f0fd5c47ef7442c65070d5f9d1d1c4d653 Mon Sep 17 00:00:00 2001 From: Rithboss Date: Mon, 1 Dec 2025 20:47:20 -0500 Subject: [PATCH 1/2] fix: Improve HTML entity sanitization for invalid XML characters Fixes #348 - Enhanced clean_html() to remove HTML entities for all invalid XML characters - Added comprehensive regex to handle decimal entities (� through ) - Added support for hexadecimal entities (� through ) - Excludes valid XML characters: tab (0x09), LF (0x0A), CR (0x0D) - Added comprehensive test suite with 6 test cases covering: - ESC character (\x1b) removal - Various invalid XML characters (NULL, SOH, STX, BS, VT, FF, SO, ESC, US) - Valid XML character preservation (tab, newline, carriage return) - HTML entity handling (, etc.) - Integration with strip_bad_html_tags_insecure() - Real-world docket text scenarios This prevents XMLSyntaxError when parsing PACER dockets that contain invalid XML characters like escape sequences. --- CHANGES.md | 2 +- juriscraper/lib/html_utils.py | 14 +- .../local/test_xml_character_sanitization.py | 129 ++++++++++++++++++ 3 files changed, 141 insertions(+), 4 deletions(-) create mode 100644 tests/local/test_xml_character_sanitization.py diff --git a/CHANGES.md b/CHANGES.md index 0e7764fdf..692f11722 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -21,7 +21,7 @@ Changes: - Fixes: -- +- Improve HTML entity sanitization to remove all invalid XML character entities (Issue #348) ## Current diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py index b15a72916..a7445373e 100644 --- a/juriscraper/lib/html_utils.py +++ b/juriscraper/lib/html_utils.py @@ -214,9 +214,17 @@ def clean_html(text: str) -> str: if isinstance(text, str): text = re.sub(r"^\s*<\?xml\s+.*?\?>", "", text) - # Remove bad escaped HTML chars  or  to  or  since are not - # valid XML bytes 0x1 to 0x8 - text = re.sub(r"�[1-8]\b|&#[1-8]\b", "", text) + # Remove HTML entities for invalid XML characters + # Invalid chars: 0x00-0x08, 0x0B-0x0C, 0x0E-0x1F + # Valid chars that look invalid: 0x09 (tab), 0x0A (LF), 0x0D (CR) + # Decimal: � through , , ,  through  + # Hex: � through  (excluding 9, A, D) + text = re.sub( + r"&#([0-8]|11|12|1[4-9]|2[0-9]|3[01]);|" + r"&#[xX]([0-8bceBCE]|1[0-9a-fA-F]);", + "", + text, + ) # Fix invalid bytes in XML (http://stackoverflow.com/questions/8733233/) text = re.sub( diff --git a/tests/local/test_xml_character_sanitization.py b/tests/local/test_xml_character_sanitization.py new file mode 100644 index 000000000..0570c7483 --- /dev/null +++ b/tests/local/test_xml_character_sanitization.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +"""Tests for XML character sanitization in PACER parsers. + +This addresses Issue #348: Invalid XML characters break docket parsers. +""" + +import unittest + +from lxml.html import fromstring +from lxml.etree import XMLSyntaxError + +from juriscraper.lib.html_utils import clean_html, strip_bad_html_tags_insecure + + +class XmlCharacterSanitizationTest(unittest.TestCase): + """Test that invalid XML characters are properly handled.""" + + def test_escape_character_in_html(self): + """Test that ESC character (\x1b) is removed from HTML.""" + # This is the problematic character from Issue #348 + html_with_esc = '

Test\x1bString

' + + # clean_html should remove the invalid character + cleaned = clean_html(html_with_esc) + self.assertNotIn('\x1b', cleaned) + self.assertIn('TestString', cleaned) + + # Should be parseable without error + try: + tree = fromstring(cleaned) + text = tree.text_content() + self.assertIn('TestString', text) + except XMLSyntaxError: + self.fail("XMLSyntaxError raised even after cleaning") + + def test_various_invalid_xml_characters(self): + """Test that various invalid XML characters are removed.""" + invalid_chars = [ + ('\x00', 'NULL'), + ('\x01', 'SOH'), + ('\x02', 'STX'), + ('\x08', 'BS'), + ('\x0b', 'VT'), + ('\x0c', 'FF'), + ('\x0e', 'SO'), + ('\x1b', 'ESC'), + ('\x1f', 'US'), + ] + + for char, name in invalid_chars: + with self.subTest(char=name): + html = f'

Before{char}After

' + cleaned = clean_html(html) + self.assertNotIn(char, cleaned, + f"{name} character should be removed") + self.assertIn('BeforeAfter', cleaned) + + def test_valid_xml_characters_preserved(self): + """Test that valid XML characters are preserved.""" + # Tab, newline, and carriage return are valid + html = '

Line1\tTab\nLine2\rLine3

' + cleaned = clean_html(html) + self.assertIn('\t', cleaned) + self.assertIn('\n', cleaned) + self.assertIn('\r', cleaned) + + def test_strip_bad_html_tags_with_invalid_chars(self): + """Test that strip_bad_html_tags_insecure handles invalid chars.""" + html_with_esc = '

Test\x1bString

' + + # First clean, then strip bad tags + cleaned = clean_html(html_with_esc) + + try: + tree = strip_bad_html_tags_insecure(cleaned) + text = tree.text_content() + self.assertIn('TestString', text) + self.assertNotIn('\x1b', text) + except XMLSyntaxError: + self.fail("XMLSyntaxError raised in strip_bad_html_tags_insecure") + + def test_html_entities_for_invalid_chars(self): + """Test that HTML entities for invalid chars are removed.""" + # Some systems might encode invalid chars as HTML entities + html_with_entity = '

TestString

' + cleaned = clean_html(html_with_entity) + + # The entity should be removed + self.assertNotIn('', cleaned) + self.assertNotIn('\x1b', cleaned) + + # Should be parseable + try: + tree = fromstring(cleaned) + text = tree.text_content() + self.assertIn('TestString', text) + except XMLSyntaxError: + self.fail("XMLSyntaxError raised with HTML entity") + + def test_real_world_docket_text(self): + """Test with a more realistic docket entry containing invalid chars.""" + # Simulate a docket entry that might have escape sequences + html = ''' + + + + + + + +
01/15/2020MOTION for Summary Judgment by \x1bDefendant
+ + + ''' + + cleaned = clean_html(html) + self.assertNotIn('\x1b', cleaned) + + try: + tree = fromstring(cleaned) + text = tree.text_content() + self.assertIn('Defendant', text) + except XMLSyntaxError: + self.fail("XMLSyntaxError raised with docket-like HTML") + + +if __name__ == "__main__": + unittest.main() + From f1e5ddb2ed458efc209e4c452f0d95fa90f16983 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Dec 2025 01:48:36 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../local/test_xml_character_sanitization.py | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/tests/local/test_xml_character_sanitization.py b/tests/local/test_xml_character_sanitization.py index 0570c7483..24b8fd51f 100644 --- a/tests/local/test_xml_character_sanitization.py +++ b/tests/local/test_xml_character_sanitization.py @@ -6,8 +6,8 @@ import unittest -from lxml.html import fromstring from lxml.etree import XMLSyntaxError +from lxml.html import fromstring from juriscraper.lib.html_utils import clean_html, strip_bad_html_tags_insecure @@ -18,89 +18,90 @@ class XmlCharacterSanitizationTest(unittest.TestCase): def test_escape_character_in_html(self): """Test that ESC character (\x1b) is removed from HTML.""" # This is the problematic character from Issue #348 - html_with_esc = '

Test\x1bString

' - + html_with_esc = "

Test\x1bString

" + # clean_html should remove the invalid character cleaned = clean_html(html_with_esc) - self.assertNotIn('\x1b', cleaned) - self.assertIn('TestString', cleaned) - + self.assertNotIn("\x1b", cleaned) + self.assertIn("TestString", cleaned) + # Should be parseable without error try: tree = fromstring(cleaned) text = tree.text_content() - self.assertIn('TestString', text) + self.assertIn("TestString", text) except XMLSyntaxError: self.fail("XMLSyntaxError raised even after cleaning") def test_various_invalid_xml_characters(self): """Test that various invalid XML characters are removed.""" invalid_chars = [ - ('\x00', 'NULL'), - ('\x01', 'SOH'), - ('\x02', 'STX'), - ('\x08', 'BS'), - ('\x0b', 'VT'), - ('\x0c', 'FF'), - ('\x0e', 'SO'), - ('\x1b', 'ESC'), - ('\x1f', 'US'), + ("\x00", "NULL"), + ("\x01", "SOH"), + ("\x02", "STX"), + ("\x08", "BS"), + ("\x0b", "VT"), + ("\x0c", "FF"), + ("\x0e", "SO"), + ("\x1b", "ESC"), + ("\x1f", "US"), ] - + for char, name in invalid_chars: with self.subTest(char=name): - html = f'

Before{char}After

' + html = f"

Before{char}After

" cleaned = clean_html(html) - self.assertNotIn(char, cleaned, - f"{name} character should be removed") - self.assertIn('BeforeAfter', cleaned) + self.assertNotIn( + char, cleaned, f"{name} character should be removed" + ) + self.assertIn("BeforeAfter", cleaned) def test_valid_xml_characters_preserved(self): """Test that valid XML characters are preserved.""" # Tab, newline, and carriage return are valid - html = '

Line1\tTab\nLine2\rLine3

' + html = "

Line1\tTab\nLine2\rLine3

" cleaned = clean_html(html) - self.assertIn('\t', cleaned) - self.assertIn('\n', cleaned) - self.assertIn('\r', cleaned) + self.assertIn("\t", cleaned) + self.assertIn("\n", cleaned) + self.assertIn("\r", cleaned) def test_strip_bad_html_tags_with_invalid_chars(self): """Test that strip_bad_html_tags_insecure handles invalid chars.""" - html_with_esc = '

Test\x1bString

' - + html_with_esc = "

Test\x1bString

" + # First clean, then strip bad tags cleaned = clean_html(html_with_esc) - + try: tree = strip_bad_html_tags_insecure(cleaned) text = tree.text_content() - self.assertIn('TestString', text) - self.assertNotIn('\x1b', text) + self.assertIn("TestString", text) + self.assertNotIn("\x1b", text) except XMLSyntaxError: self.fail("XMLSyntaxError raised in strip_bad_html_tags_insecure") def test_html_entities_for_invalid_chars(self): """Test that HTML entities for invalid chars are removed.""" # Some systems might encode invalid chars as HTML entities - html_with_entity = '

TestString

' + html_with_entity = "

TestString

" cleaned = clean_html(html_with_entity) - + # The entity should be removed - self.assertNotIn('', cleaned) - self.assertNotIn('\x1b', cleaned) - + self.assertNotIn("", cleaned) + self.assertNotIn("\x1b", cleaned) + # Should be parseable try: tree = fromstring(cleaned) text = tree.text_content() - self.assertIn('TestString', text) + self.assertIn("TestString", text) except XMLSyntaxError: self.fail("XMLSyntaxError raised with HTML entity") def test_real_world_docket_text(self): """Test with a more realistic docket entry containing invalid chars.""" # Simulate a docket entry that might have escape sequences - html = ''' + html = """ @@ -111,19 +112,18 @@ def test_real_world_docket_text(self):
- ''' - + """ + cleaned = clean_html(html) - self.assertNotIn('\x1b', cleaned) - + self.assertNotIn("\x1b", cleaned) + try: tree = fromstring(cleaned) text = tree.text_content() - self.assertIn('Defendant', text) + self.assertIn("Defendant", text) except XMLSyntaxError: self.fail("XMLSyntaxError raised with docket-like HTML") if __name__ == "__main__": unittest.main() -