diff --git a/CHANGES.md b/CHANGES.md index 0e7764fdf..692f11722 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -21,7 +21,7 @@ Changes: - Fixes: -- +- Improve HTML entity sanitization to remove all invalid XML character entities (Issue #348) ## Current diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py index b15a72916..a7445373e 100644 --- a/juriscraper/lib/html_utils.py +++ b/juriscraper/lib/html_utils.py @@ -214,9 +214,17 @@ def clean_html(text: str) -> str: if isinstance(text, str): text = re.sub(r"^\s*<\?xml\s+.*?\?>", "", text) - # Remove bad escaped HTML chars or to or since are not - # valid XML bytes 0x1 to 0x8 - text = re.sub(r"[1-8]\b|[1-8]\b", "", text) + # Remove HTML entities for invalid XML characters + # Invalid chars: 0x00-0x08, 0x0B-0x0C, 0x0E-0x1F + # Valid chars that look invalid: 0x09 (tab), 0x0A (LF), 0x0D (CR) + # Decimal: through , , , through + # Hex: through (excluding 9, A, D) + text = re.sub( + r"([0-8]|11|12|1[4-9]|2[0-9]|3[01]);|" + r"[xX]([0-8bceBCE]|1[0-9a-fA-F]);", + "", + text, + ) # Fix invalid bytes in XML (http://stackoverflow.com/questions/8733233/) text = re.sub( diff --git a/tests/local/test_xml_character_sanitization.py b/tests/local/test_xml_character_sanitization.py new file mode 100644 index 000000000..24b8fd51f --- /dev/null +++ b/tests/local/test_xml_character_sanitization.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +"""Tests for XML character sanitization in PACER parsers. + +This addresses Issue #348: Invalid XML characters break docket parsers. +""" + +import unittest + +from lxml.etree import XMLSyntaxError +from lxml.html import fromstring + +from juriscraper.lib.html_utils import clean_html, strip_bad_html_tags_insecure + + +class XmlCharacterSanitizationTest(unittest.TestCase): + """Test that invalid XML characters are properly handled.""" + + def test_escape_character_in_html(self): + """Test that ESC character (\x1b) is removed from HTML.""" + # This is the problematic character from Issue #348 + html_with_esc = "
Test\x1bString
" + + # clean_html should remove the invalid character + cleaned = clean_html(html_with_esc) + self.assertNotIn("\x1b", cleaned) + self.assertIn("TestString", cleaned) + + # Should be parseable without error + try: + tree = fromstring(cleaned) + text = tree.text_content() + self.assertIn("TestString", text) + except XMLSyntaxError: + self.fail("XMLSyntaxError raised even after cleaning") + + def test_various_invalid_xml_characters(self): + """Test that various invalid XML characters are removed.""" + invalid_chars = [ + ("\x00", "NULL"), + ("\x01", "SOH"), + ("\x02", "STX"), + ("\x08", "BS"), + ("\x0b", "VT"), + ("\x0c", "FF"), + ("\x0e", "SO"), + ("\x1b", "ESC"), + ("\x1f", "US"), + ] + + for char, name in invalid_chars: + with self.subTest(char=name): + html = f"Before{char}After
" + cleaned = clean_html(html) + self.assertNotIn( + char, cleaned, f"{name} character should be removed" + ) + self.assertIn("BeforeAfter", cleaned) + + def test_valid_xml_characters_preserved(self): + """Test that valid XML characters are preserved.""" + # Tab, newline, and carriage return are valid + html = "Line1\tTab\nLine2\rLine3
" + cleaned = clean_html(html) + self.assertIn("\t", cleaned) + self.assertIn("\n", cleaned) + self.assertIn("\r", cleaned) + + def test_strip_bad_html_tags_with_invalid_chars(self): + """Test that strip_bad_html_tags_insecure handles invalid chars.""" + html_with_esc = "Test\x1bString
" + + # First clean, then strip bad tags + cleaned = clean_html(html_with_esc) + + try: + tree = strip_bad_html_tags_insecure(cleaned) + text = tree.text_content() + self.assertIn("TestString", text) + self.assertNotIn("\x1b", text) + except XMLSyntaxError: + self.fail("XMLSyntaxError raised in strip_bad_html_tags_insecure") + + def test_html_entities_for_invalid_chars(self): + """Test that HTML entities for invalid chars are removed.""" + # Some systems might encode invalid chars as HTML entities + html_with_entity = "TestString
" + cleaned = clean_html(html_with_entity) + + # The entity should be removed + self.assertNotIn("", cleaned) + self.assertNotIn("\x1b", cleaned) + + # Should be parseable + try: + tree = fromstring(cleaned) + text = tree.text_content() + self.assertIn("TestString", text) + except XMLSyntaxError: + self.fail("XMLSyntaxError raised with HTML entity") + + def test_real_world_docket_text(self): + """Test with a more realistic docket entry containing invalid chars.""" + # Simulate a docket entry that might have escape sequences + html = """ + + +| 01/15/2020 | +MOTION for Summary Judgment by \x1bDefendant | +