freelawproject · Rithboss · Dec 2, 2025 · Dec 2, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -21,7 +21,7 @@ Changes:
 -
 
 Fixes:
--
+- Improve HTML entity sanitization to remove all invalid XML character entities (Issue #348)
 
 
 ## Current

diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py
@@ -214,9 +214,17 @@ def clean_html(text: str) -> str:
     if isinstance(text, str):
         text = re.sub(r"^\s*<\?xml\s+.*?\?>", "", text)
 
-        # Remove bad escaped HTML chars &#01 or &#1 to &#08 or &#8 since are not
-        # valid XML bytes 0x1 to 0x8
-        text = re.sub(r"&#0[1-8]\b|&#[1-8]\b", "", text)
+        # Remove HTML entities for invalid XML characters
+        # Invalid chars: 0x00-0x08, 0x0B-0x0C, 0x0E-0x1F
+        # Valid chars that look invalid: 0x09 (tab), 0x0A (LF), 0x0D (CR)
+        # Decimal: &#0; through &#8;, &#11;, &#12;, &#14; through &#31;
+        # Hex: &#x0; through &#x1F; (excluding 9, A, D)
+        text = re.sub(
+            r"&#([0-8]|11|12|1[4-9]|2[0-9]|3[01]);|"
+            r"&#[xX]([0-8bceBCE]|1[0-9a-fA-F]);",
+            "",
+            text,
+        )
 
     # Fix invalid bytes in XML (http://stackoverflow.com/questions/8733233/)
     text = re.sub(

diff --git a/tests/local/test_xml_character_sanitization.py b/tests/local/test_xml_character_sanitization.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+"""Tests for XML character sanitization in PACER parsers.
+
+This addresses Issue #348: Invalid XML characters break docket parsers.
+"""
+
+import unittest
+
+from lxml.etree import XMLSyntaxError
+from lxml.html import fromstring
+
+from juriscraper.lib.html_utils import clean_html, strip_bad_html_tags_insecure
+
+
+class XmlCharacterSanitizationTest(unittest.TestCase):
+    """Test that invalid XML characters are properly handled."""
+
+    def test_escape_character_in_html(self):
+        """Test that ESC character (\x1b) is removed from HTML."""
+        # This is the problematic character from Issue #348
+        html_with_esc = "<html><body><p>Test\x1bString</p></body></html>"
+
+        # clean_html should remove the invalid character
+        cleaned = clean_html(html_with_esc)
+        self.assertNotIn("\x1b", cleaned)
+        self.assertIn("TestString", cleaned)
+
+        # Should be parseable without error
+        try:
+            tree = fromstring(cleaned)
+            text = tree.text_content()
+            self.assertIn("TestString", text)
+        except XMLSyntaxError:
+            self.fail("XMLSyntaxError raised even after cleaning")
+
+    def test_various_invalid_xml_characters(self):
+        """Test that various invalid XML characters are removed."""
+        invalid_chars = [
+            ("\x00", "NULL"),
+            ("\x01", "SOH"),
+            ("\x02", "STX"),
+            ("\x08", "BS"),
+            ("\x0b", "VT"),
+            ("\x0c", "FF"),
+            ("\x0e", "SO"),
+            ("\x1b", "ESC"),
+            ("\x1f", "US"),
+        ]
+
+        for char, name in invalid_chars:
+            with self.subTest(char=name):
+                html = f"<html><body><p>Before{char}After</p></body></html>"
+                cleaned = clean_html(html)
+                self.assertNotIn(
+                    char, cleaned, f"{name} character should be removed"
+                )
+                self.assertIn("BeforeAfter", cleaned)
+
+    def test_valid_xml_characters_preserved(self):
+        """Test that valid XML characters are preserved."""
+        # Tab, newline, and carriage return are valid
+        html = "<html><body><p>Line1\tTab\nLine2\rLine3</p></body></html>"
+        cleaned = clean_html(html)
+        self.assertIn("\t", cleaned)
+        self.assertIn("\n", cleaned)
+        self.assertIn("\r", cleaned)
+
+    def test_strip_bad_html_tags_with_invalid_chars(self):
+        """Test that strip_bad_html_tags_insecure handles invalid chars."""
+        html_with_esc = "<html><body><p>Test\x1bString</p></body></html>"
+
+        # First clean, then strip bad tags
+        cleaned = clean_html(html_with_esc)
+
+        try:
+            tree = strip_bad_html_tags_insecure(cleaned)
+            text = tree.text_content()
+            self.assertIn("TestString", text)
+            self.assertNotIn("\x1b", text)
+        except XMLSyntaxError:
+            self.fail("XMLSyntaxError raised in strip_bad_html_tags_insecure")
+
+    def test_html_entities_for_invalid_chars(self):
+        """Test that HTML entities for invalid chars are removed."""
+        # Some systems might encode invalid chars as HTML entities
+        html_with_entity = "<html><body><p>Test&#27;String</p></body></html>"
+        cleaned = clean_html(html_with_entity)
+
+        # The entity should be removed
+        self.assertNotIn("&#27;", cleaned)
+        self.assertNotIn("\x1b", cleaned)
+
+        # Should be parseable
+        try:
+            tree = fromstring(cleaned)
+            text = tree.text_content()
+            self.assertIn("TestString", text)
+        except XMLSyntaxError:
+            self.fail("XMLSyntaxError raised with HTML entity")
+
+    def test_real_world_docket_text(self):
+        """Test with a more realistic docket entry containing invalid chars."""
+        # Simulate a docket entry that might have escape sequences
+        html = """
+        <html>
+        <body>
+        <table>
+            <tr>
+                <td>01/15/2020</td>
+                <td>MOTION for Summary Judgment by \x1bDefendant</td>
+            </tr>
+        </table>
+        </body>
+        </html>
+        """
+
+        cleaned = clean_html(html)
+        self.assertNotIn("\x1b", cleaned)
+
+        try:
+            tree = fromstring(cleaned)
+            text = tree.text_content()
+            self.assertIn("Defendant", text)
+        except XMLSyntaxError:
+            self.fail("XMLSyntaxError raised with docket-like HTML")
+
+
+if __name__ == "__main__":
+    unittest.main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,7 +21,7 @@ Changes: @@
     -
     Fixes:
-    -
+    - Improve HTML entity sanitization to remove all invalid XML character entities (Issue #348)
     ## Current
@@ Expand Down @@