From e1f130f0fd5c47ef7442c65070d5f9d1d1c4d653 Mon Sep 17 00:00:00 2001
From: Rithboss <rithvikvanga@gmail.com>
Date: Mon, 1 Dec 2025 20:47:20 -0500
Subject: [PATCH 1/2] fix: Improve HTML entity sanitization for invalid XML
 characters

Fixes #348

- Enhanced clean_html() to remove HTML entities for all invalid XML characters
- Added comprehensive regex to handle decimal entities (&#0; through &#31;)
- Added support for hexadecimal entities (&#x0; through &#x1F;)
- Excludes valid XML characters: tab (0x09), LF (0x0A), CR (0x0D)
- Added comprehensive test suite with 6 test cases covering:
  - ESC character (\x1b) removal
  - Various invalid XML characters (NULL, SOH, STX, BS, VT, FF, SO, ESC, US)
  - Valid XML character preservation (tab, newline, carriage return)
  - HTML entity handling (&#27;, etc.)
  - Integration with strip_bad_html_tags_insecure()
  - Real-world docket text scenarios

This prevents XMLSyntaxError when parsing PACER dockets that contain
invalid XML characters like escape sequences.
---
 CHANGES.md                                    |   2 +-
 juriscraper/lib/html_utils.py                 |  14 +-
 .../local/test_xml_character_sanitization.py  | 129 ++++++++++++++++++
 3 files changed, 141 insertions(+), 4 deletions(-)
 create mode 100644 tests/local/test_xml_character_sanitization.py
diff --git a/CHANGES.md b/CHANGES.md
index 0e7764fdf..692f11722 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -21,7 +21,7 @@ Changes:
 -
 
 Fixes:
--
+- Improve HTML entity sanitization to remove all invalid XML character entities (Issue #348)
 
 
 ## Current
diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py
index b15a72916..a7445373e 100644
--- a/juriscraper/lib/html_utils.py
+++ b/juriscraper/lib/html_utils.py
@@ -214,9 +214,17 @@ def clean_html(text: str) -> str:
     if isinstance(text, str):
         text = re.sub(r"^\s*<\?xml\s+.*?\?>", "", text)
 
-        # Remove bad escaped HTML chars  or  to &#08 or &#8 since are not
-        # valid XML bytes 0x1 to 0x8
-        text = re.sub(r"&#0[1-8]\b|&#[1-8]\b", "", text)
+        # Remove HTML entities for invalid XML characters
+        # Invalid chars: 0x00-0x08, 0x0B-0x0C, 0x0E-0x1F
+        # Valid chars that look invalid: 0x09 (tab), 0x0A (LF), 0x0D (CR)
+        # Decimal: &#0; through &#8;, &#11;, &#12;, &#14; through &#31;
+        # Hex: &#x0; through &#x1F; (excluding 9, A, D)
+        text = re.sub(
+            r"&#([0-8]|11|12|1[4-9]|2[0-9]|3[01]);|"
+            r"&#[xX]([0-8bceBCE]|1[0-9a-fA-F]);",
+            "",
+            text,
+        )
 
     # Fix invalid bytes in XML (http://stackoverflow.com/questions/8733233/)
     text = re.sub(
diff --git a/tests/local/test_xml_character_sanitization.py b/tests/local/test_xml_character_sanitization.py
new file mode 100644
index 000000000..0570c7483
--- /dev/null
+++ b/tests/local/test_xml_character_sanitization.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+"""Tests for XML character sanitization in PACER parsers.
+
+This addresses Issue #348: Invalid XML characters break docket parsers.
+"""
+
+import unittest
+
+from lxml.html import fromstring
+from lxml.etree import XMLSyntaxError
+
+from juriscraper.lib.html_utils import clean_html, strip_bad_html_tags_insecure
+
+
+class XmlCharacterSanitizationTest(unittest.TestCase):
+    """Test that invalid XML characters are properly handled."""
+
+    def test_escape_character_in_html(self):
+        """Test that ESC character (\x1b) is removed from HTML."""
+        # This is the problematic character from Issue #348
+        html_with_esc = '<html><body><p>Test\x1bString</p></body></html>'
+        
+        # clean_html should remove the invalid character
+        cleaned = clean_html(html_with_esc)
+        self.assertNotIn('\x1b', cleaned)
+        self.assertIn('TestString', cleaned)
+        
+        # Should be parseable without error
+        try:
+            tree = fromstring(cleaned)
+            text = tree.text_content()
+            self.assertIn('TestString', text)
+        except XMLSyntaxError:
+            self.fail("XMLSyntaxError raised even after cleaning")
+
+    def test_various_invalid_xml_characters(self):
+        """Test that various invalid XML characters are removed."""
+        invalid_chars = [
+            ('\x00', 'NULL'),
+            ('\x01', 'SOH'),
+            ('\x02', 'STX'),
+            ('\x08', 'BS'),
+            ('\x0b', 'VT'),
+            ('\x0c', 'FF'),
+            ('\x0e', 'SO'),
+            ('\x1b', 'ESC'),
+            ('\x1f', 'US'),
+        ]
+        
+        for char, name in invalid_chars:
+            with self.subTest(char=name):
+                html = f'<html><body><p>Before{char}After</p></body></html>'
+                cleaned = clean_html(html)
+                self.assertNotIn(char, cleaned, 
+                                f"{name} character should be removed")
+                self.assertIn('BeforeAfter', cleaned)
+
+    def test_valid_xml_characters_preserved(self):
+        """Test that valid XML characters are preserved."""
+        # Tab, newline, and carriage return are valid
+        html = '<html><body><p>Line1\tTab\nLine2\rLine3</p></body></html>'
+        cleaned = clean_html(html)
+        self.assertIn('\t', cleaned)
+        self.assertIn('\n', cleaned)
+        self.assertIn('\r', cleaned)
+
+    def test_strip_bad_html_tags_with_invalid_chars(self):
+        """Test that strip_bad_html_tags_insecure handles invalid chars."""
+        html_with_esc = '<html><body><p>Test\x1bString</p></body></html>'
+        
+        # First clean, then strip bad tags
+        cleaned = clean_html(html_with_esc)
+        
+        try:
+            tree = strip_bad_html_tags_insecure(cleaned)
+            text = tree.text_content()
+            self.assertIn('TestString', text)
+            self.assertNotIn('\x1b', text)
+        except XMLSyntaxError:
+            self.fail("XMLSyntaxError raised in strip_bad_html_tags_insecure")
+
+    def test_html_entities_for_invalid_chars(self):
+        """Test that HTML entities for invalid chars are removed."""
+        # Some systems might encode invalid chars as HTML entities
+        html_with_entity = '<html><body><p>Test&#27;String</p></body></html>'
+        cleaned = clean_html(html_with_entity)
+        
+        # The entity should be removed
+        self.assertNotIn('&#27;', cleaned)
+        self.assertNotIn('\x1b', cleaned)
+        
+        # Should be parseable
+        try:
+            tree = fromstring(cleaned)
+            text = tree.text_content()
+            self.assertIn('TestString', text)
+        except XMLSyntaxError:
+            self.fail("XMLSyntaxError raised with HTML entity")
+
+    def test_real_world_docket_text(self):
+        """Test with a more realistic docket entry containing invalid chars."""
+        # Simulate a docket entry that might have escape sequences
+        html = '''
+        <html>
+        <body>
+        <table>
+            <tr>
+                <td>01/15/2020</td>
+                <td>MOTION for Summary Judgment by \x1bDefendant</td>
+            </tr>
+        </table>
+        </body>
+        </html>
+        '''
+        
+        cleaned = clean_html(html)
+        self.assertNotIn('\x1b', cleaned)
+        
+        try:
+            tree = fromstring(cleaned)
+            text = tree.text_content()
+            self.assertIn('Defendant', text)
+        except XMLSyntaxError:
+            self.fail("XMLSyntaxError raised with docket-like HTML")
+
+
+if __name__ == "__main__":
+    unittest.main()
+

From f1e5ddb2ed458efc209e4c452f0d95fa90f16983 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 2 Dec 2025 01:48:36 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../local/test_xml_character_sanitization.py  | 86 +++++++++----------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/tests/local/test_xml_character_sanitization.py b/tests/local/test_xml_character_sanitization.py
index 0570c7483..24b8fd51f 100644
--- a/tests/local/test_xml_character_sanitization.py
+++ b/tests/local/test_xml_character_sanitization.py
@@ -6,8 +6,8 @@
 
 import unittest
 
-from lxml.html import fromstring
 from lxml.etree import XMLSyntaxError
+from lxml.html import fromstring
 
 from juriscraper.lib.html_utils import clean_html, strip_bad_html_tags_insecure
 
@@ -18,89 +18,90 @@ class XmlCharacterSanitizationTest(unittest.TestCase):
     def test_escape_character_in_html(self):
         """Test that ESC character (\x1b) is removed from HTML."""
         # This is the problematic character from Issue #348
-        html_with_esc = '<html><body><p>Test\x1bString</p></body></html>'
-        
+        html_with_esc = "<html><body><p>Test\x1bString</p></body></html>"
+
         # clean_html should remove the invalid character
         cleaned = clean_html(html_with_esc)
-        self.assertNotIn('\x1b', cleaned)
-        self.assertIn('TestString', cleaned)
-        
+        self.assertNotIn("\x1b", cleaned)
+        self.assertIn("TestString", cleaned)
+
         # Should be parseable without error
         try:
             tree = fromstring(cleaned)
             text = tree.text_content()
-            self.assertIn('TestString', text)
+            self.assertIn("TestString", text)
         except XMLSyntaxError:
             self.fail("XMLSyntaxError raised even after cleaning")
 
     def test_various_invalid_xml_characters(self):
         """Test that various invalid XML characters are removed."""
         invalid_chars = [
-            ('\x00', 'NULL'),
-            ('\x01', 'SOH'),
-            ('\x02', 'STX'),
-            ('\x08', 'BS'),
-            ('\x0b', 'VT'),
-            ('\x0c', 'FF'),
-            ('\x0e', 'SO'),
-            ('\x1b', 'ESC'),
-            ('\x1f', 'US'),
+            ("\x00", "NULL"),
+            ("\x01", "SOH"),
+            ("\x02", "STX"),
+            ("\x08", "BS"),
+            ("\x0b", "VT"),
+            ("\x0c", "FF"),
+            ("\x0e", "SO"),
+            ("\x1b", "ESC"),
+            ("\x1f", "US"),
         ]
-        
+
         for char, name in invalid_chars:
             with self.subTest(char=name):
-                html = f'<html><body><p>Before{char}After</p></body></html>'
+                html = f"<html><body><p>Before{char}After</p></body></html>"
                 cleaned = clean_html(html)
-                self.assertNotIn(char, cleaned, 
-                                f"{name} character should be removed")
-                self.assertIn('BeforeAfter', cleaned)
+                self.assertNotIn(
+                    char, cleaned, f"{name} character should be removed"
+                )
+                self.assertIn("BeforeAfter", cleaned)
 
     def test_valid_xml_characters_preserved(self):
         """Test that valid XML characters are preserved."""
         # Tab, newline, and carriage return are valid
-        html = '<html><body><p>Line1\tTab\nLine2\rLine3</p></body></html>'
+        html = "<html><body><p>Line1\tTab\nLine2\rLine3</p></body></html>"
         cleaned = clean_html(html)
-        self.assertIn('\t', cleaned)
-        self.assertIn('\n', cleaned)
-        self.assertIn('\r', cleaned)
+        self.assertIn("\t", cleaned)
+        self.assertIn("\n", cleaned)
+        self.assertIn("\r", cleaned)
 
     def test_strip_bad_html_tags_with_invalid_chars(self):
         """Test that strip_bad_html_tags_insecure handles invalid chars."""
-        html_with_esc = '<html><body><p>Test\x1bString</p></body></html>'
-        
+        html_with_esc = "<html><body><p>Test\x1bString</p></body></html>"
+
         # First clean, then strip bad tags
         cleaned = clean_html(html_with_esc)
-        
+
         try:
             tree = strip_bad_html_tags_insecure(cleaned)
             text = tree.text_content()
-            self.assertIn('TestString', text)
-            self.assertNotIn('\x1b', text)
+            self.assertIn("TestString", text)
+            self.assertNotIn("\x1b", text)
         except XMLSyntaxError:
             self.fail("XMLSyntaxError raised in strip_bad_html_tags_insecure")
 
     def test_html_entities_for_invalid_chars(self):
         """Test that HTML entities for invalid chars are removed."""
         # Some systems might encode invalid chars as HTML entities
-        html_with_entity = '<html><body><p>Test&#27;String</p></body></html>'
+        html_with_entity = "<html><body><p>Test&#27;String</p></body></html>"
         cleaned = clean_html(html_with_entity)
-        
+
         # The entity should be removed
-        self.assertNotIn('&#27;', cleaned)
-        self.assertNotIn('\x1b', cleaned)
-        
+        self.assertNotIn("&#27;", cleaned)
+        self.assertNotIn("\x1b", cleaned)
+
         # Should be parseable
         try:
             tree = fromstring(cleaned)
             text = tree.text_content()
-            self.assertIn('TestString', text)
+            self.assertIn("TestString", text)
         except XMLSyntaxError:
             self.fail("XMLSyntaxError raised with HTML entity")
 
     def test_real_world_docket_text(self):
         """Test with a more realistic docket entry containing invalid chars."""
         # Simulate a docket entry that might have escape sequences
-        html = '''
+        html = """
         <html>
         <body>
         <table>
@@ -111,19 +112,18 @@ def test_real_world_docket_text(self):
         </table>
         </body>
         </html>
-        '''
-        
+        """
+
         cleaned = clean_html(html)
-        self.assertNotIn('\x1b', cleaned)
-        
+        self.assertNotIn("\x1b", cleaned)
+
         try:
             tree = fromstring(cleaned)
             text = tree.text_content()
-            self.assertIn('Defendant', text)
+            self.assertIn("Defendant", text)
         except XMLSyntaxError:
             self.fail("XMLSyntaxError raised with docket-like HTML")
 
 
 if __name__ == "__main__":
     unittest.main()
-