Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,21 @@ lxml_html_clean changelog
Unreleased
==========

0.4.4 (2026-02-26)
==================

Bugs fixed
----------

* Fixed a bug where Unicode escapes in CSS were not properly decoded
before security checks. This prevents attackers from bypassing filters
using escape sequences.
* Fixed a security issue where ``<base>`` tags could be used for URL
hijacking attacks. The ``<base>`` tag is now automatically removed
whenever the ``<head>`` tag is removed (via ``page_structure=True``
or manual configuration), as ``<base>`` must be inside ``<head>``
according to HTML specifications.

0.4.3 (2025-10-02)
==================

Expand Down
28 changes: 27 additions & 1 deletion lxml_html_clean/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,12 @@ def __call__(self, doc):
if self.annoying_tags:
remove_tags.update(('blink', 'marquee'))

# Remove <base> tags whenever <head> is being removed.
# According to HTML spec, <base> must be in <head>, but browsers
# may interpret it even when misplaced, allowing URL hijacking attacks.
if 'head' in kill_tags or 'head' in remove_tags:
kill_tags.add('base')

_remove = deque()
_kill = deque()
for el in doc.iter():
Expand Down Expand Up @@ -578,6 +584,26 @@ def _remove_javascript_link(self, link):
_comments_re = re.compile(r'/\*.*?\*/', re.S)
_find_comments = _comments_re.finditer
_substitute_comments = _comments_re.sub
_css_unicode_escape_re = re.compile(r'\\([0-9a-fA-F]{1,6})\s?')

def _decode_css_unicode_escapes(self, style):
"""
Decode CSS Unicode escape sequences like \\69 or \\000069 to their
actual character values. This prevents bypassing security checks
using CSS escape sequences.

CSS escape syntax: backslash followed by 1-6 hex digits,
optionally followed by a whitespace character.
"""
def replace_escape(match):
hex_value = match.group(1)
try:
return chr(int(hex_value, 16))
except (ValueError, OverflowError):
# Invalid unicode codepoint, keep original
return match.group(0)

return self._css_unicode_escape_re.sub(replace_escape, style)

def _has_sneaky_javascript(self, style):
"""
Expand All @@ -591,7 +617,7 @@ def _has_sneaky_javascript(self, style):
more sneaky attempts.
"""
style = self._substitute_comments('', style)
style = style.replace('\\', '')
style = self._decode_css_unicode_escapes(style)
style = _substitute_whitespace('', style)
style = style.lower()
if _has_javascript_scheme(style):
Expand Down
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = lxml_html_clean
version = 0.4.3
version = 0.4.4
description = HTML cleaner from lxml project
long_description = file:README.md
long_description_content_type = text/markdown
Expand All @@ -19,6 +19,7 @@ classifiers =
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
Programming Language :: Python :: 3.13
Programming Language :: Python :: 3.14

[options]
packages =
Expand Down
96 changes: 96 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,3 +393,99 @@ def test_possibly_invalid_url_without_whitelist(self):
self.assertEqual(len(w), 0)
self.assertNotIn("google.com", result)
self.assertNotIn("example.com", result)

def test_base_tag_removed_with_page_structure(self):
# Test that <base> tags are removed when page_structure=True (default)
# This prevents URL hijacking attacks where <base> redirects all relative URLs

test_cases = [
# <base> in proper location (inside <head>)
'<html><head><base href="http://evil.com/"></head><body><a href="page.html">link</a></body></html>',
# <base> outside <head>
'<div><base href="http://evil.com/"><a href="page.html">link</a></div>',
# Multiple <base> tags
'<base href="http://evil.com/"><div><base href="http://evil2.com/"></div>',
# <base> with target attribute
'<base target="_blank"><div>content</div>',
# <base> at various positions
'<html><base href="http://evil.com/"><body>test</body></html>',
]

for html in test_cases:
with self.subTest(html=html):
cleaned = clean_html(html)
# Verify <base> tag is completely removed
self.assertNotIn('base', cleaned.lower())
self.assertNotIn('evil.com', cleaned)
self.assertNotIn('evil2.com', cleaned)

def test_base_tag_kept_when_page_structure_false(self):
# When page_structure=False and head is not removed, <base> should be kept
cleaner = Cleaner(page_structure=False)
html = '<html><head><base href="http://example.com/"></head><body>test</body></html>'
cleaned = cleaner.clean_html(html)
self.assertIn('<base href="http://example.com/">', cleaned)

def test_base_tag_removed_when_head_in_remove_tags(self):
# Even with page_structure=False, <base> should be removed if head is manually removed
cleaner = Cleaner(page_structure=False, remove_tags=['head'])
html = '<html><head><base href="http://evil.com/"></head><body>test</body></html>'
cleaned = cleaner.clean_html(html)
self.assertNotIn('base', cleaned.lower())
self.assertNotIn('evil.com', cleaned)

def test_base_tag_removed_when_head_in_kill_tags(self):
# Even with page_structure=False, <base> should be removed if head is in kill_tags
cleaner = Cleaner(page_structure=False, kill_tags=['head'])
html = '<html><head><base href="http://evil.com/"></head><body>test</body></html>'
cleaned = cleaner.clean_html(html)
self.assertNotIn('base', cleaned.lower())
self.assertNotIn('evil.com', cleaned)

def test_unicode_escape_in_style(self):
# Test that CSS Unicode escapes are properly decoded before security checks
# This prevents attackers from bypassing filters using escape sequences
# CSS escape syntax: \HHHHHH where H is a hex digit (1-6 digits)

# Test inline style attributes (requires safe_attrs_only=False)
cleaner = Cleaner(safe_attrs_only=False)
inline_style_cases = [
# \6a\61\76\61\73\63\72\69\70\74 = "javascript"
('<div style="background: url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))">test</div>', '<div>test</div>'),
# \69 = 'i', so \69mport = "import"
('<div style="@\\69mport url(evil.css)">test</div>', '<div>test</div>'),
# \69 with space after = 'i', space consumed as part of escape
('<div style="@\\69 mport url(evil.css)">test</div>', '<div>test</div>'),
# \65\78\70\72\65\73\73\69\6f\6e = "expression"
('<div style="\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))">test</div>', '<div>test</div>'),
]

for html, expected in inline_style_cases:
with self.subTest(html=html):
cleaned = cleaner.clean_html(html)
self.assertEqual(expected, cleaned)

# Test <style> tag content (uses default clean_html)
style_tag_cases = [
# Unicode-escaped "javascript:" in url()
'<style>url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))</style>',
# Unicode-escaped "javascript:" without url()
'<style>\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1)</style>',
# Unicode-escaped "expression"
'<style>\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
# Unicode-escaped @import with 'i'
'<style>@\\69mport url(evil.css)</style>',
# Unicode-escaped "data:" scheme
'<style>url(\\64\\61\\74\\61:image/svg+xml;base64,PHN2ZyBvbmxvYWQ9YWxlcnQoMSk+)</style>',
# Space after escape is consumed: \69 mport = "import"
'<style>@\\69 mport url(evil.css)</style>',
# 6-digit escape: \000069 = 'i'
'<style>@\\000069mport url(evil.css)</style>',
# 6-digit escape with space
'<style>@\\000069 mport url(evil.css)</style>',
]

for html in style_tag_cases:
with self.subTest(html=html):
cleaned = clean_html(html)
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)