Skip to content

Commit 9c5612c

Browse files
committed
Remove <base> tags to prevent URL hijacking attacks
<base> tags are now automatically removed whenever <head> is removed to prevent URL hijacking attacks. According to HTML spec, <base> must be in <head>, but browsers may interpret misplaced <base> tags, allowing attackers to redirect all relative URLs to malicious servers.
1 parent 2ef7326 commit 9c5612c

File tree

3 files changed

+59
-0
lines changed

3 files changed

+59
-0
lines changed

CHANGES.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@ Bugs fixed
1212
* Fixed a bug where Unicode escapes in CSS were not properly decoded
1313
before security checks. This prevents attackers from bypassing filters
1414
using escape sequences.
15+
* Fixed a security issue where ``<base>`` tags could be used for URL
16+
hijacking attacks. The ``<base>`` tag is now automatically removed
17+
whenever the ``<head>`` tag is removed (via ``page_structure=True``
18+
or manual configuration), as ``<base>`` must be inside ``<head>``
19+
according to HTML specifications.
1520

1621
0.4.3 (2025-10-02)
1722
==================

lxml_html_clean/clean.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,12 @@ def __call__(self, doc):
422422
if self.annoying_tags:
423423
remove_tags.update(('blink', 'marquee'))
424424

425+
# Remove <base> tags whenever <head> is being removed.
426+
# According to HTML spec, <base> must be in <head>, but browsers
427+
# may interpret it even when misplaced, allowing URL hijacking attacks.
428+
if 'head' in kill_tags or 'head' in remove_tags:
429+
kill_tags.add('base')
430+
425431
_remove = deque()
426432
_kill = deque()
427433
for el in doc.iter():

tests/test_clean.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,54 @@ def test_possibly_invalid_url_without_whitelist(self):
394394
self.assertNotIn("google.com", result)
395395
self.assertNotIn("example.com", result)
396396

397+
def test_base_tag_removed_with_page_structure(self):
398+
# Test that <base> tags are removed when page_structure=True (default)
399+
# This prevents URL hijacking attacks where <base> redirects all relative URLs
400+
401+
test_cases = [
402+
# <base> in proper location (inside <head>)
403+
'<html><head><base href="http://evil.com/"></head><body><a href="page.html">link</a></body></html>',
404+
# <base> outside <head>
405+
'<div><base href="http://evil.com/"><a href="page.html">link</a></div>',
406+
# Multiple <base> tags
407+
'<base href="http://evil.com/"><div><base href="http://evil2.com/"></div>',
408+
# <base> with target attribute
409+
'<base target="_blank"><div>content</div>',
410+
# <base> at various positions
411+
'<html><base href="http://evil.com/"><body>test</body></html>',
412+
]
413+
414+
for html in test_cases:
415+
with self.subTest(html=html):
416+
cleaned = clean_html(html)
417+
# Verify <base> tag is completely removed
418+
self.assertNotIn('base', cleaned.lower())
419+
self.assertNotIn('evil.com', cleaned)
420+
self.assertNotIn('evil2.com', cleaned)
421+
422+
def test_base_tag_kept_when_page_structure_false(self):
423+
# When page_structure=False and head is not removed, <base> should be kept
424+
cleaner = Cleaner(page_structure=False)
425+
html = '<html><head><base href="http://example.com/"></head><body>test</body></html>'
426+
cleaned = cleaner.clean_html(html)
427+
self.assertIn('<base href="http://example.com/">', cleaned)
428+
429+
def test_base_tag_removed_when_head_in_remove_tags(self):
430+
# Even with page_structure=False, <base> should be removed if head is manually removed
431+
cleaner = Cleaner(page_structure=False, remove_tags=['head'])
432+
html = '<html><head><base href="http://evil.com/"></head><body>test</body></html>'
433+
cleaned = cleaner.clean_html(html)
434+
self.assertNotIn('base', cleaned.lower())
435+
self.assertNotIn('evil.com', cleaned)
436+
437+
def test_base_tag_removed_when_head_in_kill_tags(self):
438+
# Even with page_structure=False, <base> should be removed if head is in kill_tags
439+
cleaner = Cleaner(page_structure=False, kill_tags=['head'])
440+
html = '<html><head><base href="http://evil.com/"></head><body>test</body></html>'
441+
cleaned = cleaner.clean_html(html)
442+
self.assertNotIn('base', cleaned.lower())
443+
self.assertNotIn('evil.com', cleaned)
444+
397445
def test_unicode_escape_in_style(self):
398446
# Test that CSS Unicode escapes are properly decoded before security checks
399447
# This prevents attackers from bypassing filters using escape sequences

0 commit comments

Comments
 (0)