@@ -394,6 +394,54 @@ def test_possibly_invalid_url_without_whitelist(self):
394394 self .assertNotIn ("google.com" , result )
395395 self .assertNotIn ("example.com" , result )
396396
397+ def test_base_tag_removed_with_page_structure (self ):
398+ # Test that <base> tags are removed when page_structure=True (default)
399+ # This prevents URL hijacking attacks where <base> redirects all relative URLs
400+
401+ test_cases = [
402+ # <base> in proper location (inside <head>)
403+ '<html><head><base href="http://evil.com/"></head><body><a href="page.html">link</a></body></html>' ,
404+ # <base> outside <head>
405+ '<div><base href="http://evil.com/"><a href="page.html">link</a></div>' ,
406+ # Multiple <base> tags
407+ '<base href="http://evil.com/"><div><base href="http://evil2.com/"></div>' ,
408+ # <base> with target attribute
409+ '<base target="_blank"><div>content</div>' ,
410+ # <base> at various positions
411+ '<html><base href="http://evil.com/"><body>test</body></html>' ,
412+ ]
413+
414+ for html in test_cases :
415+ with self .subTest (html = html ):
416+ cleaned = clean_html (html )
417+ # Verify <base> tag is completely removed
418+ self .assertNotIn ('base' , cleaned .lower ())
419+ self .assertNotIn ('evil.com' , cleaned )
420+ self .assertNotIn ('evil2.com' , cleaned )
421+
422+ def test_base_tag_kept_when_page_structure_false (self ):
423+ # When page_structure=False and head is not removed, <base> should be kept
424+ cleaner = Cleaner (page_structure = False )
425+ html = '<html><head><base href="http://example.com/"></head><body>test</body></html>'
426+ cleaned = cleaner .clean_html (html )
427+ self .assertIn ('<base href="http://example.com/">' , cleaned )
428+
429+ def test_base_tag_removed_when_head_in_remove_tags (self ):
430+ # Even with page_structure=False, <base> should be removed if head is manually removed
431+ cleaner = Cleaner (page_structure = False , remove_tags = ['head' ])
432+ html = '<html><head><base href="http://evil.com/"></head><body>test</body></html>'
433+ cleaned = cleaner .clean_html (html )
434+ self .assertNotIn ('base' , cleaned .lower ())
435+ self .assertNotIn ('evil.com' , cleaned )
436+
437+ def test_base_tag_removed_when_head_in_kill_tags (self ):
438+ # Even with page_structure=False, <base> should be removed if head is in kill_tags
439+ cleaner = Cleaner (page_structure = False , kill_tags = ['head' ])
440+ html = '<html><head><base href="http://evil.com/"></head><body>test</body></html>'
441+ cleaned = cleaner .clean_html (html )
442+ self .assertNotIn ('base' , cleaned .lower ())
443+ self .assertNotIn ('evil.com' , cleaned )
444+
397445 def test_unicode_escape_in_style (self ):
398446 # Test that CSS Unicode escapes are properly decoded before security checks
399447 # This prevents attackers from bypassing filters using escape sequences
0 commit comments