@@ -489,3 +489,101 @@ def test_unicode_escape_in_style(self):
489489 with self .subTest (html = html ):
490490 cleaned = clean_html (html )
491491 self .assertEqual ('<div><style>/* deleted */</style></div>' , cleaned )
492+
493+ def test_unicode_escape_mixed_with_comments (self ):
494+ # Unicode escapes mixed with CSS comments should still be caught
495+ test_cases = [
496+ # \69 = 'i' with comment before
497+ '<style>@/*comment*/\\ 69mport url(evil.css)</style>' ,
498+ # \69 = 'i' with comment after
499+ '<style>@\\ 69mport/*comment*/ url(evil.css)</style>' ,
500+ # Multiple escapes with comments
501+ '<style>\\ 65\\ 78/*comment*/\\ 70\\ 72\\ 65\\ 73\\ 73\\ 69\\ 6f\\ 6e(alert(1))</style>' ,
502+ ]
503+
504+ for html in test_cases :
505+ with self .subTest (html = html ):
506+ cleaned = clean_html (html )
507+ self .assertEqual ('<div><style>/* deleted */</style></div>' , cleaned )
508+
509+ def test_unicode_escape_case_insensitive (self ):
510+ # CSS hex escapes should work with both uppercase and lowercase hex digits
511+ # \69 = 'i', \6D = 'm', etc.
512+ test_cases = [
513+ # @import with uppercase hex digits: \69\6D\70\6F\72\74
514+ '<style>@\\ 69\\ 6D\\ 70\\ 6F\\ 72\\ 74 url(evil.css)</style>' ,
515+ # @import with mixed case hex digits
516+ '<style>@\\ 69\\ 6d\\ 70\\ 6f\\ 72\\ 74 url(evil.css)</style>' ,
517+ # @import with some uppercase
518+ '<style>@\\ 69\\ 6D\\ 70\\ 6f\\ 72\\ 74 url(evil.css)</style>' ,
519+ ]
520+
521+ for html in test_cases :
522+ with self .subTest (html = html ):
523+ cleaned = clean_html (html )
524+ self .assertEqual ('<div><style>/* deleted */</style></div>' , cleaned )
525+
526+ def test_unicode_escape_various_schemes (self ):
527+ # Test Unicode escapes for various malicious schemes
528+ test_cases = [
529+ # \76\62\73\63\72\69\70\74 = "vbscript"
530+ '<style>url(\\ 76\\ 62\\ 73\\ 63\\ 72\\ 69\\ 70\\ 74:alert(1))</style>' ,
531+ # \6a\73\63\72\69\70\74 = "jscript"
532+ '<style>url(\\ 6a\\ 73\\ 63\\ 72\\ 69\\ 70\\ 74:alert(1))</style>' ,
533+ # \6c\69\76\65\73\63\72\69\70\74 = "livescript"
534+ '<style>url(\\ 6c\\ 69\\ 76\\ 65\\ 73\\ 63\\ 72\\ 69\\ 70\\ 74:alert(1))</style>' ,
535+ # \6d\6f\63\68\61 = "mocha"
536+ '<style>url(\\ 6d\\ 6f\\ 63\\ 68\\ 61:alert(1))</style>' ,
537+ ]
538+
539+ for html in test_cases :
540+ with self .subTest (html = html ):
541+ cleaned = clean_html (html )
542+ self .assertEqual ('<div><style>/* deleted */</style></div>' , cleaned )
543+
544+ def test_unicode_escape_with_whitespace_variations (self ):
545+ # Test different whitespace characters after Unicode escapes
546+ cleaner = Cleaner (safe_attrs_only = False )
547+ test_cases = [
548+ # Tab after escape
549+ ('<div style="@\\ 69\t mport url(evil.css)">test</div>' , '<div>test</div>' ),
550+ # Newline after escape (note: actual newline, not \n)
551+ ('<div style="@\\ 69\n mport url(evil.css)">test</div>' , '<div>test</div>' ),
552+ # Form feed after escape
553+ ('<div style="@\\ 69\f mport url(evil.css)">test</div>' , '<div>test</div>' ),
554+ ]
555+
556+ for html , expected in test_cases :
557+ with self .subTest (html = html ):
558+ cleaned = cleaner .clean_html (html )
559+ self .assertEqual (expected , cleaned )
560+
561+ def test_backslash_removal_after_unicode_decode (self ):
562+ # After decoding Unicode escapes, remaining backslashes are removed
563+ # This ensures double-obfuscation (unicode + backslashes) is caught
564+ test_cases = [
565+ # Step 1: \69 → 'i', Step 2: remove \, Result: @import
566+ '<style>@\\ 69\\ m\\ p\\ o\\ r\\ t url(evil.css)</style>' ,
567+ # Multiple unicode escapes with backslashes mixed in
568+ '<style>@\\ 69\\ 6d\\ p\\ 6f\\ r\\ t url(evil.css)</style>' ,
569+ ]
570+
571+ for html in test_cases :
572+ with self .subTest (html = html ):
573+ cleaned = clean_html (html )
574+ self .assertEqual ('<div><style>/* deleted */</style></div>' , cleaned )
575+
576+ def test_backslash_obfuscation_without_unicode (self ):
577+ # Test that patterns using ONLY backslash obfuscation (no unicode) are caught
578+ # Step 1: No unicode escapes, Step 2: remove \, Result: malicious pattern
579+ test_cases = [
580+ # @\i\m\p\o\r\t → @import (caught by '@import' check)
581+ '<style>@\\ i\\ m\\ p\\ o\\ r\\ t url(evil.css)</style>' ,
582+ # Can also test combinations that create javascript schemes
583+ '<style>@\\ import url(evil.css)</style>' ,
584+ ]
585+
586+ for html in test_cases :
587+ with self .subTest (html = html ):
588+ cleaned = clean_html (html )
589+ self .assertEqual ('<div><style>/* deleted */</style></div>' , cleaned )
0 commit comments