Skip to content

Commit 9746150

Browse files
committed
Add more tests for different combinations of backslashes and unicode
1 parent 67e029f commit 9746150

File tree

1 file changed

+98
-0
lines changed

1 file changed

+98
-0
lines changed

tests/test_clean.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,3 +489,101 @@ def test_unicode_escape_in_style(self):
489489
with self.subTest(html=html):
490490
cleaned = clean_html(html)
491491
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
492+
493+
def test_unicode_escape_mixed_with_comments(self):
494+
# Unicode escapes mixed with CSS comments should still be caught
495+
test_cases = [
496+
# \69 = 'i' with comment before
497+
'<style>@/*comment*/\\69mport url(evil.css)</style>',
498+
# \69 = 'i' with comment after
499+
'<style>@\\69mport/*comment*/ url(evil.css)</style>',
500+
# Multiple escapes with comments
501+
'<style>\\65\\78/*comment*/\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
502+
]
503+
504+
for html in test_cases:
505+
with self.subTest(html=html):
506+
cleaned = clean_html(html)
507+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
508+
509+
def test_unicode_escape_case_insensitive(self):
510+
# CSS hex escapes should work with both uppercase and lowercase hex digits
511+
# \69 = 'i', \6D = 'm', etc.
512+
test_cases = [
513+
# @import with uppercase hex digits: \69\6D\70\6F\72\74
514+
'<style>@\\69\\6D\\70\\6F\\72\\74 url(evil.css)</style>',
515+
# @import with mixed case hex digits
516+
'<style>@\\69\\6d\\70\\6f\\72\\74 url(evil.css)</style>',
517+
# @import with some uppercase
518+
'<style>@\\69\\6D\\70\\6f\\72\\74 url(evil.css)</style>',
519+
]
520+
521+
for html in test_cases:
522+
with self.subTest(html=html):
523+
cleaned = clean_html(html)
524+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
525+
526+
def test_unicode_escape_various_schemes(self):
527+
# Test Unicode escapes for various malicious schemes
528+
test_cases = [
529+
# \76\62\73\63\72\69\70\74 = "vbscript"
530+
'<style>url(\\76\\62\\73\\63\\72\\69\\70\\74:alert(1))</style>',
531+
# \6a\73\63\72\69\70\74 = "jscript"
532+
'<style>url(\\6a\\73\\63\\72\\69\\70\\74:alert(1))</style>',
533+
# \6c\69\76\65\73\63\72\69\70\74 = "livescript"
534+
'<style>url(\\6c\\69\\76\\65\\73\\63\\72\\69\\70\\74:alert(1))</style>',
535+
# \6d\6f\63\68\61 = "mocha"
536+
'<style>url(\\6d\\6f\\63\\68\\61:alert(1))</style>',
537+
]
538+
539+
for html in test_cases:
540+
with self.subTest(html=html):
541+
cleaned = clean_html(html)
542+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
543+
544+
def test_unicode_escape_with_whitespace_variations(self):
545+
# Test different whitespace characters after Unicode escapes
546+
cleaner = Cleaner(safe_attrs_only=False)
547+
test_cases = [
548+
# Tab after escape
549+
('<div style="@\\69\tmport url(evil.css)">test</div>', '<div>test</div>'),
550+
# Newline after escape (note: actual newline, not \n)
551+
('<div style="@\\69\nmport url(evil.css)">test</div>', '<div>test</div>'),
552+
# Form feed after escape
553+
('<div style="@\\69\fmport url(evil.css)">test</div>', '<div>test</div>'),
554+
]
555+
556+
for html, expected in test_cases:
557+
with self.subTest(html=html):
558+
cleaned = cleaner.clean_html(html)
559+
self.assertEqual(expected, cleaned)
560+
561+
def test_backslash_removal_after_unicode_decode(self):
562+
# After decoding Unicode escapes, remaining backslashes are removed
563+
# This ensures double-obfuscation (unicode + backslashes) is caught
564+
test_cases = [
565+
# Step 1: \69 → 'i', Step 2: remove \, Result: @import
566+
'<style>@\\69\\m\\p\\o\\r\\t url(evil.css)</style>',
567+
# Multiple unicode escapes with backslashes mixed in
568+
'<style>@\\69\\6d\\p\\6f\\r\\t url(evil.css)</style>',
569+
]
570+
571+
for html in test_cases:
572+
with self.subTest(html=html):
573+
cleaned = clean_html(html)
574+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
575+
576+
def test_backslash_obfuscation_without_unicode(self):
577+
# Test that patterns using ONLY backslash obfuscation (no unicode) are caught
578+
# Step 1: No unicode escapes, Step 2: remove \, Result: malicious pattern
579+
test_cases = [
580+
# @\i\m\p\o\r\t → @import (caught by '@import' check)
581+
'<style>@\\i\\m\\p\\o\\r\\t url(evil.css)</style>',
582+
# Can also test combinations that create javascript schemes
583+
'<style>@\\import url(evil.css)</style>',
584+
]
585+
586+
for html in test_cases:
587+
with self.subTest(html=html):
588+
cleaned = clean_html(html)
589+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)

0 commit comments

Comments
 (0)