Skip to content

Commit e00abcc

Browse files
Merge commit from fork
* Harden XML parser against billion laughs * Lint
1 parent 182f464 commit e00abcc

3 files changed

Lines changed: 76 additions & 1 deletion

File tree

docs/guides/security.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,9 @@ USP minimally parses documents, so should avoid many of the risks seen in more c
1717
>>> pyexpat.EXPAT_VERSION
1818
'expat_2.4.7'
1919
20-
It is recommended to use a version greater than 2.4.0, which should be included in all recent Python versions.
20+
Billion Laughs Attack
21+
---------------------
22+
23+
To harden against the Billion Laughs attack, USP will not parse XML documents that contain a ``DOCTYPE`` or ``ENTITY`` declaration, which should never appear in a sitemap.
24+
25+
Expat versions greater than 2.4.0 are resistent to most Billion Laughs attacks, although some variants are possible in versions below 2.7.2.

tests/tree/test_security.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import textwrap
2+
3+
from tests.tree.base import TreeTestBase
4+
from usp.objects.sitemap import (
5+
InvalidSitemap,
6+
)
7+
from usp.tree import sitemap_tree_for_homepage
8+
9+
10+
class TestTreeSecurity(TreeTestBase):
11+
def test_billion_laughs_attack(self, requests_mock, caplog):
12+
requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher)
13+
requests_mock.get(
14+
self.TEST_BASE_URL + "/robots.txt",
15+
headers={"Content-Type": "text/plain"},
16+
text=textwrap.dedent(
17+
f"""
18+
Sitemap: {self.TEST_BASE_URL}/sitemap.xml
19+
"""
20+
).strip(),
21+
)
22+
23+
requests_mock.get(
24+
self.TEST_BASE_URL + "/sitemap.xml",
25+
headers={"Content-Type": "application/xml"},
26+
text=textwrap.dedent(
27+
f"""
28+
<?xml version="1.0" encoding="UTF-8"?>
29+
<!DOCTYPE lolz [
30+
<!ENTITY lol "lol">
31+
<!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol
32+
;&lol;&lol;&lol;">
33+
<!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;">
34+
<!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
35+
<!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
36+
<!ENTITY lol5 "&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;">
37+
<!ENTITY lol6 "&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;">
38+
<!ENTITY lol7 "&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;">
39+
<!ENTITY lol8 "&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;">
40+
<!ENTITY lol9 "&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;">
41+
]>
42+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
43+
<url>
44+
<loc>{self.TEST_BASE_URL}/page.html</loc>
45+
</url>
46+
</urlset>
47+
"""
48+
).strip(),
49+
)
50+
51+
tree = sitemap_tree_for_homepage(self.TEST_BASE_URL)
52+
sitemaps = list(tree.all_sitemaps())
53+
assert type(sitemaps[-1]) is InvalidSitemap
54+
55+
assert (
56+
"Sitemap contained unexpected non-standard XML DOCTYPE. Parsing not supported for security reasons."
57+
in caplog.text
58+
)

usp/fetch_parse.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,18 @@ def sitemap(self) -> AbstractSitemap:
445445
parser.EndElementHandler = self._xml_element_end
446446
parser.CharacterDataHandler = self._xml_char_data
447447

448+
def _xml_hardening_handler(handler: str):
449+
def _hardening_handler(*args, **kwargs):
450+
raise SitemapXMLParsingException(
451+
f"Sitemap contained unexpected non-standard XML {handler}. Parsing not supported for security reasons."
452+
)
453+
454+
return _hardening_handler
455+
456+
parser.StartDoctypeDeclHandler = _xml_hardening_handler("DOCTYPE")
457+
parser.EntityDeclHandler = _xml_hardening_handler("ENTITY")
458+
parser.SetParamEntityParsing(xml.parsers.expat.XML_PARAM_ENTITY_PARSING_NEVER)
459+
448460
try:
449461
is_final = True
450462
parser.Parse(self._content, is_final)

0 commit comments

Comments
 (0)