Skip to content

Commit 240eafa

Browse files
authored
Add block to direct docs path access (#2012) (#2026)
1 parent 591814f commit 240eafa

3 files changed

Lines changed: 181 additions & 5 deletions

File tree

core/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class SourceDocType(Enum):
1111
STATIC_CONTENT_EARLY_EXIT_PATH_PREFIXES = ("releases/",)
1212
# possible library versions are: boost_1_53_0_beta1, 1_82_0, 1_55_0b1
1313
BOOST_LIB_PATH_RE = re.compile(r"^(boost_){0,1}([0-9_]*[0-9]+[^/]*)/(.*)")
14+
BOOST_VERSION_REGEX = r"(boost_){0,1}([0-9_]*[0-9]+[^/]*)"
1415
NO_PROCESS_LIBS = [
1516
# Do nothing with these - just render contents directly
1617
"libs/filesystem",

core/tests/test_views.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,3 +356,169 @@ def test_docs_libs_gateway_200_html_transformed(rf, tp, mock_get_file_data):
356356
def test_calendar(rf, tp):
357357
response = tp.get("calendar")
358358
tp.response_200(response)
359+
360+
361+
@pytest.mark.django_db
362+
@override_settings(
363+
CACHES=TEST_CACHES,
364+
)
365+
def test_static_content_blocks_direct_doc_paths(request_factory):
366+
"""Test that direct access to doc paths and library paths is blocked with 404."""
367+
368+
# Test cases for paths that should be blocked (return 404)
369+
blocked_paths = [
370+
# Original doc/html paths that should be blocked
371+
"boost_1_53_0_beta1/doc/html/index.html",
372+
"1_82_0/doc/html/tutorial.html",
373+
"1_55_0b1/doc/html/reference/api.html",
374+
"boost_1_86_0/doc/html/deep/nested/path.html",
375+
"1_75_0/doc/html/simple.html",
376+
# Edge cases with different boost version formats
377+
"boost_1_53_0_beta1/doc/html/", # trailing slash
378+
"1_82_0/doc/html/a", # single character file
379+
# NEW: Library paths that should now be blocked
380+
"boost_1_53_0_beta1/libs/algorithm/doc/index.html",
381+
"1_82_0/libs/filesystem/doc/index.html",
382+
"boost_1_86_0/libs/test/doc/reference.html",
383+
"1_75_0/libs/wave/doc/tutorial.html",
384+
"boost_1_82_0/libs/any_library/any_file.html",
385+
"1_55_0b1/libs/serialization/index.html",
386+
# Edge cases for libs paths
387+
"boost_1_53_0_beta1/libs/", # just libs with trailing slash
388+
"1_82_0/libs/a", # single character lib name
389+
]
390+
391+
for content_path in blocked_paths:
392+
request = request_factory.get(f"/{content_path}")
393+
view = StaticContentTemplateView.as_view()
394+
395+
# Should raise Http404 without even trying to fetch from S3
396+
with pytest.raises(Http404):
397+
view(request, content_path=content_path)
398+
399+
400+
@pytest.mark.django_db
401+
@override_settings(
402+
CACHES=TEST_CACHES,
403+
)
404+
def test_static_content_allows_non_direct_doc_paths(request_factory):
405+
"""Test that non-direct doc paths are allowed and processed normally."""
406+
407+
# Test cases for paths that should NOT be blocked (normal processing)
408+
allowed_paths = [
409+
# Tools paths - should still be allowed (not libs)
410+
"1_82_0/tools/build/doc/index.html",
411+
"boost_1_82_0/tools/cmake/doc/reference.html",
412+
# Paths with non-boost-version prefixes - should be allowed
413+
"develop/libs/filesystem/doc/index.html", # develop prefix, not version
414+
"master/libs/test/doc/reference.html", # master prefix, not version
415+
# Paths without version prefixes
416+
"doc/html/index.html", # No boost version prefix
417+
"some/other/doc/html/file.html", # Different structure
418+
"libs/algorithm/doc/index.html", # No version prefix
419+
# Paths that don't match the exact patterns
420+
"boost_1_82_0/doc/other/file.html", # not /doc/html/
421+
"1_82_0/doc/htmls/file.html", # not exact /doc/html/
422+
"1_82_0/documentation/html/file.html", # not /doc/html/
423+
"boost_1_82_0/libraries/algorithm/doc/index.html", # libraries not libs
424+
"some_other_prefix/libs/algorithm/doc/index.html", # no boost version
425+
]
426+
427+
for content_path in allowed_paths:
428+
# Mock S3 to return content so we can test the path isn't blocked
429+
with patch(
430+
"core.views.get_content_from_s3",
431+
return_value={"content": b"test content", "content_type": "text/plain"},
432+
):
433+
response = call_view(request_factory, content_path)
434+
# Should get 200 response, not 404 - the main thing is it's not blocked
435+
assert (
436+
response.status_code == 200
437+
), f"Path should be allowed but got {response.status_code}: {content_path}"
438+
439+
440+
def test_boost_version_regex_doc_html_pattern():
441+
"""Test the BOOST_VERSION_REGEX doc/html pattern matches expected version formats."""
442+
import re
443+
from core.constants import BOOST_VERSION_REGEX
444+
445+
# Test the doc/html blocking pattern used in the view
446+
doc_html_pattern = rf"^{BOOST_VERSION_REGEX}/doc/html/.+$"
447+
448+
# Test cases that should match the doc/html pattern
449+
matching_cases = [
450+
"boost_1_53_0_beta1/doc/html/index.html",
451+
"1_82_0/doc/html/tutorial.html",
452+
"1_55_0b1/doc/html/reference/api.html",
453+
"boost_1_86_0/doc/html/test.html",
454+
"1_75_0/doc/html/simple.html",
455+
]
456+
457+
for test_path in matching_cases:
458+
match = re.match(doc_html_pattern, test_path)
459+
assert match is not None, f"Doc/html pattern should match: {test_path}"
460+
# The captured groups should match the expected version parts
461+
version_match = re.match(BOOST_VERSION_REGEX, test_path)
462+
assert version_match is not None, f"Version pattern should match: {test_path}"
463+
464+
# Test cases that should NOT match the doc/html pattern
465+
non_matching_cases = [
466+
"1_82_0/tools/build/doc/index.html", # tools path
467+
"develop/doc/html/index.html", # develop prefix, not version
468+
"doc/html/index.html", # no version prefix
469+
"boost_1_82_0/doc/other/file.html", # not /doc/html/
470+
"1_82_0/doc/htmls/file.html", # not exact /doc/html/
471+
"some/other/doc/html/file.html", # no boost version
472+
"boost_1_82_0/doc/html/", # no file after /doc/html/
473+
"1_82_0/doc/html", # no trailing slash or file
474+
"boost_1_53_0_beta1/libs/algorithm/doc/index.html", # libs path
475+
]
476+
477+
for test_path in non_matching_cases:
478+
match = re.match(doc_html_pattern, test_path)
479+
assert match is None, f"Doc/html pattern should NOT match: {test_path}"
480+
481+
482+
def test_boost_version_regex_libs_pattern():
483+
"""Test the BOOST_VERSION_REGEX libs pattern matches expected version formats."""
484+
import re
485+
from core.constants import BOOST_VERSION_REGEX
486+
487+
# Test the libs blocking pattern used in the view
488+
libs_pattern = rf"^{BOOST_VERSION_REGEX}/libs/.+$"
489+
490+
# Test cases that should match the libs pattern
491+
matching_cases = [
492+
"boost_1_53_0_beta1/libs/algorithm/doc/index.html",
493+
"1_82_0/libs/filesystem/doc/index.html",
494+
"boost_1_86_0/libs/test/doc/reference.html",
495+
"1_75_0/libs/wave/doc/tutorial.html",
496+
"boost_1_82_0/libs/any_library/any_file.html",
497+
"1_55_0b1/libs/serialization/index.html",
498+
"1_82_0/libs/a", # single character lib name
499+
"boost_1_53_0_beta1/libs/algorithm", # no trailing file extension
500+
]
501+
502+
for test_path in matching_cases:
503+
match = re.match(libs_pattern, test_path)
504+
assert match is not None, f"Libs pattern should match: {test_path}"
505+
# The captured groups should match the expected version parts
506+
version_match = re.match(BOOST_VERSION_REGEX, test_path)
507+
assert version_match is not None, f"Version pattern should match: {test_path}"
508+
509+
# Test cases that should NOT match the libs pattern
510+
non_matching_cases = [
511+
"1_82_0/tools/build/doc/index.html", # tools path
512+
"develop/libs/filesystem/doc/index.html", # develop prefix, not version
513+
"latest/libs/algorithm/doc/index.html", # latest prefix, not version
514+
"libs/algorithm/doc/index.html", # no version prefix
515+
"boost_1_82_0/libraries/algorithm/doc/index.html", # libraries not libs
516+
"some/other/libs/algorithm/file.html", # no boost version
517+
"boost_1_82_0/libs", # no trailing slash or file
518+
"boost_1_53_0_beta1/libs/", # just libs with trailing slash (no content after)
519+
"1_82_0/doc/html/index.html", # doc/html path
520+
]
521+
522+
for test_path in non_matching_cases:
523+
match = re.match(libs_pattern, test_path)
524+
assert match is None, f"Libs pattern should NOT match: {test_path}"

core/views.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import re
23

34
import requests
45
from django.utils import timezone
@@ -49,6 +50,7 @@
4950
from .constants import (
5051
SourceDocType,
5152
BOOST_LIB_PATH_RE,
53+
BOOST_VERSION_REGEX,
5254
STATIC_CONTENT_EARLY_EXIT_PATH_PREFIXES,
5355
)
5456
from .htmlhelper import (
@@ -287,11 +289,7 @@ def get(self, request, *args, **kwargs):
287289
return redirect(self.content_dict.get("redirect"))
288290

289291
except ContentNotFoundException:
290-
logger.info(
291-
"get_content_from_s3_view_not_in_cache",
292-
content_path=content_path,
293-
status_code=404,
294-
)
292+
logger.info(f"get_content_from_s3_view_not_in_cache {content_path} 404")
295293
raise Http404("Content not found")
296294
return super().get(request, *args, **kwargs)
297295

@@ -465,6 +463,17 @@ def process_content(self, content):
465463

466464

467465
class StaticContentTemplateView(BaseStaticContentTemplateView):
466+
def get(self, request, content_path, *args, **kwargs):
467+
# filter out direct access to the doc paths
468+
path_regexes = [
469+
re.compile(rf"^{BOOST_VERSION_REGEX}/doc/html/.+$"),
470+
re.compile(rf"^{BOOST_VERSION_REGEX}/libs/.+$"),
471+
]
472+
path_match = any(regex.match(content_path) for regex in path_regexes)
473+
if path_match:
474+
raise Http404("Content not found")
475+
return super().get(request, *args, **kwargs)
476+
468477
def process_content(self, content):
469478
"""Process the content we receive from S3"""
470479
content_html = self.content_dict.get("content")

0 commit comments

Comments
 (0)