Skip to content

Commit 5755b10

Browse files
fix: add URL validation in get_imdbtop.py.DISABLED
The code makes external HTTP requests using requests
1 parent 841e947 commit 5755b10

File tree

1 file changed

+28
-1
lines changed

1 file changed

+28
-1
lines changed

web_programming/get_imdbtop.py.DISABLED

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,31 @@
1+
from urllib.parse import urlparse
2+
13
import bs4
24
import requests
35

6+
ALLOWED_HOSTS = {"www.imdb.com"}
7+
8+
9+
def _validate_url(url: str) -> str:
10+
"""Validate a URL against an allowlist of trusted hosts to prevent SSRF attacks.
11+
12+
Args:
13+
url: The URL to validate.
14+
15+
Returns:
16+
The original URL if valid.
17+
18+
Raises:
19+
ValueError: If the URL scheme is not HTTPS or the host is not in the allowlist.
20+
"""
21+
parsed = urlparse(url)
22+
if parsed.scheme != "https" or parsed.hostname not in ALLOWED_HOSTS:
23+
raise ValueError(
24+
f"URL '{url}' is not allowed. Only HTTPS requests to "
25+
f"{ALLOWED_HOSTS} are permitted."
26+
)
27+
return url
28+
429

530
def get_movie_data_from_soup(soup: bs4.element.ResultSet) -> dict[str, str]:
631
return {
@@ -35,7 +60,9 @@ def get_imdb_top_movies(num_movies: int = 5) -> tuple:
3560
"https://www.imdb.com/search/title?title_type="
3661
f"feature&sort=num_votes,desc&count={num_movies}"
3762
)
38-
source = bs4.BeautifulSoup(requests.get(base_url).content, "html.parser")
63+
source = bs4.BeautifulSoup(
64+
requests.get(_validate_url(base_url)).content, "html.parser"
65+
)
3966
return tuple(
4067
get_movie_data_from_soup(movie)
4168
for movie in source.find_all("div", class_="lister-item mode-advanced")

0 commit comments

Comments
 (0)