diff --git a/scanpipe/models.py b/scanpipe/models.py index 9ab24ab413..c3179d8db9 100644 --- a/scanpipe/models.py +++ b/scanpipe/models.py @@ -2003,16 +2003,20 @@ def size(self): def fetch(self): """Fetch the file from this instance ``download_url`` field.""" - from scanpipe.pipes.fetch import fetch_url + from scanpipe.pipes import fetch if self.exists(): logger.info("The input source file already exists.") return if not self.download_url: - raise Exception("No `download_url` value to be fetched.") + raise ValueError("No `download_url` value to be fetched.") - downloaded = fetch_url(url=self.download_url) + is_safe_and_available = fetch.check_url(self.download_url) + if not is_safe_and_available: + raise ValidationError(f"Could not fetch: {self.download_url}") + + downloaded = fetch.fetch_url(url=self.download_url) destination = self.project.move_input_from(downloaded.path) # Force a commit to the database to ensure the file on disk is not rendered diff --git a/scanpipe/pipes/fetch.py b/scanpipe/pipes/fetch.py index f1b249fec9..f48a765217 100644 --- a/scanpipe/pipes/fetch.py +++ b/scanpipe/pipes/fetch.py @@ -20,10 +20,12 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import ipaddress import json import logging import os import re +import socket import tempfile from collections import namedtuple from pathlib import Path @@ -302,7 +304,7 @@ def fetch_docker_image(docker_url, to=None): def fetch_git_repo(url, to=None): - """Fetch provided git ``url`` as a clone and return a ``Download`` object.""" + """Fetch provided git `url` as a clone and return a `Download` object.""" download_directory = to or tempfile.mkdtemp() url = url.rstrip("/") filename = url.split("/")[-1] @@ -324,6 +326,7 @@ def fetch_git_repo(url, to=None): def fetch_package_url(url): + """Fetch a package from the provided `url` and return a `Download` object.""" # Ensure the provided Package URL is valid, or raise a ValueError. purl = PackageURL.from_string(url) @@ -371,7 +374,7 @@ def get_fetcher(url): def fetch_url(url): - """Fetch provided `url` and returns the result as a `Download` object.""" + """Fetch provided `url` and return the result as a `Download` object.""" fetcher = get_fetcher(url) logger.info(f'Fetching "{url}" using {fetcher.__name__}') downloaded = fetcher(url) @@ -404,19 +407,55 @@ def fetch_urls(urls): return downloads, errors -def check_urls_availability(urls): - """Check the accessibility of a list of URLs.""" - errors = [] +def is_safe_url(url): + """ + Check that a URL does not point to a private or internal network address. + Mitigates SSRF by ensuring the target host resolves only to public IPs. + """ + parsed = urlparse(url) + + # Only allow http and https schemes + if parsed.scheme not in ("http", "https"): + return False + + # Reject URLs with no hostname + if not parsed.hostname: + return False + + # Resolve the hostname to catch internal addresses hidden behind DNS + try: + resolved_ip = socket.gethostbyname(parsed.hostname) + except socket.gaierror: + return False + + # Reject private, loopback, link-local, and reserved addresses + ip = ipaddress.ip_address(resolved_ip) + unsafe = ( + ip.is_private + or ip.is_loopback + or ip.is_link_local + or ip.is_reserved + or ip.is_multicast + ) + return not unsafe - for url in urls: - if not url.startswith("http"): - continue - request_session = get_request_session(url) - try: - response = request_session.head(url, timeout=HTTP_REQUEST_TIMEOUT) - response.raise_for_status() - except requests.exceptions.RequestException: - errors.append(url) +def check_url(url): + """Check that a URL is safe and accessible.""" + if not is_safe_url(url): + return False + + request_session = get_request_session(url) + try: + response = request_session.head(url, timeout=HTTP_REQUEST_TIMEOUT) + response.raise_for_status() + except requests.exceptions.RequestException: + return False + return True + + +def check_urls_availability(urls): + """Check the safety and accessibility of a list of URLs.""" + errors = [url for url in urls if not check_url(url)] return errors diff --git a/scanpipe/pipes/matchcode.py b/scanpipe/pipes/matchcode.py index 83b88dd9d8..e57c1b7568 100644 --- a/scanpipe/pipes/matchcode.py +++ b/scanpipe/pipes/matchcode.py @@ -93,7 +93,7 @@ def is_available(): def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT): - """Wrap the HTTP request calls on the API.""" + """Send a GET request to `url` with optional `payload` and return the response.""" if not url: return @@ -113,6 +113,7 @@ def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT): def request_post(url, data=None, headers=None, files=None, timeout=DEFAULT_TIMEOUT): + """Send a POST request with `data` to `url` and return the response.""" try: response = session.post( url, data=data, timeout=timeout, headers=headers, files=files diff --git a/scanpipe/pipes/purldb.py b/scanpipe/pipes/purldb.py index fd35b3c147..6d98540e06 100644 --- a/scanpipe/pipes/purldb.py +++ b/scanpipe/pipes/purldb.py @@ -123,7 +123,7 @@ def check_service_availability(*args): def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT, raise_on_error=False): - """Wrap the HTTP request calls on the API.""" + """Send a GET request to `url` with optional `payload` and return the response.""" if not url: return @@ -147,6 +147,7 @@ def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT, raise_on_error=False def request_post(url, data=None, headers=None, files=None, timeout=DEFAULT_TIMEOUT): + """Send a POST request with `data` to `url` and return the response.""" try: response = session.post( url, data=data, timeout=timeout, headers=headers, files=files @@ -413,6 +414,7 @@ def get_packages_for_purl(package_url): def collect_data_for_purl(package_url, raise_on_error=False): + """Fetch and return PurlDB entries for the provided `package_url`.""" collect_api_url = f"{PURLDB_API_URL}collect/" payload = { "purl": str(package_url), diff --git a/scanpipe/pipes/vulnerablecode.py b/scanpipe/pipes/vulnerablecode.py index 6c6073b5d0..061c8e2028 100644 --- a/scanpipe/pipes/vulnerablecode.py +++ b/scanpipe/pipes/vulnerablecode.py @@ -96,7 +96,7 @@ def request_get( payload=None, timeout=None, ): - """Wrap the HTTP request calls on the API.""" + """Send a GET request to `url` with optional `payload` and return the response.""" if not url: return @@ -118,6 +118,7 @@ def request_post( data, timeout=None, ): + """Send a POST request with `data` as JSON to `url` and return the response.""" try: response = session.post(url, json=data, timeout=timeout) response.raise_for_status() @@ -216,8 +217,8 @@ def fetch_vulnerabilities( packages, chunk_size=1000, logger=logger.info, ignore_set=None ): """ - Fetch and store vulnerabilities for each provided ``packages``. - The PURLs are used for the lookups in batch of ``chunk_size`` per request. + Fetch and store vulnerabilities for each provided `packages`. + The PURLs are used for the lookups in batch of `chunk_size` per request. """ vulnerabilities_by_purl = {} diff --git a/scanpipe/tests/pipes/test_fetch.py b/scanpipe/tests/pipes/test_fetch.py index 6157d2a026..098d96d736 100644 --- a/scanpipe/tests/pipes/test_fetch.py +++ b/scanpipe/tests/pipes/test_fetch.py @@ -20,12 +20,14 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. +import socket from pathlib import Path from unittest import mock from django.test import TestCase from django.test import override_settings +import requests from requests import auth as request_auth from scanpipe.pipes import fetch @@ -265,3 +267,83 @@ def test_scanpipe_pipes_fetch_git_repo(self, mock_clone_from): self.assertEqual("", download.size) self.assertEqual("", download.sha1) self.assertEqual("", download.md5) + + @mock.patch("scanpipe.pipes.fetch.socket.gethostbyname") + def test_scanpipe_pipes_fetch_is_safe_url(self, mock_gethostbyname): + # Valid public URLs + mock_gethostbyname.return_value = "93.184.216.34" # example.com + self.assertTrue(fetch.is_safe_url("https://example.com/file.zip")) + self.assertTrue(fetch.is_safe_url("http://example.com/file.zip")) + + # Invalid schemes + self.assertFalse(fetch.is_safe_url("ftp://example.com/file.zip")) + self.assertFalse(fetch.is_safe_url("docker://example.com/image")) + self.assertFalse(fetch.is_safe_url("")) + + # No hostname + self.assertFalse(fetch.is_safe_url("https://")) + + # Unresolvable hostname + mock_gethostbyname.side_effect = socket.gaierror + self.assertFalse(fetch.is_safe_url("https://thisdomaindoesnotexist.invalid/")) + mock_gethostbyname.side_effect = None + + # Private ranges + mock_gethostbyname.return_value = "192.168.1.1" + self.assertFalse(fetch.is_safe_url("http://192.168.1.1/")) + mock_gethostbyname.return_value = "10.0.0.1" + self.assertFalse(fetch.is_safe_url("http://10.0.0.1/")) + mock_gethostbyname.return_value = "172.16.0.1" + self.assertFalse(fetch.is_safe_url("http://172.16.0.1/")) + + # Loopback + mock_gethostbyname.return_value = "127.0.0.1" + self.assertFalse(fetch.is_safe_url("http://127.0.0.1/")) + mock_gethostbyname.return_value = "127.0.0.1" + self.assertFalse(fetch.is_safe_url("http://localhost/")) + + # Link-local + mock_gethostbyname.return_value = "169.254.169.254" + self.assertFalse(fetch.is_safe_url("http://169.254.169.254/")) + + # Multicast + mock_gethostbyname.return_value = "224.0.0.1" + self.assertFalse(fetch.is_safe_url("http://224.0.0.1/")) + + @mock.patch("scanpipe.pipes.fetch.socket.gethostbyname") + @mock.patch("requests.sessions.Session.head") + def test_scanpipe_pipes_fetch_check_url(self, mock_head, mock_gethostbyname): + url = "https://example.com/file.zip" + + # Safe and accessible URL + mock_gethostbyname.return_value = "93.184.216.34" + mock_head.return_value = make_mock_response(url=url) + self.assertTrue(fetch.check_url(url)) + + # Unsafe URL + mock_gethostbyname.return_value = "127.0.0.1" + self.assertFalse(fetch.check_url("http://localhost/")) + + # Safe URL but request fails + mock_gethostbyname.return_value = "93.184.216.34" + mock_head.side_effect = requests.exceptions.RequestException + self.assertFalse(fetch.check_url(url)) + + @mock.patch("scanpipe.pipes.fetch.socket.gethostbyname") + @mock.patch("requests.sessions.Session.head") + def test_scanpipe_pipes_fetch_check_urls_availability( + self, mock_head, mock_gethostbyname + ): + urls = [ + "https://example.com/file.zip", + "https://example.com/archive.tar.gz", + ] + + # All URLs safe and accessible + mock_gethostbyname.return_value = "93.184.216.34" + mock_head.return_value = make_mock_response(url="mocked_url") + self.assertEqual([], fetch.check_urls_availability(urls)) + + # All URLs fail + mock_head.side_effect = requests.exceptions.RequestException + self.assertEqual(urls, fetch.check_urls_availability(urls)) diff --git a/scanpipe/tests/test_commands.py b/scanpipe/tests/test_commands.py index 749fc0269b..e38619cd0c 100644 --- a/scanpipe/tests/test_commands.py +++ b/scanpipe/tests/test_commands.py @@ -1016,8 +1016,12 @@ def test_scanpipe_management_command_run(self): self.assertEqual("do_nothing", runs[1]["pipeline_name"]) self.assertEqual(["Group1", "Group2"], runs[1]["selected_groups"]) + @mock.patch("scanpipe.pipes.fetch.check_url") @mock.patch("requests.sessions.Session.get") - def test_scanpipe_management_command_run_multiple_inputs(self, mock_get): + def test_scanpipe_management_command_run_multiple_inputs( + self, mock_get, mock_check_url + ): + mock_check_url.return_value = True source_download_url = "https://example.com/z-source.zip#from" bin_download_url = "https://example.com/z-bin.zip#to" mock_get.side_effect = [ diff --git a/scanpipe/tests/test_models.py b/scanpipe/tests/test_models.py index 0251e7b729..8313fcf761 100644 --- a/scanpipe/tests/test_models.py +++ b/scanpipe/tests/test_models.py @@ -1546,10 +1546,12 @@ def test_scanpipe_input_source_model_delete_file(self): input_source.delete_file() self.assertFalse(input_source.exists()) + @mock.patch("scanpipe.pipes.fetch.check_url") @mock.patch("requests.sessions.Session.get") - def test_scanpipe_input_source_model_fetch(self, mock_get): + def test_scanpipe_input_source_model_fetch(self, mock_get, mock_check_url): download_url = "https://download.url/file.zip" mock_get.return_value = make_mock_response(url=download_url) + mock_check_url.return_value = True input_source = self.project1.add_input_source(download_url=download_url) destination = input_source.fetch() diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 2c7dad3c10..d68c732747 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -197,8 +197,12 @@ def test_scanpipe_pipeline_class_download_inputs_attribute(self): pipeline.execute() self.assertNotIn("Step [download_missing_inputs]", run.log) + @mock.patch("scanpipe.pipes.fetch.check_url") @mock.patch("requests.sessions.Session.get") - def test_scanpipe_pipeline_class_download_missing_inputs(self, mock_get): + def test_scanpipe_pipeline_class_download_missing_inputs( + self, mock_get, mock_check_url + ): + mock_check_url.return_value = True project1 = make_project() run = project1.add_pipeline("do_nothing") pipeline = run.make_pipeline_instance()