Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions scanpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2003,16 +2003,20 @@ def size(self):

def fetch(self):
"""Fetch the file from this instance ``download_url`` field."""
from scanpipe.pipes.fetch import fetch_url
from scanpipe.pipes import fetch

if self.exists():
logger.info("The input source file already exists.")
return

if not self.download_url:
raise Exception("No `download_url` value to be fetched.")
raise ValueError("No `download_url` value to be fetched.")

downloaded = fetch_url(url=self.download_url)
is_safe_and_available = fetch.check_url_availability(self.download_url)
if not is_safe_and_available:
raise ValidationError(f"Could not fetch: {self.download_url}")

downloaded = fetch.fetch_url(url=self.download_url)
destination = self.project.move_input_from(downloaded.path)

# Force a commit to the database to ensure the file on disk is not rendered
Expand Down
62 changes: 50 additions & 12 deletions scanpipe/pipes/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import ipaddress
import json
import logging
import os
import re
import socket
import tempfile
from collections import namedtuple
from pathlib import Path
Expand Down Expand Up @@ -404,19 +406,55 @@
return downloads, errors


def check_urls_availability(urls):
"""Check the accessibility of a list of URLs."""
errors = []
def is_safe_url(url):
"""
Check that a URL does not point to a private or internal network address.
Mitigates SSRF by ensuring the target host resolves only to public IPs.
"""
parsed = urlparse(url)

# Only allow http and https schemes
if parsed.scheme not in ("http", "https"):
return False

# Reject URLs with no hostname
if not parsed.hostname:
return False

# Resolve the hostname to catch internal addresses hidden behind DNS
try:
resolved_ip = socket.gethostbyname(parsed.hostname)
except socket.gaierror:
return False

# Reject private, loopback, link-local, and reserved addresses
ip = ipaddress.ip_address(resolved_ip)
unsafe = (
ip.is_private
or ip.is_loopback
or ip.is_link_local
or ip.is_reserved
or ip.is_multicast
)
return not unsafe

for url in urls:
if not url.startswith("http"):
continue

request_session = get_request_session(url)
try:
response = request_session.head(url, timeout=HTTP_REQUEST_TIMEOUT)
response.raise_for_status()
except requests.exceptions.RequestException:
errors.append(url)
def check_url_availability(url):
"""Check that a URL is safe and accessible."""
if not is_safe_url(url):
return False

request_session = get_request_session(url)
try:
response = request_session.head(url, timeout=HTTP_REQUEST_TIMEOUT)
Comment thread Dismissed
response.raise_for_status()
except requests.exceptions.RequestException:
return False

return True


def check_urls_availability(urls):
"""Check the safety and accessibility of a list of URLs."""
errors = [url for url in urls if not check_url_availability(url)]
return errors
84 changes: 84 additions & 0 deletions scanpipe/tests/pipes/test_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

import socket
from pathlib import Path
from unittest import mock

from django.test import TestCase
from django.test import override_settings

import requests
from requests import auth as request_auth

from scanpipe.pipes import fetch
Expand Down Expand Up @@ -265,3 +267,85 @@ def test_scanpipe_pipes_fetch_git_repo(self, mock_clone_from):
self.assertEqual("", download.size)
self.assertEqual("", download.sha1)
self.assertEqual("", download.md5)

@mock.patch("scanpipe.pipes.fetch.socket.gethostbyname")
def test_scanpipe_pipes_fetch_is_safe_url(self, mock_gethostbyname):
# Valid public URLs
mock_gethostbyname.return_value = "93.184.216.34" # example.com
self.assertTrue(fetch.is_safe_url("https://example.com/file.zip"))
self.assertTrue(fetch.is_safe_url("http://example.com/file.zip"))

# Invalid schemes
self.assertFalse(fetch.is_safe_url("ftp://example.com/file.zip"))
self.assertFalse(fetch.is_safe_url("docker://example.com/image"))
self.assertFalse(fetch.is_safe_url(""))

# No hostname
self.assertFalse(fetch.is_safe_url("https://"))

# Unresolvable hostname
mock_gethostbyname.side_effect = socket.gaierror
self.assertFalse(fetch.is_safe_url("https://thisdomaindoesnotexist.invalid/"))
mock_gethostbyname.side_effect = None

# Private ranges
mock_gethostbyname.return_value = "192.168.1.1"
self.assertFalse(fetch.is_safe_url("http://192.168.1.1/"))
mock_gethostbyname.return_value = "10.0.0.1"
self.assertFalse(fetch.is_safe_url("http://10.0.0.1/"))
mock_gethostbyname.return_value = "172.16.0.1"
self.assertFalse(fetch.is_safe_url("http://172.16.0.1/"))

# Loopback
mock_gethostbyname.return_value = "127.0.0.1"
self.assertFalse(fetch.is_safe_url("http://127.0.0.1/"))
mock_gethostbyname.return_value = "127.0.0.1"
self.assertFalse(fetch.is_safe_url("http://localhost/"))

# Link-local
mock_gethostbyname.return_value = "169.254.169.254"
self.assertFalse(fetch.is_safe_url("http://169.254.169.254/"))

# Multicast
mock_gethostbyname.return_value = "224.0.0.1"
self.assertFalse(fetch.is_safe_url("http://224.0.0.1/"))

@mock.patch("scanpipe.pipes.fetch.socket.gethostbyname")
@mock.patch("requests.sessions.Session.head")
def test_scanpipe_pipes_fetch_check_url_availability(
self, mock_head, mock_gethostbyname
):
url = "https://example.com/file.zip"

# Safe and accessible URL
mock_gethostbyname.return_value = "93.184.216.34"
mock_head.return_value = make_mock_response(url=url)
self.assertTrue(fetch.check_url_availability(url))

# Unsafe URL
mock_gethostbyname.return_value = "127.0.0.1"
self.assertFalse(fetch.check_url_availability("http://localhost/"))

# Safe URL but request fails
mock_gethostbyname.return_value = "93.184.216.34"
mock_head.side_effect = requests.exceptions.RequestException
self.assertFalse(fetch.check_url_availability(url))

@mock.patch("scanpipe.pipes.fetch.socket.gethostbyname")
@mock.patch("requests.sessions.Session.head")
def test_scanpipe_pipes_fetch_check_urls_availability(
self, mock_head, mock_gethostbyname
):
urls = [
"https://example.com/file.zip",
"https://example.com/archive.tar.gz",
]

# All URLs safe and accessible
mock_gethostbyname.return_value = "93.184.216.34"
mock_head.return_value = make_mock_response(url="mocked_url")
self.assertEqual([], fetch.check_urls_availability(urls))

# All URLs fail
mock_head.side_effect = requests.exceptions.RequestException
self.assertEqual(urls, fetch.check_urls_availability(urls))
6 changes: 5 additions & 1 deletion scanpipe/tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,8 +1016,12 @@ def test_scanpipe_management_command_run(self):
self.assertEqual("do_nothing", runs[1]["pipeline_name"])
self.assertEqual(["Group1", "Group2"], runs[1]["selected_groups"])

@mock.patch("scanpipe.pipes.fetch.check_url_availability")
@mock.patch("requests.sessions.Session.get")
def test_scanpipe_management_command_run_multiple_inputs(self, mock_get):
def test_scanpipe_management_command_run_multiple_inputs(
self, mock_get, mock_check_url
):
mock_check_url.return_value = True
source_download_url = "https://example.com/z-source.zip#from"
bin_download_url = "https://example.com/z-bin.zip#to"
mock_get.side_effect = [
Expand Down
4 changes: 3 additions & 1 deletion scanpipe/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1546,10 +1546,12 @@ def test_scanpipe_input_source_model_delete_file(self):
input_source.delete_file()
self.assertFalse(input_source.exists())

@mock.patch("scanpipe.pipes.fetch.check_url_availability")
@mock.patch("requests.sessions.Session.get")
def test_scanpipe_input_source_model_fetch(self, mock_get):
def test_scanpipe_input_source_model_fetch(self, mock_get, mock_check_url):
download_url = "https://download.url/file.zip"
mock_get.return_value = make_mock_response(url=download_url)
mock_check_url.return_value = True

input_source = self.project1.add_input_source(download_url=download_url)
destination = input_source.fetch()
Expand Down
6 changes: 5 additions & 1 deletion scanpipe/tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,12 @@ def test_scanpipe_pipeline_class_download_inputs_attribute(self):
pipeline.execute()
self.assertNotIn("Step [download_missing_inputs]", run.log)

@mock.patch("scanpipe.pipes.fetch.check_url_availability")
@mock.patch("requests.sessions.Session.get")
def test_scanpipe_pipeline_class_download_missing_inputs(self, mock_get):
def test_scanpipe_pipeline_class_download_missing_inputs(
self, mock_get, mock_check_url
):
mock_check_url.return_value = True
project1 = make_project()
run = project1.add_pipeline("do_nothing")
pipeline = run.make_pipeline_instance()
Expand Down
Loading