Skip to content

Commit 65c725c

Browse files
authored
fix: stream scan results data instead of silencing timeouts (#481)
Signed-off-by: tdruez <tdruez@aboutcode.org>
1 parent b48d3e8 commit 65c725c

6 files changed

Lines changed: 58 additions & 11 deletions

File tree

component_catalog/api.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
from django.db import transaction
1212
from django.forms.widgets import HiddenInput
1313
from django.http import FileResponse
14+
from django.http.response import StreamingHttpResponse
1415

1516
import django_filters
17+
import requests
1618
from packageurl.contrib import url2purl
1719
from packageurl.contrib.django.filters import PackageURLFilter
1820
from rest_framework import serializers
@@ -879,6 +881,11 @@ class ScanDataUnavailable(APIException):
879881
default_detail = "Scan data is not available"
880882

881883

884+
class ScanFetchError(APIException):
885+
status_code = status.HTTP_400_BAD_REQUEST
886+
default_detail = "Could not fetch scan data"
887+
888+
882889
class PackageViewSet(
883890
SendAboutFilesMixin,
884891
AboutCodeFilesActionMixin,
@@ -956,17 +963,29 @@ def scan_info(self, request, uuid):
956963

957964
@action(detail=True, name="Scan results")
958965
def scan_results(self, request, uuid):
959-
"""Return the scan results from ScanCode.io."""
966+
"""
967+
Stream scan results directly from ScanCode.io back to the client.
968+
969+
The response body is not loaded in memory but proxied chunk by chunk,
970+
making it suitable for large scan result payloads.
971+
"""
960972
package = self.get_object()
961973
dataspace = request.user.dataspace
962974
scancodeio = ScanCodeIO(dataspace)
963975
project_info = self._get_scancodeio_project_info(scancodeio, package)
964976

965977
project_uuid = project_info.get("uuid")
966978
scan_results_url = scancodeio.get_scan_action_url(project_uuid, "results")
967-
scan_results = scancodeio.fetch_scan_data(scan_results_url)
968979

969-
return Response(scan_results)
980+
try:
981+
scan_response = scancodeio.stream_scan_data(scan_results_url)
982+
except requests.RequestException:
983+
raise ScanFetchError()
984+
985+
return StreamingHttpResponse(
986+
scan_response.iter_content(chunk_size=8192),
987+
content_type=scan_response.headers.get("Content-Type", "application/json"),
988+
)
970989

971990
@action(detail=True, name="Scan summary")
972991
def scan_summary(self, request, uuid):

component_catalog/tests/test_api.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from django.test import override_settings
1818
from django.urls import reverse
1919

20+
import requests
2021
from rest_framework import status
2122
from rest_framework.exceptions import ErrorDetail
2223
from rest_framework.test import APIClient
@@ -1523,19 +1524,32 @@ def test_api_package_viewset_scan_info_action(self, mock_is_available, mock_get_
15231524
self.assertEqual(project_info, response.data)
15241525

15251526
@mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.get_project_info")
1526-
@mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.fetch_scan_data")
1527+
@mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.stream_scan_data")
15271528
@mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.is_available")
15281529
def test_api_package_viewset_scan_results_action(
1529-
self, mock_is_available, mock_fetch_scan_data, mock_get_project_info
1530+
self, mock_is_available, mock_stream_scan_data, mock_get_project_info
15301531
):
15311532
self.client.login(username=self.base_user.username, password="secret")
15321533
action_url = reverse("api_v2:package-scan-results", args=[self.package1.uuid])
15331534
mock_is_available.return_value = True
15341535
mock_get_project_info.return_value = {"uuid": "abcdef"}
1535-
mock_fetch_scan_data.return_value = {"results": ""}
1536+
1537+
mock_stream_scan_data.side_effect = requests.RequestException
1538+
response = self.client.get(action_url)
1539+
self.assertEqual(400, response.status_code)
1540+
error = {"detail": ErrorDetail(string="Could not fetch scan data", code="error")}
1541+
self.assertEqual(error, response.data)
1542+
1543+
mock_response = mock.Mock()
1544+
mock_response.iter_content.return_value = iter([b'{"results": ""}'])
1545+
mock_response.headers = {"Content-Type": "application/json"}
1546+
mock_stream_scan_data.side_effect = None
1547+
mock_stream_scan_data.return_value = mock_response
1548+
15361549
response = self.client.get(action_url)
15371550
self.assertEqual(200, response.status_code)
1538-
self.assertEqual({"results": ""}, response.data)
1551+
self.assertEqual(b'{"results": ""}', b"".join(response.streaming_content))
1552+
self.assertEqual("application/json", response.headers["Content-Type"])
15391553

15401554
@mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.get_project_info")
15411555
@mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.fetch_scan_data")

dejacode/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,9 @@ def get_fake_redis_connection(config, use_strict_redis):
696696
# during the Django 6.x release cycle.
697697
URLIZE_ASSUME_HTTPS = env.bool("DEJACODE_URLIZE_ASSUME_HTTPS", default=True)
698698

699+
# Default to 5 seconds.
700+
DEJACODE_INTEGRATION_REQUESTS_TIMEOUT = env.int("DEJACODE_INTEGRATION_REQUESTS_TIMEOUT", default=5)
701+
699702
if IS_TESTS:
700703
# Silent the django-axes logging during tests
701704
LOGGING["loggers"].update({"axes": {"handlers": ["null"]}})

dejacode_toolkit/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,13 @@ def get_settings(var_name, default=None):
2121
return getenv(var_name) or getattr(settings, var_name, default)
2222

2323

24+
REQUESTS_TIMEOUT = get_settings("DEJACODE_INTEGRATION_REQUESTS_TIMEOUT", default=5)
25+
26+
2427
def is_service_available(label, session, url, raise_exceptions):
2528
"""Check if a configured integration service is available."""
2629
try:
27-
response = session.head(url, timeout=5)
30+
response = session.head(url, timeout=REQUESTS_TIMEOUT)
2831
response.raise_for_status()
2932
except requests.exceptions.RequestException as request_exception:
3033
logger.debug(f"{label} is_available() error: {request_exception}")
@@ -40,7 +43,7 @@ class BaseService:
4043
settings_prefix = None
4144
url_field_name = None
4245
api_key_field_name = None
43-
default_timeout = 5
46+
default_timeout = REQUESTS_TIMEOUT
4447

4548
def __init__(self, dataspace):
4649
if not dataspace:

dejacode_toolkit/scancodeio.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,16 @@ def fetch_scan_data(self, data_url):
146146
return self.request_get(url=data_url)
147147

148148
def stream_scan_data(self, data_url):
149+
"""
150+
Stream scan data from the given URL.
151+
152+
With stream=True, only headers are fetched initially, so raise_for_status()
153+
can fail fast on errors before any body content is downloaded.
154+
"""
149155
logger.debug(f"{self.label}: stream scan data data_url={data_url}")
150-
return self.session.get(url=data_url, stream=True)
156+
response = self.session.get(url=data_url, stream=True)
157+
response.raise_for_status()
158+
return response
151159

152160
def delete_scan(self, detail_url):
153161
logger.debug(f"{self.label}: delete scan detail_url={detail_url}")

dejacode_toolkit/vulnerablecode.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def get_vulnerable_cpes(self, components):
126126
if not cpes:
127127
return []
128128

129-
search_results = self.bulk_search_by_cpes(cpes, timeout=5)
129+
search_results = self.bulk_search_by_cpes(cpes)
130130
if not search_results:
131131
return []
132132

0 commit comments

Comments
 (0)