Skip to content

Commit 289fb5f

Browse files
committed
Update the pipeline to use https://web.archive.org/web/ endpoint no complex logic
Add a test Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 1b5b652 commit 289fb5f

File tree

6 files changed

+105
-404
lines changed

6 files changed

+105
-404
lines changed

vulnerabilities/improvers/__init__.py

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -38,43 +38,43 @@
3838

3939
IMPROVERS_REGISTRY = create_registry(
4040
[
41-
valid_versions.GitHubBasicImprover,
42-
valid_versions.GitLabBasicImprover,
43-
valid_versions.NginxBasicImprover,
44-
valid_versions.ApacheHTTPDImprover,
45-
valid_versions.DebianBasicImprover,
46-
valid_versions.NpmImprover,
47-
valid_versions.ElixirImprover,
48-
valid_versions.ApacheTomcatImprover,
49-
valid_versions.ApacheKafkaImprover,
50-
valid_versions.IstioImprover,
51-
valid_versions.DebianOvalImprover,
52-
valid_versions.UbuntuOvalImprover,
53-
valid_versions.OSSFuzzImprover,
54-
valid_versions.RubyImprover,
55-
valid_versions.GithubOSVImprover,
56-
vulnerability_status.VulnerabilityStatusImprover,
57-
valid_versions.CurlImprover,
58-
flag_ghost_packages.FlagGhostPackagePipeline,
59-
enhance_with_kev.VulnerabilityKevPipeline,
60-
enhance_with_metasploit.MetasploitImproverPipeline,
61-
enhance_with_exploitdb.ExploitDBImproverPipeline,
62-
compute_package_risk.ComputePackageRiskPipeline,
63-
compute_package_version_rank.ComputeVersionRankPipeline,
64-
add_cvss31_to_CVEs.CVEAdvisoryMappingPipeline,
65-
remove_duplicate_advisories.RemoveDuplicateAdvisoriesPipeline,
66-
populate_vulnerability_summary_pipeline.PopulateVulnerabilitySummariesPipeline,
67-
exploitdb_v2.ExploitDBImproverPipeline,
68-
enhance_with_kev_v2.VulnerabilityKevPipeline,
69-
flag_ghost_packages_v2.FlagGhostPackagePipeline,
70-
enhance_with_metasploit_v2.MetasploitImproverPipeline,
71-
compute_package_risk_v2.ComputePackageRiskPipeline,
72-
compute_version_rank_v2.ComputeVersionRankPipeline,
73-
compute_advisory_todo_v2.ComputeToDo,
74-
unfurl_version_range_v2.UnfurlVersionRangePipeline,
75-
compute_advisory_todo.ComputeToDo,
76-
collect_ssvc_trees.CollectSSVCPipeline,
77-
relate_severities.RelateSeveritiesPipeline,
41+
# valid_versions.GitHubBasicImprover,
42+
# valid_versions.GitLabBasicImprover,
43+
# valid_versions.NginxBasicImprover,
44+
# valid_versions.ApacheHTTPDImprover,
45+
# valid_versions.DebianBasicImprover,
46+
# valid_versions.NpmImprover,
47+
# valid_versions.ElixirImprover,
48+
# valid_versions.ApacheTomcatImprover,
49+
# valid_versions.ApacheKafkaImprover,
50+
# valid_versions.IstioImprover,
51+
# valid_versions.DebianOvalImprover,
52+
# valid_versions.UbuntuOvalImprover,
53+
# valid_versions.OSSFuzzImprover,
54+
# valid_versions.RubyImprover,
55+
# valid_versions.GithubOSVImprover,
56+
# vulnerability_status.VulnerabilityStatusImprover,
57+
# valid_versions.CurlImprover,
58+
# flag_ghost_packages.FlagGhostPackagePipeline,
59+
# enhance_with_kev.VulnerabilityKevPipeline,
60+
# enhance_with_metasploit.MetasploitImproverPipeline,
61+
# enhance_with_exploitdb.ExploitDBImproverPipeline,
62+
# compute_package_risk.ComputePackageRiskPipeline,
63+
# compute_package_version_rank.ComputeVersionRankPipeline,
64+
# add_cvss31_to_CVEs.CVEAdvisoryMappingPipeline,
65+
# remove_duplicate_advisories.RemoveDuplicateAdvisoriesPipeline,
66+
# populate_vulnerability_summary_pipeline.PopulateVulnerabilitySummariesPipeline,
67+
# exploitdb_v2.ExploitDBImproverPipeline,
68+
# enhance_with_kev_v2.VulnerabilityKevPipeline,
69+
# flag_ghost_packages_v2.FlagGhostPackagePipeline,
70+
# enhance_with_metasploit_v2.MetasploitImproverPipeline,
71+
# compute_package_risk_v2.ComputePackageRiskPipeline,
72+
# compute_version_rank_v2.ComputeVersionRankPipeline,
73+
# compute_advisory_todo_v2.ComputeToDo,
74+
# unfurl_version_range_v2.UnfurlVersionRangePipeline,
75+
# compute_advisory_todo.ComputeToDo,
76+
# collect_ssvc_trees.CollectSSVCPipeline,
77+
# relate_severities.RelateSeveritiesPipeline,
7878
archive_urls.ArchiveImproverPipeline,
7979
]
8080
)
Lines changed: 31 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
#
21
# Copyright (c) nexB Inc. and others. All rights reserved.
32
# VulnerableCode is a trademark of nexB Inc.
43
# SPDX-License-Identifier: Apache-2.0
@@ -8,10 +7,11 @@
87
#
98

109
import time
10+
1111
import requests
12+
1213
from vulnerabilities.models import AdvisoryReference
1314
from vulnerabilities.pipelines import VulnerableCodePipeline
14-
from vulnerabilities.wayback_machine import WaybackMachineSaveAPI
1515

1616

1717
class ArchiveImproverPipeline(VulnerableCodePipeline):
@@ -26,49 +26,38 @@ def steps(cls):
2626
return (cls.archive_urls,)
2727

2828
def archive_urls(self):
29-
advisory_refs = AdvisoryReference.objects.filter(archive_url__isnull=True).only("id", "url")
30-
31-
for advisory_ref in advisory_refs.iterator():
32-
33-
if not advisory_ref.url.startswith("http"):
29+
"""Get and stores archive URLs for AdvisoryReferences, flagging missing ones as NO_ARCHIVE"""
30+
advisory_refs = (
31+
AdvisoryReference.objects.filter(archive_url__isnull=True)
32+
.exclude(archive_url="NO_ARCHIVE")
33+
.only("id", "url")
34+
)
35+
36+
for advisory_ref in advisory_refs:
37+
url = advisory_ref.url
38+
if not url or not url.startswith("http"):
3439
continue
3540

36-
if not self.is_reachable_url(advisory_ref.url):
37-
self.log(
38-
f"Skipping archival: URL is unreachable or returned 404: {advisory_ref.url}"
41+
archive_url = self.get_archival(url)
42+
if not archive_url:
43+
AdvisoryReference.objects.filter(id=advisory_ref.id).update(
44+
archive_url="NO_ARCHIVE"
3945
)
46+
self.log(f"URL unreachable or returned no archive url: {url}")
4047
continue
48+
self.log(f"Found Archived Reference URL: {archive_url}")
49+
AdvisoryReference.objects.filter(id=advisory_ref.id).update(archive_url=archive_url)
4150

42-
self.log(f"Requesting archival for: {advisory_ref.url}")
43-
try:
44-
time.sleep(300)
45-
archive_url = self.request_archival(advisory_ref.url)
46-
if not archive_url:
47-
continue
48-
49-
AdvisoryReference.objects.filter(id=advisory_ref.id).update(archive_url=archive_url)
50-
self.log(f"Successfully added archival URL for advisory reference: {archive_url}")
51-
except Exception as e:
52-
self.log(f"Failed to archive {advisory_ref.url}: {str(e)}")
53-
54-
def request_archival(self, url):
55-
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
56-
try:
57-
save_api = WaybackMachineSaveAPI(url, user_agent)
58-
return save_api.save()
59-
except Exception as e:
60-
self.log(f"Failed to archive {url}: {str(e)}")
61-
return None
62-
63-
def is_reachable_url(self, url):
51+
def get_archival(self, url):
52+
self.log(f"Searching for archive URL for this Reference URL: {url}")
6453
try:
65-
with requests.Session() as session:
66-
head_res = session.head(url, allow_redirects=True, timeout=10)
67-
if not head_res.status_code == 200:
68-
return False
69-
70-
get_res = session.get(url, allow_redirects=True, stream=True, timeout=10)
71-
return get_res.status_code == 200
72-
73-
except requests.RequestException:
74-
return False
54+
archive_response = requests.get(
55+
url=f"https://web.archive.org/web/{url}", allow_redirects=True
56+
)
57+
time.sleep(30)
58+
if archive_response.status_code == 200:
59+
return archive_response.url
60+
elif archive_response.status_code == 403:
61+
self.log(f"Wayback Machine permission denied for '{url}'.")
62+
except requests.RequestException as e:
63+
self.log(f"Error checking existing archival: {e}")
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (c) nexB Inc. and others. All rights reserved.
2+
# VulnerableCode is a trademark of nexB Inc.
3+
# SPDX-License-Identifier: Apache-2.0
4+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
5+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
6+
# See https://aboutcode.org for more information about nexB OSS projects.
7+
#
8+
9+
from unittest.mock import MagicMock
10+
11+
import pytest
12+
13+
from vulnerabilities.models import AdvisoryReference
14+
from vulnerabilities.pipelines.v2_improvers.archive_urls import ArchiveImproverPipeline
15+
16+
17+
@pytest.mark.django_db
18+
def test_archive_urls_pipeline(monkeypatch):
19+
advisory = AdvisoryReference.objects.create(url="https://example.com", archive_url=None)
20+
21+
mock_response = MagicMock()
22+
mock_response.status_code = 200
23+
mock_response.url = "https://web.archive.org/web/20250519082420/https://example.com"
24+
25+
monkeypatch.setattr(
26+
f"vulnerabilities.pipelines.v2_improvers.archive_urls.time.sleep", MagicMock()
27+
)
28+
monkeypatch.setattr(
29+
f"vulnerabilities.pipelines.v2_improvers.archive_urls.requests.get",
30+
MagicMock(return_value=mock_response),
31+
)
32+
33+
pipeline = ArchiveImproverPipeline()
34+
pipeline.archive_urls()
35+
36+
advisory.refresh_from_db()
37+
assert advisory.archive_url == "https://web.archive.org/web/20250519082420/https://example.com"

vulnerabilities/wayback_machine.ABOUT

Lines changed: 0 additions & 10 deletions
This file was deleted.

vulnerabilities/wayback_machine.LICENSE

Lines changed: 0 additions & 21 deletions
This file was deleted.

0 commit comments

Comments
 (0)