Skip to content

Commit 6a9c7c0

Browse files
committed
Create a pipeline to archive_urls
Update the pipeline to use https://web.archive.org/web/ endpoint no complex logic Add a test Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 4ab8b2f commit 6a9c7c0

File tree

5 files changed

+128
-0
lines changed

5 files changed

+128
-0
lines changed

vulnerabilities/improvers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from vulnerabilities.pipelines import flag_ghost_packages
1919
from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline
2020
from vulnerabilities.pipelines import remove_duplicate_advisories
21+
from vulnerabilities.pipelines.v2_improvers import archive_urls
2122
from vulnerabilities.pipelines.v2_improvers import collect_ssvc_trees
2223
from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2
2324
from vulnerabilities.pipelines.v2_improvers import (
@@ -71,6 +72,7 @@
7172
unfurl_version_range_v2.UnfurlVersionRangePipeline,
7273
collect_ssvc_trees.CollectSSVCPipeline,
7374
relate_severities.RelateSeveritiesPipeline,
75+
archive_urls.ArchiveImproverPipeline,
7476
group_advisories_for_packages.GroupAdvisoriesForPackages,
7577
]
7678
)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Generated by Django 5.2.11 on 2026-04-07 15:59
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("vulnerabilities", "0119_remove_advisoryset_identifiers_and_more"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="advisoryreference",
15+
name="archive_url",
16+
field=models.URLField(
17+
help_text="URL to the backup vulnerability reference", max_length=1024, null=True
18+
),
19+
),
20+
]

vulnerabilities/models.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2675,6 +2675,12 @@ class AdvisoryReference(models.Model):
26752675
help_text="URL to the vulnerability reference",
26762676
)
26772677

2678+
archive_url = models.URLField(
2679+
max_length=1024,
2680+
null=True,
2681+
help_text="URL to the backup vulnerability reference",
2682+
)
2683+
26782684
ADVISORY = "advisory"
26792685
EXPLOIT = "exploit"
26802686
COMMIT = "commit"
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright (c) nexB Inc. and others. All rights reserved.
2+
# VulnerableCode is a trademark of nexB Inc.
3+
# SPDX-License-Identifier: Apache-2.0
4+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
5+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
6+
# See https://aboutcode.org for more information about nexB OSS projects.
7+
#
8+
9+
import time
10+
11+
import requests
12+
13+
from vulnerabilities.models import AdvisoryReference
14+
from vulnerabilities.pipelines import VulnerableCodePipeline
15+
16+
17+
class ArchiveImproverPipeline(VulnerableCodePipeline):
18+
"""
19+
Archive Improver Pipeline
20+
"""
21+
22+
pipeline_id = "archive_improver_pipeline"
23+
24+
@classmethod
25+
def steps(cls):
26+
return (cls.archive_urls,)
27+
28+
def archive_urls(self):
29+
"""Get and stores archive URLs for AdvisoryReferences, flagging missing ones as NO_ARCHIVE"""
30+
advisory_refs = (
31+
AdvisoryReference.objects.filter(archive_url__isnull=True)
32+
.exclude(archive_url="NO_ARCHIVE")
33+
.only("id", "url")
34+
)
35+
36+
for advisory_ref in advisory_refs:
37+
url = advisory_ref.url
38+
if not url or not url.startswith("http"):
39+
continue
40+
41+
archive_url = self.get_archival(url)
42+
if not archive_url:
43+
AdvisoryReference.objects.filter(id=advisory_ref.id).update(
44+
archive_url="NO_ARCHIVE"
45+
)
46+
self.log(f"URL unreachable or returned no archive url: {url}")
47+
continue
48+
self.log(f"Found Archived Reference URL: {archive_url}")
49+
AdvisoryReference.objects.filter(id=advisory_ref.id).update(archive_url=archive_url)
50+
51+
def get_archival(self, url):
52+
self.log(f"Searching for archive URL for this Reference URL: {url}")
53+
try:
54+
archive_response = requests.get(
55+
url=f"https://web.archive.org/web/{url}", allow_redirects=True
56+
)
57+
time.sleep(30)
58+
if archive_response.status_code == 200:
59+
return archive_response.url
60+
elif archive_response.status_code == 403:
61+
self.log(f"Wayback Machine permission denied for '{url}'.")
62+
except requests.RequestException as e:
63+
self.log(f"Error checking existing archival: {e}")
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (c) nexB Inc. and others. All rights reserved.
2+
# VulnerableCode is a trademark of nexB Inc.
3+
# SPDX-License-Identifier: Apache-2.0
4+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
5+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
6+
# See https://aboutcode.org for more information about nexB OSS projects.
7+
#
8+
9+
from unittest.mock import MagicMock
10+
11+
import pytest
12+
13+
from vulnerabilities.models import AdvisoryReference
14+
from vulnerabilities.pipelines.v2_improvers.archive_urls import ArchiveImproverPipeline
15+
16+
17+
@pytest.mark.django_db
18+
def test_archive_urls_pipeline(monkeypatch):
19+
advisory = AdvisoryReference.objects.create(url="https://example.com", archive_url=None)
20+
21+
mock_response = MagicMock()
22+
mock_response.status_code = 200
23+
mock_response.url = "https://web.archive.org/web/20250519082420/https://example.com"
24+
25+
monkeypatch.setattr(
26+
f"vulnerabilities.pipelines.v2_improvers.archive_urls.time.sleep", MagicMock()
27+
)
28+
monkeypatch.setattr(
29+
f"vulnerabilities.pipelines.v2_improvers.archive_urls.requests.get",
30+
MagicMock(return_value=mock_response),
31+
)
32+
33+
pipeline = ArchiveImproverPipeline()
34+
pipeline.archive_urls()
35+
36+
advisory.refresh_from_db()
37+
assert advisory.archive_url == "https://web.archive.org/web/20250519082420/https://example.com"

0 commit comments

Comments
 (0)