Skip to content

Commit ad65e69

Browse files
committed
Update the pipeline to use https://web.archive.org/web/ endpoint no complex logic
Add a test Signed-off-by: ziad hany <ziadhany2016@gmail.com> Create a pipeline to archive_urls Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 0e7adc6 commit ad65e69

File tree

6 files changed

+219
-0
lines changed

6 files changed

+219
-0
lines changed

archize.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import json
2+
import time
3+
from datetime import datetime
4+
5+
import requests
6+
7+
# ── CONFIG ────────────────────────────────────────────────────────────────────
8+
URLS = [
9+
"https://github.com/aboutcode-org/vulnerablecode/issues/17",
10+
]
11+
12+
DELAY_SECONDS = 5 # pause between requests to avoid rate-limiting
13+
LOG_FILE = "archive_log.json"
14+
# ─────────────────────────────────────────────────────────────────────────────
15+
16+
SPN_ENDPOINT = "https://web.archive.org/save/"
17+
18+
19+
def save_url(url: str) -> dict:
20+
"""Submit a single URL to the Wayback Machine."""
21+
try:
22+
response = requests.post(
23+
SPN_ENDPOINT,
24+
data={"url": url},
25+
headers={"User-Agent": "ArchiveBot/1.0"},
26+
timeout=30,
27+
)
28+
29+
if response.status_code == 200:
30+
# Archive.org returns the archived URL in the Content-Location header
31+
location = response.headers.get("Content-Location", "")
32+
archived_url = f"https://web.archive.org{location}" if location else "check manually"
33+
return {"url": url, "status": "success", "archived_url": archived_url}
34+
35+
else:
36+
return {
37+
"url": url,
38+
"status": "failed",
39+
"http_code": response.status_code,
40+
"reason": response.text[:200],
41+
}
42+
43+
except requests.exceptions.Timeout:
44+
return {"url": url, "status": "error", "reason": "Request timed out"}
45+
except requests.exceptions.RequestException as e:
46+
return {"url": url, "status": "error", "reason": str(e)}
47+
48+
49+
def archive_all(urls: list[str]) -> list[dict]:
50+
results = []
51+
total = len(urls)
52+
53+
print(f"Starting archive of {total} URL(s)...\n")
54+
55+
for i, url in enumerate(urls, start=1):
56+
print(f"[{i}/{total}] Submitting: {url}")
57+
result = save_url(url)
58+
result["timestamp"] = datetime.utcnow().isoformat()
59+
results.append(result)
60+
61+
if result["status"] == "success":
62+
print(f" ✓ Archived → {result['archived_url']}")
63+
else:
64+
print(f" ✗ {result.get('reason') or result.get('http_code')}")
65+
66+
if i < total:
67+
time.sleep(DELAY_SECONDS)
68+
69+
return results
70+
71+
72+
def save_log(results: list[dict], path: str) -> None:
73+
with open(path, "w") as f:
74+
json.dump(results, f, indent=2)
75+
print(f"\nLog saved to {path}")
76+
77+
78+
def print_summary(results: list[dict]) -> None:
79+
success = sum(1 for r in results if r["status"] == "success")
80+
failed = len(results) - success
81+
print(f"\n── Summary ──────────────────────")
82+
print(f" Total : {len(results)}")
83+
print(f" Success : {success}")
84+
print(f" Failed : {failed}")
85+
print(f"─────────────────────────────────")
86+
87+
88+
if __name__ == "__main__":
89+
results = archive_all(URLS)
90+
print_summary(results)
91+
save_log(results, LOG_FILE)

vulnerabilities/improvers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from vulnerabilities.pipelines import flag_ghost_packages
2020
from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline
2121
from vulnerabilities.pipelines import remove_duplicate_advisories
22+
from vulnerabilities.pipelines.v2_improvers import archive_urls
2223
from vulnerabilities.pipelines.v2_improvers import collect_ssvc_trees
2324
from vulnerabilities.pipelines.v2_improvers import compute_advisory_content_hash
2425
from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2
@@ -76,5 +77,6 @@
7677
collect_ssvc_trees.CollectSSVCPipeline,
7778
relate_severities.RelateSeveritiesPipeline,
7879
compute_advisory_content_hash.ComputeAdvisoryContentHash,
80+
archive_urls.ArchiveImproverPipeline,
7981
]
8082
)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Generated by Django 5.2.11 on 2026-03-20 19:45
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("vulnerabilities", "0115_impactedpackageaffecting_and_more"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="advisoryreference",
15+
name="archive_url",
16+
field=models.URLField(
17+
help_text="URL to the backup vulnerability reference", max_length=1024, null=True
18+
),
19+
),
20+
]

vulnerabilities/models.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2672,6 +2672,12 @@ class AdvisoryReference(models.Model):
26722672
help_text="URL to the vulnerability reference",
26732673
)
26742674

2675+
archive_url = models.URLField(
2676+
max_length=1024,
2677+
null=True,
2678+
help_text="URL to the backup vulnerability reference",
2679+
)
2680+
26752681
ADVISORY = "advisory"
26762682
EXPLOIT = "exploit"
26772683
COMMIT = "commit"
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright (c) nexB Inc. and others. All rights reserved.
2+
# VulnerableCode is a trademark of nexB Inc.
3+
# SPDX-License-Identifier: Apache-2.0
4+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
5+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
6+
# See https://aboutcode.org for more information about nexB OSS projects.
7+
#
8+
9+
import time
10+
11+
import requests
12+
13+
from vulnerabilities.models import AdvisoryReference
14+
from vulnerabilities.pipelines import VulnerableCodePipeline
15+
16+
17+
class ArchiveImproverPipeline(VulnerableCodePipeline):
18+
"""
19+
Archive Improver Pipeline
20+
"""
21+
22+
pipeline_id = "archive_improver_pipeline"
23+
24+
@classmethod
25+
def steps(cls):
26+
return (cls.archive_urls,)
27+
28+
def archive_urls(self):
29+
"""Get and stores archive URLs for AdvisoryReferences, flagging missing ones as NO_ARCHIVE"""
30+
advisory_refs = (
31+
AdvisoryReference.objects.filter(archive_url__isnull=True)
32+
.exclude(archive_url="NO_ARCHIVE")
33+
.only("id", "url")
34+
)
35+
36+
for advisory_ref in advisory_refs:
37+
url = advisory_ref.url
38+
if not url or not url.startswith("http"):
39+
continue
40+
41+
archive_url = self.get_archival(url)
42+
if not archive_url:
43+
AdvisoryReference.objects.filter(id=advisory_ref.id).update(
44+
archive_url="NO_ARCHIVE"
45+
)
46+
self.log(f"URL unreachable or returned no archive url: {url}")
47+
continue
48+
self.log(f"Found Archived Reference URL: {archive_url}")
49+
AdvisoryReference.objects.filter(id=advisory_ref.id).update(archive_url=archive_url)
50+
51+
def get_archival(self, url):
52+
self.log(f"Searching for archive URL for this Reference URL: {url}")
53+
try:
54+
archive_response = requests.get(
55+
url=f"https://web.archive.org/web/{url}", allow_redirects=True
56+
)
57+
time.sleep(30)
58+
if archive_response.status_code == 200:
59+
return archive_response.url
60+
elif archive_response.status_code == 403:
61+
self.log(f"Wayback Machine permission denied for '{url}'.")
62+
except requests.RequestException as e:
63+
self.log(f"Error checking existing archival: {e}")
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (c) nexB Inc. and others. All rights reserved.
2+
# VulnerableCode is a trademark of nexB Inc.
3+
# SPDX-License-Identifier: Apache-2.0
4+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
5+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
6+
# See https://aboutcode.org for more information about nexB OSS projects.
7+
#
8+
9+
from unittest.mock import MagicMock
10+
11+
import pytest
12+
13+
from vulnerabilities.models import AdvisoryReference
14+
from vulnerabilities.pipelines.v2_improvers.archive_urls import ArchiveImproverPipeline
15+
16+
17+
@pytest.mark.django_db
18+
def test_archive_urls_pipeline(monkeypatch):
19+
advisory = AdvisoryReference.objects.create(url="https://example.com", archive_url=None)
20+
21+
mock_response = MagicMock()
22+
mock_response.status_code = 200
23+
mock_response.url = "https://web.archive.org/web/20250519082420/https://example.com"
24+
25+
monkeypatch.setattr(
26+
f"vulnerabilities.pipelines.v2_improvers.archive_urls.time.sleep", MagicMock()
27+
)
28+
monkeypatch.setattr(
29+
f"vulnerabilities.pipelines.v2_improvers.archive_urls.requests.get",
30+
MagicMock(return_value=mock_response),
31+
)
32+
33+
pipeline = ArchiveImproverPipeline()
34+
pipeline.archive_urls()
35+
36+
advisory.refresh_from_db()
37+
assert advisory.archive_url == "https://web.archive.org/web/20250519082420/https://example.com"

0 commit comments

Comments
 (0)