Skip to content

Commit 1b5b652

Browse files
committed
Create a pipeline to archive_urls
Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 2dbbd38 commit 1b5b652

File tree

8 files changed

+518
-0
lines changed

8 files changed

+518
-0
lines changed

archize.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import json
2+
import time
3+
from datetime import datetime
4+
5+
import requests
6+
7+
# ── CONFIG ────────────────────────────────────────────────────────────────────
8+
URLS = [
9+
"https://github.com/aboutcode-org/vulnerablecode/issues/17",
10+
]
11+
12+
DELAY_SECONDS = 5 # pause between requests to avoid rate-limiting
13+
LOG_FILE = "archive_log.json"
14+
# ─────────────────────────────────────────────────────────────────────────────
15+
16+
SPN_ENDPOINT = "https://web.archive.org/save/"
17+
18+
19+
def save_url(url: str) -> dict:
20+
"""Submit a single URL to the Wayback Machine."""
21+
try:
22+
response = requests.post(
23+
SPN_ENDPOINT,
24+
data={"url": url},
25+
headers={"User-Agent": "ArchiveBot/1.0"},
26+
timeout=30,
27+
)
28+
29+
if response.status_code == 200:
30+
# Archive.org returns the archived URL in the Content-Location header
31+
location = response.headers.get("Content-Location", "")
32+
archived_url = f"https://web.archive.org{location}" if location else "check manually"
33+
return {"url": url, "status": "success", "archived_url": archived_url}
34+
35+
else:
36+
return {
37+
"url": url,
38+
"status": "failed",
39+
"http_code": response.status_code,
40+
"reason": response.text[:200],
41+
}
42+
43+
except requests.exceptions.Timeout:
44+
return {"url": url, "status": "error", "reason": "Request timed out"}
45+
except requests.exceptions.RequestException as e:
46+
return {"url": url, "status": "error", "reason": str(e)}
47+
48+
49+
def archive_all(urls: list[str]) -> list[dict]:
50+
results = []
51+
total = len(urls)
52+
53+
print(f"Starting archive of {total} URL(s)...\n")
54+
55+
for i, url in enumerate(urls, start=1):
56+
print(f"[{i}/{total}] Submitting: {url}")
57+
result = save_url(url)
58+
result["timestamp"] = datetime.utcnow().isoformat()
59+
results.append(result)
60+
61+
if result["status"] == "success":
62+
print(f" ✓ Archived → {result['archived_url']}")
63+
else:
64+
print(f" ✗ {result.get('reason') or result.get('http_code')}")
65+
66+
if i < total:
67+
time.sleep(DELAY_SECONDS)
68+
69+
return results
70+
71+
72+
def save_log(results: list[dict], path: str) -> None:
73+
with open(path, "w") as f:
74+
json.dump(results, f, indent=2)
75+
print(f"\nLog saved to {path}")
76+
77+
78+
def print_summary(results: list[dict]) -> None:
79+
success = sum(1 for r in results if r["status"] == "success")
80+
failed = len(results) - success
81+
print(f"\n── Summary ──────────────────────")
82+
print(f" Total : {len(results)}")
83+
print(f" Success : {success}")
84+
print(f" Failed : {failed}")
85+
print(f"─────────────────────────────────")
86+
87+
88+
if __name__ == "__main__":
89+
results = archive_all(URLS)
90+
print_summary(results)
91+
save_log(results, LOG_FILE)

vulnerabilities/improvers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from vulnerabilities.pipelines import flag_ghost_packages
2020
from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline
2121
from vulnerabilities.pipelines import remove_duplicate_advisories
22+
from vulnerabilities.pipelines.v2_improvers import archive_urls
2223
from vulnerabilities.pipelines.v2_improvers import collect_ssvc_trees
2324
from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2
2425
from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2
@@ -74,5 +75,6 @@
7475
compute_advisory_todo.ComputeToDo,
7576
collect_ssvc_trees.CollectSSVCPipeline,
7677
relate_severities.RelateSeveritiesPipeline,
78+
archive_urls.ArchiveImproverPipeline,
7779
]
7880
)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Generated by Django 5.2.11 on 2026-03-20 19:45
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("vulnerabilities", "0115_impactedpackageaffecting_and_more"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="advisoryreference",
15+
name="archive_url",
16+
field=models.URLField(
17+
help_text="URL to the backup vulnerability reference", max_length=1024, null=True
18+
),
19+
),
20+
]

vulnerabilities/models.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2672,6 +2672,12 @@ class AdvisoryReference(models.Model):
26722672
help_text="URL to the vulnerability reference",
26732673
)
26742674

2675+
archive_url = models.URLField(
2676+
max_length=1024,
2677+
null=True,
2678+
help_text="URL to the backup vulnerability reference",
2679+
)
2680+
26752681
ADVISORY = "advisory"
26762682
EXPLOIT = "exploit"
26772683
COMMIT = "commit"
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import time
11+
import requests
12+
from vulnerabilities.models import AdvisoryReference
13+
from vulnerabilities.pipelines import VulnerableCodePipeline
14+
from vulnerabilities.wayback_machine import WaybackMachineSaveAPI
15+
16+
17+
class ArchiveImproverPipeline(VulnerableCodePipeline):
18+
"""
19+
Archive Improver Pipeline
20+
"""
21+
22+
pipeline_id = "archive_improver_pipeline"
23+
24+
@classmethod
25+
def steps(cls):
26+
return (cls.archive_urls,)
27+
28+
def archive_urls(self):
29+
advisory_refs = AdvisoryReference.objects.filter(archive_url__isnull=True).only("id", "url")
30+
31+
for advisory_ref in advisory_refs.iterator():
32+
33+
if not advisory_ref.url.startswith("http"):
34+
continue
35+
36+
if not self.is_reachable_url(advisory_ref.url):
37+
self.log(
38+
f"Skipping archival: URL is unreachable or returned 404: {advisory_ref.url}"
39+
)
40+
continue
41+
42+
self.log(f"Requesting archival for: {advisory_ref.url}")
43+
try:
44+
time.sleep(300)
45+
archive_url = self.request_archival(advisory_ref.url)
46+
if not archive_url:
47+
continue
48+
49+
AdvisoryReference.objects.filter(id=advisory_ref.id).update(archive_url=archive_url)
50+
self.log(f"Successfully added archival URL for advisory reference: {archive_url}")
51+
except Exception as e:
52+
self.log(f"Failed to archive {advisory_ref.url}: {str(e)}")
53+
54+
def request_archival(self, url):
55+
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
56+
try:
57+
save_api = WaybackMachineSaveAPI(url, user_agent)
58+
return save_api.save()
59+
except Exception as e:
60+
self.log(f"Failed to archive {url}: {str(e)}")
61+
return None
62+
63+
def is_reachable_url(self, url):
64+
try:
65+
with requests.Session() as session:
66+
head_res = session.head(url, allow_redirects=True, timeout=10)
67+
if not head_res.status_code == 200:
68+
return False
69+
70+
get_res = session.get(url, allow_redirects=True, stream=True, timeout=10)
71+
return get_res.status_code == 200
72+
73+
except requests.RequestException:
74+
return False
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
about_resource: wayback_machine.py
2+
version: 5407681c34ac299ad5c0622ea0c2d186a330fec1
3+
download_url: https://raw.githubusercontent.com/akamhy/waybackpy/5407681c34ac299ad5c0622ea0c2d186a330fec1/waybackpy/save_api.py
4+
package_url: pkg:github/akamhy/waybackpy@5407681c34ac299ad5c0622ea0c2d186a330fec1#waybackpy/save_api.py
5+
homepage_url: https://github.com/akamhy/waybackpy
6+
notes: There are two files extracted from waybackpy that handle API calls—`save_api.py` and `exceptions.py` and they are saved in `wayback_machine.py`
7+
8+
copyright: Copyright (c) 2010 United States Government. All Rights Reserved.
9+
license_expression: MIT License
10+
license_file: wayback_machine.py.LICENSE
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2020-2022 waybackpy contributors ( https://github.com/akamhy/waybackpy/graphs/contributors )
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

0 commit comments

Comments
 (0)