Skip to content

Commit 7357c05

Browse files
authored
Merge pull request #1 from Samk1710/add_EUVD_mirror_pipeline
Add EUVD mirror pipeline
2 parents ef39b50 + c5ccf26 commit 7357c05

File tree

4 files changed

+183
-0
lines changed

4 files changed

+183
-0
lines changed

.github/workflows/sync.yml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Daily sync of EUVD catalog
2+
3+
on:
4+
workflow_dispatch:
5+
schedule:
6+
- cron: '0 0 * * *'
7+
8+
permissions:
9+
contents: write
10+
11+
jobs:
12+
scheduled:
13+
runs-on: ubuntu-latest
14+
15+
steps:
16+
- name: Checkout repository
17+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
18+
19+
- name: Set up Python
20+
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
21+
with:
22+
python-version: '3.10'
23+
24+
- name: Install required packages
25+
run: pip install -r requirements.txt
26+
27+
- name: Run sync
28+
run: python sync_catalog.py
29+
30+
- name: Commit and push if it changed
31+
run: |-
32+
git config user.name "AboutCode Automation"
33+
git config user.email "automation@aboutcode.org"
34+
git add -A
35+
timestamp=$(date -u)
36+
git commit -m "$(echo -e "Sync EUVD catalog: $timestamp\n\nSigned-off-by: AboutCode Automation <automation@aboutcode.org>")" || exit 0
37+
git push

.gitignore

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Various junk and temp files
2+
.DS_Store
3+
*~
4+
.*.sw[po]
5+
.build
6+
.ve
7+
*.bak
8+
var
9+
share
10+
selenium
11+
local
12+
/dist/
13+
/.*cache/
14+
/.venv/
15+
/.python-version
16+
/.pytest_cache/
17+
/scancodeio.egg-info/
18+
*.rdb
19+
*.aof
20+
.vscode
21+
.ipynb_checkpoints

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
requests==2.32.5
2+
aboutcode.pipeline==0.2.1
3+
python-dateutil==2.9.0

sync_catalog.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
5+
# See https://aboutcode.org for more information about nexB OSS projects.
6+
#
7+
8+
import json
9+
import math
10+
import sys
11+
from datetime import date, datetime, timezone
12+
from pathlib import Path
13+
14+
from dateutil.parser import parse
15+
import requests
16+
from aboutcode.pipeline import BasePipeline, LoopProgress
17+
from requests.adapters import HTTPAdapter
18+
from urllib3.util.retry import Retry
19+
20+
ROOT_PATH = Path(__file__).parent
21+
ADVISORIES_PATH = ROOT_PATH / "advisories"
22+
CHECKPOINT_FILE = ROOT_PATH / "checkpoint.json"
23+
24+
HEADERS = {
25+
"Accept": "application/json",
26+
}
27+
28+
PAGE_SIZE = 100
29+
REQUEST_TIMEOUT = 15
30+
31+
32+
class EUVDAdvisoryMirror(BasePipeline):
33+
url = "https://euvdservices.enisa.europa.eu/api/search"
34+
35+
@classmethod
36+
def steps(cls):
37+
return (
38+
cls.load_checkpoint,
39+
cls.create_session,
40+
cls.collect_new_advisory,
41+
cls.save_checkpoint,
42+
)
43+
44+
def load_checkpoint(self):
45+
"""
46+
- Load the ``last run`` date from checkpoint.json to fetch only new advisories.
47+
- If the checkpoint.json does not exist, fetch all advisories.
48+
"""
49+
self.fetch_params = {}
50+
if not CHECKPOINT_FILE.exists():
51+
return
52+
with CHECKPOINT_FILE.open() as f:
53+
checkpoint = json.load(f)
54+
if last_run := checkpoint.get("last_run"):
55+
self.fetch_params["fromUpdatedDate"] = last_run
56+
57+
def create_session(self):
58+
retry = Retry(
59+
total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]
60+
)
61+
adapter = HTTPAdapter(max_retries=retry)
62+
self.session = requests.Session()
63+
self.session.headers.update(HEADERS)
64+
self.session.mount("https://", adapter)
65+
66+
def collect_new_advisory(self):
67+
"""
68+
Fetch new advisories from the EUVD API with paginated requests.
69+
70+
- Fetch the ``total`` advisories and determine the number of pages to iterate over.
71+
- Iterate through all pages, fetching up to PAGE_SIZE advisories per request.
72+
- Save each advisory as a JSON file at ``/advisories/{year}/{month}/{EUVD_ID}.json``.
73+
- Advisories with missing publication dates are stored as at ``/advisories/unpublished/{EUVD_ID}.json``.
74+
"""
75+
count_page = self.fetch_page({**self.fetch_params, "size": 1, "page": 0})
76+
total = count_page.get("total", 0)
77+
78+
total_pages = math.ceil(total / PAGE_SIZE)
79+
self.log(f"Collecting {total} advisories across {total_pages} pages")
80+
81+
progress = LoopProgress(total_iterations=total_pages, logger=self.log)
82+
83+
for page in progress.iter(range(total_pages)):
84+
data = self.fetch_page(
85+
{**self.fetch_params, "size": PAGE_SIZE, "page": page}
86+
)
87+
for advisory in data.get("items", []):
88+
self.save_advisory(advisory)
89+
90+
def save_advisory(self, advisory):
91+
destination = "unpublished"
92+
euvd_id = advisory["id"]
93+
94+
if published := advisory.get("datePublished"):
95+
published_date = parse(published)
96+
destination = f"{published_date.year}/{published_date.month:02d}"
97+
98+
path = ADVISORIES_PATH / f"{destination}/{euvd_id}.json"
99+
path.parent.mkdir(parents=True, exist_ok=True)
100+
with open(path, "w", encoding="utf-8") as f:
101+
json.dump(advisory, f, indent=2)
102+
103+
def save_checkpoint(self):
104+
with CHECKPOINT_FILE.open("w") as f:
105+
json.dump({"last_run": date.today().isoformat()}, f, indent=2)
106+
107+
def fetch_page(self, params):
108+
response = self.session.get(self.url, params=params, timeout=REQUEST_TIMEOUT)
109+
response.raise_for_status()
110+
return response.json() or {}
111+
112+
def log(self, message):
113+
now_local = datetime.now(timezone.utc).astimezone()
114+
timestamp = now_local.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
115+
print(f"{timestamp} {message}")
116+
117+
if __name__ == "__main__":
118+
mirror = EUVDAdvisoryMirror()
119+
status_code, error_message = mirror.execute()
120+
if error_message:
121+
print(error_message)
122+
sys.exit(status_code)

0 commit comments

Comments
 (0)