Skip to content

Commit db171a1

Browse files
committed
feat: download DockerHub top packages weekly
1 parent 3f3256e commit db171a1

File tree

7 files changed

+191
-97
lines changed

7 files changed

+191
-97
lines changed

.github/workflows/weekly_download.yml

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@ on:
88
jobs:
99
download:
1010
runs-on: ubuntu-latest
11+
strategy:
12+
matrix:
13+
include:
14+
- name: pypi
15+
- name: npm
16+
- name: dockerhub
1117
steps:
1218
- uses: actions/create-github-app-token@29824e69f54612133e76f7eaac726eef6c875baf # v2.2.1
1319
id: app-token
@@ -25,15 +31,10 @@ jobs:
2531
- name: Install the project
2632
run: uv sync --locked --only-group download --python 3.14
2733

28-
- name: Download Pypi packages
34+
- name: Download ${{ matrix.name }} packages
2935
continue-on-error: true
3036
run: |
31-
uv run --no-project dependencies/scripts/download_packages.py download pypi
32-
33-
- name: Download NPM packages
34-
continue-on-error: true
35-
run: |
36-
uv run --no-project dependencies/scripts/download_packages.py download npm
37+
PYTHONPATH=dependencies/ uv run --no-project dependencies/scripts/download_packages.py download ${{ matrix.name }}
3738
3839
- name: Configure git
3940
run: |
@@ -43,5 +44,5 @@ jobs:
4344
- name: Push changes to repo
4445
run: |
4546
git add .
46-
git commit -m "chore: Weekly update of trusted packages"
47+
git commit -m "chore: Weekly update of `${{ matrix.name }}` trusted packages"
4748
git push origin HEAD:main

dependencies/dockerhub.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

dependencies/scripts/download_packages.py

Lines changed: 13 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import json
22
import logging
33
from collections.abc import Callable
4-
from dataclasses import dataclass, field
54
from datetime import datetime
65
from pathlib import Path
76
from typing import Any
@@ -10,6 +9,19 @@
109
import click
1110
import httpx
1211
import stamina
12+
from requests.exceptions import InvalidJSONError
13+
14+
from scripts.exceptions import ServerError
15+
from scripts.utils import (
16+
DEPENDENCIES_DIR,
17+
ECOSYSTEMS,
18+
RETRY_ATTEMPTS,
19+
RETRY_ON,
20+
RETRY_WAIT_EXP_BASE,
21+
RETRY_WAIT_JITTER,
22+
RETRY_WAIT_MAX,
23+
TIMEOUT,
24+
)
1325

1426
logger = logging.getLogger("weekly_download")
1527
logging.basicConfig(
@@ -19,90 +31,6 @@
1931
)
2032

2133

22-
class ServerError(Exception):
23-
"""Custom exception for HTTP 5xx errors."""
24-
25-
26-
class InvalidJSONError(Exception):
27-
"""Custom exception for when the received JSON does not match the expected format."""
28-
29-
30-
# Directory name
31-
DEPENDENCIES_DIR = "dependencies"
32-
"""Directory name where dependency files will be saved."""
33-
34-
# Sources
35-
TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
36-
"""URL for fetching top PyPI packages data."""
37-
38-
TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
39-
"""URL for fetching top npm packages data from ecosyste.ms."""
40-
41-
# Retry constants
42-
RETRY_ON = (httpx.TransportError, httpx.TimeoutException, ServerError)
43-
"""Tuple of exceptions that should trigger retry attempts."""
44-
45-
RETRY_ATTEMPTS = 15
46-
"""Maximum number of retry attempts for failed requests."""
47-
48-
RETRY_WAIT_JITTER = 1
49-
"""Random jitter factor for retry wait times."""
50-
51-
RETRY_WAIT_EXP_BASE = 2
52-
"""Exponential backoff base multiplier for retry wait times."""
53-
54-
RETRY_WAIT_MAX = 8
55-
"""Maximum wait time between retry attempts in seconds."""
56-
57-
TIMEOUT = 90
58-
"""HTTP request timeout in seconds."""
59-
60-
61-
def parse_npm(data: list[dict[str, Any]]) -> set[str]:
62-
"""Parse npm package data and extract package names."""
63-
try:
64-
return {x["name"] for x in data}
65-
except KeyError as e:
66-
raise InvalidJSONError from e
67-
68-
69-
def parse_pypi(data: dict[str, Any]) -> set[str]:
70-
"""Parse PyPI package data and extract package names."""
71-
try:
72-
return {row["project"] for row in data["rows"]}
73-
except KeyError as e:
74-
raise InvalidJSONError from e
75-
76-
77-
@dataclass(frozen=True)
78-
class Ecosystem:
79-
"""Configuration for a package ecosystem (PyPI, npm, etc.)."""
80-
81-
url: str
82-
parser: Callable[[Any], set[str]]
83-
params: dict[str, Any] = field(default_factory=dict)
84-
pages: int | None = None
85-
86-
87-
pypi_ecosystem = Ecosystem(
88-
url=TOP_PYPI_SOURCE,
89-
parser=parse_pypi,
90-
)
91-
"""Ecosystem configuration for PyPI packages."""
92-
93-
npm_ecosystem = Ecosystem(
94-
url=TOP_NPM_SOURCE,
95-
parser=parse_npm,
96-
params={"per_page": 100, "sort": "downloads"},
97-
pages=150,
98-
)
99-
"""Ecosystem configuration for npm packages with pagination."""
100-
101-
102-
ECOSYSTEMS = {"pypi": pypi_ecosystem, "npm": npm_ecosystem}
103-
"""Dictionary mapping ecosystem names to their configurations."""
104-
105-
10634
def get_params(params: dict[str, Any] | None, page: int | None) -> dict[str, Any]:
10735
"""Combine base parameters with page parameter if provided."""
10836
new_params: dict[str, Any] = {}

dependencies/scripts/exceptions.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
class ServerError(Exception):
2+
"""Custom exception for HTTP 5xx errors."""
3+
4+
5+
class InvalidJSONError(Exception):
6+
"""Custom exception for when the received JSON does not match the expected format."""

dependencies/scripts/utils.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from collections.abc import Callable
2+
from dataclasses import dataclass, field
3+
from typing import Any
4+
5+
import httpx
6+
from requests.exceptions import InvalidJSONError
7+
8+
from scripts.exceptions import ServerError
9+
10+
DEPENDENCIES_DIR = "dependencies"
11+
"""Directory name where dependency files will be saved."""
12+
13+
# Sources
14+
TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
15+
"""URL for fetching top PyPI packages data."""
16+
17+
TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
18+
"""URL for fetching top npm packages data from ecosyste.ms."""
19+
20+
TOP_DOCKERHUB_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/hub.docker.com/packages"
21+
"""URL for fetching top npm packages data from ecosyste.ms."""
22+
23+
# Retry constants
24+
RETRY_ON = (httpx.TransportError, httpx.TimeoutException, ServerError)
25+
"""Tuple of exceptions that should trigger retry attempts."""
26+
27+
RETRY_ATTEMPTS = 15
28+
"""Maximum number of retry attempts for failed requests."""
29+
30+
RETRY_WAIT_JITTER = 1
31+
"""Random jitter factor for retry wait times."""
32+
33+
RETRY_WAIT_EXP_BASE = 2
34+
"""Exponential backoff base multiplier for retry wait times."""
35+
36+
RETRY_WAIT_MAX = 8
37+
"""Maximum wait time between retry attempts in seconds."""
38+
39+
TIMEOUT = 90
40+
"""HTTP request timeout in seconds."""
41+
42+
43+
def parse_packages_ecosystems_source(data: list[dict[str, Any]]) -> set[str]:
44+
"""Parse npm package data and extract package names."""
45+
try:
46+
return {x["name"] for x in data}
47+
except KeyError as e:
48+
raise InvalidJSONError from e
49+
50+
51+
def parse_pypi(data: dict[str, Any]) -> set[str]:
52+
"""Parse PyPI package data and extract package names."""
53+
try:
54+
return {row["project"] for row in data["rows"]}
55+
except KeyError as e:
56+
raise InvalidJSONError from e
57+
58+
59+
@dataclass(frozen=True)
60+
class Ecosystem:
61+
"""Configuration for a package ecosystem (PyPI, npm, etc.)."""
62+
63+
url: str
64+
parser: Callable[[Any], set[str]]
65+
params: dict[str, Any] = field(default_factory=dict)
66+
pages: int | None = None
67+
68+
69+
pypi_ecosystem = Ecosystem(
70+
url=TOP_PYPI_SOURCE,
71+
parser=parse_pypi,
72+
)
73+
"""Ecosystem configuration for PyPI packages."""
74+
75+
npm_ecosystem = Ecosystem(
76+
url=TOP_NPM_SOURCE,
77+
parser=parse_packages_ecosystems_source,
78+
params={"per_page": 100, "sort": "downloads"},
79+
pages=150,
80+
)
81+
"""Ecosystem configuration for npm packages with pagination."""
82+
83+
dockerhub_ecosystem = Ecosystem(
84+
url=TOP_DOCKERHUB_SOURCE,
85+
parser=parse_packages_ecosystems_source,
86+
params={"per_page": 100, "sort": "downloads"},
87+
pages=150,
88+
)
89+
"""Ecosystem configuration for DockerHub packages with pagination."""
90+
91+
92+
ECOSYSTEMS = {"pypi": pypi_ecosystem, "npm": npm_ecosystem, "dockerhub": dockerhub_ecosystem}
93+
"""Dictionary mapping ecosystem names to their configurations."""

dependencies/tests/test_download_packages.py

Lines changed: 68 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@
1313
DEPENDENCIES_DIR,
1414
ECOSYSTEMS,
1515
RETRY_ATTEMPTS,
16-
Ecosystem,
1716
InvalidJSONError,
1817
ServerError,
1918
_run,
2019
download,
20+
)
21+
from scripts.utils import (
22+
Ecosystem,
23+
dockerhub_ecosystem,
2124
npm_ecosystem,
22-
parse_npm,
25+
parse_packages_ecosystems_source,
2326
parse_pypi,
2427
)
2528

@@ -70,6 +73,18 @@ def patch_npm_ecosystem(data: dict[str, Any]) -> Iterator[None]:
7073
yield
7174

7275

76+
@contextmanager
77+
def patch_dockerhub_ecosystem(data: dict[str, Any]) -> Iterator[None]:
78+
"""Context manager that temporarily modifies the npm ecosystem configuration for testing."""
79+
with (
80+
patch.dict(
81+
ECOSYSTEMS,
82+
{"dockerhub": Ecosystem(**dockerhub_ecosystem.__dict__ | data)},
83+
),
84+
):
85+
yield
86+
87+
7388
@freeze_time("2025-01-01")
7489
class TestDownload:
7590
def test_pypi_download(self) -> None:
@@ -141,7 +156,7 @@ def test_invalid_pypi_json_format(self) -> None:
141156
def test_invalid_npm_json_format(self) -> None:
142157
"""Test that InvalidJSONError is raised when npm JSON data has invalid format."""
143158
with pytest.raises(InvalidJSONError):
144-
parse_npm([{"key": "val"}])
159+
parse_packages_ecosystems_source([{"key": "val"}])
145160

146161
def test_invalid_downloaded_json(self) -> None:
147162
"""Test that InvalidJSONError is raised when downloaded JSON cannot be parsed."""
@@ -219,6 +234,56 @@ def test_npm_download_with_multiple_pages(self) -> None:
219234
assert set(m_save.call_args[0][0]["packages"]) == {"lodash", "@aws/sdk", "react", "express"}
220235
assert m_save.call_args[0][1] == m_open().__enter__()
221236

237+
def test_dockerhub_download_with_multiple_pages(self) -> None:
238+
"""Test that the script will iterate through pages if provided."""
239+
page1_data = [
240+
{"name": "sundeepm1/weatherapi", "downloads": 12345},
241+
{"name": "hitesh25/jenkins_argo", "downloads": 98765},
242+
]
243+
page2_data = [
244+
{"name": "jchensg/sg-support-integration", "downloads": 87654},
245+
]
246+
247+
with (
248+
patch_client(None) as m_client, # We'll configure the side_effect below
249+
patch_save_to_file() as m_save,
250+
patch_open_file() as m_open,
251+
patch_dockerhub_ecosystem({"pages": 2}),
252+
):
253+
# Configure the mock to return different data for each call
254+
mock_responses = []
255+
for data in [page1_data, page2_data]:
256+
mock_response = Mock()
257+
mock_response.json.return_value = data
258+
mock_responses.append(mock_response)
259+
260+
m_client.side_effect = mock_responses
261+
262+
_run("dockerhub")
263+
264+
assert m_client.call_count == 2
265+
266+
assert m_client.call_args_list == [
267+
call(
268+
"https://packages.ecosyste.ms/api/v1/registries/hub.docker.com/packages",
269+
params={"per_page": 100, "sort": "downloads", "page": 1},
270+
),
271+
call(
272+
"https://packages.ecosyste.ms/api/v1/registries/hub.docker.com/packages",
273+
params={"per_page": 100, "sort": "downloads", "page": 2},
274+
),
275+
]
276+
277+
# Verify that all packages from all pages were collected
278+
assert m_save.call_count == 1
279+
assert m_save.call_args[0][0]["date"] == "2025-01-01T00:00:00+00:00"
280+
assert set(m_save.call_args[0][0]["packages"]) == {
281+
"sundeepm1/weatherapi",
282+
"hitesh25/jenkins_argo",
283+
"jchensg/sg-support-integration",
284+
}
285+
assert m_save.call_args[0][1] == m_open().__enter__()
286+
222287

223288
class TestCli:
224289
def test_non_existing_ecosystem_error(self) -> None:

justfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,4 @@ install-dev: venv
5858
uv pip install -e .
5959

6060
download ecosystem: venv
61-
uv run --no-project dependencies/scripts/download_packages.py download {{ecosystem}}
61+
PYTHONPATH=dependencies/ uv run --no-project dependencies/scripts/download_packages.py download {{ecosystem}}

0 commit comments

Comments
 (0)