Skip to content

Commit 26533e8

Browse files
authored
Merge pull request #880 from aboutcode-org/845-update-pypi-pipeline
845 update pypi pipeline
2 parents 469c506 + d74984a commit 26533e8

24 files changed

Lines changed: 292 additions & 80 deletions

minecode_pipelines/miners/npm.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,16 @@ def get_npm_packageurls(name, npm_repo=NPM_REGISTRY_REPO):
150150
return packageurls
151151

152152

153+
def yield_npm_package_data(name, packageurls=[]):
154+
for purl in packageurls or get_npm_packageurls(name):
155+
package_url = PackageURL.from_string(purl)
156+
package_data_url = NPM_REGISTRY_REPO + name + "/" + package_url.version
157+
response = requests.get(package_data_url)
158+
if not response.ok:
159+
continue
160+
yield purl, response.json()
161+
162+
153163
def load_npm_packages(packages_file):
154164
with open(packages_file) as f:
155165
packages_data = json.load(f)

minecode_pipelines/miners/pypi.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
pypi_json_headers = {"Accept": "application/vnd.pypi.simple.v1+json"}
3838

3939

40-
PYPI_REPO = "https://pypi.org/simple/"
40+
PYPI_SIMPLE_REPO = "https://pypi.org/simple"
41+
PYPI_METADATA_REPO = "https://pypi.org/pypi"
4142
PYPI_TYPE = "pypi"
4243

4344

@@ -49,16 +50,23 @@ def get_pypi_packages(pypi_repo, logger=None):
4950
return response.json()
5051

5152

52-
def get_pypi_packageurls(name):
53-
packageurls = []
53+
def get_pypi_package_versions(name):
54+
versions = []
5455

55-
project_index_api_url = PYPI_REPO + name
56+
project_index_api_url = PYPI_SIMPLE_REPO + "/" + name
5657
response = requests.get(project_index_api_url, headers=pypi_json_headers)
5758
if not response.ok:
58-
return packageurls
59+
return versions
5960

6061
project_data = response.json()
61-
for version in project_data.get("versions"):
62+
versions = project_data.get("versions", [])
63+
return versions
64+
65+
66+
def get_pypi_packageurls(name):
67+
packageurls = []
68+
69+
for version in get_pypi_package_versions(name=name):
6270
purl = PackageURL(
6371
type=PYPI_TYPE,
6472
name=name,
@@ -69,6 +77,18 @@ def get_pypi_packageurls(name):
6977
return packageurls
7078

7179

80+
def yield_pypi_package_data(name, packageurls=[]):
81+
for purl in packageurls or get_pypi_packageurls(name):
82+
package_url = PackageURL.from_string(purl)
83+
package_data_url = (
84+
PYPI_METADATA_REPO + "/" + name + "/" + package_url.version + "/" + "json"
85+
)
86+
response = requests.get(package_data_url, headers=pypi_json_headers)
87+
if not response.ok:
88+
continue
89+
yield purl, response.json()
90+
91+
7292
def load_pypi_packages(packages_file):
7393
with open(packages_file) as f:
7494
packages_data = json.load(f)

minecode_pipelines/pipelines/__init__.py

Lines changed: 148 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,14 @@
1515
from collections.abc import Iterable
1616
from pathlib import Path
1717

18-
from aboutcode.federated import DataCluster
1918
from aboutcode.federated import DataFederation
2019
from aboutcode.pipeline import LoopProgress
20+
from packageurl import PackageURL
2121
from scanpipe.pipelines import Pipeline
2222
from scanpipe.pipes import federatedcode
2323

2424
from minecode_pipelines import pipes
25+
from minecode_pipelines.pipes import write_package_data_to_file
2526
from minecode_pipelines.pipes import write_packageurls_to_file
2627

2728
module_logger = logging.getLogger(__name__)
@@ -89,15 +90,20 @@ def fetch_federation_config(self):
8990
name="aboutcode-data",
9091
remote_root_url="https://github.com/aboutcode-data",
9192
)
92-
self.data_cluster = data_federation.get_cluster("purls")
93+
self.data_clusters = {
94+
"purls": data_federation.get_cluster("purls"),
95+
"api_package_version_responses": data_federation.get_cluster(
96+
"api_package_version_responses"
97+
),
98+
}
9399

94100
def mine_and_publish_packageurls(self):
95101
"""Mine and publish PackageURLs."""
96102

97103
_mine_and_publish_packageurls(
98104
packageurls=self.mine_packageurls(),
99105
total_package_count=self.packages_count(),
100-
data_cluster=self.data_cluster,
106+
data_clusters=self.data_clusters,
101107
checked_out_repos=self.checked_out_repos,
102108
working_path=self.working_path,
103109
append_purls=self.append_purls,
@@ -141,16 +147,75 @@ def log(self, message, level=logging.INFO):
141147
self.append_to_log(message)
142148

143149

150+
def commit_and_push_packageurls(
151+
current_working_repos,
152+
commit_msg_func,
153+
checkpoint_func,
154+
checkpoint_on_commit,
155+
checkpoint_interval,
156+
last_checkpoint_call,
157+
logger,
158+
):
159+
"""
160+
Given a list of `current_working_repos`, commit and push changes to each repo with the commit message returned from `commit_msg_func`.
161+
162+
If `checkpoint_on_commit` is True and `checkpoint_func` exists, then we execute `checkpoint_func`.
163+
164+
If `checkpoint_on_commit` is False, then we determine if it is time to call `checkpoint_func` or not.
165+
"""
166+
167+
if logger:
168+
logger("Trying to commit PackageURLs.")
169+
170+
for repo_checkout in current_working_repos:
171+
pipes.commit_and_push_checkout(
172+
local_checkout=repo_checkout,
173+
commit_message=commit_msg_func(repo_checkout["commit_count"] + 1),
174+
logger=logger,
175+
)
176+
177+
if checkpoint_on_commit and checkpoint_func:
178+
checkpoint_func()
179+
180+
if not checkpoint_on_commit:
181+
time_now = time.time()
182+
checkpoint_due = time_now - last_checkpoint_call >= checkpoint_interval
183+
if checkpoint_func and checkpoint_due:
184+
checkpoint_func()
185+
last_checkpoint_call = time_now
186+
187+
188+
def get_repo_checkout_from_data_cluster(
189+
data_cluster, purl, checked_out_repos, working_path, logger, datafile_name=None
190+
):
191+
"""
192+
Return a `repo_checkout` and `datafile_path` for a given `purl`, `data_cluster`, and `working_path`.
193+
194+
Add `repo_checkout` to `checked_out_repos`.
195+
"""
196+
repo, datafile_path = data_cluster.get_datafile_repo_and_path(
197+
purl=purl, datafile_name=datafile_name
198+
)
199+
if repo not in checked_out_repos:
200+
checked_out_repos[repo] = pipes.init_local_checkout(
201+
repo_name=repo,
202+
working_path=working_path,
203+
logger=logger,
204+
)
205+
repo_checkout = checked_out_repos[repo]
206+
return repo_checkout, datafile_path
207+
208+
144209
def _mine_and_publish_packageurls(
145210
packageurls: Iterable,
146211
total_package_count: int,
147-
data_cluster: DataCluster,
212+
data_clusters,
148213
checked_out_repos: dict,
149214
working_path: Path,
150215
append_purls: bool,
151216
commit_msg_func: Callable,
152217
logger: Callable,
153-
batch_size: int = 4000,
218+
batch_size: int = 100,
154219
checkpoint_on_commit: bool = False,
155220
checkpoint_func: Callable = None,
156221
checkpoint_freq: int = 30,
@@ -172,45 +237,94 @@ def _mine_and_publish_packageurls(
172237
iterator = progress.iter(iterator)
173238
logger(f"Mine PackageURL for {total_package_count:,d} packages.")
174239

175-
for base, purls in iterator:
240+
purls_data_cluster = data_clusters["purls"]
241+
api_package_version_responses_data_cluster = data_clusters["api_package_version_responses"]
242+
243+
current_working_repos = []
244+
currently_processed_files_count = 0
245+
for base, purls, purls_and_package_data in iterator:
176246
if not purls or not base:
177247
continue
178248

179-
package_repo, datafile_path = data_cluster.get_datafile_repo_and_path(purl=base)
180-
if package_repo not in checked_out_repos:
181-
checked_out_repos[package_repo] = pipes.init_local_checkout(
182-
repo_name=package_repo,
183-
working_path=working_path,
184-
logger=logger,
185-
)
249+
purls_package_repo_checkout, purls_datafile_path = get_repo_checkout_from_data_cluster(
250+
data_cluster=purls_data_cluster,
251+
purl=base,
252+
checked_out_repos=checked_out_repos,
253+
working_path=working_path,
254+
logger=logger,
255+
)
256+
if purls_package_repo_checkout not in current_working_repos:
257+
current_working_repos.append(purls_package_repo_checkout)
186258

187-
checkout = checked_out_repos[package_repo]
188259
purl_file = write_packageurls_to_file(
189-
repo=checkout["repo"],
190-
relative_datafile_path=datafile_path,
260+
repo=purls_package_repo_checkout["repo"],
261+
relative_datafile_path=purls_datafile_path,
191262
packageurls=purls,
192263
append=append_purls,
193264
)
194-
checkout["file_to_commit"].add(purl_file)
195-
checkout["file_processed_count"] += 1
196-
197-
if len(checkout["file_to_commit"]) > batch_size:
198-
if logger:
199-
logger("Trying to commit PackageURLs.")
200-
pipes.commit_and_push_checkout(
201-
local_checkout=checkout,
202-
commit_message=commit_msg_func(checkout["commit_count"] + 1),
265+
purls_package_repo_checkout["file_to_commit"].add(purl_file)
266+
purls_package_repo_checkout["file_processed_count"] += 1
267+
currently_processed_files_count += 1
268+
269+
if currently_processed_files_count > batch_size:
270+
commit_and_push_packageurls(
271+
current_working_repos=current_working_repos,
272+
commit_msg_func=commit_msg_func,
273+
checkpoint_func=checkpoint_func,
274+
checkpoint_on_commit=checkpoint_on_commit,
275+
checkpoint_interval=checkpoint_interval,
276+
last_checkpoint_call=last_checkpoint_call,
203277
logger=logger,
204278
)
205-
if checkpoint_on_commit and checkpoint_func:
206-
checkpoint_func()
207-
208-
if not checkpoint_on_commit:
209-
time_now = time.time()
210-
checkpoint_due = time_now - last_checkpoint_call >= checkpoint_interval
211-
if checkpoint_func and checkpoint_due:
212-
checkpoint_func()
213-
last_checkpoint_call = time_now
279+
current_working_repos = []
280+
currently_processed_files_count = 0
281+
282+
for purl, api_package_version_response in purls_and_package_data:
283+
if not isinstance(purl, PackageURL):
284+
package_url = PackageURL.from_string(purl)
285+
else:
286+
package_url = purl
287+
if package_url.type == "maven":
288+
datafile_name = "pom.xml"
289+
else:
290+
datafile_name = "api_package_version_response.json"
291+
api_package_version_responses_repo_checkout, api_package_metadata_datafile_path = (
292+
get_repo_checkout_from_data_cluster(
293+
data_cluster=api_package_version_responses_data_cluster,
294+
purl=purl,
295+
checked_out_repos=checked_out_repos,
296+
working_path=working_path,
297+
logger=logger,
298+
datafile_name=datafile_name,
299+
)
300+
)
301+
if api_package_version_responses_repo_checkout not in current_working_repos:
302+
current_working_repos.append(api_package_version_responses_repo_checkout)
303+
304+
api_package_version_response_file = write_package_data_to_file(
305+
repo=api_package_version_responses_repo_checkout["repo"],
306+
relative_api_package_metadata_datafile_path=api_package_metadata_datafile_path,
307+
package_data=api_package_version_response,
308+
)
309+
310+
api_package_version_responses_repo_checkout["file_to_commit"].add(
311+
api_package_version_response_file
312+
)
313+
api_package_version_responses_repo_checkout["file_processed_count"] += 1
314+
currently_processed_files_count += 1
315+
316+
if currently_processed_files_count > batch_size:
317+
commit_and_push_packageurls(
318+
current_working_repos=current_working_repos,
319+
commit_msg_func=commit_msg_func,
320+
checkpoint_func=checkpoint_func,
321+
checkpoint_on_commit=checkpoint_on_commit,
322+
checkpoint_interval=checkpoint_interval,
323+
last_checkpoint_call=last_checkpoint_call,
324+
logger=logger,
325+
)
326+
current_working_repos = []
327+
currently_processed_files_count = 0
214328

215329
for checkout in checked_out_repos.values():
216330
final_commit_count = checkout["commit_count"] + 1

minecode_pipelines/pipelines/mine_alpine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def steps(cls):
4040

4141
def mine_and_publish_alpine_packageurls(self):
4242
alpine.mine_and_publish_alpine_packageurls(
43-
data_cluster=self.data_cluster,
43+
data_clusters=self.data_clusters,
4444
checked_out_repos=self.checked_out_repos,
4545
working_path=self.working_path,
4646
commit_msg_func=self.commit_message,

minecode_pipelines/pipelines/mine_composer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def mine_and_publish_packageurls(self):
7979
_mine_and_publish_packageurls(
8080
packageurls=self.mine_packageurls(),
8181
total_package_count=self.packages_count(),
82-
data_cluster=self.data_cluster,
82+
data_clusters=self.data_clusters,
8383
checked_out_repos=self.checked_out_repos,
8484
working_path=self.working_path,
8585
append_purls=self.append_purls,

minecode_pipelines/pipelines/mine_debian.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def steps(cls):
4242
cls.create_federatedcode_working_dir,
4343
cls.fetch_federation_config,
4444
cls.fetch_checkpoint_and_debian_index,
45-
cls.mine_and_publish_alpine_packageurls,
45+
cls.mine_and_publish_debian_packageurls,
4646
cls.save_check_point,
4747
cls.delete_working_dir,
4848
)
@@ -62,13 +62,13 @@ def fetch_checkpoint_and_debian_index(self):
6262
self.log(f"last_checkpoint: {self.last_checkpoint}")
6363
self.debian_collector = debian.DebianCollector(logger=self.log)
6464

65-
def mine_and_publish_alpine_packageurls(self):
65+
def mine_and_publish_debian_packageurls(self):
6666
_mine_and_publish_packageurls(
6767
packageurls=self.debian_collector.get_packages(
6868
previous_index_last_modified_date=self.last_checkpoint,
6969
),
7070
total_package_count=None,
71-
data_cluster=self.data_cluster,
71+
data_clusters=self.data_clusters,
7272
checked_out_repos=self.checked_out_repos,
7373
working_path=self.working_path,
7474
append_purls=self.append_purls,

0 commit comments

Comments
 (0)