Skip to content

Commit 4d5d9e4

Browse files
committed
Refactor composer mining pipeline for git deployment (#787)
Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent df0f740 commit 4d5d9e4

7 files changed

Lines changed: 2702 additions & 184 deletions

File tree

minecode_pipelines/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88
#
99

1010

11-
VERSION = "0.0.1b25"
11+
VERSION = "0.0.1b42"

minecode_pipelines/miners/composer.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import json
1111
from minecode_pipelines.utils import get_temp_file
12+
from aboutcode.hashid import get_core_purl
1213
import requests
1314
from packageurl import PackageURL
1415

@@ -67,7 +68,7 @@ def get_composer_purl(vendor, package):
6768
response = requests.get(url, timeout=10)
6869
response.raise_for_status()
6970
except requests.RequestException:
70-
return purls
71+
return None, purls
7172

7273
data = response.json()
7374
packages = data.get("packages", {})
@@ -84,7 +85,11 @@ def get_composer_purl(vendor, package):
8485
)
8586
purls.append(purl.to_string())
8687

87-
return purls
88+
base_purl = None
89+
if purls:
90+
first_purl = purls[0]
91+
base_purl = get_core_purl(first_purl)
92+
return base_purl, purls
8893

8994

9095
def load_composer_packages(packages_file):

minecode_pipelines/pipelines/mine_composer.py

Lines changed: 60 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -20,63 +20,85 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
import os
24-
from scanpipe.pipelines import Pipeline
23+
from datetime import datetime
2524
from scanpipe.pipes import federatedcode
2625

2726
from minecode_pipelines import pipes
28-
from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO
29-
from minecode_pipelines.pipes.composer import mine_composer_packages
30-
from minecode_pipelines.pipes.composer import mine_and_publish_composer_purls
27+
from minecode_pipelines.pipelines import MineCodeBasePipeline
28+
from minecode_pipelines.pipes import composer
29+
from minecode_pipelines.pipes.composer import mine_composer_packages, PACKAGE_BATCH_SIZE
30+
from minecode_pipelines.pipelines import _mine_and_publish_packageurls
3131

32-
MINECODE_COMPOSER_GIT_URL = os.environ.get(
33-
"MINECODE_COMPOSER_GIT_URL", "https://github.com/aboutcode-data/minecode-data-composer-test"
34-
)
3532

36-
37-
class MineComposer(Pipeline):
33+
class MineComposer(MineCodeBasePipeline):
3834
"""
39-
Mine all packageURLs from a composer index and publish them to a FederatedCode repo.
35+
Pipeline to mine Composer PHP packages and publish them to FederatedCode.
4036
"""
4137

38+
pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/"
39+
checkpoint_path = "composer/checkpoints.json"
40+
checkpoint_freq = 200
41+
4242
@classmethod
4343
def steps(cls):
4444
return (
4545
cls.check_federatedcode_eligibility,
46-
cls.clone_composer_repo,
47-
cls.mine_and_publish_composer_purls,
46+
cls.create_federatedcode_working_dir,
47+
cls.fetch_checkpoint_and_start_index,
48+
cls.fetch_federation_config,
49+
cls.mine_and_publish_packageurls,
50+
cls.delete_working_dir,
51+
)
52+
53+
def fetch_checkpoint_and_start_index(self):
54+
self.checkpoint_config_repo = federatedcode.clone_repository(
55+
repo_url=self.pipeline_config_repo,
56+
clone_path=self.working_path / "minecode-pipelines-config",
57+
logger=self.log,
58+
)
59+
checkpoint = pipes.get_checkpoint_from_file(
60+
cloned_repo=self.checkpoint_config_repo,
61+
path=self.checkpoint_path,
4862
)
4963

50-
def check_federatedcode_eligibility(self):
51-
"""
52-
Check if the project fulfills the following criteria for
53-
pushing the project result to FederatedCode.
54-
"""
55-
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
64+
self.start_index = checkpoint.get("start_index", 0)
65+
self.log(f"start_index: {self.start_index}")
5666

57-
def clone_composer_repo(self):
58-
"""
59-
Clone the federatedcode composer url and return the Repo object
60-
"""
61-
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_COMPOSER_GIT_URL)
62-
self.cloned_config_repo = federatedcode.clone_repository(MINECODE_PIPELINES_CONFIG_REPO)
67+
def packages_count(self):
68+
return len(self.composer_packages) if self.composer_packages else None
6369

64-
def mine_and_publish_composer_purls(self):
65-
"""
66-
Mine Composer package names from Composer indexes and generate
67-
package URLs (pURLs) for all mined Composer packages.
68-
"""
70+
def mine_packageurls(self):
71+
self.composer_packages = mine_composer_packages()
72+
return composer.mine_composer_packageurls(
73+
packages=self.composer_packages,
74+
start_index=self.start_index,
75+
)
6976

70-
composer_packages = mine_composer_packages()
71-
mine_and_publish_composer_purls(
72-
packages=composer_packages,
73-
cloned_data_repo=self.cloned_data_repo,
74-
cloned_config_repo=self.cloned_config_repo,
77+
def mine_and_publish_packageurls(self):
78+
"""Mine and publish PackageURLs."""
79+
_mine_and_publish_packageurls(
80+
packageurls=self.mine_packageurls(),
81+
total_package_count=self.packages_count(),
82+
data_cluster=self.data_cluster,
83+
checked_out_repos=self.checked_out_repos,
84+
working_path=self.working_path,
85+
append_purls=self.append_purls,
86+
commit_msg_func=self.commit_message,
7587
logger=self.log,
88+
checkpoint_func=self.save_check_point,
89+
checkpoint_freq=self.checkpoint_freq,
7690
)
7791

78-
def delete_cloned_repos(self):
79-
pipes.delete_cloned_repos(
80-
repos=[self.cloned_data_repo, self.cloned_config_repo],
92+
def save_check_point(self):
93+
checkpoint = {
94+
"date": str(datetime.now()),
95+
"start_index": self.start_index + self.checkpoint_freq * PACKAGE_BATCH_SIZE,
96+
}
97+
98+
self.log(f"Saving checkpoint: {checkpoint}")
99+
pipes.update_checkpoints_in_github(
100+
checkpoint=checkpoint,
101+
cloned_repo=self.checkpoint_config_repo,
102+
path=self.checkpoint_path,
81103
logger=self.log,
82104
)

minecode_pipelines/pipes/composer.py

Lines changed: 5 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,13 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from datetime import datetime
24-
from pathlib import Path
25-
from aboutcode import hashid
2623
from minecode_pipelines.miners.composer import get_composer_packages
2724
from minecode_pipelines.miners.composer import load_composer_packages
2825
from minecode_pipelines.miners.composer import get_composer_purl
29-
from minecode_pipelines.pipes import (
30-
write_data_to_yaml_file,
31-
get_checkpoint_from_file,
32-
update_checkpoints_in_github,
33-
)
34-
from scanpipe.pipes.federatedcode import commit_and_push_changes
26+
3527
from minecode_pipelines.utils import cycle_from_index, grouper
3628

37-
PACKAGE_BATCH_SIZE = 1000
38-
COMPOSER_CHECKPOINT_PATH = "composer/checkpoints.json"
29+
PACKAGE_BATCH_SIZE = 100
3930

4031

4132
def mine_composer_packages():
@@ -44,55 +35,15 @@ def mine_composer_packages():
4435
return load_composer_packages(packages_file)
4536

4637

47-
def mine_and_publish_composer_purls(packages, cloned_data_repo, cloned_config_repo, logger):
48-
"""Mine Composer packages and publish their PURLs to a FederatedCode repository."""
49-
composer_checkpoint = get_checkpoint_from_file(
50-
cloned_repo=cloned_config_repo, path=COMPOSER_CHECKPOINT_PATH
51-
)
52-
53-
start_index = composer_checkpoint.get("start_index", 0)
54-
38+
def mine_composer_packageurls(packages, start_index):
39+
"""Mine Composer packages from Packagist"""
5540
packages_iter = cycle_from_index(packages, start_index)
56-
5741
for batch_index, package_batch in enumerate(
5842
grouper(n=PACKAGE_BATCH_SIZE, iterable=packages_iter)
5943
):
60-
purl_files = []
61-
purls = []
62-
6344
for item in package_batch:
6445
if not item:
6546
continue
6647

6748
vendor, package = item
68-
69-
updated_purls = get_composer_purl(vendor=vendor, package=package)
70-
if not updated_purls:
71-
continue
72-
73-
base_purl = updated_purls[0]
74-
75-
purl_file_full_path = Path(
76-
cloned_data_repo.working_dir
77-
) / hashid.get_package_purls_yml_file_path(base_purl)
78-
79-
write_data_to_yaml_file(path=purl_file_full_path, data=updated_purls)
80-
81-
purl_files.append(purl_file_full_path)
82-
purls.append(str(base_purl))
83-
84-
if purls and purl_files:
85-
logger(f"Committing packageURLs: {', '.join(purls)}")
86-
commit_and_push_changes(
87-
repo=cloned_data_repo, files_to_commit=purl_files, purls=purls, logger=logger
88-
)
89-
90-
settings_data = {
91-
"date": str(datetime.now()),
92-
"start_index": start_index + (batch_index + 1) * PACKAGE_BATCH_SIZE,
93-
}
94-
update_checkpoints_in_github(
95-
checkpoint=settings_data,
96-
cloned_repo=cloned_config_repo,
97-
path=COMPOSER_CHECKPOINT_PATH,
98-
)
49+
yield get_composer_purl(vendor=vendor, package=package)

minecode_pipelines/tests/pipes/test_composer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ def test_generate_purls_from_composer(self, mock_get):
5151

5252
all_purls = []
5353
for vendor, package in packages:
54-
purls = get_composer_purl(vendor, package)
54+
base_purl, purls = get_composer_purl(vendor, package)
55+
assert str(base_purl) == "pkg:composer/monolog/monolog"
5556
all_purls.extend(purls)
5657

5758
assert len(all_purls) == 85

0 commit comments

Comments
 (0)