Skip to content

Commit 51f02e0

Browse files
authored
Merge pull request #785 from aboutcode-org/refactor-maven-pipeline
Refactor maven mining pipeline for git deployment
2 parents 268eb89 + 9f9c253 commit 51f02e0

4 files changed

Lines changed: 62 additions & 127 deletions

File tree

minecode_pipelines/pipelines/mine_maven.py

Lines changed: 50 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,42 +20,69 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from scanpipe.pipelines import Pipeline
2423
from scanpipe.pipes import federatedcode
2524

2625
from minecode_pipelines import pipes
26+
from minecode_pipelines.pipelines import MineCodeBasePipeline
27+
from minecode_pipelines.pipelines import _mine_and_publish_packageurls
2728
from minecode_pipelines.pipes import maven
2829

2930

30-
class MineMaven(Pipeline):
31-
"""
32-
Create DiscoveredPackages for packages found on maven:
33-
- input: url of maven repo
34-
- process index
35-
- collect purls, grouped by package
36-
- write to files
37-
- publish to fetchcode
38-
- loop
31+
class MineMaven(MineCodeBasePipeline):
32+
"""Mine PackageURLs from maven index and publish them to FederatedCode."""
3933

40-
"""
34+
pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/"
35+
checkpoint_path = "maven/checkpoints.json"
36+
append_purls = True
4137

4238
@classmethod
4339
def steps(cls):
4440
return (
4541
cls.check_federatedcode_eligibility,
46-
cls.collect_packages_from_maven,
47-
cls.delete_cloned_repos,
42+
cls.create_federatedcode_working_dir,
43+
cls.fetch_federation_config,
44+
cls.fetch_checkpoint_and_maven_index,
45+
cls.mine_and_publish_alpine_packageurls,
46+
cls.delete_working_dir,
4847
)
4948

50-
def check_federatedcode_eligibility(self):
51-
"""
52-
Check if the project fulfills the following criteria for
53-
pushing the project result to FederatedCode.
54-
"""
55-
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
49+
def fetch_checkpoint_and_maven_index(self):
50+
self.checkpoint_config_repo = federatedcode.clone_repository(
51+
repo_url=self.pipeline_config_repo,
52+
clone_path=self.working_path / "minecode-pipelines-config",
53+
logger=self.log,
54+
)
55+
checkpoint = pipes.get_checkpoint_from_file(
56+
cloned_repo=self.checkpoint_config_repo,
57+
path=self.checkpoint_path,
58+
)
59+
60+
last_incremental = checkpoint.get("last_incremental")
61+
self.log(f"last_incremental: {last_incremental}")
62+
self.maven_nexus_collector = maven.MavenNexusCollector(last_incremental=last_incremental)
5663

57-
def collect_packages_from_maven(self):
58-
self.repos = maven.collect_packages_from_maven(logger=self.log)
64+
def mine_and_publish_alpine_packageurls(self):
65+
_mine_and_publish_packageurls(
66+
packageurls=self.maven_nexus_collector.get_packages(),
67+
total_package_count=None,
68+
data_cluster=self.data_cluster,
69+
checked_out_repos=self.checked_out_repos,
70+
working_path=self.working_path,
71+
append_purls=self.append_purls,
72+
commit_msg_func=self.commit_message,
73+
logger=self.log,
74+
checkpoint_func=self.save_check_point,
75+
)
5976

60-
def delete_cloned_repos(self):
61-
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)
77+
def save_check_point(self):
78+
last_incremental = self.maven_nexus_collector.index_properties.get(
79+
"nexus.index.last-incremental"
80+
)
81+
checkpoint = {"last_incremental": last_incremental}
82+
self.log(f"Saving checkpoint: {checkpoint}")
83+
pipes.update_checkpoints_in_github(
84+
checkpoint=checkpoint,
85+
cloned_repo=self.checkpoint_config_repo,
86+
path=self.checkpoint_path,
87+
logger=self.log,
88+
)

minecode_pipelines/pipes/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def get_checkpoint_from_file(cloned_repo, path):
4343
return checkpoint_data or {}
4444

4545

46-
def update_checkpoints_in_github(checkpoint, cloned_repo, path):
46+
def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger):
4747
from scanpipe.pipes.federatedcode import commit_and_push_changes
4848

4949
checkpoint_path = os.path.join(cloned_repo.working_dir, path)
@@ -53,6 +53,7 @@ def update_checkpoints_in_github(checkpoint, cloned_repo, path):
5353
repo=cloned_repo,
5454
files_to_commit=[checkpoint_path],
5555
commit_message=commit_message,
56+
logger=logger,
5657
)
5758

5859

minecode_pipelines/pipes/maven.py

Lines changed: 9 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -7,29 +7,24 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
import gzip
11+
import io
12+
import os
1013
from collections import namedtuple
1114
from itertools import chain
1215
from shutil import rmtree
13-
import os
14-
import gzip
15-
import io
1616

17-
from dateutil import tz
18-
from jawa.util.utf import decode_modified_utf8
1917
import arrow
2018
import javaproperties
21-
22-
from aboutcode import hashid
19+
from dateutil import tz
20+
from jawa.util.utf import decode_modified_utf8
2321
from packagedcode.maven import build_filename
2422
from packagedcode.maven import build_url
2523
from packagedcode.maven import get_urls
2624
from packagedcode.models import PackageData
2725
from packageurl import PackageURL
28-
from scanpipe.pipes.fetch import fetch_http
29-
from scanpipe.pipes import federatedcode
3026

31-
from minecode_pipelines import pipes
32-
from minecode_pipelines import VERSION
27+
3328
from minecode_pipelines.pipes import java_stream
3429

3530
TRACE = False
@@ -43,12 +38,6 @@
4338
MAVEN_INDEX_PROPERTIES_URL = (
4439
"https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties"
4540
)
46-
MAVEN_CHECKPOINT_PATH = "maven/checkpoints.json"
47-
48-
# We are testing and storing mined packageURLs in one single repo per ecosystem for now
49-
MINECODE_DATA_MAVEN_REPO = "https://github.com/aboutcode-data/minecode-data-maven-test"
50-
51-
PACKAGE_BATCH_SIZE = 1000
5241

5342

5443
def is_worthy_artifact(artifact):
@@ -624,6 +613,8 @@ def __del__(self):
624613
rmtree(download.directory)
625614

626615
def _fetch_http(self, uri):
616+
from scanpipe.pipes.fetch import fetch_http
617+
627618
fetched = fetch_http(uri)
628619
self.downloads.append(fetched)
629620
return fetched
@@ -724,7 +715,7 @@ def _get_packages(self, content=None):
724715
name=artifact_id,
725716
version=version,
726717
)
727-
yield current_purl, package
718+
yield current_purl, [package.purl]
728719

729720
def _get_packages_from_index_increments(self):
730721
for index_increment in self.index_increment_locations:
@@ -737,87 +728,3 @@ def get_packages(self):
737728
else:
738729
packages = self._get_packages(content=self.index_location)
739730
return packages
740-
741-
742-
def commit_message(commit_batch, total_commit_batch="many"):
743-
from django.conf import settings
744-
745-
author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME
746-
author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL
747-
tool_name = "pkg:github/aboutcode-org/scancode.io"
748-
749-
return f"""\
750-
Collect PackageURLs from Maven ({commit_batch}/{total_commit_batch})
751-
752-
Tool: {tool_name}@v{VERSION}
753-
Reference: https://{settings.ALLOWED_HOSTS[0]}
754-
755-
Signed-off-by: {author_name} <{author_email}>
756-
"""
757-
758-
759-
def collect_packages_from_maven(files_per_commit=PACKAGE_BATCH_SIZE, logger=None):
760-
# Clone data and config repo
761-
data_repo = federatedcode.clone_repository(
762-
repo_url=MINECODE_DATA_MAVEN_REPO,
763-
logger=logger,
764-
)
765-
config_repo = federatedcode.clone_repository(
766-
repo_url=pipes.MINECODE_PIPELINES_CONFIG_REPO,
767-
logger=logger,
768-
)
769-
if logger:
770-
logger(f"{MINECODE_DATA_MAVEN_REPO} repo cloned at: {data_repo.working_dir}")
771-
logger(f"{pipes.MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {config_repo.working_dir}")
772-
773-
# get last_incremental to see if we can start from incrementals
774-
checkpoint = pipes.get_checkpoint_from_file(cloned_repo=config_repo, path=MAVEN_CHECKPOINT_PATH)
775-
last_incremental = checkpoint.get("last_incremental")
776-
if logger:
777-
logger(f"last_incremental: {last_incremental}")
778-
779-
# download and iterate through maven nexus index
780-
maven_nexus_collector = MavenNexusCollector(last_incremental=last_incremental)
781-
files_to_commit = []
782-
commit_batch = 1
783-
for current_purl, package in maven_nexus_collector.get_packages():
784-
# write packageURL to file
785-
package_base_dir = hashid.get_package_base_dir(purl=current_purl)
786-
purl_file = pipes.write_packageurls_to_file(
787-
repo=data_repo,
788-
base_dir=package_base_dir,
789-
packageurls=[package.purl],
790-
append=True,
791-
)
792-
if purl_file not in files_to_commit:
793-
files_to_commit.append(purl_file)
794-
795-
if len(files_to_commit) == files_per_commit:
796-
federatedcode.commit_and_push_changes(
797-
commit_message=commit_message(commit_batch),
798-
repo=data_repo,
799-
files_to_commit=files_to_commit,
800-
logger=logger,
801-
)
802-
files_to_commit.clear()
803-
commit_batch += 1
804-
805-
if files_to_commit:
806-
federatedcode.commit_and_push_changes(
807-
commit_message=commit_message(commit_batch),
808-
repo=data_repo,
809-
files_to_commit=files_to_commit,
810-
logger=logger,
811-
)
812-
813-
# update last_incremental so we can pick up from the proper place next time
814-
last_incremental = maven_nexus_collector.index_properties.get("nexus.index.last-incremental")
815-
checkpoint = {"last_incremental": last_incremental}
816-
if logger:
817-
logger(f"checkpoint: {checkpoint}")
818-
pipes.update_checkpoints_in_github(
819-
checkpoint=checkpoint, cloned_repo=config_repo, path=MAVEN_CHECKPOINT_PATH
820-
)
821-
822-
repos_to_clean = [data_repo, config_repo]
823-
return repos_to_clean

pyproject-minecode_pipelines.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "flot.buildapi"
44

55
[project]
66
name = "minecode_pipelines"
7-
version = "0.0.1b30"
7+
version = "0.0.1b32"
88
description = "A library for mining packageURLs and package metadata from ecosystem repositories."
99
readme = "minecode_pipelines/README.rst"
1010
license = { text = "Apache-2.0" }

0 commit comments

Comments
 (0)