77# See https://aboutcode.org for more information about nexB OSS projects.
88#
99
10+ import gzip
11+ import io
12+ import os
1013from collections import namedtuple
1114from itertools import chain
1215from shutil import rmtree
13- import os
14- import gzip
15- import io
1616
17- from dateutil import tz
18- from jawa .util .utf import decode_modified_utf8
1917import arrow
2018import javaproperties
21-
22- from aboutcode import hashid
19+ from dateutil import tz
20+ from jawa . util . utf import decode_modified_utf8
2321from packagedcode .maven import build_filename
2422from packagedcode .maven import build_url
2523from packagedcode .maven import get_urls
2624from packagedcode .models import PackageData
2725from packageurl import PackageURL
28- from scanpipe .pipes .fetch import fetch_http
29- from scanpipe .pipes import federatedcode
3026
31- from minecode_pipelines import pipes
32- from minecode_pipelines import VERSION
27+
3328from minecode_pipelines .pipes import java_stream
3429
3530TRACE = False
4338MAVEN_INDEX_PROPERTIES_URL = (
4439 "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties"
4540)
46- MAVEN_CHECKPOINT_PATH = "maven/checkpoints.json"
47-
48- # We are testing and storing mined packageURLs in one single repo per ecosystem for now
49- MINECODE_DATA_MAVEN_REPO = "https://github.com/aboutcode-data/minecode-data-maven-test"
50-
51- PACKAGE_BATCH_SIZE = 1000
5241
5342
5443def is_worthy_artifact (artifact ):
@@ -624,6 +613,8 @@ def __del__(self):
624613 rmtree (download .directory )
625614
626615 def _fetch_http (self , uri ):
616+ from scanpipe .pipes .fetch import fetch_http
617+
627618 fetched = fetch_http (uri )
628619 self .downloads .append (fetched )
629620 return fetched
@@ -724,7 +715,7 @@ def _get_packages(self, content=None):
724715 name = artifact_id ,
725716 version = version ,
726717 )
727- yield current_purl , package
718+ yield current_purl , [ package . purl ]
728719
729720 def _get_packages_from_index_increments (self ):
730721 for index_increment in self .index_increment_locations :
@@ -737,87 +728,3 @@ def get_packages(self):
737728 else :
738729 packages = self ._get_packages (content = self .index_location )
739730 return packages
740-
741-
742- def commit_message (commit_batch , total_commit_batch = "many" ):
743- from django .conf import settings
744-
745- author_name = settings .FEDERATEDCODE_GIT_SERVICE_NAME
746- author_email = settings .FEDERATEDCODE_GIT_SERVICE_EMAIL
747- tool_name = "pkg:github/aboutcode-org/scancode.io"
748-
749- return f"""\
750- Collect PackageURLs from Maven ({ commit_batch } /{ total_commit_batch } )
751-
752- Tool: { tool_name } @v{ VERSION }
753- Reference: https://{ settings .ALLOWED_HOSTS [0 ]}
754-
755- Signed-off-by: { author_name } <{ author_email } >
756- """
757-
758-
759- def collect_packages_from_maven (files_per_commit = PACKAGE_BATCH_SIZE , logger = None ):
760- # Clone data and config repo
761- data_repo = federatedcode .clone_repository (
762- repo_url = MINECODE_DATA_MAVEN_REPO ,
763- logger = logger ,
764- )
765- config_repo = federatedcode .clone_repository (
766- repo_url = pipes .MINECODE_PIPELINES_CONFIG_REPO ,
767- logger = logger ,
768- )
769- if logger :
770- logger (f"{ MINECODE_DATA_MAVEN_REPO } repo cloned at: { data_repo .working_dir } " )
771- logger (f"{ pipes .MINECODE_PIPELINES_CONFIG_REPO } repo cloned at: { config_repo .working_dir } " )
772-
773- # get last_incremental to see if we can start from incrementals
774- checkpoint = pipes .get_checkpoint_from_file (cloned_repo = config_repo , path = MAVEN_CHECKPOINT_PATH )
775- last_incremental = checkpoint .get ("last_incremental" )
776- if logger :
777- logger (f"last_incremental: { last_incremental } " )
778-
779- # download and iterate through maven nexus index
780- maven_nexus_collector = MavenNexusCollector (last_incremental = last_incremental )
781- files_to_commit = []
782- commit_batch = 1
783- for current_purl , package in maven_nexus_collector .get_packages ():
784- # write packageURL to file
785- package_base_dir = hashid .get_package_base_dir (purl = current_purl )
786- purl_file = pipes .write_packageurls_to_file (
787- repo = data_repo ,
788- base_dir = package_base_dir ,
789- packageurls = [package .purl ],
790- append = True ,
791- )
792- if purl_file not in files_to_commit :
793- files_to_commit .append (purl_file )
794-
795- if len (files_to_commit ) == files_per_commit :
796- federatedcode .commit_and_push_changes (
797- commit_message = commit_message (commit_batch ),
798- repo = data_repo ,
799- files_to_commit = files_to_commit ,
800- logger = logger ,
801- )
802- files_to_commit .clear ()
803- commit_batch += 1
804-
805- if files_to_commit :
806- federatedcode .commit_and_push_changes (
807- commit_message = commit_message (commit_batch ),
808- repo = data_repo ,
809- files_to_commit = files_to_commit ,
810- logger = logger ,
811- )
812-
813- # update last_incremental so we can pick up from the proper place next time
814- last_incremental = maven_nexus_collector .index_properties .get ("nexus.index.last-incremental" )
815- checkpoint = {"last_incremental" : last_incremental }
816- if logger :
817- logger (f"checkpoint: { checkpoint } " )
818- pipes .update_checkpoints_in_github (
819- checkpoint = checkpoint , cloned_repo = config_repo , path = MAVEN_CHECKPOINT_PATH
820- )
821-
822- repos_to_clean = [data_repo , config_repo ]
823- return repos_to_clean
0 commit comments