From 353dd31e42e7f2c13525e00fdd22a07b7a7a1161 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 23 Sep 2025 19:53:24 +0530 Subject: [PATCH 01/11] Add support to mine npm PackageURLs Reference: https://github.com/aboutcode-org/purldb/issues/661 Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/miners/npm.py | 153 ++++++++++ minecode_pipelines/miners/pypi.py | 9 - minecode_pipelines/notebooks/pypi.json | 4 + minecode_pipelines/pipelines/mine_npm.py | 66 +++++ minecode_pipelines/pipes/__init__.py | 34 +++ minecode_pipelines/pipes/npm.py | 339 +++++++++++++++++++++++ minecode_pipelines/pipes/pypi.py | 53 ++-- 7 files changed, 613 insertions(+), 45 deletions(-) create mode 100644 minecode_pipelines/miners/npm.py create mode 100644 minecode_pipelines/notebooks/pypi.json create mode 100644 minecode_pipelines/pipelines/mine_npm.py create mode 100644 minecode_pipelines/pipes/npm.py diff --git a/minecode_pipelines/miners/npm.py b/minecode_pipelines/miners/npm.py new file mode 100644 index 00000000..e393de8e --- /dev/null +++ b/minecode_pipelines/miners/npm.py @@ -0,0 +1,153 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + + +import json +import requests + +from packageurl import PackageURL + + +""" +Visitors for Npmjs and npmjs-like javascript package repositories. + +We have this hierarchy in npm replicate and registry index: + npm projects replicate.npmjs.com (paginated JSON) -> versions at registry.npmjs.org (JSON) -> download urls + +See https://github.com/orgs/community/discussions/152515 for information on +the latest replicate.npmjs.com API. + +https://replicate.npmjs.com/_all_docs +This NPMJS replicate API serves as an index to get all npm packages and their revision IDs +in paginated queries. + +https://replicate.npmjs.com/_changes +This NPMJS replicate API serves as a CHANGELOG of npm packages with update sequneces which +can be fetched in paginated queries. + +https://registry.npmjs.org/{namespace/name} +For each npm package, a JSON containing details including the list of all releases +and archives, their URLs, and some metadata for each release. + +https://registry.npmjs.org/{namespace/name}/{version} +For each release, a JSON contains details for the released version and all the +downloads available for this release. +""" + + +NPM_REPLICATE_REPO = "https://replicate.npmjs.com/" +NPM_REGISTRY_REPO = "https://registry.npmjs.org/" +NPM_TYPE = "NPM" +NPM_REPLICATE_BATCH_SIZE = 10000 + + +def get_package_names_last_key(package_data): + names = [package.get("id") for package in package_data.get("rows")] + last_key = package_data.get("rows")[-1].get("key") + return names, last_key + + +def get_package_names_last_seq(package_data): + names = [package.get("id") for package in package_data.get("results")] + last_seq = package_data.get("last_seq") + return names, last_seq + + +def get_current_last_seq(replicate_url=NPM_REPLICATE_REPO): + npm_replicate_latest_changes = replicate_url + "_changes?descending=True" + response = requests.get(npm_replicate_latest_changes) + if not response.ok: + return + + package_data = response.json() + _package_names, last_seq = get_package_names_last_seq(package_data) + return last_seq + + +def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO): + all_package_names = [] + i = 0 + + while True: + print(f"Processing iteration: {i}: changes after seq: {last_seq}") + npm_replicate_changes = ( + replicate_url + "_changes?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + f"&since={last_seq}" + ) + response = requests.get(npm_replicate_changes) + if not response.ok: + return all_package_names + + package_data = response.json() + package_names, last_seq = get_package_names_last_seq(package_data) + all_package_names.extend(package_names) + + # We have fetched the last set of changes if True + if len(package_names) < NPM_REPLICATE_BATCH_SIZE: + break + + i += 1 + + return {"packages": all_package_names}, last_seq + + +def get_npm_packages(replicate_url=NPM_REPLICATE_REPO): + all_package_names = [] + + npm_replicate_all = replicate_url + "_all_docs?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + response = requests.get(npm_replicate_all) + if not response.ok: + return all_package_names + + package_data = response.json() + package_names, last_key = get_package_names_last_key(package_data) + all_package_names.append(package_names) + + total_rows = package_data.get("total_rows") + iterations = int(total_rows / NPM_REPLICATE_BATCH_SIZE) + 1 + + for i in range(iterations): + npm_replicate_from_id = npm_replicate_all + f'&start_key="{last_key}"' + print(f"Processing iteration: {i}: {npm_replicate_from_id}") + + response = requests.get(npm_replicate_from_id) + if not response.ok: + raise Exception(npm_replicate_from_id, response.text) + + package_data = response.json() + package_names, last_key = get_package_names_last_key(package_data) + all_package_names.append(package_names) + + return {"packages": all_package_names} + + +def get_npm_packageurls(name, npm_repo=NPM_REGISTRY_REPO): + packageurls = [] + + project_index_api_url = npm_repo + name + response = requests.get(project_index_api_url) + if not response.ok: + return packageurls + + project_data = response.json() + for version in project_data.get("versions"): + purl = PackageURL( + type=NPM_TYPE, + name=name, + version=version, + ) + packageurls.append(purl.to_string()) + + return packageurls + + +def load_npm_packages(packages_file): + with open(packages_file) as f: + packages_data = json.load(f) + + return packages_data.get("packages", []) diff --git a/minecode_pipelines/miners/pypi.py b/minecode_pipelines/miners/pypi.py index be81a515..680cfa1e 100644 --- a/minecode_pipelines/miners/pypi.py +++ b/minecode_pipelines/miners/pypi.py @@ -13,9 +13,6 @@ from packageurl import PackageURL -from minecode_pipelines.utils import get_temp_file -from minecode_pipelines.pipes import write_data_to_json_file - """ Visitors for Pypi and Pypi-like Python package repositories. @@ -52,12 +49,6 @@ def get_pypi_packages(pypi_repo, logger=None): return response.json() -def write_packages_json(packages, name): - temp_file = get_temp_file(name) - write_data_to_json_file(path=temp_file, data=packages) - return temp_file - - def get_pypi_packageurls(name): packageurls = [] diff --git a/minecode_pipelines/notebooks/pypi.json b/minecode_pipelines/notebooks/pypi.json new file mode 100644 index 00000000..2c87b538 --- /dev/null +++ b/minecode_pipelines/notebooks/pypi.json @@ -0,0 +1,4 @@ +{ + "last_serial": 0, + "date": null +} \ No newline at end of file diff --git a/minecode_pipelines/pipelines/mine_npm.py b/minecode_pipelines/pipelines/mine_npm.py new file mode 100644 index 00000000..c8f99e4e --- /dev/null +++ b/minecode_pipelines/pipelines/mine_npm.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from scanpipe.pipelines import Pipeline +from scanpipe.pipes import federatedcode + +from minecode_pipelines.pipes import npm +from minecode_pipelines import pipes + + +class MineandPublishNPMPURLs(Pipeline): + """ + Mine all packageURLs from a npm index and publish them to + a FederatedCode repo. + """ + + @classmethod + def steps(cls): + return ( + cls.check_federatedcode_eligibility, + cls.mine_npm_packages, + cls.mine_and_publish_npm_packageurls, + cls.delete_cloned_repos, + ) + + def check_federatedcode_eligibility(self): + """ + Check if the project fulfills the following criteria for + pushing the project result to FederatedCode. + """ + federatedcode.check_federatedcode_configured_and_available(logger=self.log) + + def mine_npm_packages(self): + """Mine npm package names from npm indexes or checkpoint.""" + self.npm_packages, self.state, self.last_seq = npm.mine_npm_packages(logger=self.log) + + def mine_and_publish_npm_packageurls(self): + """Get npm packageURLs for all mined npm package names.""" + self.repos = npm.mine_and_publish_npm_packageurls( + packages_file=self.npm_packages, + state=self.state, + last_seq=self.last_seq, + logger=self.log, + ) + + def delete_cloned_repos(self): + pipes.delete_cloned_repos(repos=self.repos, logger=self.log) diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py index 595712b8..1abc5f7c 100644 --- a/minecode_pipelines/pipes/__init__.py +++ b/minecode_pipelines/pipes/__init__.py @@ -20,6 +20,8 @@ from scanpipe.pipes.federatedcode import delete_local_clone from scanpipe.pipes.federatedcode import commit_and_push_changes +from minecode_pipelines.utils import get_temp_file + # states: # note: a state is null when mining starts INITIAL_SYNC_STATE = "initial-sync" @@ -29,6 +31,12 @@ MINECODE_PIPELINES_CONFIG_REPO = "https://github.com/aboutcode-data/minecode-pipelines-config/" +def write_packages_json(packages, name): + temp_file = get_temp_file(name) + write_data_to_json_file(path=temp_file, data=packages) + return temp_file + + def fetch_checkpoint_from_github(config_repo, checkpoint_path): repo_name = config_repo.split("github.com")[-1] checkpoints_file = ( @@ -81,6 +89,32 @@ def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, chec ) +def update_checkpoint_state( + cloned_repo, + state, + checkpoint_path, + config_repo=MINECODE_PIPELINES_CONFIG_REPO, +): + checkpoint = fetch_checkpoint_from_github( + config_repo=config_repo, + checkpoint_path=checkpoint_path, + ) + checkpoint["state"] = state + update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=cloned_repo, + path=checkpoint_path, + ) + + +def get_packages_file_from_checkpoint(config_repo, checkpoint_path, name): + packages = fetch_checkpoint_from_github( + config_repo=config_repo, + checkpoint_path=checkpoint_path, + ) + return write_packages_json(packages, name=name) + + def write_packageurls_to_file(repo, base_dir, packageurls): purl_file_rel_path = os.path.join(base_dir, PURLS_FILENAME) purl_file_full_path = Path(repo.working_dir) / purl_file_rel_path diff --git a/minecode_pipelines/pipes/npm.py b/minecode_pipelines/pipes/npm.py new file mode 100644 index 00000000..dc74fc29 --- /dev/null +++ b/minecode_pipelines/pipes/npm.py @@ -0,0 +1,339 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from datetime import datetime + +from minecode_pipelines import VERSION +from minecode_pipelines.pipes import write_packageurls_to_file +from minecode_pipelines.pipes import fetch_checkpoint_from_github +from minecode_pipelines.pipes import update_checkpoints_in_github +from minecode_pipelines.pipes import get_mined_packages_from_checkpoint +from minecode_pipelines.pipes import update_mined_packages_in_checkpoint +from minecode_pipelines.pipes import get_packages_file_from_checkpoint +from minecode_pipelines.pipes import update_checkpoint_state +from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO +from minecode_pipelines.pipes import INITIAL_SYNC_STATE +from minecode_pipelines.pipes import PERIODIC_SYNC_STATE +from minecode_pipelines.pipes import write_packages_json + + +from minecode_pipelines.miners.npm import get_npm_packages +from minecode_pipelines.miners.npm import get_updated_npm_packages +from minecode_pipelines.miners.npm import get_current_last_seq +from minecode_pipelines.miners.npm import load_npm_packages +from minecode_pipelines.miners.npm import get_npm_packageurls +from minecode_pipelines.miners.npm import NPM_REPLICATE_REPO + +from minecode_pipelines.miners.npm import NPM_TYPE +from minecode_pipelines.utils import grouper + +from packageurl import PackageURL + +from aboutcode.hashid import get_package_base_dir + + +from scanpipe.pipes.federatedcode import clone_repository +from scanpipe.pipes.federatedcode import commit_changes +from scanpipe.pipes.federatedcode import push_changes + + +PACKAGE_FILE_NAME = "NPMPackages.json" +NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + PACKAGE_FILE_NAME +NPM_CHECKPOINT_PATH = "npm/checkpoints.json" +NPM_PACKAGES_CHECKPOINT_PATH = "npm/packages_checkpoint.json" + +# We are testing and storing mined packageURLs in one single repo per ecosystem for now +MINECODE_DATA_NPM_REPO = "https://github.com/aboutcode-data/minecode-data-npm-test" + + +PACKAGE_BATCH_SIZE = 1000 + + +def mine_npm_packages(logger=None): + """ + Mine npm package names from npm replicate index and save to checkpoints, + or get packages from saved checkpoints. We have 3 cases: + + 1. first sync: we get latest set of packages from the "_all_docs" API endpoint + of npm replicate and save this and last sequence of the package to checkpoints. + 2. intial sync: we get packages from checkpoint which we're trying to sync upto + 3. periodic sync: we get latest packages newly released in npm through the + "_changes" API, for a period, from our last mined sequence of package. + """ + + npm_checkpoints = fetch_checkpoint_from_github( + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + checkpoint_path=NPM_CHECKPOINT_PATH, + ) + state = npm_checkpoints.get("state") + if logger: + logger(f"Mining state from checkpoint: {state}") + + cloned_repo = clone_repository(repo_url=MINECODE_PIPELINES_CONFIG_REPO) + + # This is the first time we are syncing from npm replicate + if not state: + last_seq = get_current_last_seq(replicate_url=NPM_REPLICATE_REPO) + if logger: + logger( + f"Starting initial checkpointing of packages from npm replicate till seq: {last_seq}" + ) + + packages = get_npm_packages(replicate_url=NPM_REPLICATE_REPO) + packages_file = write_packages_json( + packages=packages, + name=PACKAGE_FILE_NAME, + ) + update_checkpoints_in_github( + checkpoint=packages, + cloned_repo=cloned_repo, + path=NPM_REPLICATE_CHECKPOINT_PATH, + ) + + if logger: + logger(f"Updating checkpoint mining state to: {INITIAL_SYNC_STATE}") + logger(f"Updating checkpoint mining last_seq to: {last_seq}") + + update_npm_checkpoints( + cloned_repo=cloned_repo, + state=INITIAL_SYNC_STATE, + last_seq=last_seq, + checkpoint_path=NPM_CHECKPOINT_PATH, + ) + + elif state == INITIAL_SYNC_STATE: + if logger: + logger("Getting packages to sync from npm checkpoint") + + last_seq = fetch_last_seq_mined( + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + settings_path=NPM_CHECKPOINT_PATH, + ) + + packages_file = get_packages_file_from_checkpoint( + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + checkpoint_path=NPM_REPLICATE_CHECKPOINT_PATH, + name=PACKAGE_FILE_NAME, + ) + + elif state == PERIODIC_SYNC_STATE: + last_seq = fetch_last_seq_mined( + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + settings_path=NPM_CHECKPOINT_PATH, + ) + if logger: + logger( + f"Getting latest packages from npm replicate index changes after seq: {last_seq}" + ) + + packages, last_seq = get_updated_npm_packages( + last_seq=last_seq, + replicate_url=NPM_REPLICATE_REPO, + ) + packages_file = write_packages_json( + packages=packages, + name=PACKAGE_FILE_NAME, + ) + + return packages_file, state, last_seq + + +def update_npm_checkpoints( + cloned_repo, + checkpoint_path, + state=None, + last_seq=None, + config_repo=MINECODE_PIPELINES_CONFIG_REPO, +): + checkpoint = fetch_checkpoint_from_github( + config_repo=config_repo, + checkpoint_path=checkpoint_path, + ) + if state: + checkpoint["state"] = state + if last_seq: + checkpoint["last_seq"] = last_seq + + checkpoint["date"] = str(datetime.now()) + update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=cloned_repo, + path=checkpoint_path, + ) + + +def fetch_last_seq_mined(config_repo, settings_path): + """ + Fetch "last_seq" for the last mined packages. + + This is a simple JSON in a github repo containing mining checkpoints + with the "last_seq" from the npm replicate index which was mined. Example: + https://github.com/aboutcode-data/minecode-pipelines-config/blob/main/npm/checkpoints.json + """ + checkpoints = fetch_checkpoint_from_github( + config_repo=config_repo, + checkpoint_path=settings_path, + ) + return checkpoints.get("last_seq") + + +def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None): + if logger: + logger(f"Last serial number mined: {last_seq}") + logger(f"Mining state: {state}") + + # this is either from npm replicate or from checkpoints + packages = load_npm_packages(packages_file) + if logger: + logger(f"# of package names fetched from index/checkpoint: {len(packages)}") + + if not packages: + return + + if not state: + packages_to_sync = packages + if logger: + logger(f"Starting package mining for {len(packages_to_sync)} packages") + + elif state == INITIAL_SYNC_STATE or state == PERIODIC_SYNC_STATE: + synced_packages = get_mined_packages_from_checkpoint( + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + checkpoint_path=NPM_PACKAGES_CHECKPOINT_PATH, + ) + packages_to_sync = [package for package in packages if package not in synced_packages] + if logger: + logger( + f"Starting initial package mining for {len(packages_to_sync)} packages from checkpoint" + ) + + # clone repo + cloned_data_repo = clone_repository(repo_url=MINECODE_DATA_NPM_REPO) + cloned_config_repo = clone_repository(repo_url=MINECODE_PIPELINES_CONFIG_REPO) + if logger: + logger(f"{MINECODE_DATA_NPM_REPO} repo cloned at: {cloned_data_repo.working_dir}") + logger(f"{MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {cloned_config_repo.working_dir}") + + for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=packages_to_sync): + packages_mined = [] + purls = [] + purl_files = [] + + if logger: + logger("Starting package mining for a batch of packages") + + for package_name in package_batch: + if not package_name: + continue + + # fetch packageURLs for package + if logger: + logger(f"getting packageURLs for package: {package_name}") + + packageurls = get_npm_packageurls(package_name) + if not packageurls: + if logger: + logger(f"Could not fetch package versions for package: {package_name}") + continue + + # get repo and path for package + base_purl = PackageURL(type=NPM_TYPE, name=package_name).to_string() + package_base_dir = get_package_base_dir(purl=base_purl) + + if logger: + logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}") + purls_string = " ".join(packageurls) + logger(f"packageURLs: {purls_string}") + + # write packageURLs to file + purl_file = write_packageurls_to_file( + repo=cloned_data_repo, + base_dir=package_base_dir, + packageurls=packageurls, + ) + purl_files.append(purl_file) + purls.append(base_purl) + + packages_mined.append(package_name) + + if logger: + purls_string = " ".join(purls) + logger("Committing and pushing changes for a batch of packages: ") + logger(f"{purls_string}") + + # commit changes + commit_changes( + repo=cloned_data_repo, + files_to_commit=purl_files, + purls=purls, + mine_type="packageURL", + tool_name="pkg:pypi/minecode-pipelines", + tool_version=VERSION, + ) + + # Push changes to remote repository + push_changes(repo=cloned_data_repo) + + # As we are mining the packages to sync with the index, + # we need to update mined packages checkpoint for every batch + # so we can continue mining the other packages after restarting + if logger: + logger("Checkpointing processed packages to: {NPM_PACKAGES_CHECKPOINT_PATH}") + + packages_checkpoint = packages_mined + synced_packages + update_mined_packages_in_checkpoint( + packages=packages_checkpoint, + cloned_repo=cloned_config_repo, + checkpoint_path=NPM_PACKAGES_CHECKPOINT_PATH, + ) + + # If we are finished mining all the packages in the intial sync, we can now + # periodically sync the packages from latest + if state == INITIAL_SYNC_STATE: + if logger: + logger(f"{INITIAL_SYNC_STATE} completed. starting: {PERIODIC_SYNC_STATE}") + update_checkpoint_state( + cloned_repo=cloned_config_repo, + state=PERIODIC_SYNC_STATE, + ) + + # If we are finished mining all the packages in the periodic sync, we can now update + # the last sequence updated + if state == PERIODIC_SYNC_STATE: + if logger: + logger(f"{PERIODIC_SYNC_STATE} completed. Updating last seq to: {last_seq}") + + update_npm_checkpoints( + cloned_repo=cloned_config_repo, + checkpoint_path=NPM_CHECKPOINT_PATH, + state=PERIODIC_SYNC_STATE, + last_seq=last_seq, + ) + + # Refresh mined packages checkpoint + update_checkpoints_in_github( + checkpoint={"packages_mined": []}, + cloned_repo=cloned_config_repo, + path=NPM_PACKAGES_CHECKPOINT_PATH, + ) + + repos_to_clean = [cloned_data_repo, cloned_config_repo] + return repos_to_clean diff --git a/minecode_pipelines/pipes/pypi.py b/minecode_pipelines/pipes/pypi.py index 457a1ab6..e3577782 100644 --- a/minecode_pipelines/pipes/pypi.py +++ b/minecode_pipelines/pipes/pypi.py @@ -28,6 +28,9 @@ from minecode_pipelines.pipes import update_checkpoints_in_github from minecode_pipelines.pipes import get_mined_packages_from_checkpoint from minecode_pipelines.pipes import update_mined_packages_in_checkpoint +from minecode_pipelines.pipes import get_packages_file_from_checkpoint +from minecode_pipelines.pipes import update_checkpoint_state +from minecode_pipelines.pipes import write_packages_json from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO from minecode_pipelines.pipes import INITIAL_SYNC_STATE from minecode_pipelines.pipes import PERIODIC_SYNC_STATE @@ -37,7 +40,7 @@ from minecode_pipelines.miners.pypi import get_pypi_packageurls from minecode_pipelines.miners.pypi import load_pypi_packages from minecode_pipelines.miners.pypi import PYPI_REPO -from minecode_pipelines.miners.pypi import write_packages_json + from minecode_pipelines.miners.pypi import PYPI_TYPE from minecode_pipelines.utils import grouper @@ -114,7 +117,11 @@ def mine_pypi_packages(logger=None): ) if logger: logger(f"Updating checkpoint mining state to: {INITIAL_SYNC_STATE}") - update_checkpoint_state(cloned_repo=cloned_repo, state=INITIAL_SYNC_STATE) + update_checkpoint_state( + cloned_repo=cloned_repo, + state=INITIAL_SYNC_STATE, + checkpoint_path=PYPI_CHECKPOINT_PATH, + ) return packages_file, state @@ -134,25 +141,6 @@ def fetch_last_serial_mined(config_repo, settings_path): return checkpoints.get("last_serial") -def update_checkpoint_state( - cloned_repo, - state, - config_repo=MINECODE_PIPELINES_CONFIG_REPO, - checkpoint_path=PYPI_CHECKPOINT_PATH, -): - checkpoint = fetch_checkpoint_from_github( - config_repo=config_repo, - checkpoint_path=checkpoint_path, - ) - checkpoint["state"] = state - checkpoint["last_updated"] = str(datetime.now()) - update_checkpoints_in_github( - checkpoint=checkpoint, - cloned_repo=cloned_repo, - path=checkpoint_path, - ) - - def update_pypi_checkpoints( last_serial, state, @@ -171,14 +159,6 @@ def update_pypi_checkpoints( ) -def get_packages_file_from_checkpoint(config_repo, checkpoint_path, name): - packages = fetch_checkpoint_from_github( - config_repo=config_repo, - checkpoint_path=checkpoint_path, - ) - return write_packages_json(packages, name=name) - - def mine_and_publish_pypi_packageurls(packages_file, state, logger=None): last_serial_fetched = fetch_last_serial_mined( config_repo=MINECODE_PIPELINES_CONFIG_REPO, @@ -314,7 +294,7 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None): checkpoint_path=PYPI_PACKAGES_CHECKPOINT_PATH, ) - # If we are finshed mining all the packages in the intial sync, we can now + # If we are finished mining all the packages in the intial sync, we can now # periodically sync the packages from latest if state == INITIAL_SYNC_STATE: if logger: @@ -325,12 +305,13 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None): cloned_repo=cloned_config_repo, state=state, ) - # refresh packages checkpoint once to only checkpoint new packages - update_checkpoints_in_github( - checkpoint={"packages_mined": []}, - cloned_repo=cloned_config_repo, - path=PYPI_PACKAGES_CHECKPOINT_PATH, - ) + + # refresh packages checkpoint once to only checkpoint new packages + update_checkpoints_in_github( + checkpoint={"packages_mined": []}, + cloned_repo=cloned_config_repo, + path=PYPI_PACKAGES_CHECKPOINT_PATH, + ) # update last_serial to minecode checkpoints whenever we finish mining # either from checkpoints or from the latest pypi From d9147afbbeaef953cd6f7f327d0b12e5a453966c Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 29 Sep 2025 15:14:01 +0530 Subject: [PATCH 02/11] Fix debian and maven pipeline installation issues Reference: https://github.com/aboutcode-org/purldb/issues/664 Reference: https://github.com/aboutcode-org/purldb/issues/660 Signed-off-by: Ayan Sinha Mahapatra --- pyproject-minecode_pipelines.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index 5c23e70d..87691820 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -40,13 +40,16 @@ dependencies = [ "aboutcode.hashid >= 0.2.0", "packageurl_python >= 0.15.6", "scancodeio >= 35.3.0", + "ftputil >= 5.1.0", + "jawa >= 2.2.0", + "arrow >= 1.3.0" ] urls = { Homepage = "https://github.com/aboutcode-org/purldb" } [project.entry-points."scancodeio_pipelines"] mine_pypi = "minecode_pipelines.pipelines.mine_pypi:MinePypi" -mine_maven = "minecode_pipeline.pipelines.mine_maven:MineMaven" +mine_maven = "minecode_pipelines.pipelines.mine_maven:MineMaven" mine_cargo = "minecode_pipelines.pipelines.mine_cargo:MineCargo" mine_debian = "minecode_pipelines.pipelines.mine_debian:MineDebian" From fc71d092c95da357074086d71458b3981d593a3c Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 29 Sep 2025 17:02:55 +0530 Subject: [PATCH 03/11] Add npm mining pipeline in entrypoints Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/pipelines/mine_npm.py | 2 +- pyproject-minecode_pipelines.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/minecode_pipelines/pipelines/mine_npm.py b/minecode_pipelines/pipelines/mine_npm.py index c8f99e4e..6736c4f1 100644 --- a/minecode_pipelines/pipelines/mine_npm.py +++ b/minecode_pipelines/pipelines/mine_npm.py @@ -27,7 +27,7 @@ from minecode_pipelines import pipes -class MineandPublishNPMPURLs(Pipeline): +class MineNPM(Pipeline): """ Mine all packageURLs from a npm index and publish them to a FederatedCode repo. diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index 87691820..8025e558 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -49,6 +49,7 @@ urls = { Homepage = "https://github.com/aboutcode-org/purldb" } [project.entry-points."scancodeio_pipelines"] mine_pypi = "minecode_pipelines.pipelines.mine_pypi:MinePypi" +mine_npm = "minecode_pipelines.pipelines.mine_npm:MineNPM" mine_maven = "minecode_pipelines.pipelines.mine_maven:MineMaven" mine_cargo = "minecode_pipelines.pipelines.mine_cargo:MineCargo" mine_debian = "minecode_pipelines.pipelines.mine_debian:MineDebian" From f2dc10052bcd162a89189250cf4a1651e2aa92d9 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 29 Sep 2025 17:04:40 +0530 Subject: [PATCH 04/11] Update npm packageURL mining pipeline * Fix npm packages JSON file structure * Compress package names JSON to reduce size and allow git push Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/miners/npm.py | 4 ++-- minecode_pipelines/pipes/__init__.py | 25 +++++++++++++++++++++++++ minecode_pipelines/pipes/npm.py | 27 +++++++++++++++++++++------ 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/minecode_pipelines/miners/npm.py b/minecode_pipelines/miners/npm.py index e393de8e..4031a2e4 100644 --- a/minecode_pipelines/miners/npm.py +++ b/minecode_pipelines/miners/npm.py @@ -106,7 +106,7 @@ def get_npm_packages(replicate_url=NPM_REPLICATE_REPO): package_data = response.json() package_names, last_key = get_package_names_last_key(package_data) - all_package_names.append(package_names) + all_package_names.extend(package_names) total_rows = package_data.get("total_rows") iterations = int(total_rows / NPM_REPLICATE_BATCH_SIZE) + 1 @@ -121,7 +121,7 @@ def get_npm_packages(replicate_url=NPM_REPLICATE_REPO): package_data = response.json() package_names, last_key = get_package_names_last_key(package_data) - all_package_names.append(package_names) + all_package_names.extend(package_names) return {"packages": all_package_names} diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py index 1abc5f7c..9dcf3842 100644 --- a/minecode_pipelines/pipes/__init__.py +++ b/minecode_pipelines/pipes/__init__.py @@ -7,8 +7,10 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import gzip import json import os +import shutil from pathlib import Path import requests @@ -31,6 +33,18 @@ MINECODE_PIPELINES_CONFIG_REPO = "https://github.com/aboutcode-data/minecode-pipelines-config/" +def compress_packages_file(packages_file, compressed_packages_file): + with open(packages_file, "rb") as f_in: + with gzip.open(compressed_packages_file, "wb") as f_out: + f_out.writelines(f_in) + + +def decompress_packages_file(packages_file, compressed_packages_file): + with gzip.open(compressed_packages_file, "rb") as f_in: + with open(packages_file, "wb") as f_out: + f_out.writelines(f_in) + + def write_packages_json(packages, name): temp_file = get_temp_file(name) write_data_to_json_file(path=temp_file, data=packages) @@ -68,6 +82,17 @@ def update_checkpoints_in_github(checkpoint, cloned_repo, path): ) +def update_checkpoints_file_in_github(checkpoints_file, cloned_repo, path): + checkpoint_path = os.path.join(cloned_repo.working_dir, path) + shutil.move(checkpoints_file, checkpoint_path) + commit_message = """Update federatedcode purl mining checkpoint""" + commit_and_push_changes( + repo=cloned_repo, + files_to_commit=[checkpoint_path], + commit_message=commit_message, + ) + + def get_mined_packages_from_checkpoint(config_repo, checkpoint_path): checkpoint = fetch_checkpoint_from_github( config_repo=config_repo, diff --git a/minecode_pipelines/pipes/npm.py b/minecode_pipelines/pipes/npm.py index dc74fc29..33ff95c3 100644 --- a/minecode_pipelines/pipes/npm.py +++ b/minecode_pipelines/pipes/npm.py @@ -26,6 +26,7 @@ from minecode_pipelines.pipes import write_packageurls_to_file from minecode_pipelines.pipes import fetch_checkpoint_from_github from minecode_pipelines.pipes import update_checkpoints_in_github +from minecode_pipelines.pipes import update_checkpoints_file_in_github from minecode_pipelines.pipes import get_mined_packages_from_checkpoint from minecode_pipelines.pipes import update_mined_packages_in_checkpoint from minecode_pipelines.pipes import get_packages_file_from_checkpoint @@ -34,6 +35,8 @@ from minecode_pipelines.pipes import INITIAL_SYNC_STATE from minecode_pipelines.pipes import PERIODIC_SYNC_STATE from minecode_pipelines.pipes import write_packages_json +from minecode_pipelines.pipes import compress_packages_file +from minecode_pipelines.pipes import decompress_packages_file from minecode_pipelines.miners.npm import get_npm_packages @@ -57,7 +60,9 @@ PACKAGE_FILE_NAME = "NPMPackages.json" +COMPRESSED_PACKAGE_FILE_NAME = "NPMPackages.json.gz" NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + PACKAGE_FILE_NAME +COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + COMPRESSED_PACKAGE_FILE_NAME NPM_CHECKPOINT_PATH = "npm/checkpoints.json" NPM_PACKAGES_CHECKPOINT_PATH = "npm/packages_checkpoint.json" @@ -103,10 +108,15 @@ def mine_npm_packages(logger=None): packages=packages, name=PACKAGE_FILE_NAME, ) - update_checkpoints_in_github( - checkpoint=packages, + compressed_packages_file = packages_file + ".gz" + compress_packages_file( + packages_file=packages_file, + compressed_packages_file=compressed_packages_file, + ) + update_checkpoints_file_in_github( + checkpoints_file=compressed_packages_file, cloned_repo=cloned_repo, - path=NPM_REPLICATE_CHECKPOINT_PATH, + path=COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH, ) if logger: @@ -129,10 +139,15 @@ def mine_npm_packages(logger=None): settings_path=NPM_CHECKPOINT_PATH, ) - packages_file = get_packages_file_from_checkpoint( + compressed_packages_file = get_packages_file_from_checkpoint( config_repo=MINECODE_PIPELINES_CONFIG_REPO, - checkpoint_path=NPM_REPLICATE_CHECKPOINT_PATH, - name=PACKAGE_FILE_NAME, + checkpoint_path=COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH, + name=COMPRESSED_PACKAGE_FILE_NAME, + ) + packages_file = compressed_packages_file.replace(".gz", "") + decompress_packages_file( + packages_file=packages_file, + compressed_packages_file=compressed_packages_file, ) elif state == PERIODIC_SYNC_STATE: From 478b5e8d15d9ba07d29bfb851629641c3779e417 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 30 Sep 2025 02:48:49 +0530 Subject: [PATCH 05/11] Update npm packageURLs mining Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/pipes/__init__.py | 12 +++++++++++- minecode_pipelines/pipes/npm.py | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py index 9dcf3842..0420543c 100644 --- a/minecode_pipelines/pipes/__init__.py +++ b/minecode_pipelines/pipes/__init__.py @@ -39,11 +39,15 @@ def compress_packages_file(packages_file, compressed_packages_file): f_out.writelines(f_in) -def decompress_packages_file(packages_file, compressed_packages_file): +def decompress_packages_file(compressed_packages_file, name): + + packages_file = get_temp_file(name) with gzip.open(compressed_packages_file, "rb") as f_in: with open(packages_file, "wb") as f_out: f_out.writelines(f_in) + return packages_file + def write_packages_json(packages, name): temp_file = get_temp_file(name) @@ -140,6 +144,12 @@ def get_packages_file_from_checkpoint(config_repo, checkpoint_path, name): return write_packages_json(packages, name=name) +def fetch_checkpoint_by_git(cloned_repo, checkpoint_path): + + cloned_repo.remotes.origin.pull() + return os.path.join(cloned_repo.working_dir, checkpoint_path) + + def write_packageurls_to_file(repo, base_dir, packageurls): purl_file_rel_path = os.path.join(base_dir, PURLS_FILENAME) purl_file_full_path = Path(repo.working_dir) / purl_file_rel_path diff --git a/minecode_pipelines/pipes/npm.py b/minecode_pipelines/pipes/npm.py index 33ff95c3..8e143161 100644 --- a/minecode_pipelines/pipes/npm.py +++ b/minecode_pipelines/pipes/npm.py @@ -37,6 +37,7 @@ from minecode_pipelines.pipes import write_packages_json from minecode_pipelines.pipes import compress_packages_file from minecode_pipelines.pipes import decompress_packages_file +from minecode_pipelines.pipes import fetch_checkpoint_by_git from minecode_pipelines.miners.npm import get_npm_packages @@ -62,7 +63,7 @@ PACKAGE_FILE_NAME = "NPMPackages.json" COMPRESSED_PACKAGE_FILE_NAME = "NPMPackages.json.gz" NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + PACKAGE_FILE_NAME -COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + COMPRESSED_PACKAGE_FILE_NAME +COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + COMPRESSED_PACKAGE_FILE_NAME NPM_CHECKPOINT_PATH = "npm/checkpoints.json" NPM_PACKAGES_CHECKPOINT_PATH = "npm/packages_checkpoint.json" @@ -70,7 +71,7 @@ MINECODE_DATA_NPM_REPO = "https://github.com/aboutcode-data/minecode-data-npm-test" -PACKAGE_BATCH_SIZE = 1000 +PACKAGE_BATCH_SIZE = 700 def mine_npm_packages(logger=None): @@ -139,15 +140,13 @@ def mine_npm_packages(logger=None): settings_path=NPM_CHECKPOINT_PATH, ) - compressed_packages_file = get_packages_file_from_checkpoint( - config_repo=MINECODE_PIPELINES_CONFIG_REPO, + compressed_packages_file = fetch_checkpoint_by_git( + cloned_repo=cloned_repo, checkpoint_path=COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH, - name=COMPRESSED_PACKAGE_FILE_NAME, ) - packages_file = compressed_packages_file.replace(".gz", "") - decompress_packages_file( - packages_file=packages_file, + packages_file = decompress_packages_file( compressed_packages_file=compressed_packages_file, + name=PACKAGE_FILE_NAME, ) elif state == PERIODIC_SYNC_STATE: @@ -311,11 +310,12 @@ def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None # we need to update mined packages checkpoint for every batch # so we can continue mining the other packages after restarting if logger: - logger("Checkpointing processed packages to: {NPM_PACKAGES_CHECKPOINT_PATH}") + logger(f"Checkpointing processed packages to: {NPM_PACKAGES_CHECKPOINT_PATH}") packages_checkpoint = packages_mined + synced_packages update_mined_packages_in_checkpoint( packages=packages_checkpoint, + config_repo=MINECODE_PIPELINES_CONFIG_REPO, cloned_repo=cloned_config_repo, checkpoint_path=NPM_PACKAGES_CHECKPOINT_PATH, ) @@ -328,6 +328,7 @@ def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None update_checkpoint_state( cloned_repo=cloned_config_repo, state=PERIODIC_SYNC_STATE, + checkpoint_path=NPM_CHECKPOINT_PATH, ) # If we are finished mining all the packages in the periodic sync, we can now update From d3245985d454d0f139f3a5db8d061b07e775359a Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 30 Sep 2025 02:55:56 +0530 Subject: [PATCH 06/11] Bump minecode-pipelines to v0.0.1b9 Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/__init__.py | 2 +- pyproject-minecode_pipelines.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/minecode_pipelines/__init__.py b/minecode_pipelines/__init__.py index a9e1dc74..c26e2f1c 100644 --- a/minecode_pipelines/__init__.py +++ b/minecode_pipelines/__init__.py @@ -7,4 +7,4 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -VERSION = "0.0.1b3" +VERSION = "0.0.1b9" diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index bfcb2db0..69962a02 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "minecode_pipelines" -version = "0.0.1b3" +version = "0.0.1b9" description = "A library for mining packageURLs and package metadata from ecosystem repositories." readme = "minecode_pipelines/README.rst" license = { text = "Apache-2.0" } @@ -56,7 +56,7 @@ mine_debian = "minecode_pipelines.pipelines.mine_debian:MineDebian" mine_alpine = "minecode_pipelines.pipelines.mine_alpine:MineAlpine" [tool.bumpversion] -current_version = "0.0.1b3" +current_version = "0.0.1b9" allow_dirty = true files = [ From 65ae71f0973493414f0d4852afc85d03891cd02b Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 1 Oct 2025 18:38:48 +0530 Subject: [PATCH 07/11] Bump minecode-pipelines to v0.0.1b17 Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/__init__.py | 2 +- pyproject-minecode_pipelines.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/minecode_pipelines/__init__.py b/minecode_pipelines/__init__.py index 6090e832..d3c4ba87 100644 --- a/minecode_pipelines/__init__.py +++ b/minecode_pipelines/__init__.py @@ -8,4 +8,4 @@ # -VERSION = "0.0.1b15" +VERSION = "0.0.1b17" diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index ec27e28d..c58fa243 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "minecode_pipelines" -version = "0.0.1b15" +version = "0.0.1b17" description = "A library for mining packageURLs and package metadata from ecosystem repositories." readme = "minecode_pipelines/README.rst" license = { text = "Apache-2.0" } @@ -58,7 +58,7 @@ mine_alpine = "minecode_pipelines.pipelines.mine_alpine:MineAlpine" mine_conan = "minecode_pipelines.pipelines.mine_conan:MineConan" [tool.bumpversion] -current_version = "0.0.1b15" +current_version = "0.0.1b17" allow_dirty = true files = [ From 3e724c5644c6a6edd3c1dbe7dc940bba3617c127 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 9 Oct 2025 13:37:21 +0530 Subject: [PATCH 08/11] Bump debian-inspector to fix install failure Signed-off-by: Ayan Sinha Mahapatra --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b6ca857f..1b962ccc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,7 +34,7 @@ crispy-bootstrap3==2024.1 crontab==1.0.4 cryptography==45.0.4 cyclonedx-python-lib==10.2.0 -debian_inspector==31.1.0 +debian_inspector==31.1.1 defusedxml==0.7.1 Deprecated==1.2.18 Django==5.1.11 From 235fb85761683c7f52d9c05731df6773c9551f94 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 10 Dec 2025 21:28:40 +0530 Subject: [PATCH 09/11] Update npm pipeline to MineCodeBasePipeline Reference: https://github.com/aboutcode-org/purldb/issues/798 Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/__init__.py | 2 +- minecode_pipelines/miners/npm.py | 2 +- minecode_pipelines/pipelines/mine_npm.py | 83 ++++++++---- minecode_pipelines/pipelines/mine_pypi.py | 1 + minecode_pipelines/pipes/npm.py | 146 +++++++++------------- pyproject-minecode_pipelines.toml | 6 +- setup.cfg | 2 + 7 files changed, 128 insertions(+), 114 deletions(-) diff --git a/minecode_pipelines/__init__.py b/minecode_pipelines/__init__.py index 974f754a..9c2c7336 100644 --- a/minecode_pipelines/__init__.py +++ b/minecode_pipelines/__init__.py @@ -8,4 +8,4 @@ # -VERSION = "0.0.1b60" +VERSION = "0.0.1b61" diff --git a/minecode_pipelines/miners/npm.py b/minecode_pipelines/miners/npm.py index 4031a2e4..686b2175 100644 --- a/minecode_pipelines/miners/npm.py +++ b/minecode_pipelines/miners/npm.py @@ -43,7 +43,7 @@ NPM_REPLICATE_REPO = "https://replicate.npmjs.com/" NPM_REGISTRY_REPO = "https://registry.npmjs.org/" -NPM_TYPE = "NPM" +NPM_TYPE = "npm" NPM_REPLICATE_BATCH_SIZE = 10000 diff --git a/minecode_pipelines/pipelines/mine_npm.py b/minecode_pipelines/pipelines/mine_npm.py index 6736c4f1..ee203138 100644 --- a/minecode_pipelines/pipelines/mine_npm.py +++ b/minecode_pipelines/pipelines/mine_npm.py @@ -20,47 +20,88 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. -from scanpipe.pipelines import Pipeline -from scanpipe.pipes import federatedcode - from minecode_pipelines.pipes import npm -from minecode_pipelines import pipes +from minecode_pipelines.pipelines import MineCodeBasePipeline +from minecode_pipelines.pipelines import _mine_and_publish_packageurls -class MineNPM(Pipeline): +class MineNPM(MineCodeBasePipeline): """ Mine all packageURLs from a npm index and publish them to a FederatedCode repo. """ + package_batch_size = 70 + @classmethod def steps(cls): return ( cls.check_federatedcode_eligibility, + cls.create_federatedcode_working_dir, cls.mine_npm_packages, - cls.mine_and_publish_npm_packageurls, - cls.delete_cloned_repos, + cls.get_npm_packages_to_sync, + cls.fetch_federation_config, + cls.mine_and_publish_packageurls, + cls.update_state_and_checkpoints, + cls.delete_working_dir, ) - def check_federatedcode_eligibility(self): - """ - Check if the project fulfills the following criteria for - pushing the project result to FederatedCode. - """ - federatedcode.check_federatedcode_configured_and_available(logger=self.log) - def mine_npm_packages(self): """Mine npm package names from npm indexes or checkpoint.""" - self.npm_packages, self.state, self.last_seq = npm.mine_npm_packages(logger=self.log) + ( + self.npm_packages, self.state, self.last_seq, self.config_repo + ) = npm.mine_npm_packages(logger=self.log) - def mine_and_publish_npm_packageurls(self): - """Get npm packageURLs for all mined npm package names.""" - self.repos = npm.mine_and_publish_npm_packageurls( + def get_npm_packages_to_sync(self): + """Get npm packages which needs to be synced using checkpoint.""" + self.packages, self.synced_packages = npm.get_npm_packages_to_sync( packages_file=self.npm_packages, state=self.state, - last_seq=self.last_seq, logger=self.log, ) - def delete_cloned_repos(self): - pipes.delete_cloned_repos(repos=self.repos, logger=self.log) + def packages_count(self): + return len(self.packages) + + def mine_packageurls(self): + """Yield npm packageURLs for all mined npm package names.""" + self.packages_mined = [] + yield from npm.mine_and_publish_npm_packageurls( + packages_to_sync=self.packages, + packages_mined=self.packages_mined, + logger=self.log, + ) + + def save_check_point(self): + npm.save_mined_packages_in_checkpoint( + packages_mined=self.packages_mined, + synced_packages=self.synced_packages, + config_repo=self.config_repo, + logger=self.log, + ) + self.packages_mined = [] + + def mine_and_publish_packageurls(self): + """Mine and publish PackageURLs.""" + + _mine_and_publish_packageurls( + packageurls=self.mine_packageurls(), + total_package_count=self.packages_count(), + data_cluster=self.data_cluster, + checked_out_repos=self.checked_out_repos, + working_path=self.working_path, + append_purls=self.append_purls, + commit_msg_func=self.commit_message, + logger=self.log, + checkpoint_func=self.save_check_point, + checkpoint_on_commit=True, + batch_size=self.package_batch_size, + ) + + def update_state_and_checkpoints(self): + npm.update_state_and_checkpoints( + state=self.state, + last_seq=self.last_seq, + config_repo=self.config_repo, + logger=self.log, + ) diff --git a/minecode_pipelines/pipelines/mine_pypi.py b/minecode_pipelines/pipelines/mine_pypi.py index 4d7faabb..354c084a 100644 --- a/minecode_pipelines/pipelines/mine_pypi.py +++ b/minecode_pipelines/pipelines/mine_pypi.py @@ -24,6 +24,7 @@ from minecode_pipelines.pipelines import MineCodeBasePipeline from minecode_pipelines.pipelines import _mine_and_publish_packageurls + class MinePypi(MineCodeBasePipeline): """ Mine all packageURLs from a pypi index and publish them to diff --git a/minecode_pipelines/pipes/npm.py b/minecode_pipelines/pipes/npm.py index 43b77a66..8b2976d2 100644 --- a/minecode_pipelines/pipes/npm.py +++ b/minecode_pipelines/pipes/npm.py @@ -22,8 +22,6 @@ from datetime import datetime -from minecode_pipelines import VERSION -from minecode_pipelines.pipes import write_packageurls_to_file from minecode_pipelines.pipes import fetch_checkpoint_from_github from minecode_pipelines.pipes import update_checkpoints_in_github from minecode_pipelines.pipes import update_checkpoints_file_in_github @@ -45,18 +43,14 @@ from minecode_pipelines.miners.npm import load_npm_packages from minecode_pipelines.miners.npm import get_npm_packageurls from minecode_pipelines.miners.npm import NPM_REPLICATE_REPO - from minecode_pipelines.miners.npm import NPM_TYPE -from minecode_pipelines.utils import grouper - -from packageurl import PackageURL -from aboutcode.hashid import get_package_base_dir +from minecode_pipelines.utils import get_temp_dir +from packageurl import PackageURL from scanpipe.pipes.federatedcode import clone_repository -from scanpipe.pipes.federatedcode import commit_changes -from scanpipe.pipes.federatedcode import push_changes +from scanpipe.pipes.federatedcode import delete_local_clone PACKAGE_FILE_NAME = "NPMPackages.json" @@ -93,7 +87,11 @@ def mine_npm_packages(logger=None): if logger: logger(f"Mining state from checkpoint: {state}") - cloned_repo = clone_repository(repo_url=MINECODE_PIPELINES_CONFIG_REPO) + config_repo = clone_repository( + repo_url=MINECODE_PIPELINES_CONFIG_REPO, + clone_path=get_temp_dir(), + logger=logger, + ) # This is the first time we are syncing from npm replicate if not state: @@ -115,7 +113,7 @@ def mine_npm_packages(logger=None): ) update_checkpoints_file_in_github( checkpoints_file=compressed_packages_file, - cloned_repo=cloned_repo, + cloned_repo=config_repo, path=COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH, ) @@ -124,10 +122,11 @@ def mine_npm_packages(logger=None): logger(f"Updating checkpoint mining last_seq to: {last_seq}") update_npm_checkpoints( - cloned_repo=cloned_repo, + cloned_repo=config_repo, state=INITIAL_SYNC_STATE, last_seq=last_seq, checkpoint_path=NPM_CHECKPOINT_PATH, + logger=logger, ) elif state == INITIAL_SYNC_STATE: @@ -140,7 +139,7 @@ def mine_npm_packages(logger=None): ) compressed_packages_file = fetch_checkpoint_by_git( - cloned_repo=cloned_repo, + cloned_repo=config_repo, checkpoint_path=COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH, ) packages_file = decompress_packages_file( @@ -167,7 +166,7 @@ def mine_npm_packages(logger=None): name=PACKAGE_FILE_NAME, ) - return packages_file, state, last_seq + return packages_file, state, last_seq, config_repo def update_npm_checkpoints( @@ -176,6 +175,7 @@ def update_npm_checkpoints( state=None, last_seq=None, config_repo=MINECODE_PIPELINES_CONFIG_REPO, + logger=None, ): checkpoint = fetch_checkpoint_from_github( config_repo=config_repo, @@ -191,6 +191,7 @@ def update_npm_checkpoints( checkpoint=checkpoint, cloned_repo=cloned_repo, path=checkpoint_path, + logger=logger, ) @@ -209,9 +210,9 @@ def fetch_last_seq_mined(config_repo, settings_path): return checkpoints.get("last_seq") -def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None): +def get_npm_packages_to_sync(packages_file, state, logger=None): + if logger: - logger(f"Last serial number mined: {last_seq}") logger(f"Mining state: {state}") # this is either from npm replicate or from checkpoints @@ -237,95 +238,61 @@ def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None logger( f"Starting initial package mining for {len(packages_to_sync)} packages from checkpoint" ) + + return packages_to_sync, synced_packages + + +def mine_and_publish_npm_packageurls(packages_to_sync, packages_mined, logger=None): - # clone repo - cloned_data_repo = clone_repository(repo_url=MINECODE_DATA_NPM_REPO) - cloned_config_repo = clone_repository(repo_url=MINECODE_PIPELINES_CONFIG_REPO) if logger: - logger(f"{MINECODE_DATA_NPM_REPO} repo cloned at: {cloned_data_repo.working_dir}") - logger(f"{MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {cloned_config_repo.working_dir}") + logger("Starting package mining for a batch of packages") - for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=packages_to_sync): - packages_mined = [] - purls = [] - purl_files = [] + for package_name in packages_to_sync: + if not package_name: + continue + # fetch packageURLs for package if logger: - logger("Starting package mining for a batch of packages") - - for package_name in package_batch: - if not package_name: - continue + logger(f"getting packageURLs for package: {package_name}") - # fetch packageURLs for package + packageurls = get_npm_packageurls(package_name) + if not packageurls: if logger: - logger(f"getting packageURLs for package: {package_name}") + logger(f"Could not fetch package versions for package: {package_name}") + continue - packageurls = get_npm_packageurls(package_name) - if not packageurls: - if logger: - logger(f"Could not fetch package versions for package: {package_name}") - continue + base_purl = PackageURL(type=NPM_TYPE, name=package_name).to_string() + packages_mined.append(base_purl) - # get repo and path for package - base_purl = PackageURL(type=NPM_TYPE, name=package_name).to_string() - package_base_dir = get_package_base_dir(purl=base_purl) - - if logger: - logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}") - purls_string = " ".join(packageurls) - logger(f"packageURLs: {purls_string}") - - # write packageURLs to file - purl_file = write_packageurls_to_file( - repo=cloned_data_repo, - base_dir=package_base_dir, - packageurls=packageurls, - ) - purl_files.append(purl_file) - purls.append(base_purl) + yield base_purl, packageurls - packages_mined.append(package_name) - if logger: - purls_string = " ".join(purls) - logger("Committing and pushing changes for a batch of packages: ") - logger(f"{purls_string}") - - # commit changes - commit_changes( - repo=cloned_data_repo, - files_to_commit=purl_files, - purls=purls, - mine_type="packageURL", - tool_name="pkg:pypi/minecode-pipelines", - tool_version=VERSION, - ) +def save_mined_packages_in_checkpoint(packages_mined, synced_packages, config_repo, logger=None): - # Push changes to remote repository - push_changes(repo=cloned_data_repo) + # As we are mining the packages to sync with the index, + # we need to update mined packages checkpoint for every batch + # so we can continue mining the other packages after restarting + if logger: + logger(f"Checkpointing processed packages to: {NPM_PACKAGES_CHECKPOINT_PATH}") - # As we are mining the packages to sync with the index, - # we need to update mined packages checkpoint for every batch - # so we can continue mining the other packages after restarting - if logger: - logger(f"Checkpointing processed packages to: {NPM_PACKAGES_CHECKPOINT_PATH}") + packages_checkpoint = packages_mined + synced_packages + update_mined_packages_in_checkpoint( + packages=packages_checkpoint, + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + cloned_repo=config_repo, + checkpoint_path=NPM_PACKAGES_CHECKPOINT_PATH, + logger=logger, + ) - packages_checkpoint = packages_mined + synced_packages - update_mined_packages_in_checkpoint( - packages=packages_checkpoint, - config_repo=MINECODE_PIPELINES_CONFIG_REPO, - cloned_repo=cloned_config_repo, - checkpoint_path=NPM_PACKAGES_CHECKPOINT_PATH, - ) +def update_state_and_checkpoints(state, last_seq, config_repo, logger=None): # If we are finished mining all the packages in the intial sync, we can now # periodically sync the packages from latest if state == INITIAL_SYNC_STATE: if logger: logger(f"{INITIAL_SYNC_STATE} completed. starting: {PERIODIC_SYNC_STATE}") update_checkpoint_state( - cloned_repo=cloned_config_repo, + cloned_repo=config_repo, state=PERIODIC_SYNC_STATE, checkpoint_path=NPM_CHECKPOINT_PATH, ) @@ -337,18 +304,21 @@ def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None logger(f"{PERIODIC_SYNC_STATE} completed. Updating last seq to: {last_seq}") update_npm_checkpoints( - cloned_repo=cloned_config_repo, + cloned_repo=config_repo, checkpoint_path=NPM_CHECKPOINT_PATH, state=PERIODIC_SYNC_STATE, last_seq=last_seq, + logger=logger, ) # Refresh mined packages checkpoint update_checkpoints_in_github( checkpoint={"packages_mined": []}, - cloned_repo=cloned_config_repo, + cloned_repo=config_repo, path=NPM_PACKAGES_CHECKPOINT_PATH, + logger=logger, ) - repos_to_clean = [cloned_data_repo, cloned_config_repo] - return repos_to_clean + if logger: + logger(f"Deleting local clone at: {config_repo.working_dir}") + delete_local_clone(config_repo) diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index 340d63d2..5d086708 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "minecode_pipelines" -version = "0.0.1b60" +version = "0.0.1b61" description = "A library for mining packageURLs and package metadata from ecosystem repositories." readme = "minecode_pipelines/README.rst" license = { text = "Apache-2.0" } @@ -40,7 +40,7 @@ dependencies = [ "aboutcode.hashid >= 0.2.0", "aboutcode.federated >= 0.1.0", "packageurl_python >= 0.15.6", - "scancodeio >= 35.3.0", + "scancodeio >= 36.0.1", "ftputil >= 5.1.0", "jawa >= 2.2.0", "arrow >= 1.3.0", @@ -64,7 +64,7 @@ mine_swift = "minecode_pipelines.pipelines.mine_swift:MineSwift" mine_composer = "minecode_pipelines.pipelines.mine_composer:MineComposer" [tool.bumpversion] -current_version = "0.0.1b60" +current_version = "0.0.1b61" allow_dirty = true files = [ diff --git a/setup.cfg b/setup.cfg index ff538204..c40c03f1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -67,6 +67,8 @@ install_requires = samecode >= 0.5.1 # FederatedCode integration aboutcode.federatedcode >= 0.1.0 + aboutcode.federated >= 0.1.0 + minecode-pipelines setup_requires = setuptools_scm[toml] >= 4 From 16a0d33f576fc77bd8ebaa1060e166e0710c64fa Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 10 Dec 2025 22:45:39 +0530 Subject: [PATCH 10/11] Adjust batch size Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/__init__.py | 2 +- minecode_pipelines/pipelines/__init__.py | 2 -- minecode_pipelines/pipelines/mine_npm.py | 2 +- minecode_pipelines/pipes/npm.py | 2 ++ pyproject-minecode_pipelines.toml | 6 +++--- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/minecode_pipelines/__init__.py b/minecode_pipelines/__init__.py index 9c2c7336..4e68af88 100644 --- a/minecode_pipelines/__init__.py +++ b/minecode_pipelines/__init__.py @@ -8,4 +8,4 @@ # -VERSION = "0.0.1b61" +VERSION = "0.0.1b62" diff --git a/minecode_pipelines/pipelines/__init__.py b/minecode_pipelines/pipelines/__init__.py index 821f6ca3..9421658c 100644 --- a/minecode_pipelines/pipelines/__init__.py +++ b/minecode_pipelines/pipelines/__init__.py @@ -193,8 +193,6 @@ def _mine_and_publish_packageurls( ) checkout["file_to_commit"].add(purl_file) checkout["file_processed_count"] += 1 - if logger: - logger(f"{checkout['repo'].working_dir}: {checkout['file_processed_count']} / {batch_size}") if len(checkout["file_to_commit"]) > batch_size: if logger: diff --git a/minecode_pipelines/pipelines/mine_npm.py b/minecode_pipelines/pipelines/mine_npm.py index ee203138..c1fa8810 100644 --- a/minecode_pipelines/pipelines/mine_npm.py +++ b/minecode_pipelines/pipelines/mine_npm.py @@ -31,7 +31,7 @@ class MineNPM(MineCodeBasePipeline): a FederatedCode repo. """ - package_batch_size = 70 + package_batch_size = 5 @classmethod def steps(cls): diff --git a/minecode_pipelines/pipes/npm.py b/minecode_pipelines/pipes/npm.py index 8b2976d2..282eb9ae 100644 --- a/minecode_pipelines/pipes/npm.py +++ b/minecode_pipelines/pipes/npm.py @@ -228,6 +228,8 @@ def get_npm_packages_to_sync(packages_file, state, logger=None): if logger: logger(f"Starting package mining for {len(packages_to_sync)} packages") + synced_packages = [] + elif state == INITIAL_SYNC_STATE or state == PERIODIC_SYNC_STATE: synced_packages = get_mined_packages_from_checkpoint( config_repo=MINECODE_PIPELINES_CONFIG_REPO, diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index 5d086708..5588b7de 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "minecode_pipelines" -version = "0.0.1b61" +version = "0.0.1b62" description = "A library for mining packageURLs and package metadata from ecosystem repositories." readme = "minecode_pipelines/README.rst" license = { text = "Apache-2.0" } @@ -40,7 +40,7 @@ dependencies = [ "aboutcode.hashid >= 0.2.0", "aboutcode.federated >= 0.1.0", "packageurl_python >= 0.15.6", - "scancodeio >= 36.0.1", + "scancodeio >= 35.3.0", "ftputil >= 5.1.0", "jawa >= 2.2.0", "arrow >= 1.3.0", @@ -64,7 +64,7 @@ mine_swift = "minecode_pipelines.pipelines.mine_swift:MineSwift" mine_composer = "minecode_pipelines.pipelines.mine_composer:MineComposer" [tool.bumpversion] -current_version = "0.0.1b61" +current_version = "0.0.1b62" allow_dirty = true files = [ From 91307b4a1e238dabad0cf96418de18c9fa56c6ae Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 11 Dec 2025 00:47:11 +0530 Subject: [PATCH 11/11] Bump minecode-pipelines version to 0.1.0 Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/__init__.py | 2 +- minecode_pipelines/miners/npm.py | 11 +++++++---- minecode_pipelines/pipes/npm.py | 3 ++- pyproject-minecode_pipelines.toml | 4 ++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/minecode_pipelines/__init__.py b/minecode_pipelines/__init__.py index 4e68af88..d3898fbb 100644 --- a/minecode_pipelines/__init__.py +++ b/minecode_pipelines/__init__.py @@ -8,4 +8,4 @@ # -VERSION = "0.0.1b62" +VERSION = "0.1.0" diff --git a/minecode_pipelines/miners/npm.py b/minecode_pipelines/miners/npm.py index 686b2175..8d3c4df8 100644 --- a/minecode_pipelines/miners/npm.py +++ b/minecode_pipelines/miners/npm.py @@ -70,12 +70,14 @@ def get_current_last_seq(replicate_url=NPM_REPLICATE_REPO): return last_seq -def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO): +def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO, logger=None): all_package_names = [] i = 0 while True: - print(f"Processing iteration: {i}: changes after seq: {last_seq}") + if logger: + logger(f"Processing iteration: {i}: changes after seq: {last_seq}") + npm_replicate_changes = ( replicate_url + "_changes?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + f"&since={last_seq}" ) @@ -96,7 +98,7 @@ def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO): return {"packages": all_package_names}, last_seq -def get_npm_packages(replicate_url=NPM_REPLICATE_REPO): +def get_npm_packages(replicate_url=NPM_REPLICATE_REPO, logger=None): all_package_names = [] npm_replicate_all = replicate_url + "_all_docs?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" @@ -113,7 +115,8 @@ def get_npm_packages(replicate_url=NPM_REPLICATE_REPO): for i in range(iterations): npm_replicate_from_id = npm_replicate_all + f'&start_key="{last_key}"' - print(f"Processing iteration: {i}: {npm_replicate_from_id}") + if logger: + logger(f"Processing iteration: {i}: {npm_replicate_from_id}") response = requests.get(npm_replicate_from_id) if not response.ok: diff --git a/minecode_pipelines/pipes/npm.py b/minecode_pipelines/pipes/npm.py index 282eb9ae..fa462acc 100644 --- a/minecode_pipelines/pipes/npm.py +++ b/minecode_pipelines/pipes/npm.py @@ -101,7 +101,7 @@ def mine_npm_packages(logger=None): f"Starting initial checkpointing of packages from npm replicate till seq: {last_seq}" ) - packages = get_npm_packages(replicate_url=NPM_REPLICATE_REPO) + packages = get_npm_packages(replicate_url=NPM_REPLICATE_REPO, logger=logger) packages_file = write_packages_json( packages=packages, name=PACKAGE_FILE_NAME, @@ -160,6 +160,7 @@ def mine_npm_packages(logger=None): packages, last_seq = get_updated_npm_packages( last_seq=last_seq, replicate_url=NPM_REPLICATE_REPO, + logger=logger, ) packages_file = write_packages_json( packages=packages, diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index 5588b7de..7afd063e 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "minecode_pipelines" -version = "0.0.1b62" +version = "0.1.0" description = "A library for mining packageURLs and package metadata from ecosystem repositories." readme = "minecode_pipelines/README.rst" license = { text = "Apache-2.0" } @@ -64,7 +64,7 @@ mine_swift = "minecode_pipelines.pipelines.mine_swift:MineSwift" mine_composer = "minecode_pipelines.pipelines.mine_composer:MineComposer" [tool.bumpversion] -current_version = "0.0.1b62" +current_version = "0.1.0" allow_dirty = true files = [