Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion minecode_pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
#


VERSION = "0.0.1b60"
VERSION = "0.1.0"
156 changes: 156 additions & 0 deletions minecode_pipelines/miners/npm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#


import json
import requests

from packageurl import PackageURL


"""
Visitors for Npmjs and npmjs-like javascript package repositories.

We have this hierarchy in npm replicate and registry index:
npm projects replicate.npmjs.com (paginated JSON) -> versions at registry.npmjs.org (JSON) -> download urls

See https://github.com/orgs/community/discussions/152515 for information on
the latest replicate.npmjs.com API.

https://replicate.npmjs.com/_all_docs
This NPMJS replicate API serves as an index to get all npm packages and their revision IDs
in paginated queries.

https://replicate.npmjs.com/_changes
This NPMJS replicate API serves as a CHANGELOG of npm packages with update sequneces which
can be fetched in paginated queries.

https://registry.npmjs.org/{namespace/name}
For each npm package, a JSON containing details including the list of all releases
and archives, their URLs, and some metadata for each release.

https://registry.npmjs.org/{namespace/name}/{version}
For each release, a JSON contains details for the released version and all the
downloads available for this release.
"""


NPM_REPLICATE_REPO = "https://replicate.npmjs.com/"
NPM_REGISTRY_REPO = "https://registry.npmjs.org/"
NPM_TYPE = "npm"
NPM_REPLICATE_BATCH_SIZE = 10000


def get_package_names_last_key(package_data):
names = [package.get("id") for package in package_data.get("rows")]
last_key = package_data.get("rows")[-1].get("key")
return names, last_key


def get_package_names_last_seq(package_data):
names = [package.get("id") for package in package_data.get("results")]
last_seq = package_data.get("last_seq")
return names, last_seq


def get_current_last_seq(replicate_url=NPM_REPLICATE_REPO):
npm_replicate_latest_changes = replicate_url + "_changes?descending=True"
response = requests.get(npm_replicate_latest_changes)
if not response.ok:
return

package_data = response.json()
_package_names, last_seq = get_package_names_last_seq(package_data)
return last_seq


def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO, logger=None):
all_package_names = []
i = 0

while True:
if logger:
logger(f"Processing iteration: {i}: changes after seq: {last_seq}")

npm_replicate_changes = (
replicate_url + "_changes?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + f"&since={last_seq}"
)
response = requests.get(npm_replicate_changes)
if not response.ok:
return all_package_names

package_data = response.json()
package_names, last_seq = get_package_names_last_seq(package_data)
all_package_names.extend(package_names)

# We have fetched the last set of changes if True
if len(package_names) < NPM_REPLICATE_BATCH_SIZE:
break

i += 1

return {"packages": all_package_names}, last_seq


def get_npm_packages(replicate_url=NPM_REPLICATE_REPO, logger=None):
all_package_names = []

npm_replicate_all = replicate_url + "_all_docs?" + f"limit={NPM_REPLICATE_BATCH_SIZE}"
response = requests.get(npm_replicate_all)
if not response.ok:
return all_package_names

package_data = response.json()
package_names, last_key = get_package_names_last_key(package_data)
all_package_names.extend(package_names)

total_rows = package_data.get("total_rows")
iterations = int(total_rows / NPM_REPLICATE_BATCH_SIZE) + 1

for i in range(iterations):
npm_replicate_from_id = npm_replicate_all + f'&start_key="{last_key}"'
if logger:
logger(f"Processing iteration: {i}: {npm_replicate_from_id}")

response = requests.get(npm_replicate_from_id)
if not response.ok:
raise Exception(npm_replicate_from_id, response.text)

package_data = response.json()
package_names, last_key = get_package_names_last_key(package_data)
all_package_names.extend(package_names)

return {"packages": all_package_names}


def get_npm_packageurls(name, npm_repo=NPM_REGISTRY_REPO):
packageurls = []

project_index_api_url = npm_repo + name
response = requests.get(project_index_api_url)
if not response.ok:
return packageurls

project_data = response.json()
for version in project_data.get("versions"):
purl = PackageURL(
type=NPM_TYPE,
name=name,
version=version,
)
packageurls.append(purl.to_string())

return packageurls


def load_npm_packages(packages_file):
with open(packages_file) as f:
packages_data = json.load(f)

return packages_data.get("packages", [])
9 changes: 0 additions & 9 deletions minecode_pipelines/miners/pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@

from packageurl import PackageURL

from minecode_pipelines.utils import get_temp_file
from minecode_pipelines.pipes import write_data_to_json_file

"""
Visitors for Pypi and Pypi-like Python package repositories.

Expand Down Expand Up @@ -52,12 +49,6 @@ def get_pypi_packages(pypi_repo, logger=None):
return response.json()


def write_packages_json(packages, name):
temp_file = get_temp_file(name)
write_data_to_json_file(path=temp_file, data=packages)
return temp_file


def get_pypi_packageurls(name):
packageurls = []

Expand Down
2 changes: 0 additions & 2 deletions minecode_pipelines/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,6 @@ def _mine_and_publish_packageurls(
)
checkout["file_to_commit"].add(purl_file)
checkout["file_processed_count"] += 1
if logger:
logger(f"{checkout['repo'].working_dir}: {checkout['file_processed_count']} / {batch_size}")

if len(checkout["file_to_commit"]) > batch_size:
if logger:
Expand Down
107 changes: 107 additions & 0 deletions minecode_pipelines/pipelines/mine_npm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from minecode_pipelines.pipes import npm
from minecode_pipelines.pipelines import MineCodeBasePipeline
from minecode_pipelines.pipelines import _mine_and_publish_packageurls


class MineNPM(MineCodeBasePipeline):
"""
Mine all packageURLs from a npm index and publish them to
a FederatedCode repo.
"""

package_batch_size = 5

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.create_federatedcode_working_dir,
cls.mine_npm_packages,
cls.get_npm_packages_to_sync,
cls.fetch_federation_config,
cls.mine_and_publish_packageurls,
cls.update_state_and_checkpoints,
cls.delete_working_dir,
)

def mine_npm_packages(self):
"""Mine npm package names from npm indexes or checkpoint."""
(
self.npm_packages, self.state, self.last_seq, self.config_repo
) = npm.mine_npm_packages(logger=self.log)

def get_npm_packages_to_sync(self):
"""Get npm packages which needs to be synced using checkpoint."""
self.packages, self.synced_packages = npm.get_npm_packages_to_sync(
packages_file=self.npm_packages,
state=self.state,
logger=self.log,
)

def packages_count(self):
return len(self.packages)

def mine_packageurls(self):
"""Yield npm packageURLs for all mined npm package names."""
self.packages_mined = []
yield from npm.mine_and_publish_npm_packageurls(
packages_to_sync=self.packages,
packages_mined=self.packages_mined,
logger=self.log,
)

def save_check_point(self):
npm.save_mined_packages_in_checkpoint(
packages_mined=self.packages_mined,
synced_packages=self.synced_packages,
config_repo=self.config_repo,
logger=self.log,
)
self.packages_mined = []

def mine_and_publish_packageurls(self):
"""Mine and publish PackageURLs."""

_mine_and_publish_packageurls(
packageurls=self.mine_packageurls(),
total_package_count=self.packages_count(),
data_cluster=self.data_cluster,
checked_out_repos=self.checked_out_repos,
working_path=self.working_path,
append_purls=self.append_purls,
commit_msg_func=self.commit_message,
logger=self.log,
checkpoint_func=self.save_check_point,
checkpoint_on_commit=True,
batch_size=self.package_batch_size,
)

def update_state_and_checkpoints(self):
npm.update_state_and_checkpoints(
state=self.state,
last_seq=self.last_seq,
config_repo=self.config_repo,
logger=self.log,
)
1 change: 1 addition & 0 deletions minecode_pipelines/pipelines/mine_pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from minecode_pipelines.pipelines import MineCodeBasePipeline
from minecode_pipelines.pipelines import _mine_and_publish_packageurls


class MinePypi(MineCodeBasePipeline):
"""
Mine all packageURLs from a pypi index and publish them to
Expand Down
Loading