Skip to content

Commit f2dc100

Browse files
Update npm packageURL mining pipeline
* Fix npm packages JSON file structure * Compress package names JSON to reduce size and allow git push Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent fc71d09 commit f2dc100

3 files changed

Lines changed: 48 additions & 8 deletions

File tree

minecode_pipelines/miners/npm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def get_npm_packages(replicate_url=NPM_REPLICATE_REPO):
106106

107107
package_data = response.json()
108108
package_names, last_key = get_package_names_last_key(package_data)
109-
all_package_names.append(package_names)
109+
all_package_names.extend(package_names)
110110

111111
total_rows = package_data.get("total_rows")
112112
iterations = int(total_rows / NPM_REPLICATE_BATCH_SIZE) + 1
@@ -121,7 +121,7 @@ def get_npm_packages(replicate_url=NPM_REPLICATE_REPO):
121121

122122
package_data = response.json()
123123
package_names, last_key = get_package_names_last_key(package_data)
124-
all_package_names.append(package_names)
124+
all_package_names.extend(package_names)
125125

126126
return {"packages": all_package_names}
127127

minecode_pipelines/pipes/__init__.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
import gzip
1011
import json
1112
import os
13+
import shutil
1214
from pathlib import Path
1315

1416
import requests
@@ -31,6 +33,18 @@
3133
MINECODE_PIPELINES_CONFIG_REPO = "https://github.com/aboutcode-data/minecode-pipelines-config/"
3234

3335

36+
def compress_packages_file(packages_file, compressed_packages_file):
37+
with open(packages_file, "rb") as f_in:
38+
with gzip.open(compressed_packages_file, "wb") as f_out:
39+
f_out.writelines(f_in)
40+
41+
42+
def decompress_packages_file(packages_file, compressed_packages_file):
43+
with gzip.open(compressed_packages_file, "rb") as f_in:
44+
with open(packages_file, "wb") as f_out:
45+
f_out.writelines(f_in)
46+
47+
3448
def write_packages_json(packages, name):
3549
temp_file = get_temp_file(name)
3650
write_data_to_json_file(path=temp_file, data=packages)
@@ -68,6 +82,17 @@ def update_checkpoints_in_github(checkpoint, cloned_repo, path):
6882
)
6983

7084

85+
def update_checkpoints_file_in_github(checkpoints_file, cloned_repo, path):
86+
checkpoint_path = os.path.join(cloned_repo.working_dir, path)
87+
shutil.move(checkpoints_file, checkpoint_path)
88+
commit_message = """Update federatedcode purl mining checkpoint"""
89+
commit_and_push_changes(
90+
repo=cloned_repo,
91+
files_to_commit=[checkpoint_path],
92+
commit_message=commit_message,
93+
)
94+
95+
7196
def get_mined_packages_from_checkpoint(config_repo, checkpoint_path):
7297
checkpoint = fetch_checkpoint_from_github(
7398
config_repo=config_repo,

minecode_pipelines/pipes/npm.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from minecode_pipelines.pipes import write_packageurls_to_file
2727
from minecode_pipelines.pipes import fetch_checkpoint_from_github
2828
from minecode_pipelines.pipes import update_checkpoints_in_github
29+
from minecode_pipelines.pipes import update_checkpoints_file_in_github
2930
from minecode_pipelines.pipes import get_mined_packages_from_checkpoint
3031
from minecode_pipelines.pipes import update_mined_packages_in_checkpoint
3132
from minecode_pipelines.pipes import get_packages_file_from_checkpoint
@@ -34,6 +35,8 @@
3435
from minecode_pipelines.pipes import INITIAL_SYNC_STATE
3536
from minecode_pipelines.pipes import PERIODIC_SYNC_STATE
3637
from minecode_pipelines.pipes import write_packages_json
38+
from minecode_pipelines.pipes import compress_packages_file
39+
from minecode_pipelines.pipes import decompress_packages_file
3740

3841

3942
from minecode_pipelines.miners.npm import get_npm_packages
@@ -57,7 +60,9 @@
5760

5861

5962
PACKAGE_FILE_NAME = "NPMPackages.json"
63+
COMPRESSED_PACKAGE_FILE_NAME = "NPMPackages.json.gz"
6064
NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + PACKAGE_FILE_NAME
65+
COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + COMPRESSED_PACKAGE_FILE_NAME
6166
NPM_CHECKPOINT_PATH = "npm/checkpoints.json"
6267
NPM_PACKAGES_CHECKPOINT_PATH = "npm/packages_checkpoint.json"
6368

@@ -103,10 +108,15 @@ def mine_npm_packages(logger=None):
103108
packages=packages,
104109
name=PACKAGE_FILE_NAME,
105110
)
106-
update_checkpoints_in_github(
107-
checkpoint=packages,
111+
compressed_packages_file = packages_file + ".gz"
112+
compress_packages_file(
113+
packages_file=packages_file,
114+
compressed_packages_file=compressed_packages_file,
115+
)
116+
update_checkpoints_file_in_github(
117+
checkpoints_file=compressed_packages_file,
108118
cloned_repo=cloned_repo,
109-
path=NPM_REPLICATE_CHECKPOINT_PATH,
119+
path=COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH,
110120
)
111121

112122
if logger:
@@ -129,10 +139,15 @@ def mine_npm_packages(logger=None):
129139
settings_path=NPM_CHECKPOINT_PATH,
130140
)
131141

132-
packages_file = get_packages_file_from_checkpoint(
142+
compressed_packages_file = get_packages_file_from_checkpoint(
133143
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
134-
checkpoint_path=NPM_REPLICATE_CHECKPOINT_PATH,
135-
name=PACKAGE_FILE_NAME,
144+
checkpoint_path=COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH,
145+
name=COMPRESSED_PACKAGE_FILE_NAME,
146+
)
147+
packages_file = compressed_packages_file.replace(".gz", "")
148+
decompress_packages_file(
149+
packages_file=packages_file,
150+
compressed_packages_file=compressed_packages_file,
136151
)
137152

138153
elif state == PERIODIC_SYNC_STATE:

0 commit comments

Comments
 (0)