Skip to content

Commit 478b5e8

Browse files
Update npm packageURLs mining
Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent f2dc100 commit 478b5e8

2 files changed

Lines changed: 21 additions & 10 deletions

File tree

minecode_pipelines/pipes/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,15 @@ def compress_packages_file(packages_file, compressed_packages_file):
3939
f_out.writelines(f_in)
4040

4141

42-
def decompress_packages_file(packages_file, compressed_packages_file):
42+
def decompress_packages_file(compressed_packages_file, name):
43+
44+
packages_file = get_temp_file(name)
4345
with gzip.open(compressed_packages_file, "rb") as f_in:
4446
with open(packages_file, "wb") as f_out:
4547
f_out.writelines(f_in)
4648

49+
return packages_file
50+
4751

4852
def write_packages_json(packages, name):
4953
temp_file = get_temp_file(name)
@@ -140,6 +144,12 @@ def get_packages_file_from_checkpoint(config_repo, checkpoint_path, name):
140144
return write_packages_json(packages, name=name)
141145

142146

147+
def fetch_checkpoint_by_git(cloned_repo, checkpoint_path):
148+
149+
cloned_repo.remotes.origin.pull()
150+
return os.path.join(cloned_repo.working_dir, checkpoint_path)
151+
152+
143153
def write_packageurls_to_file(repo, base_dir, packageurls):
144154
purl_file_rel_path = os.path.join(base_dir, PURLS_FILENAME)
145155
purl_file_full_path = Path(repo.working_dir) / purl_file_rel_path

minecode_pipelines/pipes/npm.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from minecode_pipelines.pipes import write_packages_json
3838
from minecode_pipelines.pipes import compress_packages_file
3939
from minecode_pipelines.pipes import decompress_packages_file
40+
from minecode_pipelines.pipes import fetch_checkpoint_by_git
4041

4142

4243
from minecode_pipelines.miners.npm import get_npm_packages
@@ -62,15 +63,15 @@
6263
PACKAGE_FILE_NAME = "NPMPackages.json"
6364
COMPRESSED_PACKAGE_FILE_NAME = "NPMPackages.json.gz"
6465
NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + PACKAGE_FILE_NAME
65-
COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + COMPRESSED_PACKAGE_FILE_NAME
66+
COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + COMPRESSED_PACKAGE_FILE_NAME
6667
NPM_CHECKPOINT_PATH = "npm/checkpoints.json"
6768
NPM_PACKAGES_CHECKPOINT_PATH = "npm/packages_checkpoint.json"
6869

6970
# We are testing and storing mined packageURLs in one single repo per ecosystem for now
7071
MINECODE_DATA_NPM_REPO = "https://github.com/aboutcode-data/minecode-data-npm-test"
7172

7273

73-
PACKAGE_BATCH_SIZE = 1000
74+
PACKAGE_BATCH_SIZE = 700
7475

7576

7677
def mine_npm_packages(logger=None):
@@ -139,15 +140,13 @@ def mine_npm_packages(logger=None):
139140
settings_path=NPM_CHECKPOINT_PATH,
140141
)
141142

142-
compressed_packages_file = get_packages_file_from_checkpoint(
143-
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
143+
compressed_packages_file = fetch_checkpoint_by_git(
144+
cloned_repo=cloned_repo,
144145
checkpoint_path=COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH,
145-
name=COMPRESSED_PACKAGE_FILE_NAME,
146146
)
147-
packages_file = compressed_packages_file.replace(".gz", "")
148-
decompress_packages_file(
149-
packages_file=packages_file,
147+
packages_file = decompress_packages_file(
150148
compressed_packages_file=compressed_packages_file,
149+
name=PACKAGE_FILE_NAME,
151150
)
152151

153152
elif state == PERIODIC_SYNC_STATE:
@@ -311,11 +310,12 @@ def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None
311310
# we need to update mined packages checkpoint for every batch
312311
# so we can continue mining the other packages after restarting
313312
if logger:
314-
logger("Checkpointing processed packages to: {NPM_PACKAGES_CHECKPOINT_PATH}")
313+
logger(f"Checkpointing processed packages to: {NPM_PACKAGES_CHECKPOINT_PATH}")
315314

316315
packages_checkpoint = packages_mined + synced_packages
317316
update_mined_packages_in_checkpoint(
318317
packages=packages_checkpoint,
318+
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
319319
cloned_repo=cloned_config_repo,
320320
checkpoint_path=NPM_PACKAGES_CHECKPOINT_PATH,
321321
)
@@ -328,6 +328,7 @@ def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None
328328
update_checkpoint_state(
329329
cloned_repo=cloned_config_repo,
330330
state=PERIODIC_SYNC_STATE,
331+
checkpoint_path=NPM_CHECKPOINT_PATH,
331332
)
332333

333334
# If we are finished mining all the packages in the periodic sync, we can now update

0 commit comments

Comments
 (0)