3737from minecode_pipelines .pipes import write_packages_json
3838from minecode_pipelines .pipes import compress_packages_file
3939from minecode_pipelines .pipes import decompress_packages_file
40+ from minecode_pipelines .pipes import fetch_checkpoint_by_git
4041
4142
4243from minecode_pipelines .miners .npm import get_npm_packages
6263PACKAGE_FILE_NAME = "NPMPackages.json"
6364COMPRESSED_PACKAGE_FILE_NAME = "NPMPackages.json.gz"
6465NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + PACKAGE_FILE_NAME
65- COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + COMPRESSED_PACKAGE_FILE_NAME
66+ COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + COMPRESSED_PACKAGE_FILE_NAME
6667NPM_CHECKPOINT_PATH = "npm/checkpoints.json"
6768NPM_PACKAGES_CHECKPOINT_PATH = "npm/packages_checkpoint.json"
6869
6970# We are testing and storing mined packageURLs in one single repo per ecosystem for now
7071MINECODE_DATA_NPM_REPO = "https://github.com/aboutcode-data/minecode-data-npm-test"
7172
7273
73- PACKAGE_BATCH_SIZE = 1000
74+ PACKAGE_BATCH_SIZE = 700
7475
7576
7677def mine_npm_packages (logger = None ):
@@ -139,15 +140,13 @@ def mine_npm_packages(logger=None):
139140 settings_path = NPM_CHECKPOINT_PATH ,
140141 )
141142
142- compressed_packages_file = get_packages_file_from_checkpoint (
143- config_repo = MINECODE_PIPELINES_CONFIG_REPO ,
143+ compressed_packages_file = fetch_checkpoint_by_git (
144+ cloned_repo = cloned_repo ,
144145 checkpoint_path = COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH ,
145- name = COMPRESSED_PACKAGE_FILE_NAME ,
146146 )
147- packages_file = compressed_packages_file .replace (".gz" , "" )
148- decompress_packages_file (
149- packages_file = packages_file ,
147+ packages_file = decompress_packages_file (
150148 compressed_packages_file = compressed_packages_file ,
149+ name = PACKAGE_FILE_NAME ,
151150 )
152151
153152 elif state == PERIODIC_SYNC_STATE :
@@ -311,11 +310,12 @@ def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None
311310 # we need to update mined packages checkpoint for every batch
312311 # so we can continue mining the other packages after restarting
313312 if logger :
314- logger ("Checkpointing processed packages to: {NPM_PACKAGES_CHECKPOINT_PATH}" )
313+ logger (f "Checkpointing processed packages to: { NPM_PACKAGES_CHECKPOINT_PATH } " )
315314
316315 packages_checkpoint = packages_mined + synced_packages
317316 update_mined_packages_in_checkpoint (
318317 packages = packages_checkpoint ,
318+ config_repo = MINECODE_PIPELINES_CONFIG_REPO ,
319319 cloned_repo = cloned_config_repo ,
320320 checkpoint_path = NPM_PACKAGES_CHECKPOINT_PATH ,
321321 )
@@ -328,6 +328,7 @@ def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None
328328 update_checkpoint_state (
329329 cloned_repo = cloned_config_repo ,
330330 state = PERIODIC_SYNC_STATE ,
331+ checkpoint_path = NPM_CHECKPOINT_PATH ,
331332 )
332333
333334 # If we are finished mining all the packages in the periodic sync, we can now update
0 commit comments