Refactor mine_cran to use MineCodeBasePipeline for git deployment (#792)

ziadhany · web-flow · commit 9edc0abe18c2 · 2025-12-08T23:05:19.000+05:30
* Refactor mine_cran to use MineCodeBasePipeline for git deployment #775 Signed-off-by: ziad hany <ziadhany2016@gmail.com> * Update minecode-pipeline version to 0.0.1b55 Rename fetch_cran_db to download_cran_db Signed-off-by: ziad hany <ziadhany2016@gmail.com> * Bump version Signed-off-by: ziad hany <ziadhany2016@gmail.com> * Store cran_db in pipeline working_path Fix mine_cran_packageurls return type Update minecode-pipelines version to 0.0.1b57 Signed-off-by: ziad hany <ziadhany2016@gmail.com> --------- Signed-off-by: ziad hany <ziadhany2016@gmail.com>
diff --git a/minecode_pipelines/__init__.py b/minecode_pipelines/__init__.py
@@ -8,4 +8,4 @@
 #
 
 
-VERSION = "0.0.1b54"
+VERSION = "0.0.1b57"
diff --git a/minecode_pipelines/miners/cran.py b/minecode_pipelines/miners/cran.py
diff --git a/minecode_pipelines/pipelines/mine_cran.py b/minecode_pipelines/pipelines/mine_cran.py
@@ -20,67 +20,42 @@
 # ScanCode.io is a free software code scanning tool from nexB Inc. and others.
 # Visit https://github.com/aboutcode-org/scancode.io for support and download.
 
-import os
-from scanpipe.pipelines import Pipeline
-from scanpipe.pipes import federatedcode
-
-from minecode_pipelines import pipes
-from minecode_pipelines.miners.cran import fetch_cran_db
+import json
+from minecode_pipelines.pipelines import MineCodeBasePipeline
 from minecode_pipelines.pipes import cran
+from minecode_pipelines.pipes.cran import fetch_cran_db
 
 
-MINECODE_DATA_CRAN_REPO = os.environ.get(
-    "MINECODE_DATA_CRAN_REPO", "https://github.com/aboutcode-data/minecode-data-cran-test"
-)
-
-
-class MineCran(Pipeline):
-    """
-    Mine all packageURLs from a CRAN R index and publish them to a FederatedCode repo.
-    """
+class MineCran(MineCodeBasePipeline):
+    """Pipeline to mine CRAN R packages and publish them to FederatedCode."""
 
     @classmethod
     def steps(cls):
         return (
             cls.check_federatedcode_eligibility,
-            cls.setup_federatedcode_cran,
-            cls.mine_and_publish_cran_packageurls,
-            cls.cleanup_db_and_repo,
+            cls.create_federatedcode_working_dir,
+            cls.fetch_federation_config,
+            cls.download_cran_db,
+            cls.mine_and_publish_packageurls,
+            cls.delete_working_dir,
         )
 
-    def check_federatedcode_eligibility(self):
+    def download_cran_db(self):
         """
-        Check if the project fulfills the following criteria for
-        pushing the project result to FederatedCode.
+        Download the full CRAN package database
         """
-        federatedcode.check_federatedcode_configured_and_available(logger=self.log)
+        self.db_path = fetch_cran_db(working_path=self.working_path, logger=self.log)
 
-    def setup_federatedcode_cran(self):
+    def packages_count(self):
         """
-        Clone the FederatedCode CRAN repository and download the CRAN DB JSON file.
+        Return the count of packages found in the downloaded CRAN JSON database.
         """
-        self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CRAN_REPO)
-        self.db_path = fetch_cran_db()
-
-        if self.log:
-            self.log(
-                f"{MINECODE_DATA_CRAN_REPO} repo cloned at: {self.cloned_data_repo.working_dir}"
-            )
+        if not getattr(self, "db_path", None) or not self.db_path.exists():
+            return None
 
-    def mine_and_publish_cran_packageurls(self):
-        """Get cran packageURLs for all mined cran package names."""
-        cran.mine_and_publish_cran_packageurls(
-            cloned_data_repo=self.cloned_data_repo, db_path=self.db_path, logger=self.log
-        )
-
-    def cleanup_db_and_repo(self):
-        self.log(f"Cleaning database file at: {self.db_path}")
-        os.remove(self.db_path)
+        with open(self.db_path, encoding="utf-8") as f:
+            return sum(1 for _ in json.load(f))
 
-        self.log(
-            f"Deleting cloned repo {MINECODE_DATA_CRAN_REPO} from: {self.cloned_data_repo.working_dir}"
-        )
-        pipes.delete_cloned_repos(
-            repos=[self.cloned_data_repo],
-            logger=self.log,
-        )
+    def mine_packageurls(self):
+        """Mine Cran PackageURLs from cran package database."""
+        return cran.mine_cran_packageurls(db_path=self.db_path)
diff --git a/minecode_pipelines/pipes/cran.py b/minecode_pipelines/pipes/cran.py
@@ -20,46 +20,71 @@
 # ScanCode.io is a free software code scanning tool from nexB Inc. and others.
 # Visit https://github.com/aboutcode-org/scancode.io for support and download.
 
-from aboutcode.hashid import get_package_purls_yml_file_path
-from aboutcode.hashid import get_core_purl
-from scanpipe.pipes.federatedcode import commit_and_push_changes
-from minecode_pipelines.miners.cran import extract_cran_packages
-from minecode_pipelines.pipes import write_data_to_yaml_file
-from minecode_pipelines.utils import grouper
+import json
+from pathlib import Path
+from typing import Iterable
+from typing import Tuple
+from typing import List
 
-PACKAGE_BATCH_SIZE = 100
+import requests
+from packageurl import PackageURL
+from aboutcode.hashid import get_core_purl
 
 
-def mine_and_publish_cran_packageurls(cloned_data_repo, db_path, logger):
+def fetch_cran_db(working_path, logger) -> Path:
     """
-    Extract CRAN packages from the database, write their package URLs (purls) to YAML,
-    and commit changes in batches to the given cloned repository.
+    Download the CRAN package database (~250MB JSON) in a memory-efficient way.
+    Saves it to a file instead of loading everything into memory.
     """
-    packages_to_sync = list(extract_cran_packages(db_path))
+    output_path = working_path / "cran_db.json"
+    logger(f"Target download path: {output_path}")
 
-    for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=packages_to_sync):
-        purl_files = []
-        base_purls = []
+    url = "https://crandb.r-pkg.org/-/all"
+    with requests.get(url, stream=True) as response:
+        response.raise_for_status()
+        with output_path.open("wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
 
-        if logger:
-            logger(f"Starting package mining for a batch of {PACKAGE_BATCH_SIZE} packages")
+    return output_path
 
-        for updated_purls in package_batch:
-            if not updated_purls:
-                continue  # skip padded None values or empty
 
-            first_purl = updated_purls[0]
-            base_purl = get_core_purl(first_purl)
-            purl_yaml_path = cloned_data_repo.working_dir / get_package_purls_yml_file_path(
-                first_purl
-            )
-            write_data_to_yaml_file(path=purl_yaml_path, data=updated_purls)
+def mine_cran_packageurls(db_path: Path) -> Iterable[Tuple[str, List[str]]]:
+    """
+    Extract package names and their versions from a CRAN DB JSON file.
+    Yields a tuple: (base_purl, list_of_purls)
+    ex:
+    {
+      "AATtools": {
+      "_id": "AATtools",
+      "_rev": "8-9ebb721d05b946f2b437b49e892c9e8c",
+      "name": "AATtools",
+      "versions": {
+         "0.0.1": {...},
+         "0.0.2": {...},
+         "0.0.3": {...}
+      }
+    }
+    """
+    if not db_path.exists():
+        raise FileNotFoundError(f"File not found: {db_path}")
 
-            purl_files.append(purl_yaml_path)
-            base_purls.append(str(base_purl))
+    with open(db_path, encoding="utf-8") as f:
+        data = json.load(f)
 
-        if purl_files and base_purls:
-            logger(f"Committing packageURLs: {', '.join(base_purls)}")
-            commit_and_push_changes(
-                repo=cloned_data_repo, files_to_commit=purl_files, purls=base_purls, logger=logger
+    for pkg_name, pkg_data in data.items():
+        versions = list(pkg_data.get("versions", {}).keys())
+        purls = []
+        for version in versions:
+            purl = PackageURL(
+                type="cran",
+                name=pkg_name,
+                version=version,
             )
+            purls.append(purl.to_string())
+
+        base_purl = None
+        if purls:
+            first_purl = purls[0]
+            base_purl = get_core_purl(first_purl)
+        yield base_purl, purls
diff --git a/minecode_pipelines/tests/pipes/test_cran.py b/minecode_pipelines/tests/pipes/test_cran.py
@@ -10,32 +10,33 @@
 import saneyaml
 from pathlib import Path
 from unittest import TestCase
-from minecode_pipelines.miners.cran import extract_cran_packages
 
+from minecode_pipelines.pipes.cran import mine_cran_packageurls
 
 DATA_DIR = Path(__file__).parent.parent / "test_data" / "cran"
 
-
 class CranPipelineTests(TestCase):
-    def test_extract_cran_packages_from_testdata(self):
+    def test_mine_cran_packageurls_from_testdata(self):
         """
-        Ensure extract_cran_packages correctly parses the CRAN database
+        Ensure mine_cran_packageurls correctly parses the CRAN database
         and produces results identical to the expected YAML files.
         """
 
         db_file = DATA_DIR / "cran_db.json"
-        results = list(extract_cran_packages(db_file))
+        results = list(mine_cran_packageurls(db_file))
 
         expected_files = [
             DATA_DIR / "expected_abbreviate.yaml",
             DATA_DIR / "expected_abc.data.yaml",
             DATA_DIR / "expected_abc.yaml",
         ]
-
+        expected_base_purls = ["pkg:cran/abbreviate", "pkg:cran/abc", "pkg:cran/abc.data"]
         assert len(results) == len(expected_files)
 
-        for result, expected_file in zip(results, expected_files):
+        for result, expected_base_purl, expected_file in zip(results, expected_base_purls, expected_files):
             with open(expected_file, encoding="utf-8") as f:
-                expected = saneyaml.load(f)
+                expected_purls = saneyaml.load(f)
 
-            assert result == expected
+            base_purl, purls = result
+            assert str(base_purl) == expected_base_purl
+            assert purls == expected_purls
diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml
@@ -4,7 +4,7 @@ build-backend = "flot.buildapi"
 
 [project]
 name = "minecode_pipelines"
-version = "0.0.1b55"
+version = "0.0.1b57"
 description = "A library for mining packageURLs and package metadata from ecosystem repositories."
 readme = "minecode_pipelines/README.rst"
 license = { text = "Apache-2.0" }

Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,4 @@`
`8`	`8`	`#`
`9`	`9`
`10`	`10`
`11`		`-VERSION = "0.0.1b54"`
	`11`	`+VERSION = "0.0.1b57"`