Skip to content

Commit 9edc0ab

Browse files
authored
Refactor mine_cran to use MineCodeBasePipeline for git deployment (#792)
* Refactor mine_cran to use MineCodeBasePipeline for git deployment #775 Signed-off-by: ziad hany <ziadhany2016@gmail.com> * Update minecode-pipeline version to 0.0.1b55 Rename fetch_cran_db to download_cran_db Signed-off-by: ziad hany <ziadhany2016@gmail.com> * Bump version Signed-off-by: ziad hany <ziadhany2016@gmail.com> * Store cran_db in pipeline working_path Fix mine_cran_packageurls return type Update minecode-pipelines version to 0.0.1b57 Signed-off-by: ziad hany <ziadhany2016@gmail.com> --------- Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 611a4da commit 9edc0ab

6 files changed

Lines changed: 90 additions & 155 deletions

File tree

minecode_pipelines/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88
#
99

1010

11-
VERSION = "0.0.1b54"
11+
VERSION = "0.0.1b57"

minecode_pipelines/miners/cran.py

Lines changed: 0 additions & 66 deletions
This file was deleted.

minecode_pipelines/pipelines/mine_cran.py

Lines changed: 22 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,67 +20,42 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
import os
24-
from scanpipe.pipelines import Pipeline
25-
from scanpipe.pipes import federatedcode
26-
27-
from minecode_pipelines import pipes
28-
from minecode_pipelines.miners.cran import fetch_cran_db
23+
import json
24+
from minecode_pipelines.pipelines import MineCodeBasePipeline
2925
from minecode_pipelines.pipes import cran
26+
from minecode_pipelines.pipes.cran import fetch_cran_db
3027

3128

32-
MINECODE_DATA_CRAN_REPO = os.environ.get(
33-
"MINECODE_DATA_CRAN_REPO", "https://github.com/aboutcode-data/minecode-data-cran-test"
34-
)
35-
36-
37-
class MineCran(Pipeline):
38-
"""
39-
Mine all packageURLs from a CRAN R index and publish them to a FederatedCode repo.
40-
"""
29+
class MineCran(MineCodeBasePipeline):
30+
"""Pipeline to mine CRAN R packages and publish them to FederatedCode."""
4131

4232
@classmethod
4333
def steps(cls):
4434
return (
4535
cls.check_federatedcode_eligibility,
46-
cls.setup_federatedcode_cran,
47-
cls.mine_and_publish_cran_packageurls,
48-
cls.cleanup_db_and_repo,
36+
cls.create_federatedcode_working_dir,
37+
cls.fetch_federation_config,
38+
cls.download_cran_db,
39+
cls.mine_and_publish_packageurls,
40+
cls.delete_working_dir,
4941
)
5042

51-
def check_federatedcode_eligibility(self):
43+
def download_cran_db(self):
5244
"""
53-
Check if the project fulfills the following criteria for
54-
pushing the project result to FederatedCode.
45+
Download the full CRAN package database
5546
"""
56-
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
47+
self.db_path = fetch_cran_db(working_path=self.working_path, logger=self.log)
5748

58-
def setup_federatedcode_cran(self):
49+
def packages_count(self):
5950
"""
60-
Clone the FederatedCode CRAN repository and download the CRAN DB JSON file.
51+
Return the count of packages found in the downloaded CRAN JSON database.
6152
"""
62-
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CRAN_REPO)
63-
self.db_path = fetch_cran_db()
64-
65-
if self.log:
66-
self.log(
67-
f"{MINECODE_DATA_CRAN_REPO} repo cloned at: {self.cloned_data_repo.working_dir}"
68-
)
53+
if not getattr(self, "db_path", None) or not self.db_path.exists():
54+
return None
6955

70-
def mine_and_publish_cran_packageurls(self):
71-
"""Get cran packageURLs for all mined cran package names."""
72-
cran.mine_and_publish_cran_packageurls(
73-
cloned_data_repo=self.cloned_data_repo, db_path=self.db_path, logger=self.log
74-
)
75-
76-
def cleanup_db_and_repo(self):
77-
self.log(f"Cleaning database file at: {self.db_path}")
78-
os.remove(self.db_path)
56+
with open(self.db_path, encoding="utf-8") as f:
57+
return sum(1 for _ in json.load(f))
7958

80-
self.log(
81-
f"Deleting cloned repo {MINECODE_DATA_CRAN_REPO} from: {self.cloned_data_repo.working_dir}"
82-
)
83-
pipes.delete_cloned_repos(
84-
repos=[self.cloned_data_repo],
85-
logger=self.log,
86-
)
59+
def mine_packageurls(self):
60+
"""Mine Cran PackageURLs from cran package database."""
61+
return cran.mine_cran_packageurls(db_path=self.db_path)

minecode_pipelines/pipes/cran.py

Lines changed: 56 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -20,46 +20,71 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from aboutcode.hashid import get_package_purls_yml_file_path
24-
from aboutcode.hashid import get_core_purl
25-
from scanpipe.pipes.federatedcode import commit_and_push_changes
26-
from minecode_pipelines.miners.cran import extract_cran_packages
27-
from minecode_pipelines.pipes import write_data_to_yaml_file
28-
from minecode_pipelines.utils import grouper
23+
import json
24+
from pathlib import Path
25+
from typing import Iterable
26+
from typing import Tuple
27+
from typing import List
2928

30-
PACKAGE_BATCH_SIZE = 100
29+
import requests
30+
from packageurl import PackageURL
31+
from aboutcode.hashid import get_core_purl
3132

3233

33-
def mine_and_publish_cran_packageurls(cloned_data_repo, db_path, logger):
34+
def fetch_cran_db(working_path, logger) -> Path:
3435
"""
35-
Extract CRAN packages from the database, write their package URLs (purls) to YAML,
36-
and commit changes in batches to the given cloned repository.
36+
Download the CRAN package database (~250MB JSON) in a memory-efficient way.
37+
Saves it to a file instead of loading everything into memory.
3738
"""
38-
packages_to_sync = list(extract_cran_packages(db_path))
39+
output_path = working_path / "cran_db.json"
40+
logger(f"Target download path: {output_path}")
3941

40-
for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=packages_to_sync):
41-
purl_files = []
42-
base_purls = []
42+
url = "https://crandb.r-pkg.org/-/all"
43+
with requests.get(url, stream=True) as response:
44+
response.raise_for_status()
45+
with output_path.open("wb") as f:
46+
for chunk in response.iter_content(chunk_size=8192):
47+
f.write(chunk)
4348

44-
if logger:
45-
logger(f"Starting package mining for a batch of {PACKAGE_BATCH_SIZE} packages")
49+
return output_path
4650

47-
for updated_purls in package_batch:
48-
if not updated_purls:
49-
continue # skip padded None values or empty
5051

51-
first_purl = updated_purls[0]
52-
base_purl = get_core_purl(first_purl)
53-
purl_yaml_path = cloned_data_repo.working_dir / get_package_purls_yml_file_path(
54-
first_purl
55-
)
56-
write_data_to_yaml_file(path=purl_yaml_path, data=updated_purls)
52+
def mine_cran_packageurls(db_path: Path) -> Iterable[Tuple[str, List[str]]]:
53+
"""
54+
Extract package names and their versions from a CRAN DB JSON file.
55+
Yields a tuple: (base_purl, list_of_purls)
56+
ex:
57+
{
58+
"AATtools": {
59+
"_id": "AATtools",
60+
"_rev": "8-9ebb721d05b946f2b437b49e892c9e8c",
61+
"name": "AATtools",
62+
"versions": {
63+
"0.0.1": {...},
64+
"0.0.2": {...},
65+
"0.0.3": {...}
66+
}
67+
}
68+
"""
69+
if not db_path.exists():
70+
raise FileNotFoundError(f"File not found: {db_path}")
5771

58-
purl_files.append(purl_yaml_path)
59-
base_purls.append(str(base_purl))
72+
with open(db_path, encoding="utf-8") as f:
73+
data = json.load(f)
6074

61-
if purl_files and base_purls:
62-
logger(f"Committing packageURLs: {', '.join(base_purls)}")
63-
commit_and_push_changes(
64-
repo=cloned_data_repo, files_to_commit=purl_files, purls=base_purls, logger=logger
75+
for pkg_name, pkg_data in data.items():
76+
versions = list(pkg_data.get("versions", {}).keys())
77+
purls = []
78+
for version in versions:
79+
purl = PackageURL(
80+
type="cran",
81+
name=pkg_name,
82+
version=version,
6583
)
84+
purls.append(purl.to_string())
85+
86+
base_purl = None
87+
if purls:
88+
first_purl = purls[0]
89+
base_purl = get_core_purl(first_purl)
90+
yield base_purl, purls

minecode_pipelines/tests/pipes/test_cran.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,32 +10,33 @@
1010
import saneyaml
1111
from pathlib import Path
1212
from unittest import TestCase
13-
from minecode_pipelines.miners.cran import extract_cran_packages
1413

14+
from minecode_pipelines.pipes.cran import mine_cran_packageurls
1515

1616
DATA_DIR = Path(__file__).parent.parent / "test_data" / "cran"
1717

18-
1918
class CranPipelineTests(TestCase):
20-
def test_extract_cran_packages_from_testdata(self):
19+
def test_mine_cran_packageurls_from_testdata(self):
2120
"""
22-
Ensure extract_cran_packages correctly parses the CRAN database
21+
Ensure mine_cran_packageurls correctly parses the CRAN database
2322
and produces results identical to the expected YAML files.
2423
"""
2524

2625
db_file = DATA_DIR / "cran_db.json"
27-
results = list(extract_cran_packages(db_file))
26+
results = list(mine_cran_packageurls(db_file))
2827

2928
expected_files = [
3029
DATA_DIR / "expected_abbreviate.yaml",
3130
DATA_DIR / "expected_abc.data.yaml",
3231
DATA_DIR / "expected_abc.yaml",
3332
]
34-
33+
expected_base_purls = ["pkg:cran/abbreviate", "pkg:cran/abc", "pkg:cran/abc.data"]
3534
assert len(results) == len(expected_files)
3635

37-
for result, expected_file in zip(results, expected_files):
36+
for result, expected_base_purl, expected_file in zip(results, expected_base_purls, expected_files):
3837
with open(expected_file, encoding="utf-8") as f:
39-
expected = saneyaml.load(f)
38+
expected_purls = saneyaml.load(f)
4039

41-
assert result == expected
40+
base_purl, purls = result
41+
assert str(base_purl) == expected_base_purl
42+
assert purls == expected_purls

pyproject-minecode_pipelines.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "flot.buildapi"
44

55
[project]
66
name = "minecode_pipelines"
7-
version = "0.0.1b55"
7+
version = "0.0.1b57"
88
description = "A library for mining packageURLs and package metadata from ecosystem repositories."
99
readme = "minecode_pipelines/README.rst"
1010
license = { text = "Apache-2.0" }

0 commit comments

Comments
 (0)