Skip to content

Commit cd704dd

Browse files
committed
Refactor mine_cran to use MineCodeBasePipeline for git deployment #775
Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 389a298 commit cd704dd

5 files changed

Lines changed: 86 additions & 154 deletions

File tree

minecode_pipelines/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88
#
99

1010

11-
VERSION = "0.0.1b54"
11+
VERSION = "0.0.1b47"

minecode_pipelines/miners/cran.py

Lines changed: 0 additions & 66 deletions
This file was deleted.

minecode_pipelines/pipelines/mine_cran.py

Lines changed: 21 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,67 +20,41 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
import os
24-
from scanpipe.pipelines import Pipeline
25-
from scanpipe.pipes import federatedcode
26-
27-
from minecode_pipelines import pipes
28-
from minecode_pipelines.miners.cran import fetch_cran_db
23+
import json
24+
from minecode_pipelines.pipelines import MineCodeBasePipeline
2925
from minecode_pipelines.pipes import cran
26+
from minecode_pipelines.pipes.cran import fetch_cran_db
3027

3128

32-
MINECODE_DATA_CRAN_REPO = os.environ.get(
33-
"MINECODE_DATA_CRAN_REPO", "https://github.com/aboutcode-data/minecode-data-cran-test"
34-
)
35-
36-
37-
class MineCran(Pipeline):
38-
"""
39-
Mine all packageURLs from a CRAN R index and publish them to a FederatedCode repo.
40-
"""
29+
class MineCran(MineCodeBasePipeline):
30+
"""Pipeline to mine CRAN R packages and publish them to FederatedCode."""
4131

4232
@classmethod
4333
def steps(cls):
4434
return (
4535
cls.check_federatedcode_eligibility,
46-
cls.setup_federatedcode_cran,
47-
cls.mine_and_publish_cran_packageurls,
48-
cls.cleanup_db_and_repo,
36+
cls.create_federatedcode_working_dir,
37+
cls.fetch_federation_config,
38+
cls.mine_and_publish_packageurls,
39+
cls.delete_working_dir,
4940
)
5041

51-
def check_federatedcode_eligibility(self):
42+
def fetch_cran_db(self):
5243
"""
53-
Check if the project fulfills the following criteria for
54-
pushing the project result to FederatedCode.
44+
Download the full CRAN package database
5545
"""
56-
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
46+
self.db_path = fetch_cran_db(logger=self.log)
5747

58-
def setup_federatedcode_cran(self):
48+
def packages_count(self):
5949
"""
60-
Clone the FederatedCode CRAN repository and download the CRAN DB JSON file.
50+
Return the count of packages found in the downloaded CRAN JSON database.
6151
"""
62-
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CRAN_REPO)
63-
self.db_path = fetch_cran_db()
64-
65-
if self.log:
66-
self.log(
67-
f"{MINECODE_DATA_CRAN_REPO} repo cloned at: {self.cloned_data_repo.working_dir}"
68-
)
52+
if not getattr(self, "db_path", None) or not self.db_path.exists():
53+
return None
6954

70-
def mine_and_publish_cran_packageurls(self):
71-
"""Get cran packageURLs for all mined cran package names."""
72-
cran.mine_and_publish_cran_packageurls(
73-
cloned_data_repo=self.cloned_data_repo, db_path=self.db_path, logger=self.log
74-
)
75-
76-
def cleanup_db_and_repo(self):
77-
self.log(f"Cleaning database file at: {self.db_path}")
78-
os.remove(self.db_path)
55+
with open(self.db_path, encoding="utf-8") as f:
56+
return sum(1 for _ in json.load(f))
7957

80-
self.log(
81-
f"Deleting cloned repo {MINECODE_DATA_CRAN_REPO} from: {self.cloned_data_repo.working_dir}"
82-
)
83-
pipes.delete_cloned_repos(
84-
repos=[self.cloned_data_repo],
85-
logger=self.log,
86-
)
58+
def mine_packageurls(self):
59+
"""Mine Cran PackageURLs from cran package database."""
60+
cran.mine_cran_packageurls(db_path=self.db_path)

minecode_pipelines/pipes/cran.py

Lines changed: 54 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -20,46 +20,69 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from aboutcode.hashid import get_package_purls_yml_file_path
23+
import json
24+
from pathlib import Path
25+
import requests
26+
from packageurl import PackageURL
2427
from aboutcode.hashid import get_core_purl
25-
from scanpipe.pipes.federatedcode import commit_and_push_changes
26-
from minecode_pipelines.miners.cran import extract_cran_packages
27-
from minecode_pipelines.pipes import write_data_to_yaml_file
28-
from minecode_pipelines.utils import grouper
28+
import tempfile
2929

30-
PACKAGE_BATCH_SIZE = 100
3130

32-
33-
def mine_and_publish_cran_packageurls(cloned_data_repo, db_path, logger):
31+
def fetch_cran_db(logger) -> Path:
3432
"""
35-
Extract CRAN packages from the database, write their package URLs (purls) to YAML,
36-
and commit changes in batches to the given cloned repository.
33+
Download the CRAN package database (~250MB JSON) in a memory-efficient way.
34+
Saves it to a file instead of loading everything into memory.
3735
"""
38-
packages_to_sync = list(extract_cran_packages(db_path))
36+
temp_dir = Path(tempfile.mkdtemp())
37+
output_path = temp_dir / "cran_db.json"
38+
logger(f"Target download path: {output_path}")
3939

40-
for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=packages_to_sync):
41-
purl_files = []
42-
base_purls = []
40+
url = "https://crandb.r-pkg.org/-/all"
41+
with requests.get(url, stream=True) as response:
42+
response.raise_for_status()
43+
with output_path.open("wb") as f:
44+
for chunk in response.iter_content(chunk_size=8192):
45+
f.write(chunk)
4346

44-
if logger:
45-
logger(f"Starting package mining for a batch of {PACKAGE_BATCH_SIZE} packages")
47+
return output_path
4648

47-
for updated_purls in package_batch:
48-
if not updated_purls:
49-
continue # skip padded None values or empty
5049

51-
first_purl = updated_purls[0]
52-
base_purl = get_core_purl(first_purl)
53-
purl_yaml_path = cloned_data_repo.working_dir / get_package_purls_yml_file_path(
54-
first_purl
55-
)
56-
write_data_to_yaml_file(path=purl_yaml_path, data=updated_purls)
50+
def mine_cran_packageurls(db_path: Path) -> list:
51+
"""
52+
Extract package names and their versions from a CRAN DB JSON file.
53+
Yields a tuple: (base_purl, list_of_purls)
54+
ex:
55+
{
56+
"AATtools": {
57+
"_id": "AATtools",
58+
"_rev": "8-9ebb721d05b946f2b437b49e892c9e8c",
59+
"name": "AATtools",
60+
"versions": {
61+
"0.0.1": {...},
62+
"0.0.2": {...},
63+
"0.0.3": {...}
64+
}
65+
}
66+
"""
67+
if not db_path.exists():
68+
raise FileNotFoundError(f"File not found: {db_path}")
5769

58-
purl_files.append(purl_yaml_path)
59-
base_purls.append(str(base_purl))
70+
with open(db_path, encoding="utf-8") as f:
71+
data = json.load(f)
6072

61-
if purl_files and base_purls:
62-
logger(f"Committing packageURLs: {', '.join(base_purls)}")
63-
commit_and_push_changes(
64-
repo=cloned_data_repo, files_to_commit=purl_files, purls=base_purls, logger=logger
73+
for pkg_name, pkg_data in data.items():
74+
versions = list(pkg_data.get("versions", {}).keys())
75+
purls = []
76+
for version in versions:
77+
purl = PackageURL(
78+
type="cran",
79+
name=pkg_name,
80+
version=version,
6581
)
82+
purls.append(purl.to_string())
83+
84+
base_purl = None
85+
if purls:
86+
first_purl = purls[0]
87+
base_purl = get_core_purl(first_purl)
88+
yield base_purl, purls

minecode_pipelines/tests/pipes/test_cran.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,32 +10,33 @@
1010
import saneyaml
1111
from pathlib import Path
1212
from unittest import TestCase
13-
from minecode_pipelines.miners.cran import extract_cran_packages
1413

14+
from minecode_pipelines.pipes.cran import mine_cran_packageurls
1515

1616
DATA_DIR = Path(__file__).parent.parent / "test_data" / "cran"
1717

18-
1918
class CranPipelineTests(TestCase):
20-
def test_extract_cran_packages_from_testdata(self):
19+
def test_mine_cran_packageurls_from_testdata(self):
2120
"""
22-
Ensure extract_cran_packages correctly parses the CRAN database
21+
Ensure mine_cran_packageurls correctly parses the CRAN database
2322
and produces results identical to the expected YAML files.
2423
"""
2524

2625
db_file = DATA_DIR / "cran_db.json"
27-
results = list(extract_cran_packages(db_file))
26+
results = list(mine_cran_packageurls(db_file))
2827

2928
expected_files = [
3029
DATA_DIR / "expected_abbreviate.yaml",
3130
DATA_DIR / "expected_abc.data.yaml",
3231
DATA_DIR / "expected_abc.yaml",
3332
]
34-
33+
expected_base_purls = ["pkg:cran/abbreviate", "pkg:cran/abc", "pkg:cran/abc.data"]
3534
assert len(results) == len(expected_files)
3635

37-
for result, expected_file in zip(results, expected_files):
36+
for result, expected_base_purl, expected_file in zip(results, expected_base_purls, expected_files):
3837
with open(expected_file, encoding="utf-8") as f:
39-
expected = saneyaml.load(f)
38+
expected_purls = saneyaml.load(f)
4039

41-
assert result == expected
40+
base_purl, purls = result
41+
assert str(base_purl) == expected_base_purl
42+
assert purls == expected_purls

0 commit comments

Comments
 (0)