Skip to content

Commit 9445f0d

Browse files
Merge pull request #731 from aboutcode-org/minecode-pipeline-cpan
Add support for mining cpan packageURLs
2 parents 9edc0ab + fbf8bf5 commit 9445f0d

6 files changed

Lines changed: 327 additions & 4 deletions

File tree

minecode/tests/collectors/test_github.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ def test_github_get_all_versions(self):
4040
"minecode-pipelines/v0.0.1b6",
4141
"minecode-pipelines/v0.0.1b7",
4242
"minecode-pipelines/v0.0.1b8",
43+
"minecode-pipelines/v0.0.1b9",
44+
"minecode-pipelines/v0.0.1b10",
45+
"minecode-pipelines/v0.0.1b11",
46+
"minecode-pipelines/v0.0.1b12",
47+
"minecode-pipelines/v0.0.1b13",
48+
"minecode-pipelines/v0.0.1b14",
49+
"minecode-pipelines/v0.0.1b15",
4350
]
4451
for item in expected:
4552
self.assertIn(item, versions)

minecode_pipelines/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88
#
99

1010

11-
VERSION = "0.0.1b57"
11+
VERSION = "0.0.1b59"

minecode_pipelines/miners/cpan.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import gzip
11+
import requests
12+
13+
from bs4 import BeautifulSoup
14+
from packageurl import PackageURL
15+
16+
from minecode_pipelines.utils import get_temp_file
17+
18+
"""
19+
Visitors for cpan and cpan-like perl package repositories.
20+
"""
21+
22+
23+
CPAN_REPO = "https://www.cpan.org/"
24+
CPAN_TYPE = "cpan"
25+
26+
27+
def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None):
28+
"""
29+
Get cpan package names parsed from the `02packages.details.txt`
30+
which conatins a list of all modules and their respective
31+
package archive paths. We parse the package names and their respective
32+
path_prefixes with author page path from this list.
33+
"""
34+
cpan_packages_url = cpan_repo + "modules/02packages.details.txt.gz"
35+
packages_archive = get_temp_file(file_name="cpan_packages", extension=".gz")
36+
packages_content = get_temp_file(file_name="cpan_packages", extension=".txt")
37+
response = requests.get(cpan_packages_url, stream=True)
38+
with open(packages_archive, 'wb') as f:
39+
for chunk in response.iter_content(chunk_size=8192):
40+
f.write(chunk)
41+
42+
with gzip.open(packages_archive, "rb") as f_in:
43+
with open(packages_content, "wb") as f_out:
44+
f_out.writelines(f_in)
45+
46+
with open(packages_content, 'r', encoding='utf-8') as file:
47+
packages_content = file.read()
48+
49+
package_path_by_name = {}
50+
51+
# The ``modules/02packages.details.txt`` file has the following section
52+
# at the beginning of the file:
53+
#
54+
# File: 02packages.details.txt
55+
# URL: http://www.cpan.org/modules/02packages.details.txt
56+
# Description: Package names found in directory $CPAN/authors/id/
57+
# Columns: package name, version, path
58+
# Intended-For: Automated fetch routines, namespace documentation.
59+
# Written-By: PAUSE version 1.005
60+
# Line-Count: 268940
61+
# Last-Updated: Mon, 29 Sep 2025 22:29:02 GMT
62+
#
63+
# This information is there in first 10 lines, and the last line is an
64+
# empty line, both of which we are ignoring below
65+
66+
modules = packages_content.split("\n")[9:-1]
67+
68+
# A sample line from this module list looks like this:
69+
#
70+
# Crypt::Passphrase::SHA1::Base64 0.021 L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
71+
72+
for module in modules:
73+
info = [section for section in module.split(" ") if section]
74+
75+
# This is like: L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
76+
package_path = info[-1]
77+
path_segments = package_path.split("/")
78+
filename = path_segments.pop()
79+
path_prefix = "/".join(path_segments)
80+
81+
name_version = filename.replace(".tar.gz", "").split("-")
82+
_version = name_version.pop()
83+
name = "-".join(name_version)
84+
85+
# for the above example: name: Crypt-Passphrase, path_prefix: L/LE/LEONT/
86+
package_path_by_name[name] = path_prefix
87+
88+
return package_path_by_name
89+
90+
91+
def get_cpan_packageurls(name, path_prefix, logger=None):
92+
"""
93+
Given a package name and it's path_prefix (author page path)
94+
return a list of packageURLs for that package.
95+
96+
An author page (like https://www.cpan.org/authors/id/P/PT/PTC/) lists
97+
all versions of all packages released by the author, so we can scrape
98+
all the packageURLs from this author packages index.
99+
"""
100+
101+
author_name = path_prefix.split("/")[-1]
102+
103+
packageurls = []
104+
105+
# file extensions found in cpan index
106+
ignorable_extensions = [".meta", ".readme", ".tar.gz"]
107+
108+
cpan_authors_path = "/authors/id/"
109+
cpan_authors_url = CPAN_REPO + cpan_authors_path
110+
111+
cpan_author_page_url = cpan_authors_url + path_prefix
112+
113+
response = requests.get(cpan_author_page_url)
114+
if not response.ok:
115+
return packageurls
116+
117+
if logger:
118+
logger(f"Getting package versions for {name} from {cpan_author_page_url}")
119+
120+
soup = BeautifulSoup(response.text, "html.parser")
121+
122+
# We get all the listed packages in the author page index
123+
package_list = soup.find("ul")
124+
if not package_list:
125+
return packageurls
126+
127+
package_list_elements = package_list.text.split("\n")
128+
129+
package_elements = [
130+
element.replace(" ", "")
131+
for element in package_list_elements
132+
if element and element not in {" Parent Directory", " CHECKSUMS"}
133+
]
134+
135+
versions = []
136+
for package_file in package_elements:
137+
for extension in ignorable_extensions:
138+
if extension in package_file:
139+
package_file = package_file.replace(extension, "")
140+
141+
name_version = package_file.split("-")
142+
version = name_version.pop()
143+
package_name = "-".join(name_version)
144+
if package_name != name:
145+
continue
146+
147+
versions.append(version)
148+
149+
unique_versions = list(set(versions))
150+
for version in unique_versions:
151+
purl = PackageURL(
152+
type=CPAN_TYPE,
153+
namespace=author_name,
154+
name=name,
155+
version=version,
156+
)
157+
packageurls.append(purl.to_string())
158+
159+
return packageurls
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
24+
from minecode_pipelines import pipes
25+
from minecode_pipelines.pipes import cpan
26+
from minecode_pipelines.pipelines import MineCodeBasePipeline
27+
from scanpipe.pipes import federatedcode
28+
29+
30+
class MineCpan(MineCodeBasePipeline):
31+
"""
32+
Mine all packageURLs from a cpan index and publish them to
33+
a FederatedCode repo.
34+
"""
35+
36+
@classmethod
37+
def steps(cls):
38+
return (
39+
cls.check_federatedcode_eligibility,
40+
cls.create_federatedcode_working_dir,
41+
cls.mine_cpan_packages,
42+
cls.fetch_federation_config,
43+
cls.mine_and_publish_packageurls,
44+
cls.delete_working_dir,
45+
)
46+
47+
def mine_cpan_packages(self):
48+
"""Mine cpan package names from cpan indexes or checkpoint."""
49+
self.cpan_packages_path_by_name = cpan.mine_cpan_packages(logger=self.log)
50+
51+
def packages_count(self):
52+
return len(self.cpan_packages_path_by_name)
53+
54+
def mine_packageurls(self):
55+
"""Get cpan packageURLs for all mined cpan package names."""
56+
yield from cpan.mine_and_publish_cpan_packageurls(
57+
package_path_by_name=self.cpan_packages_path_by_name,
58+
logger=self.log,
59+
)
60+

minecode_pipelines/pipes/cpan.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from minecode_pipelines.miners.cpan import get_cpan_packages
24+
from minecode_pipelines.miners.cpan import get_cpan_packageurls
25+
from minecode_pipelines.miners.cpan import CPAN_REPO
26+
27+
from minecode_pipelines.miners.cpan import CPAN_TYPE
28+
from minecode_pipelines.utils import grouper
29+
30+
from packageurl import PackageURL
31+
32+
# If True, show full details on fetching packageURL for
33+
# a package name present in the index
34+
LOG_PACKAGEURL_DETAILS = False
35+
36+
PACKAGE_BATCH_SIZE = 500
37+
38+
39+
def mine_cpan_packages(logger=None):
40+
if logger:
41+
logger("Getting packages from cpan index")
42+
43+
package_path_by_name = get_cpan_packages(cpan_repo=CPAN_REPO, logger=logger)
44+
45+
if logger:
46+
packages_count = len(package_path_by_name.keys())
47+
logger(f"Mined {packages_count} packages from cpan index")
48+
49+
return package_path_by_name
50+
51+
52+
def mine_and_publish_cpan_packageurls(package_path_by_name, logger=None):
53+
if not package_path_by_name:
54+
return
55+
56+
for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=package_path_by_name.keys()):
57+
packages_mined = []
58+
59+
if logger and LOG_PACKAGEURL_DETAILS:
60+
logger("Starting package mining for a batch of packages")
61+
62+
for package_name in package_batch:
63+
if not package_name or package_name in packages_mined:
64+
continue
65+
66+
# fetch packageURLs for package
67+
if logger and LOG_PACKAGEURL_DETAILS:
68+
logger(f"getting packageURLs for package: {package_name}")
69+
70+
path_prefix = package_path_by_name.get(package_name)
71+
if not path_prefix:
72+
continue
73+
74+
packageurls = get_cpan_packageurls(
75+
name=package_name,
76+
path_prefix=path_prefix,
77+
logger=logger,
78+
)
79+
if not packageurls:
80+
if logger and LOG_PACKAGEURL_DETAILS:
81+
logger(f"Package versions not present for package: {package_name}")
82+
83+
# We don't want to try fetching versions for these again
84+
packages_mined.append(package_name)
85+
continue
86+
87+
# get repo and path for package
88+
base_purl = PackageURL(type=CPAN_TYPE, name=package_name).to_string()
89+
if logger and LOG_PACKAGEURL_DETAILS:
90+
logger(f"fetched packageURLs for package: {base_purl}")
91+
purls_string = " ".join(packageurls)
92+
logger(f"packageURLs: {purls_string}")
93+
94+
packages_mined.append(package_name)
95+
yield base_purl, packageurls

pyproject-minecode_pipelines.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "flot.buildapi"
44

55
[project]
66
name = "minecode_pipelines"
7-
version = "0.0.1b57"
7+
version = "0.0.1b59"
88
description = "A library for mining packageURLs and package metadata from ecosystem repositories."
99
readme = "minecode_pipelines/README.rst"
1010
license = { text = "Apache-2.0" }
@@ -43,7 +43,8 @@ dependencies = [
4343
"scancodeio >= 35.3.0",
4444
"ftputil >= 5.1.0",
4545
"jawa >= 2.2.0",
46-
"arrow >= 1.3.0"
46+
"arrow >= 1.3.0",
47+
"beautifulsoup4 >= 4.13.4"
4748
]
4849

4950
urls = { Homepage = "https://github.com/aboutcode-org/purldb" }
@@ -56,12 +57,13 @@ mine_debian = "minecode_pipelines.pipelines.mine_debian:MineDebian"
5657
mine_nuget = "minecode_pipelines.pipelines.mine_nuget:MineNuGet"
5758
mine_alpine = "minecode_pipelines.pipelines.mine_alpine:MineAlpine"
5859
mine_conan = "minecode_pipelines.pipelines.mine_conan:MineConan"
60+
mine_cpan = "minecode_pipelines.pipelines.mine_cpan:MineCpan"
5961
mine_cran = "minecode_pipelines.pipelines.mine_cran:MineCran"
6062
mine_swift = "minecode_pipelines.pipelines.mine_swift:MineSwift"
6163
mine_composer = "minecode_pipelines.pipelines.mine_composer:MineComposer"
6264

6365
[tool.bumpversion]
64-
current_version = "0.0.1b25"
66+
current_version = "0.0.1b59"
6567
allow_dirty = true
6668

6769
files = [

0 commit comments

Comments
 (0)