|
| 1 | +# |
| 2 | +# Copyright (c) nexB Inc. and others. All rights reserved. |
| 3 | +# purldb is a trademark of nexB Inc. |
| 4 | +# SPDX-License-Identifier: Apache-2.0 |
| 5 | +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. |
| 6 | +# See https://github.com/aboutcode-org/purldb for support or download. |
| 7 | +# See https://aboutcode.org for more information about nexB OSS projects. |
| 8 | +# |
| 9 | + |
| 10 | +import gzip |
| 11 | +import requests |
| 12 | + |
| 13 | +from bs4 import BeautifulSoup |
| 14 | +from packageurl import PackageURL |
| 15 | + |
| 16 | +from minecode_pipelines.utils import get_temp_file |
| 17 | + |
| 18 | +""" |
| 19 | +Visitors for cpan and cpan-like perl package repositories. |
| 20 | +""" |
| 21 | + |
| 22 | + |
| 23 | +CPAN_REPO = "https://www.cpan.org/" |
| 24 | +CPAN_TYPE = "cpan" |
| 25 | + |
| 26 | + |
| 27 | +def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None): |
| 28 | + """ |
| 29 | + Get cpan package names parsed from the `02packages.details.txt` |
| 30 | + which conatins a list of all modules and their respective |
| 31 | + package archive paths. We parse the package names and their respective |
| 32 | + path_prefixes with author page path from this list. |
| 33 | + """ |
| 34 | + cpan_packages_url = cpan_repo + "modules/02packages.details.txt.gz" |
| 35 | + packages_archive = get_temp_file(file_name="cpan_packages", extension=".gz") |
| 36 | + packages_content = get_temp_file(file_name="cpan_packages", extension=".txt") |
| 37 | + response = requests.get(cpan_packages_url, stream=True) |
| 38 | + with open(packages_archive, 'wb') as f: |
| 39 | + for chunk in response.iter_content(chunk_size=8192): |
| 40 | + f.write(chunk) |
| 41 | + |
| 42 | + with gzip.open(packages_archive, "rb") as f_in: |
| 43 | + with open(packages_content, "wb") as f_out: |
| 44 | + f_out.writelines(f_in) |
| 45 | + |
| 46 | + with open(packages_content, 'r', encoding='utf-8') as file: |
| 47 | + packages_content = file.read() |
| 48 | + |
| 49 | + package_path_by_name = {} |
| 50 | + |
| 51 | + # The ``modules/02packages.details.txt`` file has the following section |
| 52 | + # at the beginning of the file: |
| 53 | + # |
| 54 | + # File: 02packages.details.txt |
| 55 | + # URL: http://www.cpan.org/modules/02packages.details.txt |
| 56 | + # Description: Package names found in directory $CPAN/authors/id/ |
| 57 | + # Columns: package name, version, path |
| 58 | + # Intended-For: Automated fetch routines, namespace documentation. |
| 59 | + # Written-By: PAUSE version 1.005 |
| 60 | + # Line-Count: 268940 |
| 61 | + # Last-Updated: Mon, 29 Sep 2025 22:29:02 GMT |
| 62 | + # |
| 63 | + # This information is there in first 10 lines, and the last line is an |
| 64 | + # empty line, both of which we are ignoring below |
| 65 | + |
| 66 | + modules = packages_content.split("\n")[9:-1] |
| 67 | + |
| 68 | + # A sample line from this module list looks like this: |
| 69 | + # |
| 70 | + # Crypt::Passphrase::SHA1::Base64 0.021 L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz |
| 71 | + |
| 72 | + for module in modules: |
| 73 | + info = [section for section in module.split(" ") if section] |
| 74 | + |
| 75 | + # This is like: L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz |
| 76 | + package_path = info[-1] |
| 77 | + path_segments = package_path.split("/") |
| 78 | + filename = path_segments.pop() |
| 79 | + path_prefix = "/".join(path_segments) |
| 80 | + |
| 81 | + name_version = filename.replace(".tar.gz", "").split("-") |
| 82 | + _version = name_version.pop() |
| 83 | + name = "-".join(name_version) |
| 84 | + |
| 85 | + # for the above example: name: Crypt-Passphrase, path_prefix: L/LE/LEONT/ |
| 86 | + package_path_by_name[name] = path_prefix |
| 87 | + |
| 88 | + return package_path_by_name |
| 89 | + |
| 90 | + |
| 91 | +def get_cpan_packageurls(name, path_prefix, logger=None): |
| 92 | + """ |
| 93 | + Given a package name and it's path_prefix (author page path) |
| 94 | + return a list of packageURLs for that package. |
| 95 | +
|
| 96 | + An author page (like https://www.cpan.org/authors/id/P/PT/PTC/) lists |
| 97 | + all versions of all packages released by the author, so we can scrape |
| 98 | + all the packageURLs from this author packages index. |
| 99 | + """ |
| 100 | + |
| 101 | + author_name = path_prefix.split("/")[-1] |
| 102 | + |
| 103 | + packageurls = [] |
| 104 | + |
| 105 | + # file extensions found in cpan index |
| 106 | + ignorable_extensions = [".meta", ".readme", ".tar.gz"] |
| 107 | + |
| 108 | + cpan_authors_path = "/authors/id/" |
| 109 | + cpan_authors_url = CPAN_REPO + cpan_authors_path |
| 110 | + |
| 111 | + cpan_author_page_url = cpan_authors_url + path_prefix |
| 112 | + |
| 113 | + response = requests.get(cpan_author_page_url) |
| 114 | + if not response.ok: |
| 115 | + return packageurls |
| 116 | + |
| 117 | + if logger: |
| 118 | + logger(f"Getting package versions for {name} from {cpan_author_page_url}") |
| 119 | + |
| 120 | + soup = BeautifulSoup(response.text, "html.parser") |
| 121 | + |
| 122 | + # We get all the listed packages in the author page index |
| 123 | + package_list = soup.find("ul") |
| 124 | + if not package_list: |
| 125 | + return packageurls |
| 126 | + |
| 127 | + package_list_elements = package_list.text.split("\n") |
| 128 | + |
| 129 | + package_elements = [ |
| 130 | + element.replace(" ", "") |
| 131 | + for element in package_list_elements |
| 132 | + if element and element not in {" Parent Directory", " CHECKSUMS"} |
| 133 | + ] |
| 134 | + |
| 135 | + versions = [] |
| 136 | + for package_file in package_elements: |
| 137 | + for extension in ignorable_extensions: |
| 138 | + if extension in package_file: |
| 139 | + package_file = package_file.replace(extension, "") |
| 140 | + |
| 141 | + name_version = package_file.split("-") |
| 142 | + version = name_version.pop() |
| 143 | + package_name = "-".join(name_version) |
| 144 | + if package_name != name: |
| 145 | + continue |
| 146 | + |
| 147 | + versions.append(version) |
| 148 | + |
| 149 | + unique_versions = list(set(versions)) |
| 150 | + for version in unique_versions: |
| 151 | + purl = PackageURL( |
| 152 | + type=CPAN_TYPE, |
| 153 | + namespace=author_name, |
| 154 | + name=name, |
| 155 | + version=version, |
| 156 | + ) |
| 157 | + packageurls.append(purl.to_string()) |
| 158 | + |
| 159 | + return packageurls |
0 commit comments