Skip to content

Commit 6d1873a

Browse files
committed
Fixes Invalid SBoM created when projects uses ssh
Move out the purl logic and add tests for it. Also add support for Bitbucket PURLs
1 parent 928dd07 commit 6d1873a

6 files changed

Lines changed: 330 additions & 46 deletions

File tree

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ Release 0.11.0 (unreleased)
22
====================================
33

44
* Don't show animation when running in CI (#702)
5+
* Improve logic for creating Purls in SBoM (#780)
56

67
Release 0.10.0 (released 2025-03-12)
78
====================================

dfetch/reporting/sbom_reporter.py

Lines changed: 17 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616
"""
1717

1818
import json
19-
import re
20-
from typing import List, cast
19+
from typing import cast
2120

2221
from cyclonedx.model import (
2322
ExternalReference,
@@ -31,18 +30,15 @@
3130
from cyclonedx.output import get_instance
3231
from cyclonedx.output.json import Json
3332
from cyclonedx.schema import OutputFormat
34-
from packageurl import PackageURL
3533

36-
import dfetch.util.util
34+
import dfetch.util.purl
3735
from dfetch.manifest.project import ProjectEntry
3836
from dfetch.reporting.reporter import Reporter
3937

4038

4139
class SbomReporter(Reporter):
4240
"""Reporter for generating SBoM's."""
4341

44-
url_splitter = re.compile(r"([^\/)]+)")
45-
github_url = re.compile(r"github.com\/(?P<group>.+)\/(?P<repo>[^\s\.]+)[\.]?")
4642
dfetch_tool = Tool(vendor="dfetch-org", name="dfetch", version=dfetch.__version__)
4743

4844
name = "SBoM"
@@ -56,56 +52,31 @@ def add_project(
5652
self, project: ProjectEntry, license_name: str, version: str
5753
) -> None:
5854
"""Add a project to the report."""
59-
match = self.github_url.search(project.remote_url)
60-
if match:
61-
component = Component(
62-
name=project.name,
63-
version=version,
64-
type=ComponentType.LIBRARY,
65-
purl=PackageURL(
66-
type="github",
67-
name=match.group("repo"),
68-
version=version,
69-
namespace=match.group("group"),
70-
subpath=project.source or None,
71-
),
72-
)
73-
else:
74-
parts = self._split_url(project.remote_url)
75-
component = Component(
76-
name=project.name,
77-
version=version,
78-
type=ComponentType.LIBRARY,
79-
purl=PackageURL(
80-
type="generic",
81-
version=version,
82-
qualifiers=f"download_url={project.remote_url}",
83-
namespace="/".join(parts),
84-
subpath=project.source or None,
85-
name=project.name,
86-
),
87-
group="/".join(parts),
88-
)
55+
purl = dfetch.util.purl.remote_url_to_purl(
56+
project.remote_url, version=version, subpath=project.source or None
57+
)
58+
59+
component = Component(
60+
name=project.name,
61+
version=version,
62+
type=ComponentType.LIBRARY,
63+
purl=purl,
64+
)
65+
66+
if purl.type not in ["github", "bitbucket"]:
67+
component.group = purl.namespace
68+
8969
component.external_references.add(
9070
ExternalReference(
9171
type=ExternalReferenceType.VCS,
92-
url=XsUri(project.remote_url),
72+
url=XsUri(purl.qualifiers.get("vcs_url", "")),
9373
)
9474
)
9575

9676
if license_name:
9777
component.licenses.add(LicenseChoice(expression=license_name))
9878
self._bom.components.add(component)
9979

100-
@staticmethod
101-
def _split_url(url: str) -> List[str]:
102-
"""Split the url in elements."""
103-
return [
104-
part.group()
105-
for part in SbomReporter.url_splitter.finditer(url)
106-
if not part.group().endswith(":") # Skip protocol specifiers
107-
]
108-
10980
def dump_to_file(self, outfile: str) -> bool:
11081
"""Dump the SBoM to file."""
11182
output_format = OutputFormat(

dfetch/util/purl.py

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
"""Module to convert remote URLs to valid Package URLs (PURLs).
2+
3+
Supports: GitHub, Bitbucket, SVN, SSH paths, and more.
4+
"""
5+
6+
import re
7+
from typing import Optional
8+
from urllib.parse import urlparse
9+
10+
from packageurl import PackageURL
11+
from tldextract import TLDExtract
12+
13+
NO_FETCH_EXTRACT = TLDExtract(suffix_list_urls=())
14+
SSH_REGEX = re.compile(
15+
r"^(?:git@|git\+ssh://git@)(?P<host>[^/:]+)[:/](?P<path>.+?)(?:\.git)?$"
16+
)
17+
18+
GITHUB_REGEX = re.compile(
19+
r".*github\.com(?::\d+)?[:/](?P<org>[^/]+)/(?P<repo>[^/]+?)(?:\.git)?$",
20+
re.IGNORECASE,
21+
)
22+
23+
BITBUCKET_REGEX = re.compile(
24+
r".*bitbucket\.org(?::\d+)?[:/](?P<org>[^/]+)/(?P<repo>[^/]+?)(?:\.git)?$",
25+
re.IGNORECASE,
26+
)
27+
28+
29+
def _handle_github(
30+
remote_url: str,
31+
_: str,
32+
version: Optional[str],
33+
subpath: Optional[str],
34+
) -> Optional[PackageURL]:
35+
"""Handler for GitHub URLs."""
36+
match = GITHUB_REGEX.match(remote_url)
37+
if match:
38+
return PackageURL(
39+
type="github",
40+
namespace=match.group("org"),
41+
name=match.group("repo"),
42+
version=version,
43+
subpath=subpath,
44+
)
45+
return None
46+
47+
48+
def _handle_bitbucket(
49+
remote_url: str,
50+
_: str,
51+
version: Optional[str],
52+
subpath: Optional[str],
53+
) -> Optional[PackageURL]:
54+
"""Handler for Bitbucket URLs."""
55+
match = BITBUCKET_REGEX.match(remote_url)
56+
if match:
57+
return PackageURL(
58+
type="bitbucket",
59+
namespace=match.group("org"),
60+
name=match.group("repo"),
61+
version=version,
62+
subpath=subpath,
63+
)
64+
return None
65+
66+
67+
def _handle_svn(
68+
remote_url: str,
69+
path: str,
70+
version: Optional[str],
71+
subpath: Optional[str],
72+
) -> Optional[PackageURL]:
73+
"""Handler for SVN URLs."""
74+
parsed = urlparse(remote_url)
75+
if "svn" in parsed.scheme or "svn." in parsed.netloc:
76+
domain = NO_FETCH_EXTRACT(parsed.netloc).domain
77+
78+
parts: list[str] = [domain]
79+
if path:
80+
parts.extend(path.split("/"))
81+
name = parts[-1] if parts else "unknown"
82+
namespace = "/".join(parts[:-1])
83+
84+
return PackageURL(
85+
type="generic",
86+
namespace=namespace,
87+
name=name,
88+
version=version,
89+
qualifiers={"vcs_url": remote_url},
90+
subpath=subpath,
91+
)
92+
return None
93+
94+
95+
def _handle_ssh(
96+
remote_url: str,
97+
path: str,
98+
version: Optional[str],
99+
subpath: Optional[str],
100+
) -> Optional[PackageURL]:
101+
"""Handler for SSH URLs."""
102+
match = SSH_REGEX.match(remote_url)
103+
if match:
104+
domain = NO_FETCH_EXTRACT(match.group("host")).domain
105+
106+
parts: list[str] = []
107+
if domain not in ["gitlab", "gitea", "gitee"]:
108+
parts.append(domain)
109+
110+
path = match.group("path")
111+
if path:
112+
parts.extend(path.replace(".git", "").split("/"))
113+
name = parts[-1] if parts else "unknown"
114+
namespace = "/".join(parts[:-1])
115+
return PackageURL(
116+
type="generic",
117+
namespace=namespace,
118+
name=name,
119+
version=version,
120+
qualifiers={"vcs_url": remote_url},
121+
subpath=subpath,
122+
)
123+
return None
124+
125+
126+
def _handle_generic(
127+
remote_url: str,
128+
path: str,
129+
version: Optional[str],
130+
subpath: Optional[str],
131+
) -> PackageURL:
132+
"""Fallback handler for generic URLs."""
133+
domain = NO_FETCH_EXTRACT(remote_url).domain
134+
135+
parts: list[str] = []
136+
if domain not in ["gitlab", "gitea", "gitee"]:
137+
parts.append(domain)
138+
139+
if path:
140+
parts.extend(path.replace(".git", "").split("/"))
141+
name = parts[-1] if parts else "unknown"
142+
namespace = "/".join(parts[:-1])
143+
return PackageURL(
144+
type="generic",
145+
namespace=namespace,
146+
name=name,
147+
version=version,
148+
qualifiers={"vcs_url": remote_url},
149+
subpath=subpath,
150+
)
151+
152+
153+
def remote_url_to_purl(
154+
remote_url: str, version: Optional[str] = None, subpath: Optional[str] = None
155+
) -> PackageURL:
156+
"""Convert a remote URL to a valid PackageURL object.
157+
158+
Supports GitHub, Bitbucket, SVN, SSH paths.
159+
Optionally specify version and subpath.
160+
"""
161+
parsed = urlparse(remote_url)
162+
path = parsed.path.lstrip("/")
163+
164+
handlers: list = [
165+
_handle_github,
166+
_handle_bitbucket,
167+
_handle_svn,
168+
_handle_ssh,
169+
]
170+
for handler in handlers:
171+
result: Optional[PackageURL] = handler(remote_url, path, version, subpath)
172+
if result is not None:
173+
return result
174+
175+
return _handle_generic(remote_url, path, version, subpath)

doc/legal.rst

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,3 +489,41 @@ Sarif-om
489489
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
490490
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
491491
SOFTWARE
492+
493+
TLDExtract
494+
~~~~~~~~~~
495+
`TLDExtract`_ is used for extracting the top-level domain from URLs.
496+
497+
::
498+
499+
BSD 3-Clause License
500+
501+
Copyright (c) 2013-2025, John Kurkowski
502+
All rights reserved.
503+
504+
Redistribution and use in source and binary forms, with or without
505+
modification, are permitted provided that the following conditions are met:
506+
507+
1. Redistributions of source code must retain the above copyright notice, this
508+
list of conditions and the following disclaimer.
509+
510+
2. Redistributions in binary form must reproduce the above copyright notice,
511+
this list of conditions and the following disclaimer in the documentation
512+
and/or other materials provided with the distribution.
513+
514+
3. Neither the name of the copyright holder nor the names of its
515+
contributors may be used to endorse or promote products derived from
516+
this software without specific prior written permission.
517+
518+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
519+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
520+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
521+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
522+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
523+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
524+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
525+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
526+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
527+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
528+
529+
.. _`TLDExtract`: https://github.com/john-kurkowski/tldextract

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ dependencies = [
4949
"typing-extensions==4.7.1; python_version <= '3.7.0'",
5050
"typing-extensions==4.13.2; python_version <= '3.8.0'",
5151
"typing-extensions==4.15.0; python_version > '3.8.0'",
52+
"tldextract==5.3.0",
5253
"sarif-om==1.0.4",
5354
"semver==3.0.4",
5455
"patch-ng==1.18.1",

0 commit comments

Comments
 (0)