Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
Release 0.14.0 (unreleased)
===========================

* Use github purl, repo and version for a github release archive in SBOM (#1063)

Release 0.13.0 (released 2026-03-30)
====================================

Expand Down
1 change: 1 addition & 0 deletions dfetch/reporting/sbom_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ def add_project(
else:
purl = vcs_url_to_purl(project.remote_url, version=version, subpath=subpath)
name = project.name if purl.type == "generic" else purl.name
version = purl.version or version
location = self.manifest.find_name_in_manifest(project.name)
component = Component(
name=name,
Expand Down
18 changes: 17 additions & 1 deletion dfetch/vcs/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import http.client
import os
import pathlib
import re
import shutil
import stat
import sys
Expand All @@ -47,6 +48,7 @@
copy_src_subset,
prune_files_by_pattern,
)
from dfetch.util.versions import coerce

logger = get_logger(__name__)

Expand Down Expand Up @@ -82,7 +84,21 @@ def archive_url_to_purl(
version: str | None = None,
subpath: str | None = None,
) -> PackageURL:
"""Build a generic PackageURL for an archive download URL."""
"""Build a github or generic PackageURL for an archive download URL."""
if match := re.search(
r"https://github\.com/(?P<org>[^/]+)/(?P<repo>[^/]+)/releases/download/(?P<version>[^/]+)/",
download_url,
):
prefix, current_version, _ = coerce(
match["version"],
)
return PackageURL(
type="github",
namespace=match["org"].lower(),
name=match["repo"].lower(),
version=str(current_version) or prefix,
)
Comment thread
spoorcc marked this conversation as resolved.

parsed = urllib.parse.urlparse(download_url)
basename = os.path.basename(parsed.path)
Comment on lines +88 to 103
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Anchor GitHub detection to parsed host/path to avoid false positives.

Using re.search on the full URL can incorrectly match GitHub patterns inside query parameters/fragments of non-GitHub URLs, resulting in wrong SBOM package identity.

Proposed fix
-    if match := re.search(
-        r"https://github\.com/(?P<org>[^/]+)/(?P<repo>[^/]+)/releases/download/(?P<version>[^/]+)/",
-        download_url,
-    ):
+    parsed = urllib.parse.urlparse(download_url)
+    if (
+        (parsed.hostname or "").lower() == "github.com"
+        and (
+            match := re.match(
+                r"^/(?P<org>[^/]+)/(?P<repo>[^/]+)/releases/download/(?P<version>[^/]+)/",
+                parsed.path,
+            )
+        )
+    ):
         prefix, current_version, _ = coerce(
             match["version"],
         )
         return PackageURL(
             type="github",
             namespace=match["org"].lower(),
             name=match["repo"].lower(),
             version=str(current_version) if current_version else prefix,
         )
 
-    parsed = urllib.parse.urlparse(download_url)
     basename = os.path.basename(parsed.path)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@dfetch/vcs/archive.py` around lines 88 - 103, The GitHub detection currently
runs re.search against the whole download_url which can match GitHub-like
strings in query/fragments; first parse the URL (use parsed =
urllib.parse.urlparse(download_url)) and then only run the GitHub regex against
parsed.netloc and parsed.path (or check parsed.hostname == "github.com" and
match parsed.path against
r"/(?P<org>[^/]+)/(?P<repo>[^/]+)/releases/download/(?P<version>[^/]+)/"); keep
the same logic that calls coerce(match["version"]) and returns the PackageURL
with namespace, name, and version, but derive match from the parsed.path (or
combined host+path) to avoid false positives.

name = strip_archive_extension(basename) or "unknown"
Expand Down
36 changes: 36 additions & 0 deletions features/report-sbom-archive.feature
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,39 @@ Feature: Create a CycloneDX SBOM for archive dependencies
]
}
"""

Scenario: An github release archive uses github purl and repo
Given the manifest 'dfetch.yaml'
"""
manifest:
version: '0.1'

remotes:
- name: github
url-base: https://github.com/

projects:
- name: TF-PSA-Crypto
vcs: archive
remote: github
dst: ext/TF-PSA-Crypto
repo-path: Mbed-TLS/TF-PSA-Crypto/releases/download/tf-psa-crypto-1.0.0/tf-psa-crypto-1.0.0.tar.bz2
integrity:
hash: sha256:31f0df2ca17897b5db2757cb0307dcde267292ba21ade831663d972a7a5b7d40
"""
And all projects are updated
When I run "dfetch report -t sbom"
Then the 'report.json' json file includes
"""
{
"components": [
{
"group": "mbed-tls",
"name": "tf-psa-crypto",
"purl": "pkg:github/mbed-tls/tf-psa-crypto@1.0.0",
"type": "library",
"version": "1.0.0"
}
]
}
"""
Loading