diff --git a/.gitattributes b/.gitattributes
index da9f2fe243744..65aa648cc791d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -78,5 +78,6 @@ environment.yml export-ignore
# exclude the whole directory to avoid running related tests in sdist
pandas/tests/io/parser/data export-ignore
-# Include cibw script in sdist since it's needed for building wheels
+# Include cibw script and the SBOM generator in sdist
scripts/cibw_before_build.sh -export-ignore
+scripts/generate_sbom.py -export-ignore
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index b359f5b5026ee..27e668ef2c7af 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -145,6 +145,16 @@ jobs:
- name: Validate wheel RECORD
run: for whl in $(ls wheelhouse); do wheel unpack wheelhouse/$whl -d /tmp; done
+ - name: Validate SBOM (PEP 770)
+ shell: bash -el {0}
+ run: |
+ pip install check-jsonschema
+ for whl in wheelhouse/*.whl; do
+ echo "Validating SBOM in $whl..."
+ unzip -p "$whl" "*/sboms/pandas.cdx.json" > /tmp/sbom.json
+ check-jsonschema --schemafile "https://cyclonedx.org/schema/bom-1.6.schema.json" /tmp/sbom.json
+ done
+
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
with:
name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
diff --git a/LICENSES/vendored.toml b/LICENSES/vendored.toml
new file mode 100644
index 0000000000000..dd1a0544a2120
--- /dev/null
+++ b/LICENSES/vendored.toml
@@ -0,0 +1,108 @@
+# Vendored components manifest for SBOM generation
+# This file documents code that pandas has derived from or incorporates from other projects.
+# Used by scripts/generate_sbom.py to generate CycloneDX SBOM for PEP 770 compliance.
+#
+# License field supports:
+# - Single SPDX ID: "MIT"
+# - SPDX expression for dual-license: "Apache-2.0 OR BSD-3-Clause"
+
+[[component]]
+name = "numpy"
+license = "BSD-3-Clause"
+license_file = "NUMPY_LICENSE"
+description = "Derived algorithms and array handling code"
+purl = "pkg:pypi/numpy"
+website = "https://numpy.org"
+
+[[component]]
+name = "bottleneck"
+license = "BSD-2-Clause"
+license_file = "BOTTLENECK_LICENCE"
+description = "Derived reduction algorithms"
+purl = "pkg:pypi/bottleneck"
+website = "https://github.com/pydata/bottleneck"
+
+[[component]]
+name = "python-dateutil"
+# BSD applies to all code; Apache applies to contributions after 2017-12-01
+license = "Apache-2.0 AND BSD-3-Clause"
+license_file = "DATEUTIL_LICENSE"
+description = "Derived date parsing routines"
+purl = "pkg:pypi/python-dateutil"
+website = "https://github.com/dateutil/dateutil"
+
+[[component]]
+name = "klib"
+license = "MIT"
+license_file = "KLIB_LICENSE"
+description = "Derived hash table implementation (khash)"
+purl = "pkg:github/attractivechaos/klib"
+website = "https://github.com/attractivechaos/klib"
+
+[[component]]
+name = "musl"
+license = "MIT"
+license_file = "MUSL_LICENSE"
+description = "Derived ASCII character classification functions (isdigit, isspace, etc.)"
+purl = "pkg:generic/musl"
+website = "https://musl.libc.org"
+
+[[component]]
+name = "pyperclip"
+license = "BSD-3-Clause"
+license_file = "PYPERCLIP_LICENSE"
+description = "Derived clipboard utilities"
+purl = "pkg:pypi/pyperclip"
+website = "https://github.com/asweigart/pyperclip"
+
+[[component]]
+name = "sas7bdat"
+license = "MIT"
+license_file = "SAS7BDAT_LICENSE"
+description = "Derived SAS file reader code"
+purl = "pkg:pypi/sas7bdat"
+website = "https://github.com/jaredhobbs/sas7bdat"
+
+[[component]]
+name = "pyupgrade"
+license = "MIT"
+license_file = "PYUPGRADE_LICENSE"
+description = "Ported unwanted-pattern check in scripts/validate_unwanted_patterns.py"
+purl = "pkg:pypi/pyupgrade"
+website = "https://github.com/asottile/pyupgrade"
+
+[[component]]
+name = "ultrajson"
+# Per LICENSES/ULTRAJSON_LICENSE: BSD-3-Clause for ultrajson itself,
+# plus TCL-licensed portions derived from the double-to-ascii routine
+# (see header in pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c).
+license = "BSD-3-Clause AND TCL"
+license_file = "ULTRAJSON_LICENSE"
+description = "Derived JSON parsing code"
+purl = "pkg:pypi/ujson"
+website = "https://github.com/ultrajson/ultrajson"
+
+[[component]]
+name = "haven"
+license = "MIT"
+license_file = "HAVEN_LICENSE"
+description = "Derived SPSS/Stata reader code"
+purl = "pkg:cran/haven"
+website = "https://github.com/tidyverse/haven"
+
+[[component]]
+name = "packaging"
+# Dual-licensed: user can choose either license
+license = "Apache-2.0 OR BSD-2-Clause"
+license_file = "PACKAGING_LICENSE"
+description = "Derived version parsing code"
+purl = "pkg:pypi/packaging"
+website = "https://github.com/pypa/packaging"
+
+[[component]]
+name = "cpython"
+license = "PSF-2.0"
+license_file = "PSF_LICENSE"
+description = "Derived Python standard library code"
+purl = "pkg:generic/cpython"
+website = "https://github.com/python/cpython"
diff --git a/MANIFEST.in b/MANIFEST.in
index 9894381ed6252..c67e0e565a766 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -62,3 +62,6 @@ prune pandas/tests/io/parser/data
# Selectively re-add *.cxx files that were excluded above
graft pandas/_libs/src
graft pandas/_libs/include
+
+# Include the SBOM generator in sdist; meson.build invokes it via custom_target.
+include scripts/generate_sbom.py
diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst
index f3055427be5e0..3971448bf5161 100644
--- a/doc/source/whatsnew/v3.1.0.rst
+++ b/doc/source/whatsnew/v3.1.0.rst
@@ -37,6 +37,7 @@ Other enhancements
- Improved the precision of float parsing in :func:`read_csv` (:issue:`64395`)
- Improved the string ``repr`` of :class:`pd.core.arrays.SparseArray` (:issue:`64547`)
- MSVC is no longer required to build on Windows, and build errors when using the MinGW compiler have been fixed (:issue:`63160`)
+- Wheels now include a `PEP 770 `_ CycloneDX Software Bill of Materials (SBOM) at ``.dist-info/sboms/pandas.cdx.json`` describing vendored components (:issue:`63479`)
.. ---------------------------------------------------------------------------
.. _whatsnew_310.notable_bug_fixes:
diff --git a/environment.yml b/environment.yml
index bde64f5a3a51c..33f7ea6d2c762 100644
--- a/environment.yml
+++ b/environment.yml
@@ -25,6 +25,9 @@ dependencies:
- python-dateutil
- numpy<3
+ # SBOM validation
+ - jsonschema
+
# optional dependencies
- adbc-driver-postgresql>=1.2.0
- adbc-driver-sqlite>=1.2.0
diff --git a/meson.build b/meson.build
index d2874e85cc3a7..f645ebd842d0a 100644
--- a/meson.build
+++ b/meson.build
@@ -95,4 +95,23 @@ endif
# Needed by pandas.test() when it looks for the pytest ini options
py.install_sources('pyproject.toml', subdir: 'pandas')
+# PEP 770 SBOM for vendored components. Generated at build time from
+# LICENSES/vendored.toml; meson-python (>=0.20.0) recognises the
+# -.dist-info/ prefix under {py_purelib} and routes the
+# output into the wheel's own .dist-info/sboms/ at pack time.
+distinfo = meson.project_name() + '-' + meson.project_version() + '.dist-info'
+custom_target(
+ 'pandas-vendored-sbom',
+ output: 'pandas.cdx.json',
+ command: [
+ py,
+ files('scripts/generate_sbom.py'),
+ '@OUTPUT@',
+ '--version',
+ meson.project_version(),
+ ],
+ install: true,
+ install_dir: py.get_install_dir() / distinfo / 'sboms',
+)
+
subdir('pandas')
diff --git a/pyproject.toml b/pyproject.toml
index fd64bb71a0562..6878eade14635 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,10 @@
# Minimum requirements for the build system to execute.
# See https://github.com/scipy/scipy/pull/12940 for the AIX issue.
requires = [
- "meson-python>=0.19.0,<1",
+ # TEMPORARY: pin to the meson-python fork branch that implements
+ # PEP 770 SBOM routing (mesonbuild/meson-python#843). Revert to
+ # "meson-python>=0.20.0,<1" once that PR lands in a released version.
+ "meson-python @ git+https://github.com/fangchenli/meson-python.git@feat/distinfo-placeholder",
"meson>=1.2.3,<2",
"wheel",
"Cython>3.1.0,<4.0.0a0", # Note: sync with environment.yml and asv.conf.json
@@ -81,6 +84,7 @@ matplotlib = "pandas:plotting._matplotlib"
[project.optional-dependencies]
test = ['hypothesis>=6.116.0', 'pytest>=8.3.4', 'pytest-xdist>=3.6.1']
+sbom = ['check-jsonschema']
pyarrow = ['pyarrow>=13.0.0']
performance = ['bottleneck>=1.4.2', 'numba>=0.60.0', 'numexpr>=2.10.2']
computation = ['scipy>=1.14.1', 'xarray>=2024.10.0']
@@ -190,7 +194,7 @@ environment = {CFLAGS="-g0"}
[[tool.cibuildwheel.overrides]]
select = "*pyodide*"
-# Pyodide repairs wheels on its own, using auditwheel-emscripten
+# Pyodide repairs wheels on its own, using auditwheel-emscripten.
repair-wheel-command = ""
# https://github.com/pyodide/pyodide/issues/5805
build-verbosity = 1
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 50b9e1f3dd684..1a391accb9cb8 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -15,6 +15,7 @@ PyQt5>=5.15.9
coverage
python-dateutil
numpy<3
+jsonschema
adbc-driver-postgresql>=1.2.0
adbc-driver-sqlite>=1.2.0
beautifulsoup4>=4.12.3
diff --git a/scripts/generate_sbom.py b/scripts/generate_sbom.py
new file mode 100644
index 0000000000000..2ea911691648e
--- /dev/null
+++ b/scripts/generate_sbom.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+"""
+Generate a CycloneDX SBOM for pandas vendored components.
+
+This script generates a Software Bill of Materials (SBOM) in CycloneDX 1.6 format
+documenting code that pandas has derived from or incorporates from other projects.
+This is in compliance with PEP 770.
+
+The vendored components are defined in LICENSES/vendored.toml.
+
+Usage:
+ python scripts/generate_sbom.py output_path
+ python scripts/generate_sbom.py - # print to stdout
+
+To validate the generated SBOM:
+ check-jsonschema --schemafile \
+ https://cyclonedx.org/schema/bom-1.6.schema.json output.json
+"""
+
+import argparse
+from datetime import (
+ datetime,
+ timezone,
+)
+import hashlib
+import json
+import os
+from pathlib import Path
+import tomllib
+
+
+def is_spdx_expression(license_str: str) -> bool:
+ """Check if a license string is an SPDX expression (vs a single ID)."""
+ # SPDX expressions contain operators like OR, AND, WITH
+ return any(op in license_str for op in (" OR ", " AND ", " WITH "))
+
+
+def load_vendored_components(manifest_path: Path | None = None) -> list[dict]:
+ """Load vendored components from LICENSES/vendored.toml manifest."""
+ if manifest_path is None:
+ # Default to LICENSES/vendored.toml relative to repo root
+ repo_root = Path(__file__).parent.parent
+ manifest_path = repo_root / "LICENSES" / "vendored.toml"
+
+ with manifest_path.open("rb") as f:
+ manifest = tomllib.load(f)
+
+ components = []
+ for comp in manifest.get("component", []):
+ license_str = comp["license"]
+ components.append(
+ {
+ "name": comp["name"],
+ "bom_ref": f"{comp['name']}-derived",
+ "description": comp["description"],
+ "license": license_str,
+ "is_expression": is_spdx_expression(license_str),
+ "purl": comp["purl"],
+ "website": comp["website"],
+ }
+ )
+ return components
+
+
+def get_pandas_version() -> str:
+ """Get the pandas version from installed package.
+
+ During CI wheel builds, use --version flag instead since pandas
+ is not installed at that point.
+ """
+ try:
+ from pandas import __version__
+
+ return __version__
+ except ImportError:
+ # Return placeholder if pandas is not installed
+ return "0.0.0.dev0"
+
+
+def _reproducible_timestamp() -> str:
+ """Return an ISO-8601 timestamp honoring SOURCE_DATE_EPOCH if set.
+
+ Matches the reproducible-builds convention already honored by
+ meson-python for wheel file mtimes. Falls back to wall-clock UTC
+ only when no SOURCE_DATE_EPOCH is provided.
+ """
+ sde = os.environ.get("SOURCE_DATE_EPOCH")
+ if sde:
+ return datetime.fromtimestamp(int(sde), timezone.utc).isoformat()
+ return datetime.now(timezone.utc).isoformat()
+
+
+def _deterministic_serial(version: str, manifest_path: Path) -> str:
+ """Build a urn:uuid serialNumber deterministic in pandas version + manifest.
+
+ CycloneDX requires serialNumber to be unique per BOM, but for
+ reproducible builds we derive it from inputs so repeated invocations
+ produce byte-identical output. Hashing manifest bytes + pandas
+ version yields a stable UUID that still changes when either input
+ changes.
+ """
+ manifest_bytes = manifest_path.read_bytes()
+ digest = hashlib.sha256(manifest_bytes + version.encode("utf-8")).hexdigest()
+ # Lay out the 32-hex digest as a canonical UUID string (8-4-4-4-12).
+ u = f"{digest[0:8]}-{digest[8:12]}-{digest[12:16]}-{digest[16:20]}-{digest[20:32]}"
+ return f"urn:uuid:{u}"
+
+
+def generate_sbom(
+ version: str | None = None, manifest_path: Path | None = None
+) -> dict:
+ """Generate the CycloneDX SBOM document."""
+ if version is None:
+ version = get_pandas_version()
+ if manifest_path is None:
+ manifest_path = Path(__file__).parent.parent / "LICENSES" / "vendored.toml"
+
+ vendored_components = load_vendored_components(manifest_path)
+
+ timestamp = _reproducible_timestamp()
+ serial_number = _deterministic_serial(version, manifest_path)
+
+ # Build components list
+ components = []
+ dependency_refs = []
+
+ for comp in vendored_components:
+ # CycloneDX uses "expression" for SPDX expressions, "id" for single license
+ if comp["is_expression"]:
+ license_entry = {"expression": comp["license"]}
+ else:
+ license_entry = {"license": {"id": comp["license"]}}
+
+ component = {
+ "type": "library",
+ "bom-ref": comp["bom_ref"],
+ "name": comp["name"],
+ "description": comp["description"],
+ "licenses": [license_entry],
+ "purl": comp["purl"],
+ "externalReferences": [
+ {"type": "website", "url": comp["website"]},
+ ],
+ }
+ components.append(component)
+ dependency_refs.append(comp["bom_ref"])
+
+ # Single bom-ref shared by metadata.component and dependencies[0]
+ # so CycloneDX consumers can resolve the root of the dependency
+ # graph. See
+ # https://cyclonedx.org/use-cases/software-dependencies/.
+ root_bom_ref = f"pkg:pypi/pandas@{version}"
+
+ sbom = {
+ "$schema": "https://cyclonedx.org/schema/bom-1.6.schema.json",
+ "bomFormat": "CycloneDX",
+ "specVersion": "1.6",
+ "serialNumber": serial_number,
+ "version": 1,
+ "metadata": {
+ "timestamp": timestamp,
+ "tools": {
+ "components": [
+ {
+ "type": "application",
+ "name": "pandas-sbom-generator",
+ "version": "1.0.0",
+ }
+ ]
+ },
+ "component": {
+ "type": "library",
+ "bom-ref": root_bom_ref,
+ "name": "pandas",
+ "version": version,
+ "purl": root_bom_ref,
+ "description": "Powerful data structures for data analysis, "
+ "time series, and statistics",
+ "licenses": [{"license": {"id": "BSD-3-Clause"}}],
+ "externalReferences": [
+ {"type": "website", "url": "https://pandas.pydata.org"},
+ {
+ "type": "vcs",
+ "url": "https://github.com/pandas-dev/pandas",
+ },
+ {
+ "type": "documentation",
+ "url": "https://pandas.pydata.org/docs/",
+ },
+ ],
+ },
+ },
+ "components": components,
+ "dependencies": [
+ {
+ "ref": root_bom_ref,
+ "dependsOn": dependency_refs,
+ }
+ ],
+ }
+
+ return sbom
+
+
+def main() -> None:
+ """Main entry point."""
+ parser = argparse.ArgumentParser(
+ description="Generate CycloneDX SBOM for pandas vendored components"
+ )
+ parser.add_argument(
+ "output",
+ nargs="?",
+ default="-",
+ help="Output file path (use '-' for stdout, default: stdout)",
+ )
+ parser.add_argument(
+ "--version",
+ help="Override pandas version (default: auto-detect)",
+ )
+ parser.add_argument(
+ "--manifest",
+ type=Path,
+ help="Path to vendored.toml manifest (default: LICENSES/vendored.toml)",
+ )
+ args = parser.parse_args()
+
+ sbom = generate_sbom(version=args.version, manifest_path=args.manifest)
+ sbom_json = json.dumps(sbom, indent=2, ensure_ascii=False)
+
+ if args.output == "-":
+ print(sbom_json)
+ else:
+ output_path = Path(args.output)
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ output_path.write_text(sbom_json, encoding="utf-8")
+
+
+if __name__ == "__main__":
+ main()