diff --git a/.gitattributes b/.gitattributes index da9f2fe243744..65aa648cc791d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -78,5 +78,6 @@ environment.yml export-ignore # exclude the whole directory to avoid running related tests in sdist pandas/tests/io/parser/data export-ignore -# Include cibw script in sdist since it's needed for building wheels +# Include cibw script and the SBOM generator in sdist scripts/cibw_before_build.sh -export-ignore +scripts/generate_sbom.py -export-ignore diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index b359f5b5026ee..27e668ef2c7af 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -145,6 +145,16 @@ jobs: - name: Validate wheel RECORD run: for whl in $(ls wheelhouse); do wheel unpack wheelhouse/$whl -d /tmp; done + - name: Validate SBOM (PEP 770) + shell: bash -el {0} + run: | + pip install check-jsonschema + for whl in wheelhouse/*.whl; do + echo "Validating SBOM in $whl..." + unzip -p "$whl" "*/sboms/pandas.cdx.json" > /tmp/sbom.json + check-jsonschema --schemafile "https://cyclonedx.org/schema/bom-1.6.schema.json" /tmp/sbom.json + done + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 with: name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} diff --git a/LICENSES/vendored.toml b/LICENSES/vendored.toml new file mode 100644 index 0000000000000..dd1a0544a2120 --- /dev/null +++ b/LICENSES/vendored.toml @@ -0,0 +1,108 @@ +# Vendored components manifest for SBOM generation +# This file documents code that pandas has derived from or incorporates from other projects. +# Used by scripts/generate_sbom.py to generate CycloneDX SBOM for PEP 770 compliance. +# +# License field supports: +# - Single SPDX ID: "MIT" +# - SPDX expression for dual-license: "Apache-2.0 OR BSD-3-Clause" + +[[component]] +name = "numpy" +license = "BSD-3-Clause" +license_file = "NUMPY_LICENSE" +description = "Derived algorithms and array handling code" +purl = "pkg:pypi/numpy" +website = "https://numpy.org" + +[[component]] +name = "bottleneck" +license = "BSD-2-Clause" +license_file = "BOTTLENECK_LICENCE" +description = "Derived reduction algorithms" +purl = "pkg:pypi/bottleneck" +website = "https://github.com/pydata/bottleneck" + +[[component]] +name = "python-dateutil" +# BSD applies to all code; Apache applies to contributions after 2017-12-01 +license = "Apache-2.0 AND BSD-3-Clause" +license_file = "DATEUTIL_LICENSE" +description = "Derived date parsing routines" +purl = "pkg:pypi/python-dateutil" +website = "https://github.com/dateutil/dateutil" + +[[component]] +name = "klib" +license = "MIT" +license_file = "KLIB_LICENSE" +description = "Derived hash table implementation (khash)" +purl = "pkg:github/attractivechaos/klib" +website = "https://github.com/attractivechaos/klib" + +[[component]] +name = "musl" +license = "MIT" +license_file = "MUSL_LICENSE" +description = "Derived ASCII character classification functions (isdigit, isspace, etc.)" +purl = "pkg:generic/musl" +website = "https://musl.libc.org" + +[[component]] +name = "pyperclip" +license = "BSD-3-Clause" +license_file = "PYPERCLIP_LICENSE" +description = "Derived clipboard utilities" +purl = "pkg:pypi/pyperclip" +website = "https://github.com/asweigart/pyperclip" + +[[component]] +name = "sas7bdat" +license = "MIT" +license_file = "SAS7BDAT_LICENSE" +description = "Derived SAS file reader code" +purl = "pkg:pypi/sas7bdat" +website = "https://github.com/jaredhobbs/sas7bdat" + +[[component]] +name = "pyupgrade" +license = "MIT" +license_file = "PYUPGRADE_LICENSE" +description = "Ported unwanted-pattern check in scripts/validate_unwanted_patterns.py" +purl = "pkg:pypi/pyupgrade" +website = "https://github.com/asottile/pyupgrade" + +[[component]] +name = "ultrajson" +# Per LICENSES/ULTRAJSON_LICENSE: BSD-3-Clause for ultrajson itself, +# plus TCL-licensed portions derived from the double-to-ascii routine +# (see header in pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c). +license = "BSD-3-Clause AND TCL" +license_file = "ULTRAJSON_LICENSE" +description = "Derived JSON parsing code" +purl = "pkg:pypi/ujson" +website = "https://github.com/ultrajson/ultrajson" + +[[component]] +name = "haven" +license = "MIT" +license_file = "HAVEN_LICENSE" +description = "Derived SPSS/Stata reader code" +purl = "pkg:cran/haven" +website = "https://github.com/tidyverse/haven" + +[[component]] +name = "packaging" +# Dual-licensed: user can choose either license +license = "Apache-2.0 OR BSD-2-Clause" +license_file = "PACKAGING_LICENSE" +description = "Derived version parsing code" +purl = "pkg:pypi/packaging" +website = "https://github.com/pypa/packaging" + +[[component]] +name = "cpython" +license = "PSF-2.0" +license_file = "PSF_LICENSE" +description = "Derived Python standard library code" +purl = "pkg:generic/cpython" +website = "https://github.com/python/cpython" diff --git a/MANIFEST.in b/MANIFEST.in index 9894381ed6252..c67e0e565a766 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -62,3 +62,6 @@ prune pandas/tests/io/parser/data # Selectively re-add *.cxx files that were excluded above graft pandas/_libs/src graft pandas/_libs/include + +# Include the SBOM generator in sdist; meson.build invokes it via custom_target. +include scripts/generate_sbom.py diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index f3055427be5e0..3971448bf5161 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -37,6 +37,7 @@ Other enhancements - Improved the precision of float parsing in :func:`read_csv` (:issue:`64395`) - Improved the string ``repr`` of :class:`pd.core.arrays.SparseArray` (:issue:`64547`) - MSVC is no longer required to build on Windows, and build errors when using the MinGW compiler have been fixed (:issue:`63160`) +- Wheels now include a `PEP 770 `_ CycloneDX Software Bill of Materials (SBOM) at ``.dist-info/sboms/pandas.cdx.json`` describing vendored components (:issue:`63479`) .. --------------------------------------------------------------------------- .. _whatsnew_310.notable_bug_fixes: diff --git a/environment.yml b/environment.yml index bde64f5a3a51c..33f7ea6d2c762 100644 --- a/environment.yml +++ b/environment.yml @@ -25,6 +25,9 @@ dependencies: - python-dateutil - numpy<3 + # SBOM validation + - jsonschema + # optional dependencies - adbc-driver-postgresql>=1.2.0 - adbc-driver-sqlite>=1.2.0 diff --git a/meson.build b/meson.build index d2874e85cc3a7..f645ebd842d0a 100644 --- a/meson.build +++ b/meson.build @@ -95,4 +95,23 @@ endif # Needed by pandas.test() when it looks for the pytest ini options py.install_sources('pyproject.toml', subdir: 'pandas') +# PEP 770 SBOM for vendored components. Generated at build time from +# LICENSES/vendored.toml; meson-python (>=0.20.0) recognises the +# -.dist-info/ prefix under {py_purelib} and routes the +# output into the wheel's own .dist-info/sboms/ at pack time. +distinfo = meson.project_name() + '-' + meson.project_version() + '.dist-info' +custom_target( + 'pandas-vendored-sbom', + output: 'pandas.cdx.json', + command: [ + py, + files('scripts/generate_sbom.py'), + '@OUTPUT@', + '--version', + meson.project_version(), + ], + install: true, + install_dir: py.get_install_dir() / distinfo / 'sboms', +) + subdir('pandas') diff --git a/pyproject.toml b/pyproject.toml index fd64bb71a0562..6878eade14635 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,10 @@ # Minimum requirements for the build system to execute. # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ - "meson-python>=0.19.0,<1", + # TEMPORARY: pin to the meson-python fork branch that implements + # PEP 770 SBOM routing (mesonbuild/meson-python#843). Revert to + # "meson-python>=0.20.0,<1" once that PR lands in a released version. + "meson-python @ git+https://github.com/fangchenli/meson-python.git@feat/distinfo-placeholder", "meson>=1.2.3,<2", "wheel", "Cython>3.1.0,<4.0.0a0", # Note: sync with environment.yml and asv.conf.json @@ -81,6 +84,7 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.116.0', 'pytest>=8.3.4', 'pytest-xdist>=3.6.1'] +sbom = ['check-jsonschema'] pyarrow = ['pyarrow>=13.0.0'] performance = ['bottleneck>=1.4.2', 'numba>=0.60.0', 'numexpr>=2.10.2'] computation = ['scipy>=1.14.1', 'xarray>=2024.10.0'] @@ -190,7 +194,7 @@ environment = {CFLAGS="-g0"} [[tool.cibuildwheel.overrides]] select = "*pyodide*" -# Pyodide repairs wheels on its own, using auditwheel-emscripten +# Pyodide repairs wheels on its own, using auditwheel-emscripten. repair-wheel-command = "" # https://github.com/pyodide/pyodide/issues/5805 build-verbosity = 1 diff --git a/requirements-dev.txt b/requirements-dev.txt index 50b9e1f3dd684..1a391accb9cb8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -15,6 +15,7 @@ PyQt5>=5.15.9 coverage python-dateutil numpy<3 +jsonschema adbc-driver-postgresql>=1.2.0 adbc-driver-sqlite>=1.2.0 beautifulsoup4>=4.12.3 diff --git a/scripts/generate_sbom.py b/scripts/generate_sbom.py new file mode 100644 index 0000000000000..2ea911691648e --- /dev/null +++ b/scripts/generate_sbom.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +""" +Generate a CycloneDX SBOM for pandas vendored components. + +This script generates a Software Bill of Materials (SBOM) in CycloneDX 1.6 format +documenting code that pandas has derived from or incorporates from other projects. +This is in compliance with PEP 770. + +The vendored components are defined in LICENSES/vendored.toml. + +Usage: + python scripts/generate_sbom.py output_path + python scripts/generate_sbom.py - # print to stdout + +To validate the generated SBOM: + check-jsonschema --schemafile \ + https://cyclonedx.org/schema/bom-1.6.schema.json output.json +""" + +import argparse +from datetime import ( + datetime, + timezone, +) +import hashlib +import json +import os +from pathlib import Path +import tomllib + + +def is_spdx_expression(license_str: str) -> bool: + """Check if a license string is an SPDX expression (vs a single ID).""" + # SPDX expressions contain operators like OR, AND, WITH + return any(op in license_str for op in (" OR ", " AND ", " WITH ")) + + +def load_vendored_components(manifest_path: Path | None = None) -> list[dict]: + """Load vendored components from LICENSES/vendored.toml manifest.""" + if manifest_path is None: + # Default to LICENSES/vendored.toml relative to repo root + repo_root = Path(__file__).parent.parent + manifest_path = repo_root / "LICENSES" / "vendored.toml" + + with manifest_path.open("rb") as f: + manifest = tomllib.load(f) + + components = [] + for comp in manifest.get("component", []): + license_str = comp["license"] + components.append( + { + "name": comp["name"], + "bom_ref": f"{comp['name']}-derived", + "description": comp["description"], + "license": license_str, + "is_expression": is_spdx_expression(license_str), + "purl": comp["purl"], + "website": comp["website"], + } + ) + return components + + +def get_pandas_version() -> str: + """Get the pandas version from installed package. + + During CI wheel builds, use --version flag instead since pandas + is not installed at that point. + """ + try: + from pandas import __version__ + + return __version__ + except ImportError: + # Return placeholder if pandas is not installed + return "0.0.0.dev0" + + +def _reproducible_timestamp() -> str: + """Return an ISO-8601 timestamp honoring SOURCE_DATE_EPOCH if set. + + Matches the reproducible-builds convention already honored by + meson-python for wheel file mtimes. Falls back to wall-clock UTC + only when no SOURCE_DATE_EPOCH is provided. + """ + sde = os.environ.get("SOURCE_DATE_EPOCH") + if sde: + return datetime.fromtimestamp(int(sde), timezone.utc).isoformat() + return datetime.now(timezone.utc).isoformat() + + +def _deterministic_serial(version: str, manifest_path: Path) -> str: + """Build a urn:uuid serialNumber deterministic in pandas version + manifest. + + CycloneDX requires serialNumber to be unique per BOM, but for + reproducible builds we derive it from inputs so repeated invocations + produce byte-identical output. Hashing manifest bytes + pandas + version yields a stable UUID that still changes when either input + changes. + """ + manifest_bytes = manifest_path.read_bytes() + digest = hashlib.sha256(manifest_bytes + version.encode("utf-8")).hexdigest() + # Lay out the 32-hex digest as a canonical UUID string (8-4-4-4-12). + u = f"{digest[0:8]}-{digest[8:12]}-{digest[12:16]}-{digest[16:20]}-{digest[20:32]}" + return f"urn:uuid:{u}" + + +def generate_sbom( + version: str | None = None, manifest_path: Path | None = None +) -> dict: + """Generate the CycloneDX SBOM document.""" + if version is None: + version = get_pandas_version() + if manifest_path is None: + manifest_path = Path(__file__).parent.parent / "LICENSES" / "vendored.toml" + + vendored_components = load_vendored_components(manifest_path) + + timestamp = _reproducible_timestamp() + serial_number = _deterministic_serial(version, manifest_path) + + # Build components list + components = [] + dependency_refs = [] + + for comp in vendored_components: + # CycloneDX uses "expression" for SPDX expressions, "id" for single license + if comp["is_expression"]: + license_entry = {"expression": comp["license"]} + else: + license_entry = {"license": {"id": comp["license"]}} + + component = { + "type": "library", + "bom-ref": comp["bom_ref"], + "name": comp["name"], + "description": comp["description"], + "licenses": [license_entry], + "purl": comp["purl"], + "externalReferences": [ + {"type": "website", "url": comp["website"]}, + ], + } + components.append(component) + dependency_refs.append(comp["bom_ref"]) + + # Single bom-ref shared by metadata.component and dependencies[0] + # so CycloneDX consumers can resolve the root of the dependency + # graph. See + # https://cyclonedx.org/use-cases/software-dependencies/. + root_bom_ref = f"pkg:pypi/pandas@{version}" + + sbom = { + "$schema": "https://cyclonedx.org/schema/bom-1.6.schema.json", + "bomFormat": "CycloneDX", + "specVersion": "1.6", + "serialNumber": serial_number, + "version": 1, + "metadata": { + "timestamp": timestamp, + "tools": { + "components": [ + { + "type": "application", + "name": "pandas-sbom-generator", + "version": "1.0.0", + } + ] + }, + "component": { + "type": "library", + "bom-ref": root_bom_ref, + "name": "pandas", + "version": version, + "purl": root_bom_ref, + "description": "Powerful data structures for data analysis, " + "time series, and statistics", + "licenses": [{"license": {"id": "BSD-3-Clause"}}], + "externalReferences": [ + {"type": "website", "url": "https://pandas.pydata.org"}, + { + "type": "vcs", + "url": "https://github.com/pandas-dev/pandas", + }, + { + "type": "documentation", + "url": "https://pandas.pydata.org/docs/", + }, + ], + }, + }, + "components": components, + "dependencies": [ + { + "ref": root_bom_ref, + "dependsOn": dependency_refs, + } + ], + } + + return sbom + + +def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Generate CycloneDX SBOM for pandas vendored components" + ) + parser.add_argument( + "output", + nargs="?", + default="-", + help="Output file path (use '-' for stdout, default: stdout)", + ) + parser.add_argument( + "--version", + help="Override pandas version (default: auto-detect)", + ) + parser.add_argument( + "--manifest", + type=Path, + help="Path to vendored.toml manifest (default: LICENSES/vendored.toml)", + ) + args = parser.parse_args() + + sbom = generate_sbom(version=args.version, manifest_path=args.manifest) + sbom_json = json.dumps(sbom, indent=2, ensure_ascii=False) + + if args.output == "-": + print(sbom_json) + else: + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(sbom_json, encoding="utf-8") + + +if __name__ == "__main__": + main()