|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Generate THIRD-PARTY-NOTICES from installed Python packages. |
| 3 | +
|
| 4 | +Runs pip-licenses to collect metadata, filters out dev/internal packages, |
| 5 | +and outputs a formatted notices file with summary table and per-package details. |
| 6 | +
|
| 7 | +Usage: |
| 8 | + python generate_third_party_notices.py [--output PATH] |
| 9 | +""" |
| 10 | + |
| 11 | +import argparse |
| 12 | +import json |
| 13 | +import re |
| 14 | +import subprocess |
| 15 | +import sys |
| 16 | +from datetime import date |
| 17 | +from pathlib import Path |
| 18 | + |
| 19 | +# Packages installed temporarily during Docker build — never in pyproject.toml. |
| 20 | +_BUILD_ONLY = {"pip-licenses", "prettytable"} |
| 21 | + |
| 22 | +# Internal DK packages not discoverable from pyproject.toml structure. |
| 23 | +_EXTRA_INTERNAL = {"requests-extensions", "requests_extensions"} |
| 24 | + |
| 25 | +# Packages whose license is reported as UNKNOWN by pip-licenses (keys are normalized). |
| 26 | +LICENSE_OVERRIDES = { |
| 27 | + "google-crc32c": "Apache-2.0", |
| 28 | + "streamlit-camera-input-live": "MIT", |
| 29 | + "streamlit-embedcode": "MIT", |
| 30 | + "streamlit-keyup": "MIT", |
| 31 | + "streamlit-toggle-switch": "MIT", |
| 32 | + "streamlit-vertical-slider": "MIT", |
| 33 | + "streamlit-faker": "Apache-2.0", |
| 34 | +} |
| 35 | + |
| 36 | + |
| 37 | +def _normalize(name: str) -> str: |
| 38 | + """Normalize package name per PEP 503 (lowercase, hyphens/underscores/dots → hyphen).""" |
| 39 | + return re.sub(r"[-_.]+", "-", name).lower() |
| 40 | + |
| 41 | + |
| 42 | +def _parse_pkg_name(requirement: str) -> str: |
| 43 | + """Extract normalized package name from a PEP 508 requirement string.""" |
| 44 | + raw = re.split(r"[><=!~\[;@\s]", requirement, maxsplit=1)[0].strip() |
| 45 | + return _normalize(raw) |
| 46 | + |
| 47 | + |
| 48 | +def _load_pyproject(path: Path) -> dict: |
| 49 | + if sys.version_info >= (3, 11): |
| 50 | + import tomllib |
| 51 | + else: |
| 52 | + import tomli as tomllib # type: ignore[no-redef] |
| 53 | + with open(path, "rb") as f: |
| 54 | + return tomllib.load(f) |
| 55 | + |
| 56 | + |
| 57 | +def _find_pyprojects(repo_root: Path) -> list[Path]: |
| 58 | + """Return pyproject.toml paths for root, submodule, and plugins.""" |
| 59 | + candidates = [repo_root / "pyproject.toml", repo_root / "testgen" / "pyproject.toml"] |
| 60 | + for plugins_dir in [repo_root / "plugins", repo_root / "testgen" / "plugins"]: |
| 61 | + if plugins_dir.is_dir(): |
| 62 | + candidates.extend(sorted(plugins_dir.glob("*/pyproject.toml"))) |
| 63 | + return [p for p in candidates if p.exists()] |
| 64 | + |
| 65 | + |
| 66 | +def _resolve_transitive(names: set[str]) -> set[str]: |
| 67 | + """Expand a set of normalized package names to include all their transitive dependencies.""" |
| 68 | + from importlib.metadata import requires, PackageNotFoundError |
| 69 | + |
| 70 | + resolved: set[str] = set() |
| 71 | + queue = list(names) |
| 72 | + while queue: |
| 73 | + name = queue.pop() |
| 74 | + norm = _normalize(name) |
| 75 | + if norm in resolved: |
| 76 | + continue |
| 77 | + resolved.add(norm) |
| 78 | + try: |
| 79 | + reqs = requires(name) or [] |
| 80 | + except PackageNotFoundError: |
| 81 | + try: |
| 82 | + reqs = requires(norm) or [] |
| 83 | + except PackageNotFoundError: |
| 84 | + continue |
| 85 | + for req in reqs: |
| 86 | + if "; extra ==" in req or "; " in req: |
| 87 | + continue |
| 88 | + dep_name = _parse_pkg_name(req) |
| 89 | + if dep_name and dep_name not in resolved: |
| 90 | + queue.append(dep_name) |
| 91 | + return resolved |
| 92 | + |
| 93 | + |
| 94 | +def _build_exclude_sets(repo_root: Path) -> tuple[set[str], set[str]]: |
| 95 | + """Read pyproject.toml files to build dev-only and internal package sets.""" |
| 96 | + dev_direct: set[str] = set(_BUILD_ONLY) |
| 97 | + internal: set[str] = set(_EXTRA_INTERNAL) |
| 98 | + |
| 99 | + for pyproject_path in _find_pyprojects(repo_root): |
| 100 | + data = _load_pyproject(pyproject_path) |
| 101 | + |
| 102 | + project_name = data.get("project", {}).get("name") |
| 103 | + if project_name: |
| 104 | + internal.add(project_name) |
| 105 | + |
| 106 | + for deps in data.get("project", {}).get("optional-dependencies", {}).values(): |
| 107 | + for dep in deps: |
| 108 | + dev_direct.add(_parse_pkg_name(dep)) |
| 109 | + |
| 110 | + # Expand dev deps transitively, then subtract anything reachable from the main |
| 111 | + # package. This keeps shared deps (e.g. requests, urllib3) in the runtime set. |
| 112 | + dev_all = _resolve_transitive(dev_direct) |
| 113 | + runtime_all = _resolve_transitive(internal) |
| 114 | + dev_only = dev_all - runtime_all |
| 115 | + return dev_only, internal |
| 116 | + |
| 117 | + |
| 118 | +def _find_repo_root() -> Path: |
| 119 | + """Walk up from this script to find the repo root (contains pyproject.toml with 'testgen' subdir).""" |
| 120 | + # Script lives at <root>/testgen/deploy/ or is called from repo root |
| 121 | + script_dir = Path(__file__).resolve().parent |
| 122 | + for candidate in [script_dir.parent.parent, script_dir.parent, Path.cwd()]: |
| 123 | + if (candidate / "pyproject.toml").exists() and (candidate / "testgen" / "pyproject.toml").exists(): |
| 124 | + return candidate |
| 125 | + # Fallback: just use empty sets (Docker build context may not have root pyproject.toml) |
| 126 | + return script_dir |
| 127 | + |
| 128 | + |
| 129 | +def normalize_license(name: str, lic: str) -> str: |
| 130 | + if _normalize(name) in LICENSE_OVERRIDES: |
| 131 | + return LICENSE_OVERRIDES[_normalize(name)] |
| 132 | + if not lic or lic == "UNKNOWN": |
| 133 | + return "UNKNOWN" |
| 134 | + if "Apache" in lic and len(lic) > 50: |
| 135 | + return "Apache-2.0" |
| 136 | + return lic |
| 137 | + |
| 138 | + |
| 139 | +def extract_copyright(license_text: str) -> str | None: |
| 140 | + if not license_text: |
| 141 | + return None |
| 142 | + lines: list[str] = [] |
| 143 | + seen: set[str] = set() |
| 144 | + for line in license_text.split("\n"): |
| 145 | + stripped = line.strip() |
| 146 | + if re.match(r"(?i)copyright\s", stripped) and stripped not in seen: |
| 147 | + lines.append(stripped) |
| 148 | + seen.add(stripped) |
| 149 | + return "\n".join(lines) if lines else None |
| 150 | + |
| 151 | + |
| 152 | +def get_packages() -> list[dict]: |
| 153 | + result = subprocess.run( |
| 154 | + [ |
| 155 | + sys.executable, "-m", "piplicenses", |
| 156 | + "--format=json", |
| 157 | + "--with-urls", |
| 158 | + "--with-license-file", |
| 159 | + "--with-notice-file", |
| 160 | + "--no-license-path", |
| 161 | + ], |
| 162 | + capture_output=True, |
| 163 | + text=True, |
| 164 | + check=True, |
| 165 | + ) |
| 166 | + return json.loads(result.stdout) |
| 167 | + |
| 168 | + |
| 169 | +def generate(packages: list[dict], dev_only: set[str], internal: set[str]) -> str: |
| 170 | + runtime = [ |
| 171 | + pkg for pkg in packages |
| 172 | + if _normalize(pkg["Name"]) not in internal and _normalize(pkg["Name"]) not in dev_only |
| 173 | + ] |
| 174 | + runtime.sort(key=lambda p: p["Name"].lower()) |
| 175 | + |
| 176 | + lines: list[str] = [] |
| 177 | + |
| 178 | + # Header |
| 179 | + lines.append("THIRD-PARTY SOFTWARE NOTICES AND INFORMATION") |
| 180 | + lines.append("=" * 60) |
| 181 | + lines.append("") |
| 182 | + lines.append("DataOps TestGen Enterprise") |
| 183 | + lines.append(f"Copyright (c) {date.today().year} DataKitchen, Inc.") |
| 184 | + lines.append("") |
| 185 | + lines.append("This product includes software developed by third parties.") |
| 186 | + lines.append("The following sets forth attribution notices for third-party") |
| 187 | + lines.append("software that may be contained in portions of this product.") |
| 188 | + lines.append("") |
| 189 | + lines.append(f"Generated: {date.today().isoformat()}") |
| 190 | + lines.append(f"Runtime dependencies: {len(runtime)}") |
| 191 | + lines.append("") |
| 192 | + lines.append("") |
| 193 | + |
| 194 | + # Summary table |
| 195 | + lines.append("-" * 60) |
| 196 | + lines.append("SUMMARY") |
| 197 | + lines.append("-" * 60) |
| 198 | + lines.append("") |
| 199 | + lines.append(f"{'Package':<40s} {'Version':<16s} {'License'}") |
| 200 | + lines.append(f"{'-' * 40} {'-' * 16} {'-' * 30}") |
| 201 | + for pkg in runtime: |
| 202 | + lic = normalize_license(pkg["Name"], pkg["License"]) |
| 203 | + lines.append(f"{pkg['Name']:<40s} {pkg['Version']:<16s} {lic}") |
| 204 | + |
| 205 | + lines.append("") |
| 206 | + lines.append("") |
| 207 | + |
| 208 | + # Detailed notices |
| 209 | + lines.append("-" * 60) |
| 210 | + lines.append("DETAILED NOTICES") |
| 211 | + lines.append("-" * 60) |
| 212 | + |
| 213 | + for pkg in runtime: |
| 214 | + name = pkg["Name"] |
| 215 | + version = pkg["Version"] |
| 216 | + lic = normalize_license(name, pkg["License"]) |
| 217 | + url = pkg.get("URL", "") |
| 218 | + license_text = pkg.get("LicenseText", "") |
| 219 | + notice_text = pkg.get("NoticeText", "") |
| 220 | + |
| 221 | + lines.append("") |
| 222 | + lines.append("=" * 60) |
| 223 | + lines.append(f"{name} {version}") |
| 224 | + lines.append(f"License: {lic}") |
| 225 | + if url and url != "UNKNOWN": |
| 226 | + lines.append(f"URL: {url}") |
| 227 | + lines.append("=" * 60) |
| 228 | + |
| 229 | + copyright_line = extract_copyright(license_text) |
| 230 | + if copyright_line: |
| 231 | + lines.append("") |
| 232 | + lines.append(copyright_line) |
| 233 | + |
| 234 | + if notice_text and notice_text.strip() and notice_text.strip() != "UNKNOWN": |
| 235 | + lines.append("") |
| 236 | + lines.append("NOTICE:") |
| 237 | + lines.append(notice_text.strip()) |
| 238 | + |
| 239 | + if license_text and license_text.strip() and license_text.strip() != "UNKNOWN": |
| 240 | + text = license_text.strip() |
| 241 | + # Abbreviate long Apache 2.0 boilerplate to the standard short form |
| 242 | + if len(text) > 3000 and "apache" in text.lower(): |
| 243 | + lines.append("") |
| 244 | + lines.append("Licensed under the Apache License, Version 2.0.") |
| 245 | + lines.append("You may obtain a copy of the License at") |
| 246 | + lines.append("") |
| 247 | + lines.append(" http://www.apache.org/licenses/LICENSE-2.0") |
| 248 | + lines.append("") |
| 249 | + lines.append("Unless required by applicable law or agreed to in writing,") |
| 250 | + lines.append("software distributed under the License is distributed on an") |
| 251 | + lines.append('"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.') |
| 252 | + else: |
| 253 | + lines.append("") |
| 254 | + lines.append(text) |
| 255 | + |
| 256 | + lines.append("") |
| 257 | + return "\n".join(lines) |
| 258 | + |
| 259 | + |
| 260 | +def main() -> None: |
| 261 | + parser = argparse.ArgumentParser(description="Generate THIRD-PARTY-NOTICES") |
| 262 | + parser.add_argument("--output", default=None, help="Output file path (default: stdout)") |
| 263 | + args = parser.parse_args() |
| 264 | + |
| 265 | + repo_root = _find_repo_root() |
| 266 | + dev_only, internal = _build_exclude_sets(repo_root) |
| 267 | + packages = get_packages() |
| 268 | + content = generate(packages, dev_only, internal) |
| 269 | + |
| 270 | + if args.output: |
| 271 | + with open(args.output, "w") as f: |
| 272 | + f.write(content) |
| 273 | + else: |
| 274 | + print(content) |
| 275 | + |
| 276 | + |
| 277 | +if __name__ == "__main__": |
| 278 | + main() |
0 commit comments