diff --git a/.gitignore b/.gitignore index a7d344bca5a7..b19b6b5c2154 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ static-analyzer-report .settings* .clangd .vscode +.changelog scripts/docs/en/components_schema scripts/docs/en/dynamic_configs scripts/docs/en/versions.md diff --git a/scripts/changelog_tool/AGENTS.md b/scripts/changelog_tool/AGENTS.md new file mode 100644 index 000000000000..c53eb45e36be --- /dev/null +++ b/scripts/changelog_tool/AGENTS.md @@ -0,0 +1,103 @@ +# Changelog Tool + +This agent is responsible for running the changelog tool, which collects commit information and identifies external contributors. + +## Heuristics for LLM Analysis + +The tool uses heuristics to determine which commits should be sent to an LLM for changelog analysis: + +We calculate a `score_size` metric as `lines_added + lines_deleted` for each commit. + +The tool will NOT send commits to the LLM if they meet any of these criteria: +1. Any file path contains "docs/" or "documentation", OR commit title contains documentation keywords +2. Commit title contains fix/bug keywords AND the commit is small (score_size <= 20) +3. All commits with score_size <= 20 + +Documentation keywords: "doc", "docs", "documentation", "readme" +Fix/bug keywords: "fix", "bugfix", "bug", "patch", "repair", "correct", "resolve" + +## Usage + +IMPORTANT: The changelog tool must always be run with the virtual environment activated: + +```bash +# Always activate the virtual environment first +source .vent/bin/activate + +# Run the tool +./changelog-tool [command] [options] +``` + +## Commands + +### collect + +Collects commits from the specified range and classifies them using heuristics and LLM analysis. + +```bash +./changelog-tool collect [options] +``` + +Options: +- `--from-sha`: Starting commit SHA (overrides config) +- `--to-sha`: Ending commit SHA (overrides config) +- `--repo-path`: Path to the repository (overrides config) + +### review + +Generates a markdown report and an override YAML file for reviewing classified commits. + +```bash +./changelog-tool review +``` + +The review command generates two files in the output directory: +- `review_report.md`: A markdown report showing all commits, sorted by size, with their classification status, changelog lines, and analysis +- `override.yaml`: A commented YAML file containing all commits that can be uncommented and modified to override classifications + +The report is divided into two sections: +1. **Not in Changelog**: Commits that are not included in the changelog (either filtered by heuristics or marked as unclear) +2. **In Changelog**: Commits that are included in the changelog + +Each commit in the report shows: +- Commit hash with link to GitHub +- Commit title +- Status (✅ In Changelog, ❌ Not in Changelog, or ❓ Unclear) +- Size (number of lines changed) +- Changelog line (if available) +- Analysis (if available) + +### report + +Generates a formatted Markdown changelog based on the review output and applies user overrides. + +```bash +./changelog-tool report +``` + +The report command performs the following steps: +1. Loads classified commits from `classified.json` +2. Applies overrides from `override.yaml` (if present) +3. Identifies commits marked for the changelog that lack changelog lines or analysis +4. Runs these commits through the LLM with 1.5x increased prompt size and diff truncation enabled +5. Generates a formatted Markdown changelog grouped by classification: + - Breaking Changes + - Features + - Optimizations + - Bug Fixes + - Refactoring + - Minor Changes + - Documentation +6. Appends "Many thanks to [Name] for the PR!" for external contributors in the changelog +7. Appends a section at the end for external contributors not included in the changelog +8. Saves the generated changelog to `changelog.md` in the output directory + +## Output Directory + +By default, the tool outputs classified commits to `.changelog/preclassified.json`. You can customize this with the `--output-dir` global option: + +```bash +# Run with custom output directory +./changelog-tool --output-dir ./my-output-dir collect +./changelog-tool --output-dir ./my-output-dir review +``` \ No newline at end of file diff --git a/scripts/changelog_tool/README.md b/scripts/changelog_tool/README.md new file mode 100644 index 000000000000..3edc19efe209 --- /dev/null +++ b/scripts/changelog_tool/README.md @@ -0,0 +1,174 @@ +# Changelog Tool + +A tool for automatically generating changelogs from git commits using LLM analysis. + +## Features + +- **Automatic commit classification**: Classifies commits into categories (feature, bug, optimization, refactor, minor, docs, unclear) +- **LLM-powered analysis**: Uses LLM to analyze commits and generate changelog entries +- **External contributor detection**: Identifies external contributors and generates acknowledgments +- **Component extraction**: Extracts component names from commit titles for better organization +- **Override support**: Allows manual override of classifications and changelog entries +- **State persistence**: Saves LLM analysis results to avoid reprocessing +- **Rate limiting**: Configurable rate limiting and concurrent request limits + +## Installation + +1. Ensure you have Python 3.8+ installed +2. Install dependencies: +```bash +cd scripts/changelog_tool +python3 -m venv .venv +source .venv/bin/activate +python3 -m pip3 install -r requirements.txt +``` + +3. Set up environment variables: +```bash +export CHANGELOG_LLM_URL="https://your-llm-api.com/v1" +export CHANGELOG_LLM_API_KEY="your-api-key" +export CHANGELOG_LLM_MODEL="your-model-name" +``` + +## Configuration + +The tool is configured via `changelog.yaml`: + +```yaml +collect: + from_sha: # Starting commit SHA + to_sha: HEAD # Ending commit SHA (default: HEAD) + repo_path: ../.. # Path to the repository (default: ../..) + core_team_patterns: # Patterns to identify core team members + - ".*@userver\\.tech" + - ".*@yandex-team\\.com" + +llm-config: + target_rps: 1 # Target requests per second + retries: 3 # Number of retry attempts + max_commits_per_batch: 10 # Maximum commits per LLM batch + max_user_prompt_length: 100000 # Maximum prompt length in characters + include_diff: true # Include diff in LLM prompt + truncate_diff: false # Truncate diff if too long + max_concurrent_requests: 5 # Maximum concurrent requests + +review: + github_url: "https://github.com/userver-framework/userver" + +report: + github_url: "https://github.com/userver-framework/userver" +``` + +## Usage + +### Step 1: Collect Commits + +Run the `collect` command to gather commits and analyze them: + +```bash +source .venv/bin/activate +./changelog-tool collect +``` + +The tool will: +1. Fetch commits from the specified range +2. Classify commits using heuristics +3. Send unclear commits to LLM for analysis +4. Save results to `.changelog/classified.json` + +**Important**: Run the `collect` command repeatedly until you see a message like: +``` +Found 10 commits, 10 already processed, 0 to process via LLM +``` + +This ensures all commits have been processed by the LLM. The tool uses state persistence to avoid reprocessing commits, so running it multiple times is safe and recommended for reliability. + +### Step 2: Review and Override + +Run the `review` command to generate a review report: + +```bash +./changelog-tool review +``` + +This generates two files in `.changelog/`: +- `review_report.md`: A markdown report showing all commits with their classification status +- `override.yaml`: A commented YAML file for overriding classifications + +Review the report and uncomment/modify entries in `override.yaml` to override classifications: + +```yaml +# Example override.yaml +commit_sha_1: + to_changelog: true + changelog_line: "Added support for async LLM processing" + +commit_sha_2: + to_changelog: false + classification: "minor" +``` + +Feel free to leave classification or changelog_line empty LLM will handle it on the next step. + +### Step 3: Generate Changelog + +Run the `report` command to generate the final changelog: + +```bash +./changelog-tool report +``` + +This will: +1. Load classified commits from `classified.json` +2. Apply overrides from `override.yaml` +3. Process commits needing LLM analysis with increased prompt size (1.5x) and diff truncation +4. Generate a formatted Markdown changelog grouped by classification and component +5. Save the changelog to `.changelog/changelog.md` + +**Important**: Run the `report` command repeatedly until you see a message like: +``` +Found 10 commits, 10 already processed, 0 to process via LLM +``` + +This ensures all commits that need LLM analysis have been processed. + +## Output Format + +The generated changelog has the following structure: + +```markdown +* Breaking Change + * component1 + * changelog line 1 + * changelog line 2 + * changelog line without component + +* Feature + * component1 + * changelog line 3 + * changelog line without component + +* Optimization + * component2 + * changelog line 4 + +* Bug + * component1 + * changelog line 5 + +* Refactor + * component3 + * changelog line 6 + +* Minor + * changelog line 7 + +* Documentation + * changelog line 8 + +* Many thanks to: + * External Contributor 1 for commit title 1! + * External Contributor 2 for: + * commit title 2 + * commit title 3 +``` \ No newline at end of file diff --git a/scripts/changelog_tool/changelog-tool b/scripts/changelog_tool/changelog-tool new file mode 100755 index 000000000000..801aca07f57b --- /dev/null +++ b/scripts/changelog_tool/changelog-tool @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +import pathlib +import click + +import changelog_tool.config as cfg +import changelog_tool.collect.command as collect_cmd +import changelog_tool.review.command as review_cmd +import changelog_tool.report.command as report_cmd + +@click.group() +@click.option('--config', default='changelog.yaml') +@click.option('--output-dir', type=pathlib.Path, default=None) +@click.pass_context +def cli(ctx: click.Context, config: str, output_dir: pathlib.Path | None): + ctx.ensure_object(dict) + parsed_config = cfg.parse_config(pathlib.Path(config)) + if output_dir: + parsed_config.collect.output_dir = output_dir + parsed_config.review.output_dir = output_dir + parsed_config.report.output_dir = output_dir + ctx.obj["CONFIG"] = parsed_config + +@cli.command() +@click.option('--from-sha') +@click.option('--to-sha') +@click.option('--repo-path', type=pathlib.Path) +@click.pass_context +def collect(ctx: click.Context, from_sha: str | None, to_sha: str | None, repo_path: pathlib.Path | None): + # Get the config and override with CLI options if provided + config = ctx.obj["CONFIG"] + if from_sha: + config.collect.from_sha = from_sha + if to_sha: + config.collect.to_sha = to_sha + if repo_path: + config.collect.repo_path = repo_path + + collect_cmd.collect(config) + +@cli.command() +@click.pass_context +def review(ctx: click.Context): + config = ctx.obj["CONFIG"] + review_cmd.review(config) + +@cli.command() +@click.pass_context +def report(ctx: click.Context): + config = ctx.obj["CONFIG"] + report_cmd.report(config) + +if __name__ == '__main__': + cli() \ No newline at end of file diff --git a/scripts/changelog_tool/changelog.yaml b/scripts/changelog_tool/changelog.yaml new file mode 100644 index 000000000000..e38c1c2ee8e7 --- /dev/null +++ b/scripts/changelog_tool/changelog.yaml @@ -0,0 +1,22 @@ +collect: + from_sha: da8642900398c33333e29e2bd3e91ca4e181f602 + to_sha: HEAD + repo_path: ../.. + core_team_patterns: + - ".*@userver\\.tech" + - ".*@yandex-team\\.com" + +llm-config: + target_rps: 1 + retries: 7 + max_commits_per_batch: 4 + max_user_prompt_length: 100000 + include_diff: true + truncate_diff: false + max_concurrent_requests: 2 + +review: + github_url: "https://github.com/userver-framework/userver" + +report: + github_url: "https://github.com/userver-framework/userver" diff --git a/scripts/changelog_tool/changelog_tool/__init__.py b/scripts/changelog_tool/changelog_tool/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/scripts/changelog_tool/changelog_tool/collect/__init__.py b/scripts/changelog_tool/changelog_tool/collect/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/scripts/changelog_tool/changelog_tool/collect/classification.py b/scripts/changelog_tool/changelog_tool/collect/classification.py new file mode 100644 index 000000000000..cd83f7c870ff --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/collect/classification.py @@ -0,0 +1,45 @@ +from enum import Enum + +from changelog_tool.common.git import Commit + + +class Classification(str, Enum): + FEATURE = "feature" + BUG = "bug" + BREAKING_CHANGE = "breaking-change" + MINOR_BUG = "minor_bug" + REFACTOR = "refactor" + DOCS = "docs" + UNCLEAR = "unclear" + MINOR="minor" + OPTIMIZATION = "optimization" + +MINOR_BUG_SIZE_THRESHOLD = 200 +MINOR_SIZE_THRESHOLD = 50 + +class ClassifiedCommit(Commit): + classification: Classification = Classification.UNCLEAR + is_external: bool = False + to_changelog: bool | None = None + changelog_line: str | None = None + commit_analysis: str | None = None + component: str | None = None + +def classify_commit(commit: Commit) -> Classification: + doc_keywords = ["doc", "docs", "documentation", "readme"] + commit_title_lower = commit.title.lower() + has_docs_in_title = any(keyword in commit_title_lower for keyword in doc_keywords) + + fix_keywords = ["fix", "bugfix", "bug"] + has_fix = any(keyword in commit_title_lower for keyword in fix_keywords) + + if has_docs_in_title: + return Classification.DOCS + + if has_fix and commit.score_size <= MINOR_BUG_SIZE_THRESHOLD: + return Classification.MINOR_BUG + + if commit.score_size <= MINOR_SIZE_THRESHOLD: + return Classification.MINOR + + return Classification.UNCLEAR \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py new file mode 100644 index 000000000000..8275b1dc7d3a --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -0,0 +1,106 @@ +import re +import asyncio +import os +from typing import List + +import changelog_tool.common.git as git +import changelog_tool.common.io as io +from changelog_tool.config import Config +from changelog_tool.collect.classification import Classification, classify_commit, ClassifiedCommit +from changelog_tool.llm.client import HttpLLMClient +from changelog_tool.llm.processor import LLMProcessor +from changelog_tool.llm.exceptions import LLMError + + +def _extract_component_from_title(title: str) -> str | None: + """Extract component name from commit title. + + Examples: + - "feat odbc: improve driver" -> "odbc" + - "fix(redis): connection leak" -> "redis" + - "feat chaotic: deal with..." -> "chaotic" + - "docs: update README" -> None + """ + # Pattern: type(component): or type component: or type component description + match = re.match(r'^(\w+)(?:\(([^)]+)\))?:?\s*(.+)', title) + if match: + commit_type = match.group(1) + component = match.group(2) + description = match.group(3) + + # If component in parentheses, use it + if component: + return component.lower() + + # If no component in parentheses, check description + words = description.split() + if words: + # Check if first word ends with colon (e.g., "odbc: improve driver") + if words[0].endswith(':'): + return words[0][:-1].lower() + # Check if first word is followed by a colon (e.g., "chaotic: deal with...") + if len(words) > 1 and words[1].startswith(':'): + return words[0].lower() + return None + +def collect(config: Config) -> None: + print(f"Collecting commits from {config.collect.from_sha} to {config.collect.to_sha}...") + commits: list[git.Commit] = git.get_commits(config.collect.from_sha, config.collect.to_sha, config.collect.repo_path) + + core_team_regexes = [re.compile(pattern) for pattern in config.collect.core_team_patterns] + + classified_commits: List[ClassifiedCommit] = [] + for commit in commits: + is_core_team = any(regex.match(commit.author) for regex in core_team_regexes) + classification = classify_commit(commit) + # Extract component from title (e.g., "feat odbc: improve driver" -> "odbc") + component = _extract_component_from_title(commit.title) + + classified_commit = ClassifiedCommit( + **commit.model_dump(), + classification=classification, + is_external=not is_core_team, + to_changelog=None, + changelog_line=None, + commit_analysis=None, + component=component + ) + + if classification in [Classification.FEATURE, Classification.BUG, Classification.BREAKING_CHANGE]: + raise RuntimeError("Unexpected positive changelog preclassification") + elif classification == Classification.UNCLEAR: + classified_commit.to_changelog = None + else: + classified_commit.to_changelog = False + + classified_commits.append(classified_commit) + + print(f"Found {len(classified_commits)} commits") + + io.dump_classified_commits(classified_commits, config.collect.output_dir, 'preclassified.json') + + llm_client = HttpLLMClient(config.llm_config) + llm_processor = LLMProcessor(config.llm_config, llm_client, config.collect.output_dir) + + unclear_commits = [ + commit for commit in classified_commits + if commit.classification == Classification.UNCLEAR + ] + + llm_results = asyncio.run(llm_processor.process_commits(unclear_commits)) + + for commit in classified_commits: + if commit.sha in llm_results: + result = llm_results[commit.sha] + try: + commit.classification = Classification(result.get("classification", "unclear")) + except ValueError: + # If LLM returned an unknown classification, keep UNCLEAR + pass + + commit.to_changelog = result.get("to_changelog") + commit.changelog_line = result.get("changelog_line") + commit.commit_analysis = result.get("detailed_commit_analysis") + + + io.dump_classified_commits(classified_commits, config.collect.output_dir, 'classified.json') \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/collect/config.py b/scripts/changelog_tool/changelog_tool/collect/config.py new file mode 100644 index 000000000000..1ed44f849aac --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/collect/config.py @@ -0,0 +1,10 @@ +import pathlib +import pydantic +from typing import List + +class CollectConfig(pydantic.BaseModel): + from_sha: str + to_sha: str + repo_path: pathlib.Path = pydantic.Field(default_factory=pathlib.Path.cwd) + core_team_patterns: List[str] = pydantic.Field(default_factory=list) + output_dir: pathlib.Path = pydantic.Field(default_factory=lambda: pathlib.Path(".changelog")) \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/common/__init__.py b/scripts/changelog_tool/changelog_tool/common/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/scripts/changelog_tool/changelog_tool/common/git.py b/scripts/changelog_tool/changelog_tool/common/git.py new file mode 100644 index 000000000000..4c47c0605274 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/common/git.py @@ -0,0 +1,157 @@ +from __future__ import annotations + +import re +import subprocess +import pydantic +from pathlib import Path + + +class GitError(Exception): + """Any error when working with git.""" + + +class FileChange(pydantic.BaseModel): + path: str + old_path: str | None = None # None if file is not renamed + added_lines: int = 0 + removed_lines: int = 0 + + +class Commit(pydantic.BaseModel): + sha: str + title: str # first line of message + message: str # full message + author: str # "Name " + co_authors: list[str] # from "Co-authored-by:" + changed_files: list[FileChange] + total_added: int = 0 + total_removed: int = 0 + score_size: int = 0 + + +def get_commits( + from_ref: str | None = None, + to_ref: str = "HEAD", + repo_path: str | Path | None = None, +) -> list[Commit]: + cwd = _repo(repo_path) + rev_range = f"{from_ref}..{to_ref}" if from_ref else to_ref + + raw_shas = _run_git(["log", "--format=%H", rev_range], cwd) + shas = [s.strip() for s in raw_shas.splitlines() if s.strip()] + + return [_fetch_commit(sha, cwd) for sha in shas] + + +def get_commit( + sha: str, + repo_path: str | Path | None = None, +) -> Commit: + return _fetch_commit(sha, _repo(repo_path)) + + +def _repo(repo_path: str | Path | None) -> Path: + return Path(repo_path) if repo_path is not None else Path.cwd() + + +def _run_git(args: list[str], cwd: Path) -> str: + try: + result = subprocess.run( + ["git", *args], + cwd=cwd, + capture_output=True, + text=True, + ) + except FileNotFoundError: + raise GitError("git executable not found") + + if result.returncode != 0: + raise GitError(result.stderr.strip() or f"git {args[0]} failed") + + return result.stdout + + +def _parse_rename(path_str: str) -> tuple[str, str | None]: + m = re.match(r'^(.*?)\{(.*?) => (.*?)\}(.*)$', path_str) + if m: + pre, old_mid, new_mid, suf = m.groups() + old = (pre + old_mid + suf).strip('/') + new = (pre + new_mid + suf).strip('/') + return new, old + + if ' => ' in path_str: + old, new = path_str.split(' => ', 1) + return new.strip(), old.strip() + + return path_str, None + + +def _parse_numstat(output: str) -> list[FileChange]: + changes: list[FileChange] = [] + + for line in output.splitlines(): + line = line.strip() + if not line: + continue + + parts = line.split('\t', 2) + if len(parts) != 3: + continue + + added_str, removed_str, path_str = parts + + added = 0 if added_str == '-' else int(added_str) + removed = 0 if removed_str == '-' else int(removed_str) + + path, old_path = _parse_rename(path_str) + changes.append(FileChange( + path=path, + old_path=old_path, + added_lines=added, + removed_lines=removed, + )) + + return changes + + +def _parse_co_authors(message: str) -> list[str]: + return re.findall(r'(?im)^Co-authored-by:\s*(.+)$', message) + + +def _fetch_commit(sha: str, cwd: Path) -> Commit: + raw_meta = _run_git( + ["show", "-s", "--format=%H%x00%an <%ae>%x00%B", sha], + cwd, + ) + parts = raw_meta.split('\x00', 2) + if len(parts) < 3: + raise GitError(f"Unexpected git show output for {sha!r}") + + sha_full = parts[0].strip() + author = parts[1].strip() + message = parts[2].strip() + title = message.splitlines()[0] if message else "" + + raw_numstat = _run_git( + ["diff-tree", "--root", "--numstat", "-r", "-M", sha], + cwd, + ) + changes = _parse_numstat(raw_numstat) + + return Commit( + sha=sha_full, + title=title, + message=message, + author=author, + co_authors=_parse_co_authors(message), + changed_files=changes, + total_added=sum(c.added_lines for c in changes), + total_removed=sum(c.removed_lines for c in changes), + score_size=sum(c.added_lines + c.removed_lines for c in changes), + ) + +def get_commit_diff(commit: Commit, repo_path: str | Path | None = None) -> str: + return get_diff_by_sha(commit.sha, repo_path) + +def get_diff_by_sha(sha: str, repo_path: str | Path | None = None) -> str: + return _run_git(["diff-tree", "--root", "-p", "-r", "-M", sha], _repo(repo_path)) diff --git a/scripts/changelog_tool/changelog_tool/common/io.py b/scripts/changelog_tool/changelog_tool/common/io.py new file mode 100644 index 000000000000..47eb987986e6 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/common/io.py @@ -0,0 +1,37 @@ +import json +import pathlib +from typing import List + +from changelog_tool.collect.classification import ClassifiedCommit + + +def dump_classified_commits(commits: List[ClassifiedCommit], output_dir: pathlib.Path, filename: str) -> None: + # Ensure output directory exists + output_dir.mkdir(parents=True, exist_ok=True) + + # Create full path to output file + output_file = output_dir / filename + + # Convert classified commits to JSON format + json_data = [commit.model_dump() for commit in commits] + json_str = json.dumps(json_data, indent=2) + + # Write to file + with open(output_file, 'w') as f: + f.write(json_str) + + +def load_classified_commits(output_dir: pathlib.Path, filename: str) -> List[ClassifiedCommit]: + # Create full path to input file + input_file = output_dir / filename + + # Check if file exists + if not input_file.exists(): + return [] + + # Read from file + with open(input_file, 'r') as f: + json_data = json.load(f) + + # Convert JSON data to ClassifiedCommit objects + return [ClassifiedCommit(**item) for item in json_data] \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/config.py b/scripts/changelog_tool/changelog_tool/config.py new file mode 100644 index 000000000000..57b38b878d01 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/config.py @@ -0,0 +1,19 @@ +from changelog_tool.collect.config import CollectConfig +from changelog_tool.llm.config import LLMConfig +from changelog_tool.review.config import ReviewConfig +from changelog_tool.report.config import ReportConfig + +import pydantic +import yaml +import pathlib + +class Config(pydantic.BaseModel): + collect: CollectConfig + llm_config: LLMConfig = pydantic.Field(alias="llm-config") + review: ReviewConfig + report: ReportConfig + +def parse_config(config_path: pathlib.Path) -> Config: + with open(config_path, 'r') as f: + yaml_data = yaml.safe_load(f) + return Config.model_validate(yaml_data) \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/llm/__init__.py b/scripts/changelog_tool/changelog_tool/llm/__init__.py new file mode 100644 index 000000000000..b64ba976b736 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/__init__.py @@ -0,0 +1,15 @@ +from changelog_tool.llm.config import LLMConfig +from changelog_tool.llm.exceptions import LLMError, LLMTransientError +from changelog_tool.llm.client import BaseLLMClient, HttpLLMClient +from changelog_tool.llm.state import LLMState +from changelog_tool.llm.processor import LLMProcessor + +__all__ = [ + "LLMConfig", + "LLMError", + "LLMTransientError", + "BaseLLMClient", + "HttpLLMClient", + "LLMState", + "LLMProcessor", +] diff --git a/scripts/changelog_tool/changelog_tool/llm/client.py b/scripts/changelog_tool/changelog_tool/llm/client.py new file mode 100644 index 000000000000..1f1a7610025a --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/client.py @@ -0,0 +1,135 @@ +import os +import asyncio +from abc import ABC, abstractmethod + +import httpx +import openai +from aiolimiter import AsyncLimiter + +from changelog_tool.llm.config import LLMConfig +from changelog_tool.llm.exceptions import LLMError, LLMTransientError + + +class BaseLLMClient(ABC): + @abstractmethod + async def generate(self, prompt: str) -> str: + """ + Asynchronously sends a text prompt to the LLM and returns a text response. + May throw LLMError or LLMTransientError. + """ + pass + + @abstractmethod + async def close(self): + """Closes client resources.""" + pass + + +class HttpLLMClient(BaseLLMClient): + def __init__(self, config: LLMConfig): + self.url = os.environ.get("CHANGELOG_LLM_URL") + api_key = os.environ.get("CHANGELOG_LLM_API_KEY") + oauth_key = os.environ.get("CHANGELOG_LLM_OAUTH_KEY") + self.model = os.environ.get("CHANGELOG_LLM_MODEL") + self.retries = config.retries + + if not self.url: + raise RuntimeError("Missing required environment variable: CHANGELOG_LLM_URL") + + if api_key: + auth_header = f"Bearer {api_key}" + elif oauth_key: + auth_header = f"Oauth {oauth_key}" + else: + raise RuntimeError("Missing required environment variable: either CHANGELOG_LLM_API_KEY or CHANGELOG_LLM_OAUTH_KEY must be set") + + self.limiter = AsyncLimiter(config.target_rps, 1) + self.semaphore = asyncio.Semaphore(config.max_concurrent_requests) + + http_client = httpx.AsyncClient(verify=False) + + self.client = openai.AsyncOpenAI( + base_url=self.url, + api_key=api_key or oauth_key or "dummy", + default_headers={"Authorization": auth_header}, + max_retries=0, + http_client=http_client, + ) + + async def generate(self, prompt: str) -> str: + last_error = None + + async with self.semaphore: + for attempt in range(self.retries + 1): + try: + async with self.limiter: + if attempt > 0: + print(f" Retrying ({attempt}/{self.retries})...") + + response = await self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + ) + + # Handle non-standard API response format + # Some APIs return the actual data in a 'response' dict attribute + if hasattr(response, 'response') and response.response: + response_data = response.response + if isinstance(response_data, dict) and 'choices' in response_data: + choices = response_data['choices'] + if choices and len(choices) > 0: + first_choice = choices[0] + if 'message' in first_choice and 'content' in first_choice['message']: + content = first_choice['message']['content'] + return content or "" + + # Standard OpenAI response format + if not response: + raise ValueError("LLM returned None response") + + if not hasattr(response, 'choices') or not response.choices: + raise ValueError(f"LLM response has no choices. Response: {response}") + + if len(response.choices) == 0: + raise ValueError(f"LLM returned empty choices list. Response: {response}") + + first_choice = response.choices[0] + if not hasattr(first_choice, 'message'): + raise ValueError(f"First choice has no message attribute. Choice: {first_choice}") + + message = first_choice.message + if not hasattr(message, 'content'): + raise ValueError(f"Message has no content attribute. Message: {message}") + + content = message.content + return content or "" + + except openai.RateLimitError as e: + last_error = f"Rate limit: {e}" + print(f" Rate limit hit, waiting...") + await asyncio.sleep(2 ** attempt) + continue + except openai.APIStatusError as e: + if e.status_code in (400, 401, 403, 404): + raise LLMError(f"Critical LLM error: {e.status_code} - {e.message}") + if e.status_code >= 500: + last_error = f"Server error {e.status_code}" + print(f" Server error, retrying...") + await asyncio.sleep(2 ** attempt) + continue + raise LLMError(f"Unexpected status {e.status_code}: {e.message}") + except openai.APIError as e: + last_error = f"API error: {e}" + print(f" API error, retrying...") + await asyncio.sleep(2 ** attempt) + continue + except Exception as e: + last_error = f"Client error: {e}" + print(f" Error, retrying...") + await asyncio.sleep(2 ** attempt) + continue + + raise LLMTransientError(f"Max retries ({self.retries}) exceeded. Last error: {last_error}") + + async def close(self): + await self.client.close() diff --git a/scripts/changelog_tool/changelog_tool/llm/config.py b/scripts/changelog_tool/changelog_tool/llm/config.py new file mode 100644 index 000000000000..571683c63736 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/config.py @@ -0,0 +1,10 @@ +import pydantic + +class LLMConfig(pydantic.BaseModel): + target_rps: float = 5.0 + retries: int = 3 + max_commits_per_batch: int = 10 + max_user_prompt_length: int = 8000 + include_diff: bool = True + truncate_diff: bool = True + max_concurrent_requests: int = 5 diff --git a/scripts/changelog_tool/changelog_tool/llm/exceptions.py b/scripts/changelog_tool/changelog_tool/llm/exceptions.py new file mode 100644 index 000000000000..54bdf1a53562 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/exceptions.py @@ -0,0 +1,7 @@ +class LLMError(Exception): + """Critical LLM error (e.g., invalid request format, 400 Bad Request).""" + pass + +class LLMTransientError(LLMError): + """Transient LLM error (e.g., 500, 503, timeout or retries exhausted).""" + pass diff --git a/scripts/changelog_tool/changelog_tool/llm/processor.py b/scripts/changelog_tool/changelog_tool/llm/processor.py new file mode 100644 index 000000000000..f46bc65745ec --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/processor.py @@ -0,0 +1,280 @@ +import asyncio +import json +from typing import List, Dict, Any +from pathlib import Path + +from changelog_tool.common.git import Commit, get_commit_diff +from changelog_tool.llm.client import BaseLLMClient +from changelog_tool.llm.config import LLMConfig +from changelog_tool.llm.state import LLMState +from changelog_tool.llm.exceptions import LLMError, LLMTransientError + +class LLMProcessor: + def __init__(self, config: LLMConfig, llm_client: BaseLLMClient, output_dir: Path): + self.config = config + self.llm_client = llm_client + self.output_dir = output_dir + self.state = LLMState(output_dir / "llm_state.json") + + async def process_commits(self, commits: List[Commit]) -> Dict[str, Dict[str, Any]]: + """ + Asynchronously processes a list of commits through the LLM. + Returns a dictionary SHA -> dict with results (classification, changelog_line, detailed_commit_analysis). + """ + # Load and clean state + await self.state.load() + valid_shas = {commit.sha for commit in commits} + await self.state.cleanup(valid_shas) + + # Filter commits for processing + commits_to_process = [] + results = {} + + for commit in commits: + # Check state + result = await self.state.get_result(commit.sha) + if result: + results[commit.sha] = result + else: + commits_to_process.append(commit) + + print(f"Found {len(commits)} commits, {len(results)} already processed, {len(commits_to_process)} to process via LLM") + + if not commits_to_process: + return results + + # Split into batches considering prompt size + batches = self._create_smart_batches(commits_to_process) + + total_commits = sum(len(batch) for batch in batches) + print(f"Processing {total_commits} commits in {len(batches)} batches...") + + # Process batches in parallel + batch_results = await asyncio.gather( + *[self._process_batch(batch, i, len(batches), total_commits) for i, batch in enumerate(batches)], + return_exceptions=True + ) + + # Collect results + completed_batches = 0 + for batch_idx, batch_result in enumerate(batch_results): + if isinstance(batch_result, Exception): + # Batch errors are already written to state, just continue + continue + completed_batches += 1 + results.update(batch_result) + + print(f"Completed {completed_batches}/{len(batches)} batches") + + return results + + async def _process_batch(self, batch: List[Commit], batch_idx: int = 0, total_batches: int = 0, total_commits: int = 0) -> Dict[str, Dict[str, Any]]: + """Processes one batch of commits.""" + try: + print(f"[{batch_idx + 1}/{total_batches}] Processing {len(batch)} commits...") + prompt = self._build_prompt(batch) + + # Check prompt length + if len(prompt) > self.config.max_user_prompt_length: + if self.config.truncate_diff: + prompt = self._truncate_prompt(prompt) + else: + # Mark all batch commits as erroneous + error_msg = f"Prompt too long ({len(prompt)} > {self.config.max_user_prompt_length})" + for commit in batch: + await self.state.set_error(commit.sha, error_msg) + return { + commit.sha: { + "classification": "unclear", + "to_changelog": False, + "changelog_line": "", + "detailed_commit_analysis": "" + } for commit in batch + } + + # Send to LLM + response_text = await self.llm_client.generate(prompt) + + # Remove markdown code blocks if present + if response_text.strip().startswith('```json'): + response_text = response_text.strip()[7:] # Remove ```json + if response_text.strip().startswith('```'): + response_text = response_text.strip()[3:] # Remove ``` + if response_text.strip().endswith('```'): + response_text = response_text.strip()[:-3] # Remove trailing ``` + response_text = response_text.strip() + + # Parse response + try: + response_data = json.loads(response_text) + except json.JSONDecodeError as e: + raise LLMError(f"LLM returned invalid JSON: {e}") + + # Check response format + if not isinstance(response_data, dict): + raise LLMError("LLM returned invalid response format (not a dict)") + + # Save results and return + results = {} + for commit in batch: + commit_data = response_data.get(commit.sha, {}) + if isinstance(commit_data, str): + # Fallback if LLM returned just a string + classification = commit_data + to_changelog = classification in ["feature", "breaking-change"] + changelog_line = "" + detailed_commit_analysis = "" + else: + classification = commit_data.get("classification", "unclear") + to_changelog = commit_data.get("to_changelog", None) + changelog_line = commit_data.get("changelog_line", "") + detailed_commit_analysis = commit_data.get("detailed_commit_analysis", "") + + completed = await self.state.set_result(commit.sha, classification, changelog_line, detailed_commit_analysis, to_changelog) + if total_commits > 0: + remaining = total_commits - completed + print(f" Progress: {completed}/{total_commits} commits, {remaining} remaining") + + results[commit.sha] = { + "classification": classification, + "to_changelog": to_changelog, + "changelog_line": changelog_line, + "detailed_commit_analysis": detailed_commit_analysis + } + + print(f"[{batch_idx + 1}/{total_batches}] ✓ Completed") + return results + + except LLMError: + # Critical error - re-raise + raise + except Exception as e: + # Temporary error or other problem - mark commits as erroneous + error_msg = f"{type(e).__name__}: {str(e)}" + print(f"✗ Batch {batch_idx + 1}/{total_batches} failed: {error_msg}") + for commit in batch: + await self.state.set_error(commit.sha, error_msg) + return { + commit.sha: { + "classification": "unclear", + "to_changelog": None, + "changelog_line": "", + "detailed_commit_analysis": "" + } for commit in batch + } + + def _create_smart_batches(self, commits: List[Commit]) -> List[List[Commit]]: + if not commits: + return [] + + batches = [] + current_batch = [] + current_prompt_size = 0 + + system_prompt_size = len(self._build_prompt([])) + + for commit in commits: + commit_prompt_size = self._estimate_commit_size(commit) + + can_add = ( + len(current_batch) < self.config.max_commits_per_batch and + (current_prompt_size + commit_prompt_size + system_prompt_size) <= self.config.max_user_prompt_length + ) + + if can_add: + current_batch.append(commit) + current_prompt_size += commit_prompt_size + else: + if current_batch: + batches.append(current_batch) + + current_batch = [commit] + current_prompt_size = commit_prompt_size + + if current_batch: + batches.append(current_batch) + + return batches + + def _estimate_commit_size(self, commit: Commit) -> int: + """Estimates the prompt size for one commit.""" + size = len(commit.sha) + len(commit.title) + len(commit.message) + size += len(', '.join(f.path for f in commit.changed_files)) + + if self.config.include_diff: + diff = get_commit_diff(commit) + size += len(diff) + + return size + 200 # reserve for JSON formatting and separators + + def _build_prompt(self, commits: List[Commit]) -> str: + """Forms a prompt for a batch of commits.""" + system_prompt = """You are an expert software engineer analyzing git commits for a changelog. +Your task is to analyze commits since the last release and highlight important and interesting changes. +Ignore simple bugfixes, typos, and minor refactoring. + +IMPORTANT: This is for the USERVER project - a C++ asynchronous framework. Focus on changes that are significant for users of this framework. + +For each commit, you MUST provide a JSON object with the following fields: +1. "classification": One of ["feature", "breaking-change", "refactor", "minor", "optimization", "unclear"]. + - Use "breaking-change" if the commit introduces backward-incompatible changes. + - Use "feature" for new functionality that is important for USERVER users. + - Use "refactor" for significant architectural changes. + - Use "minor" for small improvements. + - Use "optimization" for performance improvements, optimizations, and efficiency gains. + - Use "unclear" if you cannot determine the classification. +2. "to_changelog": Boolean - MUST be true for: + - ALL breaking-change commits (these are critical for users) + - Features that are significant for USERVER users (new components, major APIs, important functionality) + - MUST be false for: minor refactoring, bugfixes, typos, internal changes, test updates +3. "changelog_line": A concise, user-friendly description of the change suitable for a changelog. + - IMPORTANT: If the classification is "breaking-change", you MUST include migration or fix instructions in this line if they are present in the commit message. + - Only include this if to_changelog is true. +4. "detailed_commit_analysis": A detailed analysis of what was added, why it was added, and what impact or benefit it brings to the project. + +You MUST return a valid JSON object where keys are commit SHAs and values are the analysis objects. +Example output format: +{ + "commit_sha_1": { + "classification": "feature", + "to_changelog": true, + "changelog_line": "Added support for async LLM processing", + "detailed_commit_analysis": "Added a new LLMProcessor class to handle batching and async requests. This improves performance by allowing parallel processing of commits." + }, + "commit_sha_2": { + "classification": "breaking-change", + "to_changelog": true, + "changelog_line": "Changed config format. Migration: rename 'llm_config' to 'llm-config' in your yaml file.", + "detailed_commit_analysis": "Updated the configuration schema to use hyphens instead of underscores for consistency. This breaks existing configs but aligns with the project's naming conventions." + }, + "commit_sha_3": { + "classification": "minor", + "to_changelog": false, + "changelog_line": "", + "detailed_commit_analysis": "Fixed typo in documentation." + } +} +""" + + user_parts = [] + for commit in commits: + part = f"Commit SHA: {commit.sha}\n" + part += f"Title: {commit.title}\n" + part += f"Message: {commit.message}\n" + part += f"Changed Files: {', '.join(f.path for f in commit.changed_files)}\n" + + if self.config.include_diff: + diff = get_commit_diff(commit) + part += f"Diff:\n{diff}\n" + + user_parts.append(part) + + user_prompt = "Please analyze the following commits:\n\n" + "\n---\n".join(user_parts) + return f"{system_prompt}\n\n{user_prompt}" + + def _truncate_prompt(self, prompt: str) -> str: + """Truncates the prompt to the allowed length.""" + # Simple truncation - in reality, smarter logic may be needed + if len(prompt) <= self.config.max_user_prompt_length: + return prompt + return prompt[:self.config.max_user_prompt_length] \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/llm/state.py b/scripts/changelog_tool/changelog_tool/llm/state.py new file mode 100644 index 000000000000..3bdb702a0279 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/state.py @@ -0,0 +1,84 @@ +import json +import asyncio +from pathlib import Path +from typing import Dict, Any, Set, Optional + +class LLMState: + def __init__(self, state_file_path: Path): + self.state_file_path = state_file_path + self.state: Dict[str, Dict[str, Any]] = {} + self.lock = asyncio.Lock() + + async def load(self) -> None: + """Asynchronously loads state from file.""" + async with self.lock: + if self.state_file_path.exists(): + try: + with open(self.state_file_path, 'r', encoding='utf-8') as f: + loaded_state = json.load(f) + # Ensure the state has the correct format + if isinstance(loaded_state, dict): + self.state = loaded_state + else: + self.state = {} + except (json.JSONDecodeError, IOError) as e: + print(f"Warning: Could not load state file {self.state_file_path}: {e}") + self.state = {} + else: + self.state = {} + + async def save(self) -> None: + """Asynchronously saves state to file.""" + # Create directory if it doesn't exist + self.state_file_path.parent.mkdir(parents=True, exist_ok=True) + + # Atomic write through temporary file + temp_file = self.state_file_path.with_suffix('.tmp') + try: + with open(temp_file, 'w', encoding='utf-8') as f: + json.dump(self.state, f, ensure_ascii=False, indent=2) + temp_file.replace(self.state_file_path) + except IOError as e: + print(f"Error: Could not save state file {self.state_file_path}: {e}") + if temp_file.exists(): + temp_file.unlink() + + async def cleanup(self, valid_shas: Set[str]) -> None: + """Removes from state commits that are not in the current selection.""" + async with self.lock: + keys_to_remove = set(self.state.keys()) - valid_shas + for key in keys_to_remove: + del self.state[key] + if keys_to_remove: + await self.save() + + async def get_result(self, sha: str) -> Optional[Dict[str, Any]]: + """Returns the commit analysis result if it exists and contains no errors.""" + async with self.lock: + commit_data = self.state.get(sha) + if commit_data and commit_data.get("error") is None: + return commit_data + return None + + async def set_result(self, sha: str, classification: str, changelog_line: str, detailed_commit_analysis: str, to_changelog: bool = False) -> int: + """Saves successful classification result. Returns the number of completed commits.""" + async with self.lock: + self.state[sha] = { + "classification": classification, + "to_changelog": to_changelog, + "changelog_line": changelog_line, + "detailed_commit_analysis": detailed_commit_analysis, + "error": None + } + completed = len([k for k, v in self.state.items() if v.get("error") is None]) + await self.save() + return completed + + async def set_error(self, sha: str, error_message: str) -> None: + """Saves classification error.""" + async with self.lock: + self.state[sha] = { + "classification": "unclear", + "error": error_message + } + await self.save() \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/report/command.py b/scripts/changelog_tool/changelog_tool/report/command.py new file mode 100644 index 000000000000..cf8da1034873 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/report/command.py @@ -0,0 +1,222 @@ +import asyncio +import pathlib +import re +from typing import List, Dict, Any + +import changelog_tool.common.git as git +import changelog_tool.common.io as io +from changelog_tool.config import Config +from changelog_tool.collect.classification import ClassifiedCommit, Classification +from changelog_tool.llm.client import HttpLLMClient +from changelog_tool.llm.processor import LLMProcessor +from changelog_tool.llm.config import LLMConfig + + +def report(config: Config) -> None: + print(f"Loading classified commits from {config.report.output_dir}...") + classified_commits: List[ClassifiedCommit] = io.load_classified_commits( + config.report.output_dir, 'classified.json' + ) + + if not classified_commits: + print("No classified commits found. Please run 'collect' command first.") + return + + print(f"Found {len(classified_commits)} classified commits") + + # Load and apply overrides + override_file = config.report.output_dir / 'override.yaml' + if override_file.exists(): + print(f"Applying overrides from {override_file}...") + _apply_overrides(classified_commits, override_file) + + # Identify commits that need LLM analysis + commits_needing_analysis = [ + commit for commit in classified_commits + if commit.to_changelog is True and (not commit.changelog_line or not commit.commit_analysis) + ] + + if commits_needing_analysis: + print(f"Found {len(commits_needing_analysis)} commits needing LLM analysis") + + # Create modified LLM config with 1.5x prompt size and truncate enabled + modified_llm_config = LLMConfig( + target_rps=config.llm_config.target_rps, + retries=config.llm_config.retries, + max_commits_per_batch=config.llm_config.max_commits_per_batch, + max_user_prompt_length=int(config.llm_config.max_user_prompt_length * 1.5), + include_diff=config.llm_config.include_diff, + truncate_diff=True + ) + + llm_client = HttpLLMClient(modified_llm_config) + llm_processor = LLMProcessor(modified_llm_config, llm_client, config.report.output_dir) + + llm_results = asyncio.run(llm_processor.process_commits(commits_needing_analysis)) + + # Update commits with LLM results + for commit in classified_commits: + if commit.sha in llm_results: + result = llm_results[commit.sha] + commit.changelog_line = result.get("changelog_line", "") + commit.commit_analysis = result.get("detailed_commit_analysis", "") + try: + commit.classification = Classification(result.get("classification", "unclear")) + except ValueError: + pass + print(f"Updated commit {commit.sha} with LLM results") + + # Generate changelog + print("Generating changelog...") + changelog_content = _generate_changelog(classified_commits, config.report.github_url) + + # Save changelog + changelog_file = config.report.output_dir / 'changelog.md' + with open(changelog_file, 'w') as f: + f.write(changelog_content) + print(f"Generated changelog: {changelog_file}") + + +def _apply_overrides(commits: List[ClassifiedCommit], override_file: pathlib.Path) -> None: + """Parse override.yaml and apply overrides to commits.""" + import yaml + + with open(override_file, 'r') as f: + override_data = yaml.safe_load(f) + + if not override_data: + return + + # Create a mapping of SHA to commit for quick lookup + commit_map = {commit.sha: commit for commit in commits} + + for sha, override in override_data.items(): + if sha in commit_map: + commit = commit_map[sha] + if 'to_changelog' in override: + commit.to_changelog = override['to_changelog'] + if 'changelog_line' in override: + commit.changelog_line = override['changelog_line'] + if 'classification' in override: + try: + commit.classification = Classification(override['classification']) + except ValueError: + pass + + +def _generate_changelog(commits: List[ClassifiedCommit], github_url: str) -> str: + """Generate formatted Markdown changelog.""" + lines = [] + + # Group commits by classification + groups: Dict[str, List[ClassifiedCommit]] = {} + for commit in commits: + if commit.to_changelog is True and commit.changelog_line: + classification = commit.classification.value + if classification not in groups: + groups[classification] = [] + groups[classification].append(commit) + + # Define order of classifications + classification_order = [ + "breaking-change", + "feature", + "optimization", + "bug", + "refactor", + "minor", + "docs", + "unclear" + ] + + # Generate sections for each classification + for classification in classification_order: + if classification not in groups: + continue + + section_commits = groups[classification] + if not section_commits: + continue + + # Section header + section_title = classification.replace("-", " ").title() + lines.append(f"* {section_title}") + lines.append("") + + # Group commits by component within each classification + component_groups: Dict[str, List[ClassifiedCommit]] = {} + commits_without_component = [] + + for commit in section_commits: + if commit.component: + if commit.component not in component_groups: + component_groups[commit.component] = [] + component_groups[commit.component].append(commit) + else: + commits_without_component.append(commit) + + # Generate entries for each component + for component in sorted(component_groups.keys()): + component_commits = component_groups[component] + lines.append(f" * {component}") + lines.append("") + + for commit in component_commits: + short_sha = commit.sha[:8] + line = f" * {commit.changelog_line} " + + # Add external contributor thanks + if commit.is_external: + author_name = _extract_author_name(commit.author) + line += f" Many thanks to {author_name} for the PR!" + + lines.append(line) + + lines.append("") + + # Generate entries for commits without component + if commits_without_component: + for commit in commits_without_component: + short_sha = commit.sha[:8] + line = f" * {commit.changelog_line} " + + # Add external contributor thanks + if commit.is_external: + author_name = _extract_author_name(commit.author) + line += f" Many thanks to {author_name} for the PR!" + + lines.append(line) + + lines.append("") + + # Collect external contributors not in changelog + # Group by author and collect their commit titles + external_contributors_not_in_changelog: Dict[str, List[str]] = {} + for commit in commits: + if commit.is_external and (commit.to_changelog is False or commit.to_changelog is None): + author_name = _extract_author_name(commit.author) + if author_name not in external_contributors_not_in_changelog: + external_contributors_not_in_changelog[author_name] = [] + external_contributors_not_in_changelog[author_name].append(commit.title) + + if external_contributors_not_in_changelog: + lines.append("* Many thanks to:") + for contributor in sorted(external_contributors_not_in_changelog.keys()): + titles = external_contributors_not_in_changelog[contributor] + if len(titles) == 1: + lines.append(f" * {contributor} for {titles[0]}!") + else: + lines.append(f" * {contributor} for:") + for title in titles: + lines.append(f" * {title}") + lines.append("") + + return "\n".join(lines) + + +def _extract_author_name(author: str) -> str: + """Extract author name from 'Name ' format.""" + match = re.match(r'^(.+?)\s*<', author) + if match: + return match.group(1).strip() + return author diff --git a/scripts/changelog_tool/changelog_tool/report/config.py b/scripts/changelog_tool/changelog_tool/report/config.py new file mode 100644 index 000000000000..74d84c30fc98 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/report/config.py @@ -0,0 +1,6 @@ +import pathlib +import pydantic + +class ReportConfig(pydantic.BaseModel): + github_url: str + output_dir: pathlib.Path = pydantic.Field(default_factory=lambda: pathlib.Path(".changelog")) diff --git a/scripts/changelog_tool/changelog_tool/review/command.py b/scripts/changelog_tool/changelog_tool/review/command.py new file mode 100644 index 000000000000..8249499d71bd --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/review/command.py @@ -0,0 +1,157 @@ +import pathlib +from typing import List + +import changelog_tool.common.io as io +from changelog_tool.config import Config +from changelog_tool.collect.classification import ClassifiedCommit, Classification + + +def review(config: Config) -> None: + print(f"Loading classified commits from {config.review.output_dir}...") + classified_commits: List[ClassifiedCommit] = io.load_classified_commits( + config.review.output_dir, 'classified.json' + ) + + if not classified_commits: + print("No classified commits found. Please run 'collect' command first.") + return + + print(f"Found {len(classified_commits)} classified commits") + + # Split commits into two groups + not_in_changelog: List[ClassifiedCommit] = [] + in_changelog: List[ClassifiedCommit] = [] + + for commit in classified_commits: + if commit.to_changelog is False or commit.classification == Classification.UNCLEAR: + not_in_changelog.append(commit) + elif commit.to_changelog is True: + in_changelog.append(commit) + + # Sort both groups by score_size (descending) + not_in_changelog.sort(key=lambda c: c.score_size, reverse=True) + in_changelog.sort(key=lambda c: c.score_size, reverse=True) + + # Generate markdown report + markdown_content = _generate_markdown_report( + not_in_changelog, in_changelog, config.review.github_url + ) + + # Generate override YAML + override_yaml_content = _generate_override_yaml( + not_in_changelog, in_changelog + ) + + # Write output files + output_dir = config.review.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + + markdown_file = output_dir / 'review_report.md' + with open(markdown_file, 'w') as f: + f.write(markdown_content) + print(f"Generated markdown report: {markdown_file}") + + override_file = output_dir / 'override.yaml' + with open(override_file, 'w') as f: + f.write(override_yaml_content) + print(f"Generated override YAML: {override_file}") + + +def _generate_markdown_report( + not_in_changelog: List[ClassifiedCommit], + in_changelog: List[ClassifiedCommit], + github_url: str +) -> str: + lines = [] + + # Header + lines.append("# Changelog Review Report\n") + + # Not in changelog section + lines.append("## Not in Changelog\n") + lines.append(f"Total: {len(not_in_changelog)} commits\n") + + for commit in not_in_changelog: + lines.append(_format_commit_markdown(commit, github_url)) + lines.append("") + + # In changelog section + lines.append("## In Changelog\n") + lines.append(f"Total: {len(in_changelog)} commits\n") + + for commit in in_changelog: + lines.append(_format_commit_markdown(commit, github_url)) + lines.append("") + + return "\n".join(lines) + + +def _format_commit_markdown(commit: ClassifiedCommit, github_url: str) -> str: + short_sha = commit.sha[:8] + commit_url = f"{github_url}/commit/{commit.sha}" + + lines = [] + lines.append(f"### [{short_sha}]({commit_url}) {commit.title}") + lines.append("") + + # Status + if commit.to_changelog is True: + status = "✅ In Changelog" + elif commit.to_changelog is False: + status = f"❌ Not in Changelog (Classification: {commit.classification})" + else: + status = f"❓ Unclear (Classification: {commit.classification})" + + lines.append(f"**Status:** {status}") + lines.append(f"**Size:** {commit.score_size} lines changed") + lines.append("") + + # Changelog line (if available) + if commit.changelog_line: + lines.append(f"**Changelog Line:** {commit.changelog_line}") + lines.append("") + + # Analysis (if available) + if commit.commit_analysis: + lines.append("**Analysis:**") + lines.append(commit.commit_analysis) + lines.append("") + + return "\n".join(lines) + + +def _generate_override_yaml( + not_in_changelog: List[ClassifiedCommit], + in_changelog: List[ClassifiedCommit] +) -> str: + lines = [] + + # Header comment + lines.append("# Override file for changelog classification") + lines.append("# Uncomment and modify entries to override classification") + lines.append("") + + # Process all commits in order + all_commits = not_in_changelog + in_changelog + + for commit in all_commits: + lines.append(f"# {commit.sha}:") + lines.append(f"# commit_title: \"{commit.title}\"") + + if commit.to_changelog is True: + to_changelog = "true" + elif commit.to_changelog is False: + to_changelog = "false" + else: + to_changelog = "null" + + lines.append(f"# to_changelog: {to_changelog}") + + if commit.changelog_line: + lines.append(f"# changelog_line: \"{commit.changelog_line}\"") + else: + lines.append(f"# changelog_line: null") + + lines.append("") + + return "\n".join(lines) diff --git a/scripts/changelog_tool/changelog_tool/review/config.py b/scripts/changelog_tool/changelog_tool/review/config.py new file mode 100644 index 000000000000..9a40fbf578ec --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/review/config.py @@ -0,0 +1,6 @@ +import pathlib +import pydantic + +class ReviewConfig(pydantic.BaseModel): + github_url: str + output_dir: pathlib.Path = pydantic.Field(default_factory=lambda: pathlib.Path(".changelog")) diff --git a/scripts/changelog_tool/requirements.txt b/scripts/changelog_tool/requirements.txt new file mode 100644 index 000000000000..085d00e208db --- /dev/null +++ b/scripts/changelog_tool/requirements.txt @@ -0,0 +1,5 @@ +click >= 8.0.0 +PyYAML >= 6.0.1 +pydantic >= 2.5.3 +openai >= 1.0.0 +aiolimiter >= 1.1.0