From 7ab221e5a74e012f59ad2a458b01c6c2b3255989 Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 01:05:27 +0300 Subject: [PATCH 01/14] skeleton --- scripts/changelog_tool/changelog-tool | 28 ++++ scripts/changelog_tool/changelog.yaml | 4 + .../changelog_tool/changelog_tool/__init__.py | 0 .../changelog_tool/collect/__init__.py | 0 .../changelog_tool/collect/command.py | 9 ++ .../changelog_tool/collect/config.py | 7 + .../changelog_tool/common/__init__.py | 0 .../changelog_tool/common/git.py | 149 ++++++++++++++++++ .../changelog_tool/changelog_tool/config.py | 13 ++ scripts/changelog_tool/requirements.txt | 3 + 10 files changed, 213 insertions(+) create mode 100755 scripts/changelog_tool/changelog-tool create mode 100644 scripts/changelog_tool/changelog.yaml create mode 100644 scripts/changelog_tool/changelog_tool/__init__.py create mode 100644 scripts/changelog_tool/changelog_tool/collect/__init__.py create mode 100644 scripts/changelog_tool/changelog_tool/collect/command.py create mode 100644 scripts/changelog_tool/changelog_tool/collect/config.py create mode 100644 scripts/changelog_tool/changelog_tool/common/__init__.py create mode 100644 scripts/changelog_tool/changelog_tool/common/git.py create mode 100644 scripts/changelog_tool/changelog_tool/config.py create mode 100644 scripts/changelog_tool/requirements.txt diff --git a/scripts/changelog_tool/changelog-tool b/scripts/changelog_tool/changelog-tool new file mode 100755 index 000000000000..0a3b27939cd2 --- /dev/null +++ b/scripts/changelog_tool/changelog-tool @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import pathlib +import click + +import changelog_tool.config as cfg +import changelog_tool.collect.command as collect_cmd + +@click.group() +@click.option('--config', default='changelog.yaml') +@click.pass_context +def cli(ctx: click.Context, config: str): + ctx.ensure_object(dict) + ctx.obj["CONFIG"] = cfg.parse_config(pathlib.Path(config)) + +@cli.command() +@click.option('--from-sha') +@click.option('--to-sha') +@click.option('--repo-path', type=pathlib.Path) +@click.pass_context +def collect(ctx: click.Context, from_sha: str | None, to_sha: str | None, repo_path: pathlib.Path | None): + from_sha = from_sha or ctx.obj["CONFIG"].collect.from_sha + to_sha = to_sha or ctx.obj["CONFIG"].collect.to_sha + repo_path = repo_path or ctx.obj["CONFIG"].collect.repo_path + + collect_cmd.collect(from_sha, to_sha, repo_path) + +if __name__ == '__main__': + cli() \ No newline at end of file diff --git a/scripts/changelog_tool/changelog.yaml b/scripts/changelog_tool/changelog.yaml new file mode 100644 index 000000000000..bd9e8920d3fd --- /dev/null +++ b/scripts/changelog_tool/changelog.yaml @@ -0,0 +1,4 @@ +collect: + from_sha: c580979b522f43ea1ab9cd55033cd353d52844f6 + to_sha: HEAD + repo_path: ../.. \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/__init__.py b/scripts/changelog_tool/changelog_tool/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/scripts/changelog_tool/changelog_tool/collect/__init__.py b/scripts/changelog_tool/changelog_tool/collect/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py new file mode 100644 index 000000000000..5d21d57003bf --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -0,0 +1,9 @@ +import pathlib + +import changelog_tool.common.git as git + +def collect(from_sha: str, to_sha: str, repo_path: pathlib.Path) -> None: + print(f"Collecting commits from {from_sha} to {to_sha}...") + commits: list[git.Commit] = git.get_commits(from_sha, to_sha, repo_path) + + print(f"Found {len(commits)} commits") \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/collect/config.py b/scripts/changelog_tool/changelog_tool/collect/config.py new file mode 100644 index 000000000000..de742e5ca0e8 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/collect/config.py @@ -0,0 +1,7 @@ +import pathlib +import pydantic + +class CollectConfig(pydantic.BaseModel): + from_sha: str + to_sha: str + repo_path: pathlib.Path = pydantic.Field(default_factory=pathlib.Path.cwd) \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/common/__init__.py b/scripts/changelog_tool/changelog_tool/common/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/scripts/changelog_tool/changelog_tool/common/git.py b/scripts/changelog_tool/changelog_tool/common/git.py new file mode 100644 index 000000000000..d9c87b21fc04 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/common/git.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import re +import subprocess +import pydantic +from pathlib import Path + + +class GitError(Exception): + """Любая ошибка при работе с git.""" + + +class FileChange(pydantic.BaseModel): + path: str + old_path: str | None = None # None если файл не переименован + added_lines: int = 0 + removed_lines: int = 0 + + +class Commit(pydantic.BaseModel): + sha: str + title: str # первая строка message + message: str # полный message + author: str # "Name " + co_authors: list[str] # из "Co-authored-by:" + changed_files: list[FileChange] + total_added: int = 0 + total_removed: int = 0 + + +def get_commits( + from_ref: str | None = None, + to_ref: str = "HEAD", + repo_path: str | Path | None = None, +) -> list[Commit]: + cwd = _repo(repo_path) + rev_range = f"{from_ref}..{to_ref}" if from_ref else to_ref + + raw_shas = _run_git(["log", "--format=%H", rev_range], cwd) + shas = [s.strip() for s in raw_shas.splitlines() if s.strip()] + + return [_fetch_commit(sha, cwd) for sha in shas] + + +def get_commit( + sha: str, + repo_path: str | Path | None = None, +) -> Commit: + return _fetch_commit(sha, _repo(repo_path)) + + +def _repo(repo_path: str | Path | None) -> Path: + return Path(repo_path) if repo_path is not None else Path.cwd() + + +def _run_git(args: list[str], cwd: Path) -> str: + try: + result = subprocess.run( + ["git", *args], + cwd=cwd, + capture_output=True, + text=True, + ) + except FileNotFoundError: + raise GitError("git executable not found") + + if result.returncode != 0: + raise GitError(result.stderr.strip() or f"git {args[0]} failed") + + return result.stdout + + +def _parse_rename(path_str: str) -> tuple[str, str | None]: + m = re.match(r'^(.*?)\{(.*?) => (.*?)\}(.*)$', path_str) + if m: + pre, old_mid, new_mid, suf = m.groups() + old = (pre + old_mid + suf).strip('/') + new = (pre + new_mid + suf).strip('/') + return new, old + + if ' => ' in path_str: + old, new = path_str.split(' => ', 1) + return new.strip(), old.strip() + + return path_str, None + + +def _parse_numstat(output: str) -> list[FileChange]: + changes: list[FileChange] = [] + + for line in output.splitlines(): + line = line.strip() + if not line: + continue + + parts = line.split('\t', 2) + if len(parts) != 3: + continue + + added_str, removed_str, path_str = parts + + added = 0 if added_str == '-' else int(added_str) + removed = 0 if removed_str == '-' else int(removed_str) + + path, old_path = _parse_rename(path_str) + changes.append(FileChange( + path=path, + old_path=old_path, + added_lines=added, + removed_lines=removed, + )) + + return changes + + +def _parse_co_authors(message: str) -> list[str]: + return re.findall(r'(?im)^Co-authored-by:\s*(.+)$', message) + + +def _fetch_commit(sha: str, cwd: Path) -> Commit: + raw_meta = _run_git( + ["show", "-s", "--format=%H%x00%an <%ae>%x00%B", sha], + cwd, + ) + parts = raw_meta.split('\x00', 2) + if len(parts) < 3: + raise GitError(f"Unexpected git show output for {sha!r}") + + sha_full = parts[0].strip() + author = parts[1].strip() + message = parts[2].strip() + title = message.splitlines()[0] if message else "" + + raw_numstat = _run_git( + ["diff-tree", "--root", "--numstat", "-r", "-M", sha], + cwd, + ) + changes = _parse_numstat(raw_numstat) + + return Commit( + sha=sha_full, + title=title, + message=message, + author=author, + co_authors=_parse_co_authors(message), + changed_files=changes, + total_added=sum(c.added_lines for c in changes), + total_removed=sum(c.removed_lines for c in changes), + ) \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/config.py b/scripts/changelog_tool/changelog_tool/config.py new file mode 100644 index 000000000000..4fb7d9790575 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/config.py @@ -0,0 +1,13 @@ +from changelog_tool.collect.config import CollectConfig + +import pydantic +import yaml +import pathlib + +class Config(pydantic.BaseModel): + collect: CollectConfig + +def parse_config(config_path: pathlib.Path) -> Config: + with open(config_path, 'r') as f: + yaml_data = yaml.safe_load(f) + return Config.model_validate(yaml_data) \ No newline at end of file diff --git a/scripts/changelog_tool/requirements.txt b/scripts/changelog_tool/requirements.txt new file mode 100644 index 000000000000..d25ffcfd0fd9 --- /dev/null +++ b/scripts/changelog_tool/requirements.txt @@ -0,0 +1,3 @@ +click >= 8.0.0 +PyYAML >= 6.0.1 +pydantic >= 2.5.3 \ No newline at end of file From 57a2b05c9c91d8047b6875d8ddea0fa7f2c9a050 Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 01:20:12 +0300 Subject: [PATCH 02/14] collect external --- scripts/changelog_tool/AGENTS.md | 15 +++++++++++++++ scripts/changelog_tool/changelog-tool | 15 ++++++++++----- scripts/changelog_tool/changelog.yaml | 5 ++++- .../changelog_tool/collect/command.py | 14 +++++++++++--- .../changelog_tool/collect/config.py | 4 +++- .../changelog_tool/changelog_tool/common/git.py | 1 + 6 files changed, 44 insertions(+), 10 deletions(-) create mode 100644 scripts/changelog_tool/AGENTS.md diff --git a/scripts/changelog_tool/AGENTS.md b/scripts/changelog_tool/AGENTS.md new file mode 100644 index 000000000000..cc01cd6d78d0 --- /dev/null +++ b/scripts/changelog_tool/AGENTS.md @@ -0,0 +1,15 @@ +# Changelog Tool + +This agent is responsible for running the changelog tool, which collects commit information and identifies external contributors. + +## Usage + +IMPORTANT: The changelog tool must always be run with the virtual environment activated: + +```bash +# Always activate the virtual environment first +source .vent/bin/activate + +# Run the tool +./changelog-tool [command] [options] +``` \ No newline at end of file diff --git a/scripts/changelog_tool/changelog-tool b/scripts/changelog_tool/changelog-tool index 0a3b27939cd2..6c811c94b3bd 100755 --- a/scripts/changelog_tool/changelog-tool +++ b/scripts/changelog_tool/changelog-tool @@ -18,11 +18,16 @@ def cli(ctx: click.Context, config: str): @click.option('--repo-path', type=pathlib.Path) @click.pass_context def collect(ctx: click.Context, from_sha: str | None, to_sha: str | None, repo_path: pathlib.Path | None): - from_sha = from_sha or ctx.obj["CONFIG"].collect.from_sha - to_sha = to_sha or ctx.obj["CONFIG"].collect.to_sha - repo_path = repo_path or ctx.obj["CONFIG"].collect.repo_path - - collect_cmd.collect(from_sha, to_sha, repo_path) + # Get the config and override with CLI options if provided + config = ctx.obj["CONFIG"].collect + if from_sha: + config.from_sha = from_sha + if to_sha: + config.to_sha = to_sha + if repo_path: + config.repo_path = repo_path + + collect_cmd.collect(config) if __name__ == '__main__': cli() \ No newline at end of file diff --git a/scripts/changelog_tool/changelog.yaml b/scripts/changelog_tool/changelog.yaml index bd9e8920d3fd..94c53885b2c0 100644 --- a/scripts/changelog_tool/changelog.yaml +++ b/scripts/changelog_tool/changelog.yaml @@ -1,4 +1,7 @@ collect: from_sha: c580979b522f43ea1ab9cd55033cd353d52844f6 to_sha: HEAD - repo_path: ../.. \ No newline at end of file + repo_path: ../.. + core_team_patterns: + - ".*@userver\\.tech" + - ".*@yandex-team\\.com" diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py index 5d21d57003bf..724735f812b6 100644 --- a/scripts/changelog_tool/changelog_tool/collect/command.py +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -1,9 +1,17 @@ import pathlib +import re import changelog_tool.common.git as git +from changelog_tool.collect.config import CollectConfig -def collect(from_sha: str, to_sha: str, repo_path: pathlib.Path) -> None: - print(f"Collecting commits from {from_sha} to {to_sha}...") - commits: list[git.Commit] = git.get_commits(from_sha, to_sha, repo_path) +def collect(config: CollectConfig) -> None: + print(f"Collecting commits from {config.from_sha} to {config.to_sha}...") + commits: list[git.Commit] = git.get_commits(config.from_sha, config.to_sha, config.repo_path) + + core_team_regexes = [re.compile(pattern) for pattern in config.core_team_patterns] + + for commit in commits: + is_core_team = any(regex.match(commit.author) for regex in core_team_regexes) + commit.is_external = not is_core_team print(f"Found {len(commits)} commits") \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/collect/config.py b/scripts/changelog_tool/changelog_tool/collect/config.py index de742e5ca0e8..e003e0906447 100644 --- a/scripts/changelog_tool/changelog_tool/collect/config.py +++ b/scripts/changelog_tool/changelog_tool/collect/config.py @@ -1,7 +1,9 @@ import pathlib import pydantic +from typing import List class CollectConfig(pydantic.BaseModel): from_sha: str to_sha: str - repo_path: pathlib.Path = pydantic.Field(default_factory=pathlib.Path.cwd) \ No newline at end of file + repo_path: pathlib.Path = pydantic.Field(default_factory=pathlib.Path.cwd) + core_team_patterns: List[str] = pydantic.Field(default_factory=list) \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/common/git.py b/scripts/changelog_tool/changelog_tool/common/git.py index d9c87b21fc04..c475fcb2d3ea 100644 --- a/scripts/changelog_tool/changelog_tool/common/git.py +++ b/scripts/changelog_tool/changelog_tool/common/git.py @@ -26,6 +26,7 @@ class Commit(pydantic.BaseModel): changed_files: list[FileChange] total_added: int = 0 total_removed: int = 0 + is_external: bool = False # whether the author is external contributor def get_commits( From 1e2d76dc1d2558435f4c77eeedd0b1f0e97e3b36 Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 01:39:00 +0300 Subject: [PATCH 03/14] classification --- .../changelog_tool/collect/classification.py | 25 +++++++++++++++++++ .../changelog_tool/collect/command.py | 13 +++++++--- .../changelog_tool/common/git.py | 1 - 3 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 scripts/changelog_tool/changelog_tool/collect/classification.py diff --git a/scripts/changelog_tool/changelog_tool/collect/classification.py b/scripts/changelog_tool/changelog_tool/collect/classification.py new file mode 100644 index 000000000000..48fb4d262338 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/collect/classification.py @@ -0,0 +1,25 @@ +from enum import Enum +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from changelog_tool.common.git import Commit + + +class Classification(str, Enum): + FEATURE = "feature" + BUG = "bug" + BREAKING_CHANGE = "breaking-change" + MINOR_BUG = "minor_bug" + REFACTOR = "refactor" + DOCS = "docs" + UNCLEAR = "unclear" + + +class ClassifiedCommit(Commit): + classification: Classification = Classification.UNCLEAR + is_external: bool = False + to_changelog: bool | None = None + +def classify_commit(commit: "Commit") -> Classification: + # Default to unclear if no heuristics match + return Classification.UNCLEAR \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py index 724735f812b6..55dfd5ce37f2 100644 --- a/scripts/changelog_tool/changelog_tool/collect/command.py +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -1,8 +1,10 @@ import pathlib import re +from typing import List import changelog_tool.common.git as git from changelog_tool.collect.config import CollectConfig +from changelog_tool.collect.classification import classify_commit, ClassifiedCommit def collect(config: CollectConfig) -> None: print(f"Collecting commits from {config.from_sha} to {config.to_sha}...") @@ -10,8 +12,13 @@ def collect(config: CollectConfig) -> None: core_team_regexes = [re.compile(pattern) for pattern in config.core_team_patterns] + classified_commits: List[ClassifiedCommit] = [] for commit in commits: is_core_team = any(regex.match(commit.author) for regex in core_team_regexes) - commit.is_external = not is_core_team - - print(f"Found {len(commits)} commits") \ No newline at end of file + classified_commits.append(ClassifiedCommit( + **commit.model_dump(), + classification=classify_commit(commit), + is_external=not is_core_team, + )) + + print(f"Found {len(classified_commits)} commits") \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/common/git.py b/scripts/changelog_tool/changelog_tool/common/git.py index c475fcb2d3ea..d9c87b21fc04 100644 --- a/scripts/changelog_tool/changelog_tool/common/git.py +++ b/scripts/changelog_tool/changelog_tool/common/git.py @@ -26,7 +26,6 @@ class Commit(pydantic.BaseModel): changed_files: list[FileChange] total_added: int = 0 total_removed: int = 0 - is_external: bool = False # whether the author is external contributor def get_commits( From 896927e32ebc30194bee5827bd9f0d5818f207ff Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 01:53:35 +0300 Subject: [PATCH 04/14] heuristics --- scripts/changelog_tool/AGENTS.md | 14 ++++++++++ .../changelog_tool/collect/classification.py | 28 +++++++++++++++++-- .../changelog_tool/collect/command.py | 6 ++-- .../changelog_tool/common/git.py | 2 ++ 4 files changed, 46 insertions(+), 4 deletions(-) diff --git a/scripts/changelog_tool/AGENTS.md b/scripts/changelog_tool/AGENTS.md index cc01cd6d78d0..0a5084d38e4a 100644 --- a/scripts/changelog_tool/AGENTS.md +++ b/scripts/changelog_tool/AGENTS.md @@ -2,6 +2,20 @@ This agent is responsible for running the changelog tool, which collects commit information and identifies external contributors. +## Heuristics for LLM Analysis + +The tool uses heuristics to determine which commits should be sent to an LLM for changelog analysis: + +We calculate a `score_size` metric as `lines_added + lines_deleted` for each commit. + +The tool will NOT send commits to the LLM if they meet any of these criteria: +1. Any file path contains "docs/" or "documentation", OR commit title contains documentation keywords +2. Commit title contains fix/bug keywords AND the commit is small (score_size <= 20) +3. All commits with score_size <= 20 + +Documentation keywords: "doc", "docs", "documentation", "readme" +Fix/bug keywords: "fix", "bugfix", "bug", "patch", "repair", "correct", "resolve" + ## Usage IMPORTANT: The changelog tool must always be run with the virtual environment activated: diff --git a/scripts/changelog_tool/changelog_tool/collect/classification.py b/scripts/changelog_tool/changelog_tool/collect/classification.py index 48fb4d262338..e19272481b99 100644 --- a/scripts/changelog_tool/changelog_tool/collect/classification.py +++ b/scripts/changelog_tool/changelog_tool/collect/classification.py @@ -13,13 +13,37 @@ class Classification(str, Enum): REFACTOR = "refactor" DOCS = "docs" UNCLEAR = "unclear" + MINOR="minor" +MINOR_BUG_SIZE_THRESHOLD = 200 +MINOR_SIZE_THRESHOLD = 50 class ClassifiedCommit(Commit): classification: Classification = Classification.UNCLEAR is_external: bool = False to_changelog: bool | None = None -def classify_commit(commit: "Commit") -> Classification: - # Default to unclear if no heuristics match +def classify_commit(commit: Commit) -> Classification: + has_docs_in_files = any( + "docs/" in file_change.path.lower() or + "documentation" in file_change.path.lower() + for file_change in commit.changed_files + ) + + doc_keywords = ["doc", "docs", "documentation", "readme"] + commit_title_lower = commit.title.lower() + has_docs_in_title = any(keyword in commit_title_lower for keyword in doc_keywords) + + fix_keywords = ["fix", "bugfix", "bug"] + has_fix = any(keyword in commit_title_lower for keyword in fix_keywords) + + if has_docs_in_files or has_docs_in_title: + return Classification.DOCS + + if has_fix and commit.score_size <= MINOR_BUG_SIZE_THRESHOLD: + return Classification.MINOR_BUG + + if commit.score_size <= MINOR_SIZE_THRESHOLD: + return Classification.MINOR + return Classification.UNCLEAR \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py index 55dfd5ce37f2..27388f5d100a 100644 --- a/scripts/changelog_tool/changelog_tool/collect/command.py +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -4,7 +4,7 @@ import changelog_tool.common.git as git from changelog_tool.collect.config import CollectConfig -from changelog_tool.collect.classification import classify_commit, ClassifiedCommit +from changelog_tool.collect.classification import Classification, classify_commit, ClassifiedCommit, should_send_to_llm def collect(config: CollectConfig) -> None: print(f"Collecting commits from {config.from_sha} to {config.to_sha}...") @@ -15,10 +15,12 @@ def collect(config: CollectConfig) -> None: classified_commits: List[ClassifiedCommit] = [] for commit in commits: is_core_team = any(regex.match(commit.author) for regex in core_team_regexes) + classification = classify_commit(commit) classified_commits.append(ClassifiedCommit( **commit.model_dump(), - classification=classify_commit(commit), + classification=classification, is_external=not is_core_team, + to_changelog=classification != Classification.UNCLEAR )) print(f"Found {len(classified_commits)} commits") \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/common/git.py b/scripts/changelog_tool/changelog_tool/common/git.py index d9c87b21fc04..ad85a700579d 100644 --- a/scripts/changelog_tool/changelog_tool/common/git.py +++ b/scripts/changelog_tool/changelog_tool/common/git.py @@ -26,6 +26,7 @@ class Commit(pydantic.BaseModel): changed_files: list[FileChange] total_added: int = 0 total_removed: int = 0 + score_size: int = 0 def get_commits( @@ -146,4 +147,5 @@ def _fetch_commit(sha: str, cwd: Path) -> Commit: changed_files=changes, total_added=sum(c.added_lines for c in changes), total_removed=sum(c.removed_lines for c in changes), + score_size=sum(c.added_lines + c.removed_lines for c in changes), ) \ No newline at end of file From ebdbc06487b8ac3e7302252b07e2635bef7c2bfb Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 02:15:50 +0300 Subject: [PATCH 05/14] preclassify --- .gitignore | 1 + scripts/changelog_tool/AGENTS.md | 9 +++++ scripts/changelog_tool/changelog-tool | 8 +++- .../changelog_tool/collect/classification.py | 4 +- .../changelog_tool/collect/command.py | 23 +++++++++--- .../changelog_tool/collect/config.py | 3 +- .../changelog_tool/common/git.py | 8 +++- .../changelog_tool/common/io.py | 37 +++++++++++++++++++ 8 files changed, 80 insertions(+), 13 deletions(-) create mode 100644 scripts/changelog_tool/changelog_tool/common/io.py diff --git a/.gitignore b/.gitignore index a7d344bca5a7..b19b6b5c2154 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ static-analyzer-report .settings* .clangd .vscode +.changelog scripts/docs/en/components_schema scripts/docs/en/dynamic_configs scripts/docs/en/versions.md diff --git a/scripts/changelog_tool/AGENTS.md b/scripts/changelog_tool/AGENTS.md index 0a5084d38e4a..e1bf08ab547e 100644 --- a/scripts/changelog_tool/AGENTS.md +++ b/scripts/changelog_tool/AGENTS.md @@ -26,4 +26,13 @@ source .vent/bin/activate # Run the tool ./changelog-tool [command] [options] +``` + +## Output Directory + +By default, the tool outputs classified commits to `.changelog/preclassified.json`. You can customize this with the `--output-dir` global option: + +```bash +# Run with custom output directory +./changelog-tool --output-dir ./my-output-dir collect ``` \ No newline at end of file diff --git a/scripts/changelog_tool/changelog-tool b/scripts/changelog_tool/changelog-tool index 6c811c94b3bd..502b314deffe 100755 --- a/scripts/changelog_tool/changelog-tool +++ b/scripts/changelog_tool/changelog-tool @@ -7,10 +7,14 @@ import changelog_tool.collect.command as collect_cmd @click.group() @click.option('--config', default='changelog.yaml') +@click.option('--output-dir', type=pathlib.Path, default=None) @click.pass_context -def cli(ctx: click.Context, config: str): +def cli(ctx: click.Context, config: str, output_dir: pathlib.Path | None): ctx.ensure_object(dict) - ctx.obj["CONFIG"] = cfg.parse_config(pathlib.Path(config)) + parsed_config = cfg.parse_config(pathlib.Path(config)) + if output_dir: + parsed_config.collect.output_dir = output_dir + ctx.obj["CONFIG"] = parsed_config @cli.command() @click.option('--from-sha') diff --git a/scripts/changelog_tool/changelog_tool/collect/classification.py b/scripts/changelog_tool/changelog_tool/collect/classification.py index e19272481b99..a1325ea6b772 100644 --- a/scripts/changelog_tool/changelog_tool/collect/classification.py +++ b/scripts/changelog_tool/changelog_tool/collect/classification.py @@ -1,8 +1,6 @@ from enum import Enum -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from changelog_tool.common.git import Commit +from changelog_tool.common.git import Commit class Classification(str, Enum): diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py index 27388f5d100a..4fdf2429a3ec 100644 --- a/scripts/changelog_tool/changelog_tool/collect/command.py +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -1,10 +1,10 @@ -import pathlib import re from typing import List import changelog_tool.common.git as git +import changelog_tool.common.io as io from changelog_tool.collect.config import CollectConfig -from changelog_tool.collect.classification import Classification, classify_commit, ClassifiedCommit, should_send_to_llm +from changelog_tool.collect.classification import Classification, classify_commit, ClassifiedCommit def collect(config: CollectConfig) -> None: print(f"Collecting commits from {config.from_sha} to {config.to_sha}...") @@ -16,11 +16,22 @@ def collect(config: CollectConfig) -> None: for commit in commits: is_core_team = any(regex.match(commit.author) for regex in core_team_regexes) classification = classify_commit(commit) - classified_commits.append(ClassifiedCommit( + classified_commit = ClassifiedCommit( **commit.model_dump(), classification=classification, is_external=not is_core_team, - to_changelog=classification != Classification.UNCLEAR - )) + to_changelog=None + ) - print(f"Found {len(classified_commits)} commits") \ No newline at end of file + if classification in [Classification.FEATURE, Classification.BUG, Classification.BREAKING_CHANGE]: + classified_commit.to_changelog = True + elif classification == Classification.UNCLEAR: + classified_commit.to_changelog = None + else: + classified_commit.to_changelog = False + + classified_commits.append(classified_commit) + + print(f"Found {len(classified_commits)} commits") + + io.dump_classified_commits(classified_commits, config.output_dir, 'preclassified.json') \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/collect/config.py b/scripts/changelog_tool/changelog_tool/collect/config.py index e003e0906447..1ed44f849aac 100644 --- a/scripts/changelog_tool/changelog_tool/collect/config.py +++ b/scripts/changelog_tool/changelog_tool/collect/config.py @@ -6,4 +6,5 @@ class CollectConfig(pydantic.BaseModel): from_sha: str to_sha: str repo_path: pathlib.Path = pydantic.Field(default_factory=pathlib.Path.cwd) - core_team_patterns: List[str] = pydantic.Field(default_factory=list) \ No newline at end of file + core_team_patterns: List[str] = pydantic.Field(default_factory=list) + output_dir: pathlib.Path = pydantic.Field(default_factory=lambda: pathlib.Path(".changelog")) \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/common/git.py b/scripts/changelog_tool/changelog_tool/common/git.py index ad85a700579d..f240d2ae38f7 100644 --- a/scripts/changelog_tool/changelog_tool/common/git.py +++ b/scripts/changelog_tool/changelog_tool/common/git.py @@ -148,4 +148,10 @@ def _fetch_commit(sha: str, cwd: Path) -> Commit: total_added=sum(c.added_lines for c in changes), total_removed=sum(c.removed_lines for c in changes), score_size=sum(c.added_lines + c.removed_lines for c in changes), - ) \ No newline at end of file + ) + +def get_commit_diff(commit: Commit, repo_path: str | Path | None = None) -> str: + return get_diff_by_sha(commit.sha, repo_path) + +def get_diff_by_sha(sha: str, repo_path: str | Path | None = None) -> str: + return _run_git(["diff-tree", "--root", "-p", "-r", "-M", sha], _repo(repo_path)) diff --git a/scripts/changelog_tool/changelog_tool/common/io.py b/scripts/changelog_tool/changelog_tool/common/io.py new file mode 100644 index 000000000000..47eb987986e6 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/common/io.py @@ -0,0 +1,37 @@ +import json +import pathlib +from typing import List + +from changelog_tool.collect.classification import ClassifiedCommit + + +def dump_classified_commits(commits: List[ClassifiedCommit], output_dir: pathlib.Path, filename: str) -> None: + # Ensure output directory exists + output_dir.mkdir(parents=True, exist_ok=True) + + # Create full path to output file + output_file = output_dir / filename + + # Convert classified commits to JSON format + json_data = [commit.model_dump() for commit in commits] + json_str = json.dumps(json_data, indent=2) + + # Write to file + with open(output_file, 'w') as f: + f.write(json_str) + + +def load_classified_commits(output_dir: pathlib.Path, filename: str) -> List[ClassifiedCommit]: + # Create full path to input file + input_file = output_dir / filename + + # Check if file exists + if not input_file.exists(): + return [] + + # Read from file + with open(input_file, 'r') as f: + json_data = json.load(f) + + # Convert JSON data to ClassifiedCommit objects + return [ClassifiedCommit(**item) for item in json_data] \ No newline at end of file From 01a1cfbc3007030c2f49bdb5aaa85d4773cf80e4 Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 02:36:32 +0300 Subject: [PATCH 06/14] llm --- scripts/changelog_tool/changelog.yaml | 4 + .../changelog_tool/changelog_tool/config.py | 2 + .../changelog_tool/llm/__init__.py | 11 +++ .../changelog_tool/llm/client.py | 92 +++++++++++++++++++ .../changelog_tool/llm/config.py | 5 + .../changelog_tool/llm/exceptions.py | 7 ++ scripts/changelog_tool/requirements.txt | 4 +- 7 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 scripts/changelog_tool/changelog_tool/llm/__init__.py create mode 100644 scripts/changelog_tool/changelog_tool/llm/client.py create mode 100644 scripts/changelog_tool/changelog_tool/llm/config.py create mode 100644 scripts/changelog_tool/changelog_tool/llm/exceptions.py diff --git a/scripts/changelog_tool/changelog.yaml b/scripts/changelog_tool/changelog.yaml index 94c53885b2c0..e81f60f43588 100644 --- a/scripts/changelog_tool/changelog.yaml +++ b/scripts/changelog_tool/changelog.yaml @@ -5,3 +5,7 @@ collect: core_team_patterns: - ".*@userver\\.tech" - ".*@yandex-team\\.com" + +llm-config: + target_rps: 5.0 + retries: 3 diff --git a/scripts/changelog_tool/changelog_tool/config.py b/scripts/changelog_tool/changelog_tool/config.py index 4fb7d9790575..a0ba032814ab 100644 --- a/scripts/changelog_tool/changelog_tool/config.py +++ b/scripts/changelog_tool/changelog_tool/config.py @@ -1,4 +1,5 @@ from changelog_tool.collect.config import CollectConfig +from changelog_tool.llm.config import LLMConfig import pydantic import yaml @@ -6,6 +7,7 @@ class Config(pydantic.BaseModel): collect: CollectConfig + llm_config: LLMConfig = pydantic.Field(alias="llm-config") def parse_config(config_path: pathlib.Path) -> Config: with open(config_path, 'r') as f: diff --git a/scripts/changelog_tool/changelog_tool/llm/__init__.py b/scripts/changelog_tool/changelog_tool/llm/__init__.py new file mode 100644 index 000000000000..1a6db4301fb1 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/__init__.py @@ -0,0 +1,11 @@ +from changelog_tool.llm.config import LLMConfig +from changelog_tool.llm.exceptions import LLMError, LLMTransientError +from changelog_tool.llm.client import BaseLLMClient, HttpLLMClient + +__all__ = [ + "LLMConfig", + "LLMError", + "LLMTransientError", + "BaseLLMClient", + "HttpLLMClient", +] diff --git a/scripts/changelog_tool/changelog_tool/llm/client.py b/scripts/changelog_tool/changelog_tool/llm/client.py new file mode 100644 index 000000000000..e95b1cecffaa --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/client.py @@ -0,0 +1,92 @@ +import os +import asyncio +from abc import ABC, abstractmethod + +import aiohttp +from aiolimiter import AsyncLimiter + +from changelog_tool.llm.config import LLMConfig +from changelog_tool.llm.exceptions import LLMError, LLMTransientError + + +class BaseLLMClient(ABC): + @abstractmethod + async def generate(self, prompt: str) -> str: + """ + Асинхронно отправляет текстовый промпт в LLM и возвращает текстовый ответ. + Может выбрасывать LLMError или LLMTransientError. + """ + pass + + @abstractmethod + async def close(self): + """Закрывает ресурсы клиента.""" + pass + + +class HttpLLMClient(BaseLLMClient): + def __init__(self, config: LLMConfig): + self.url = os.environ.get("CHANGELOG_LLM_URL") + api_key = os.environ.get("CHANGELOG_LLM_API_KEY") + oauth_key = os.environ.get("CHANGELOG_LLM_OAUTH_KEY") + self.retries = config.retries + + if not self.url: + raise RuntimeError("Missing required environment variable: CHANGELOG_LLM_URL") + + if api_key: + auth_header = f"Bearer {api_key}" + elif oauth_key: + auth_header = f"OAuth {oauth_key}" + else: + raise RuntimeError("Missing required environment variable: either CHANGELOG_LLM_API_KEY or CHANGELOG_LLM_OAUTH_KEY must be set") + + self.limiter = AsyncLimiter(config.target_rps, 1) + self.session = aiohttp.ClientSession( + headers={"Authorization": auth_header} + ) + + async def generate(self, prompt: str) -> str: + last_error = None + + for attempt in range(self.retries + 1): + try: + async with self.limiter: + async with self.session.post(self.url, json={"prompt": prompt}) as response: + if response.status == 200: + data = await response.json() + return data.get("response", "") + + if response.status in (400, 401, 403, 404): + text = await response.text() + raise LLMError(f"Critical LLM error: {response.status} - {text}") + + if response.status == 429: + retry_after = response.headers.get("Retry-After") + if retry_after and retry_after.isdigit(): + wait_time = float(retry_after) + else: + wait_time = 2 ** attempt + + last_error = f"429 Too Many Requests. Waiting {wait_time}s" + await asyncio.sleep(wait_time) + continue + + if response.status >= 500: + last_error = f"Server error {response.status}" + await asyncio.sleep(2 ** attempt) + continue + + # Неизвестный статус + text = await response.text() + raise LLMError(f"Unexpected status {response.status}: {text}") + + except aiohttp.ClientError as e: + last_error = f"Client error: {e}" + await asyncio.sleep(2 ** attempt) + continue + + raise LLMTransientError(f"Max retries ({self.retries}) exceeded. Last error: {last_error}") + + async def close(self): + await self.session.close() diff --git a/scripts/changelog_tool/changelog_tool/llm/config.py b/scripts/changelog_tool/changelog_tool/llm/config.py new file mode 100644 index 000000000000..d2223d64d03d --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/config.py @@ -0,0 +1,5 @@ +import pydantic + +class LLMConfig(pydantic.BaseModel): + target_rps: float = 5.0 + retries: int = 3 diff --git a/scripts/changelog_tool/changelog_tool/llm/exceptions.py b/scripts/changelog_tool/changelog_tool/llm/exceptions.py new file mode 100644 index 000000000000..3aa06096ec86 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/exceptions.py @@ -0,0 +1,7 @@ +class LLMError(Exception): + """Критическая ошибка LLM (например, неверный формат запроса, 400 Bad Request).""" + pass + +class LLMTransientError(LLMError): + """Временная ошибка LLM (например, 500, 503, таймаут или исчерпаны попытки ретраев).""" + pass diff --git a/scripts/changelog_tool/requirements.txt b/scripts/changelog_tool/requirements.txt index d25ffcfd0fd9..30798fcb5dec 100644 --- a/scripts/changelog_tool/requirements.txt +++ b/scripts/changelog_tool/requirements.txt @@ -1,3 +1,5 @@ click >= 8.0.0 PyYAML >= 6.0.1 -pydantic >= 2.5.3 \ No newline at end of file +pydantic >= 2.5.3 +aiohttp >= 3.9.0 +aiolimiter >= 1.1.0 \ No newline at end of file From 298caf1ad7497f3862a2a9102e2c6c79bb142286 Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 02:42:21 +0300 Subject: [PATCH 07/14] changelog line --- .../changelog_tool/changelog_tool/collect/classification.py | 2 ++ scripts/changelog_tool/changelog_tool/collect/command.py | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/changelog_tool/changelog_tool/collect/classification.py b/scripts/changelog_tool/changelog_tool/collect/classification.py index a1325ea6b772..96e629d241c8 100644 --- a/scripts/changelog_tool/changelog_tool/collect/classification.py +++ b/scripts/changelog_tool/changelog_tool/collect/classification.py @@ -20,6 +20,8 @@ class ClassifiedCommit(Commit): classification: Classification = Classification.UNCLEAR is_external: bool = False to_changelog: bool | None = None + changelog_line: str | None = None + commit_analysis: str | None = None def classify_commit(commit: Commit) -> Classification: has_docs_in_files = any( diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py index 4fdf2429a3ec..5585d380782f 100644 --- a/scripts/changelog_tool/changelog_tool/collect/command.py +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -20,11 +20,13 @@ def collect(config: CollectConfig) -> None: **commit.model_dump(), classification=classification, is_external=not is_core_team, - to_changelog=None + to_changelog=None, + changelog_line=None, + commit_analysis=None ) if classification in [Classification.FEATURE, Classification.BUG, Classification.BREAKING_CHANGE]: - classified_commit.to_changelog = True + raise RuntimeError("Unexpected positive changelog preclassification") elif classification == Classification.UNCLEAR: classified_commit.to_changelog = None else: From 6a6475ad8c540d77419877ba41d099fc23376ccf Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 03:12:20 +0300 Subject: [PATCH 08/14] llm analysis --- scripts/changelog_tool/changelog.yaml | 4 + .../changelog_tool/collect/command.py | 31 ++- .../changelog_tool/llm/__init__.py | 4 + .../changelog_tool/llm/config.py | 4 + .../changelog_tool/llm/processor.py | 196 ++++++++++++++++++ .../changelog_tool/llm/state.py | 82 ++++++++ 6 files changed, 320 insertions(+), 1 deletion(-) create mode 100644 scripts/changelog_tool/changelog_tool/llm/processor.py create mode 100644 scripts/changelog_tool/changelog_tool/llm/state.py diff --git a/scripts/changelog_tool/changelog.yaml b/scripts/changelog_tool/changelog.yaml index e81f60f43588..776567036dd8 100644 --- a/scripts/changelog_tool/changelog.yaml +++ b/scripts/changelog_tool/changelog.yaml @@ -9,3 +9,7 @@ collect: llm-config: target_rps: 5.0 retries: 3 + max_commits_per_batch: 10 + max_user_prompt_length: 8000 + include_diff: true + truncate_diff: true diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py index 5585d380782f..dfe440539499 100644 --- a/scripts/changelog_tool/changelog_tool/collect/command.py +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -1,10 +1,15 @@ import re +import asyncio +import os from typing import List import changelog_tool.common.git as git import changelog_tool.common.io as io from changelog_tool.collect.config import CollectConfig from changelog_tool.collect.classification import Classification, classify_commit, ClassifiedCommit +from changelog_tool.llm.client import HttpLLMClient +from changelog_tool.llm.processor import LLMProcessor +from changelog_tool.llm.exceptions import LLMError def collect(config: CollectConfig) -> None: print(f"Collecting commits from {config.from_sha} to {config.to_sha}...") @@ -35,5 +40,29 @@ def collect(config: CollectConfig) -> None: classified_commits.append(classified_commit) print(f"Found {len(classified_commits)} commits") + + io.dump_classified_commits(classified_commits, config.output_dir, 'preclassified.json') - io.dump_classified_commits(classified_commits, config.output_dir, 'preclassified.json') \ No newline at end of file + llm_client = HttpLLMClient(config.root.llm_config) + llm_processor = LLMProcessor(config.root.llm_config, llm_client, config.output_dir) + + unclear_commits = [ + commit for commit in classified_commits + if commit.classification == Classification.UNCLEAR + ] + + llm_results = asyncio.run(llm_processor.process_commits(unclear_commits)) + + for commit in classified_commits: + if commit.sha in llm_results: + result = llm_results[commit.sha] + try: + commit.classification = Classification(result.get("classification", "unclear")) + except ValueError: + # Если LLM вернула неизвестную классификацию, оставляем UNCLEAR + pass + commit.changelog_line = result.get("changelog_line") + commit.commit_analysis = result.get("detailed_commit_analysis") + + + io.dump_classified_commits(classified_commits, config.output_dir, 'classified.json') \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/llm/__init__.py b/scripts/changelog_tool/changelog_tool/llm/__init__.py index 1a6db4301fb1..b64ba976b736 100644 --- a/scripts/changelog_tool/changelog_tool/llm/__init__.py +++ b/scripts/changelog_tool/changelog_tool/llm/__init__.py @@ -1,6 +1,8 @@ from changelog_tool.llm.config import LLMConfig from changelog_tool.llm.exceptions import LLMError, LLMTransientError from changelog_tool.llm.client import BaseLLMClient, HttpLLMClient +from changelog_tool.llm.state import LLMState +from changelog_tool.llm.processor import LLMProcessor __all__ = [ "LLMConfig", @@ -8,4 +10,6 @@ "LLMTransientError", "BaseLLMClient", "HttpLLMClient", + "LLMState", + "LLMProcessor", ] diff --git a/scripts/changelog_tool/changelog_tool/llm/config.py b/scripts/changelog_tool/changelog_tool/llm/config.py index d2223d64d03d..5327103bd420 100644 --- a/scripts/changelog_tool/changelog_tool/llm/config.py +++ b/scripts/changelog_tool/changelog_tool/llm/config.py @@ -3,3 +3,7 @@ class LLMConfig(pydantic.BaseModel): target_rps: float = 5.0 retries: int = 3 + max_commits_per_batch: int = 10 + max_user_prompt_length: int = 8000 + include_diff: bool = True + truncate_diff: bool = True diff --git a/scripts/changelog_tool/changelog_tool/llm/processor.py b/scripts/changelog_tool/changelog_tool/llm/processor.py new file mode 100644 index 000000000000..00305cd0987a --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/processor.py @@ -0,0 +1,196 @@ +import asyncio +import json +from typing import List, Dict, Any +from pathlib import Path + +from changelog_tool.common.git import Commit, get_commit_diff +from changelog_tool.llm.client import BaseLLMClient +from changelog_tool.llm.config import LLMConfig +from changelog_tool.llm.state import LLMState +from changelog_tool.llm.exceptions import LLMError, LLMTransientError + +class LLMProcessor: + def __init__(self, config: LLMConfig, llm_client: BaseLLMClient, output_dir: Path): + self.config = config + self.llm_client = llm_client + self.output_dir = output_dir + self.state = LLMState(output_dir / "llm_state.json") + + async def process_commits(self, commits: List[Commit]) -> Dict[str, Dict[str, Any]]: + """ + Асинхронно обрабатывает список коммитов через LLM. + Возвращает словарь SHA -> dict с результатами (classification, changelog_line, detailed_commit_analysis). + """ + # Загружаем и очищаем стейт + await self.state.load() + valid_shas = {commit.sha for commit in commits} + await self.state.cleanup(valid_shas) + + # Фильтруем коммиты для обработки + commits_to_process = [] + results = {} + + for commit in commits: + # Проверяем стейт + result = await self.state.get_result(commit.sha) + if result: + results[commit.sha] = result + else: + commits_to_process.append(commit) + + print(f"Found {len(commits)} commits, {len(results)} already processed, {len(commits_to_process)} to process via LLM") + + if not commits_to_process: + return results + + # Разбиваем на батчи + batches = [ + commits_to_process[i:i + self.config.max_commits_per_batch] + for i in range(0, len(commits_to_process), self.config.max_commits_per_batch) + ] + + # Обрабатываем батчи параллельно + batch_results = await asyncio.gather( + *[self._process_batch(batch) for batch in batches], + return_exceptions=True + ) + + # Собираем результаты + for batch_result in batch_results: + if isinstance(batch_result, Exception): + print(f"Warning: Batch processing failed with exception: {batch_result}") + # Ошибки в батчах уже записаны в стейт, просто продолжаем + continue + results.update(batch_result) + + return results + + async def _process_batch(self, batch: List[Commit]) -> Dict[str, Dict[str, Any]]: + """Обрабатывает один батч коммитов.""" + try: + prompt = self._build_prompt(batch) + + # Проверяем длину промпта + if len(prompt) > self.config.max_user_prompt_length: + if self.config.truncate_diff: + prompt = self._truncate_prompt(prompt) + else: + # Помечаем все коммиты батча как ошибочные + error_msg = f"Prompt too long ({len(prompt)} > {self.config.max_user_prompt_length})" + for commit in batch: + await self.state.set_error(commit.sha, error_msg) + return { + commit.sha: { + "classification": "unclear", + "changelog_line": "", + "detailed_commit_analysis": "" + } for commit in batch + } + + # Отправляем в LLM + response_text = await self.llm_client.generate(prompt) + + # Парсим ответ + try: + response_data = json.loads(response_text) + except json.JSONDecodeError as e: + raise LLMError(f"LLM returned invalid JSON: {e}") + + # Проверяем формат ответа + if not isinstance(response_data, dict): + raise LLMError("LLM returned invalid response format (not a dict)") + + # Сохраняем результаты и возвращаем + results = {} + for commit in batch: + commit_data = response_data.get(commit.sha, {}) + if isinstance(commit_data, str): + # Fallback if LLM returned just a string + classification = commit_data + changelog_line = "" + detailed_commit_analysis = "" + else: + classification = commit_data.get("classification", "unclear") + changelog_line = commit_data.get("changelog_line", "") + detailed_commit_analysis = commit_data.get("detailed_commit_analysis", "") + + await self.state.set_result(commit.sha, classification, changelog_line, detailed_commit_analysis) + results[commit.sha] = { + "classification": classification, + "changelog_line": changelog_line, + "detailed_commit_analysis": detailed_commit_analysis + } + + return results + + except LLMError: + # Критическая ошибка - пробрасываем дальше + raise + except Exception as e: + # Временная ошибка или другая проблема - помечаем коммиты как ошибочные + error_msg = str(e) + for commit in batch: + await self.state.set_error(commit.sha, error_msg) + return { + commit.sha: { + "classification": "unclear", + "changelog_line": "", + "detailed_commit_analysis": "" + } for commit in batch + } + + def _build_prompt(self, commits: List[Commit]) -> str: + """Формирует промпт для батча коммитов.""" + system_prompt = """You are an expert software engineer analyzing git commits for a changelog. +Your task is to analyze commits since the last release and highlight important and interesting changes. +Ignore simple bugfixes, typos, and minor refactoring. + +For each commit, you MUST provide a JSON object with the following fields: +1. "classification": One of ["feature", "breaking-change", "refactor", "minor", "unclear"]. + - Use "breaking-change" if the commit introduces backward-incompatible changes. + - Use "feature" for new functionality. + - Use "refactor" for significant architectural changes. + - Use "minor" for small improvements. + - Use "unclear" if you cannot determine the classification. +2. "changelog_line": A concise, user-friendly description of the change suitable for a changelog. + - IMPORTANT: If the classification is "breaking-change", you MUST include migration or fix instructions in this line if they are present in the commit message. +3. "detailed_commit_analysis": A detailed analysis of what was added, why it was added, and what impact or benefit it brings to the project. + +You MUST return a valid JSON object where keys are commit SHAs and values are the analysis objects. +Example output format: +{ + "commit_sha_1": { + "classification": "feature", + "changelog_line": "Added support for async LLM processing", + "detailed_commit_analysis": "Added a new LLMProcessor class to handle batching and async requests. This improves performance by allowing parallel processing of commits." + }, + "commit_sha_2": { + "classification": "breaking-change", + "changelog_line": "Changed config format. Migration: rename 'llm_config' to 'llm-config' in your yaml file.", + "detailed_commit_analysis": "Updated the configuration schema to use hyphens instead of underscores for consistency. This breaks existing configs but aligns with the project's naming conventions." + } +} +""" + + user_parts = [] + for commit in commits: + part = f"Commit SHA: {commit.sha}\n" + part += f"Title: {commit.title}\n" + part += f"Message: {commit.message}\n" + part += f"Changed Files: {', '.join(f.path for f in commit.changed_files)}\n" + + if self.config.include_diff: + diff = get_commit_diff(commit) + part += f"Diff:\n{diff}\n" + + user_parts.append(part) + + user_prompt = "Please analyze the following commits:\n\n" + "\n---\n".join(user_parts) + return f"{system_prompt}\n\n{user_prompt}" + + def _truncate_prompt(self, prompt: str) -> str: + """Обрезает промпт до допустимой длины.""" + # Простая обрезка - в реальности может потребоваться более умная логика + if len(prompt) <= self.config.max_user_prompt_length: + return prompt + return prompt[:self.config.max_user_prompt_length] \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/llm/state.py b/scripts/changelog_tool/changelog_tool/llm/state.py new file mode 100644 index 000000000000..85ebbe095262 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/llm/state.py @@ -0,0 +1,82 @@ +import json +import asyncio +from pathlib import Path +from typing import Dict, Any, Set, Optional + +class LLMState: + def __init__(self, state_file_path: Path): + self.state_file_path = state_file_path + self.state: Dict[str, Dict[str, Any]] = {} + self.lock = asyncio.Lock() + + async def load(self) -> None: + """Асинхронно загружает состояние из файла.""" + async with self.lock: + if self.state_file_path.exists(): + try: + with open(self.state_file_path, 'r', encoding='utf-8') as f: + loaded_state = json.load(f) + # Убедимся, что состояние имеет правильный формат + if isinstance(loaded_state, dict): + self.state = loaded_state + else: + self.state = {} + except (json.JSONDecodeError, IOError) as e: + print(f"Warning: Could not load state file {self.state_file_path}: {e}") + self.state = {} + else: + self.state = {} + + async def save(self) -> None: + """Асинхронно сохраняет состояние в файл.""" + async with self.lock: + # Создаем директорию если её нет + self.state_file_path.parent.mkdir(parents=True, exist_ok=True) + + # Атомарная запись через временный файл + temp_file = self.state_file_path.with_suffix('.tmp') + try: + with open(temp_file, 'w', encoding='utf-8') as f: + json.dump(self.state, f, ensure_ascii=False, indent=2) + temp_file.replace(self.state_file_path) + except IOError as e: + print(f"Error: Could not save state file {self.state_file_path}: {e}") + if temp_file.exists(): + temp_file.unlink() + + async def cleanup(self, valid_shas: Set[str]) -> None: + """Удаляет из стейта коммиты, не попавшие в текущую выборку.""" + async with self.lock: + keys_to_remove = set(self.state.keys()) - valid_shas + for key in keys_to_remove: + del self.state[key] + if keys_to_remove: + await self.save() + + async def get_result(self, sha: str) -> Optional[Dict[str, Any]]: + """Возвращает результат анализа коммита, если он есть и не содержит ошибки.""" + async with self.lock: + commit_data = self.state.get(sha) + if commit_data and commit_data.get("error") is None: + return commit_data + return None + + async def set_result(self, sha: str, classification: str, changelog_line: str, detailed_commit_analysis: str) -> None: + """Сохраняет успешный результат классификации.""" + async with self.lock: + self.state[sha] = { + "classification": classification, + "changelog_line": changelog_line, + "detailed_commit_analysis": detailed_commit_analysis, + "error": None + } + await self.save() + + async def set_error(self, sha: str, error_message: str) -> None: + """Сохраняет ошибку классификации.""" + async with self.lock: + self.state[sha] = { + "classification": "unclear", + "error": error_message + } + await self.save() \ No newline at end of file From 08aa5020fa1472d74c0bbd2f29c4f7d2e85121ae Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 06:07:50 +0300 Subject: [PATCH 09/14] llm classification 2 --- scripts/changelog_tool/changelog-tool | 8 +- scripts/changelog_tool/changelog.yaml | 6 +- .../changelog_tool/collect/command.py | 20 ++-- .../changelog_tool/llm/client.py | 111 +++++++++++------ .../changelog_tool/llm/processor.py | 113 +++++++++++++++--- .../changelog_tool/llm/state.py | 34 +++--- scripts/changelog_tool/requirements.txt | 4 +- 7 files changed, 212 insertions(+), 84 deletions(-) diff --git a/scripts/changelog_tool/changelog-tool b/scripts/changelog_tool/changelog-tool index 502b314deffe..858937ac1b2c 100755 --- a/scripts/changelog_tool/changelog-tool +++ b/scripts/changelog_tool/changelog-tool @@ -23,13 +23,13 @@ def cli(ctx: click.Context, config: str, output_dir: pathlib.Path | None): @click.pass_context def collect(ctx: click.Context, from_sha: str | None, to_sha: str | None, repo_path: pathlib.Path | None): # Get the config and override with CLI options if provided - config = ctx.obj["CONFIG"].collect + config = ctx.obj["CONFIG"] if from_sha: - config.from_sha = from_sha + config.collect.from_sha = from_sha if to_sha: - config.to_sha = to_sha + config.collect.to_sha = to_sha if repo_path: - config.repo_path = repo_path + config.collect.repo_path = repo_path collect_cmd.collect(config) diff --git a/scripts/changelog_tool/changelog.yaml b/scripts/changelog_tool/changelog.yaml index 776567036dd8..7b6e592a0e8d 100644 --- a/scripts/changelog_tool/changelog.yaml +++ b/scripts/changelog_tool/changelog.yaml @@ -7,9 +7,9 @@ collect: - ".*@yandex-team\\.com" llm-config: - target_rps: 5.0 + target_rps: 1 retries: 3 max_commits_per_batch: 10 - max_user_prompt_length: 8000 + max_user_prompt_length: 50000 include_diff: true - truncate_diff: true + truncate_diff: false diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py index dfe440539499..25eeb1e14b41 100644 --- a/scripts/changelog_tool/changelog_tool/collect/command.py +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -5,17 +5,17 @@ import changelog_tool.common.git as git import changelog_tool.common.io as io -from changelog_tool.collect.config import CollectConfig +from changelog_tool.config import Config from changelog_tool.collect.classification import Classification, classify_commit, ClassifiedCommit from changelog_tool.llm.client import HttpLLMClient from changelog_tool.llm.processor import LLMProcessor from changelog_tool.llm.exceptions import LLMError -def collect(config: CollectConfig) -> None: - print(f"Collecting commits from {config.from_sha} to {config.to_sha}...") - commits: list[git.Commit] = git.get_commits(config.from_sha, config.to_sha, config.repo_path) +def collect(config: Config) -> None: + print(f"Collecting commits from {config.collect.from_sha} to {config.collect.to_sha}...") + commits: list[git.Commit] = git.get_commits(config.collect.from_sha, config.collect.to_sha, config.collect.repo_path) - core_team_regexes = [re.compile(pattern) for pattern in config.core_team_patterns] + core_team_regexes = [re.compile(pattern) for pattern in config.collect.core_team_patterns] classified_commits: List[ClassifiedCommit] = [] for commit in commits: @@ -41,10 +41,10 @@ def collect(config: CollectConfig) -> None: print(f"Found {len(classified_commits)} commits") - io.dump_classified_commits(classified_commits, config.output_dir, 'preclassified.json') + io.dump_classified_commits(classified_commits, config.collect.output_dir, 'preclassified.json') - llm_client = HttpLLMClient(config.root.llm_config) - llm_processor = LLMProcessor(config.root.llm_config, llm_client, config.output_dir) + llm_client = HttpLLMClient(config.llm_config) + llm_processor = LLMProcessor(config.llm_config, llm_client, config.collect.output_dir) unclear_commits = [ commit for commit in classified_commits @@ -61,8 +61,10 @@ def collect(config: CollectConfig) -> None: except ValueError: # Если LLM вернула неизвестную классификацию, оставляем UNCLEAR pass + + commit.to_changelog = result.get("to_changelog") commit.changelog_line = result.get("changelog_line") commit.commit_analysis = result.get("detailed_commit_analysis") - io.dump_classified_commits(classified_commits, config.output_dir, 'classified.json') \ No newline at end of file + io.dump_classified_commits(classified_commits, config.collect.output_dir, 'classified.json') \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/llm/client.py b/scripts/changelog_tool/changelog_tool/llm/client.py index e95b1cecffaa..26880f6e8e9b 100644 --- a/scripts/changelog_tool/changelog_tool/llm/client.py +++ b/scripts/changelog_tool/changelog_tool/llm/client.py @@ -2,7 +2,8 @@ import asyncio from abc import ABC, abstractmethod -import aiohttp +import httpx +import openai from aiolimiter import AsyncLimiter from changelog_tool.llm.config import LLMConfig @@ -29,6 +30,7 @@ def __init__(self, config: LLMConfig): self.url = os.environ.get("CHANGELOG_LLM_URL") api_key = os.environ.get("CHANGELOG_LLM_API_KEY") oauth_key = os.environ.get("CHANGELOG_LLM_OAUTH_KEY") + self.model = os.environ.get("CHANGELOG_LLM_MODEL") self.retries = config.retries if not self.url: @@ -37,13 +39,20 @@ def __init__(self, config: LLMConfig): if api_key: auth_header = f"Bearer {api_key}" elif oauth_key: - auth_header = f"OAuth {oauth_key}" + auth_header = f"Oauth {oauth_key}" else: raise RuntimeError("Missing required environment variable: either CHANGELOG_LLM_API_KEY or CHANGELOG_LLM_OAUTH_KEY must be set") self.limiter = AsyncLimiter(config.target_rps, 1) - self.session = aiohttp.ClientSession( - headers={"Authorization": auth_header} + + http_client = httpx.AsyncClient(verify=False) + + self.client = openai.AsyncOpenAI( + base_url=self.url, + api_key=api_key or oauth_key or "dummy", + default_headers={"Authorization": auth_header}, + max_retries=0, + http_client=http_client, ) async def generate(self, prompt: str) -> str: @@ -52,41 +61,73 @@ async def generate(self, prompt: str) -> str: for attempt in range(self.retries + 1): try: async with self.limiter: - async with self.session.post(self.url, json={"prompt": prompt}) as response: - if response.status == 200: - data = await response.json() - return data.get("response", "") - - if response.status in (400, 401, 403, 404): - text = await response.text() - raise LLMError(f"Critical LLM error: {response.status} - {text}") - - if response.status == 429: - retry_after = response.headers.get("Retry-After") - if retry_after and retry_after.isdigit(): - wait_time = float(retry_after) - else: - wait_time = 2 ** attempt - - last_error = f"429 Too Many Requests. Waiting {wait_time}s" - await asyncio.sleep(wait_time) - continue - - if response.status >= 500: - last_error = f"Server error {response.status}" - await asyncio.sleep(2 ** attempt) - continue - - # Неизвестный статус - text = await response.text() - raise LLMError(f"Unexpected status {response.status}: {text}") - - except aiohttp.ClientError as e: + if attempt > 0: + print(f" Retrying ({attempt}/{self.retries})...") + + response = await self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + ) + + # Handle non-standard API response format + # Some APIs return the actual data in a 'response' dict attribute + if hasattr(response, 'response') and response.response: + response_data = response.response + if isinstance(response_data, dict) and 'choices' in response_data: + choices = response_data['choices'] + if choices and len(choices) > 0: + first_choice = choices[0] + if 'message' in first_choice and 'content' in first_choice['message']: + content = first_choice['message']['content'] + return content or "" + + # Standard OpenAI response format + if not response: + raise ValueError("LLM returned None response") + + if not hasattr(response, 'choices') or not response.choices: + raise ValueError(f"LLM response has no choices. Response: {response}") + + if len(response.choices) == 0: + raise ValueError(f"LLM returned empty choices list. Response: {response}") + + first_choice = response.choices[0] + if not hasattr(first_choice, 'message'): + raise ValueError(f"First choice has no message attribute. Choice: {first_choice}") + + message = first_choice.message + if not hasattr(message, 'content'): + raise ValueError(f"Message has no content attribute. Message: {message}") + + content = message.content + return content or "" + + except openai.RateLimitError as e: + last_error = f"Rate limit: {e}" + print(f" Rate limit hit, waiting...") + await asyncio.sleep(2 ** attempt) + continue + except openai.APIStatusError as e: + if e.status_code in (400, 401, 403, 404): + raise LLMError(f"Critical LLM error: {e.status_code} - {e.message}") + if e.status_code >= 500: + last_error = f"Server error {e.status_code}" + print(f" Server error, retrying...") + await asyncio.sleep(2 ** attempt) + continue + raise LLMError(f"Unexpected status {e.status_code}: {e.message}") + except openai.APIError as e: + last_error = f"API error: {e}" + print(f" API error, retrying...") + await asyncio.sleep(2 ** attempt) + continue + except Exception as e: last_error = f"Client error: {e}" + print(f" Error, retrying...") await asyncio.sleep(2 ** attempt) continue raise LLMTransientError(f"Max retries ({self.retries}) exceeded. Last error: {last_error}") async def close(self): - await self.session.close() + await self.client.close() diff --git a/scripts/changelog_tool/changelog_tool/llm/processor.py b/scripts/changelog_tool/changelog_tool/llm/processor.py index 00305cd0987a..11511aa8d2a0 100644 --- a/scripts/changelog_tool/changelog_tool/llm/processor.py +++ b/scripts/changelog_tool/changelog_tool/llm/processor.py @@ -43,31 +43,35 @@ async def process_commits(self, commits: List[Commit]) -> Dict[str, Dict[str, An if not commits_to_process: return results - # Разбиваем на батчи - batches = [ - commits_to_process[i:i + self.config.max_commits_per_batch] - for i in range(0, len(commits_to_process), self.config.max_commits_per_batch) - ] + # Разбиваем на батчи с учетом размера промпта + batches = self._create_smart_batches(commits_to_process) + + total_commits = sum(len(batch) for batch in batches) + print(f"Processing {total_commits} commits in {len(batches)} batches...") # Обрабатываем батчи параллельно batch_results = await asyncio.gather( - *[self._process_batch(batch) for batch in batches], + *[self._process_batch(batch, i, len(batches), total_commits) for i, batch in enumerate(batches)], return_exceptions=True ) # Собираем результаты - for batch_result in batch_results: + completed_batches = 0 + for batch_idx, batch_result in enumerate(batch_results): if isinstance(batch_result, Exception): - print(f"Warning: Batch processing failed with exception: {batch_result}") # Ошибки в батчах уже записаны в стейт, просто продолжаем continue + completed_batches += 1 results.update(batch_result) + + print(f"Completed {completed_batches}/{len(batches)} batches") return results - async def _process_batch(self, batch: List[Commit]) -> Dict[str, Dict[str, Any]]: + async def _process_batch(self, batch: List[Commit], batch_idx: int = 0, total_batches: int = 0, total_commits: int = 0) -> Dict[str, Dict[str, Any]]: """Обрабатывает один батч коммитов.""" try: + print(f"[{batch_idx + 1}/{total_batches}] Processing {len(batch)} commits...") prompt = self._build_prompt(batch) # Проверяем длину промпта @@ -82,6 +86,7 @@ async def _process_batch(self, batch: List[Commit]) -> Dict[str, Dict[str, Any]] return { commit.sha: { "classification": "unclear", + "to_changelog": False, "changelog_line": "", "detailed_commit_analysis": "" } for commit in batch @@ -90,6 +95,15 @@ async def _process_batch(self, batch: List[Commit]) -> Dict[str, Dict[str, Any]] # Отправляем в LLM response_text = await self.llm_client.generate(prompt) + # Remove markdown code blocks if present + if response_text.strip().startswith('```json'): + response_text = response_text.strip()[7:] # Remove ```json + if response_text.strip().startswith('```'): + response_text = response_text.strip()[3:] # Remove ``` + if response_text.strip().endswith('```'): + response_text = response_text.strip()[:-3] # Remove trailing ``` + response_text = response_text.strip() + # Парсим ответ try: response_data = json.loads(response_text) @@ -107,20 +121,28 @@ async def _process_batch(self, batch: List[Commit]) -> Dict[str, Dict[str, Any]] if isinstance(commit_data, str): # Fallback if LLM returned just a string classification = commit_data + to_changelog = classification in ["feature", "breaking-change"] changelog_line = "" detailed_commit_analysis = "" else: classification = commit_data.get("classification", "unclear") + to_changelog = commit_data.get("to_changelog", None) changelog_line = commit_data.get("changelog_line", "") detailed_commit_analysis = commit_data.get("detailed_commit_analysis", "") - - await self.state.set_result(commit.sha, classification, changelog_line, detailed_commit_analysis) + + completed = await self.state.set_result(commit.sha, classification, changelog_line, detailed_commit_analysis, to_changelog) + if total_commits > 0: + remaining = total_commits - completed + print(f" Progress: {completed}/{total_commits} commits, {remaining} remaining") + results[commit.sha] = { "classification": classification, + "to_changelog": to_changelog, "changelog_line": changelog_line, "detailed_commit_analysis": detailed_commit_analysis } + print(f"[{batch_idx + 1}/{total_batches}] ✓ Completed") return results except LLMError: @@ -128,46 +150,107 @@ async def _process_batch(self, batch: List[Commit]) -> Dict[str, Dict[str, Any]] raise except Exception as e: # Временная ошибка или другая проблема - помечаем коммиты как ошибочные - error_msg = str(e) + error_msg = f"{type(e).__name__}: {str(e)}" + print(f"✗ Batch {batch_idx + 1}/{total_batches} failed: {error_msg}") for commit in batch: await self.state.set_error(commit.sha, error_msg) return { commit.sha: { "classification": "unclear", + "to_changelog": None, "changelog_line": "", "detailed_commit_analysis": "" } for commit in batch } + def _create_smart_batches(self, commits: List[Commit]) -> List[List[Commit]]: + if not commits: + return [] + + batches = [] + current_batch = [] + current_prompt_size = 0 + + system_prompt_size = len(self._build_prompt([])) + + for commit in commits: + commit_prompt_size = self._estimate_commit_size(commit) + + can_add = ( + len(current_batch) < self.config.max_commits_per_batch and + (current_prompt_size + commit_prompt_size + system_prompt_size) <= self.config.max_user_prompt_length + ) + + if can_add: + current_batch.append(commit) + current_prompt_size += commit_prompt_size + else: + if current_batch: + batches.append(current_batch) + + current_batch = [commit] + current_prompt_size = commit_prompt_size + + if current_batch: + batches.append(current_batch) + + return batches + + def _estimate_commit_size(self, commit: Commit) -> int: + """Оценивает размер промпта для одного коммита.""" + size = len(commit.sha) + len(commit.title) + len(commit.message) + size += len(', '.join(f.path for f in commit.changed_files)) + + if self.config.include_diff: + diff = get_commit_diff(commit) + size += len(diff) + + return size + 200 # запас на JSON форматирование и разделители + def _build_prompt(self, commits: List[Commit]) -> str: """Формирует промпт для батча коммитов.""" system_prompt = """You are an expert software engineer analyzing git commits for a changelog. Your task is to analyze commits since the last release and highlight important and interesting changes. Ignore simple bugfixes, typos, and minor refactoring. +IMPORTANT: This is for the USERVER project - a C++ asynchronous framework. Focus on changes that are significant for users of this framework. + For each commit, you MUST provide a JSON object with the following fields: 1. "classification": One of ["feature", "breaking-change", "refactor", "minor", "unclear"]. - Use "breaking-change" if the commit introduces backward-incompatible changes. - - Use "feature" for new functionality. + - Use "feature" for new functionality that is important for USERVER users. - Use "refactor" for significant architectural changes. - Use "minor" for small improvements. - Use "unclear" if you cannot determine the classification. -2. "changelog_line": A concise, user-friendly description of the change suitable for a changelog. +2. "to_changelog": Boolean - MUST be true for: + - ALL breaking-change commits (these are critical for users) + - Features that are significant for USERVER users (new components, major APIs, important functionality) + - MUST be false for: minor refactoring, bugfixes, typos, internal changes, test updates +3. "changelog_line": A concise, user-friendly description of the change suitable for a changelog. - IMPORTANT: If the classification is "breaking-change", you MUST include migration or fix instructions in this line if they are present in the commit message. -3. "detailed_commit_analysis": A detailed analysis of what was added, why it was added, and what impact or benefit it brings to the project. + - Only include this if to_changelog is true. +4. "detailed_commit_analysis": A detailed analysis of what was added, why it was added, and what impact or benefit it brings to the project. You MUST return a valid JSON object where keys are commit SHAs and values are the analysis objects. Example output format: { "commit_sha_1": { "classification": "feature", + "to_changelog": true, "changelog_line": "Added support for async LLM processing", "detailed_commit_analysis": "Added a new LLMProcessor class to handle batching and async requests. This improves performance by allowing parallel processing of commits." }, "commit_sha_2": { "classification": "breaking-change", + "to_changelog": true, "changelog_line": "Changed config format. Migration: rename 'llm_config' to 'llm-config' in your yaml file.", "detailed_commit_analysis": "Updated the configuration schema to use hyphens instead of underscores for consistency. This breaks existing configs but aligns with the project's naming conventions." + }, + "commit_sha_3": { + "classification": "minor", + "to_changelog": false, + "changelog_line": "", + "detailed_commit_analysis": "Fixed typo in documentation." } } """ diff --git a/scripts/changelog_tool/changelog_tool/llm/state.py b/scripts/changelog_tool/changelog_tool/llm/state.py index 85ebbe095262..2295c2e6e35c 100644 --- a/scripts/changelog_tool/changelog_tool/llm/state.py +++ b/scripts/changelog_tool/changelog_tool/llm/state.py @@ -29,20 +29,19 @@ async def load(self) -> None: async def save(self) -> None: """Асинхронно сохраняет состояние в файл.""" - async with self.lock: - # Создаем директорию если её нет - self.state_file_path.parent.mkdir(parents=True, exist_ok=True) - - # Атомарная запись через временный файл - temp_file = self.state_file_path.with_suffix('.tmp') - try: - with open(temp_file, 'w', encoding='utf-8') as f: - json.dump(self.state, f, ensure_ascii=False, indent=2) - temp_file.replace(self.state_file_path) - except IOError as e: - print(f"Error: Could not save state file {self.state_file_path}: {e}") - if temp_file.exists(): - temp_file.unlink() + # Создаем директорию если её нет + self.state_file_path.parent.mkdir(parents=True, exist_ok=True) + + # Атомарная запись через временный файл + temp_file = self.state_file_path.with_suffix('.tmp') + try: + with open(temp_file, 'w', encoding='utf-8') as f: + json.dump(self.state, f, ensure_ascii=False, indent=2) + temp_file.replace(self.state_file_path) + except IOError as e: + print(f"Error: Could not save state file {self.state_file_path}: {e}") + if temp_file.exists(): + temp_file.unlink() async def cleanup(self, valid_shas: Set[str]) -> None: """Удаляет из стейта коммиты, не попавшие в текущую выборку.""" @@ -61,16 +60,19 @@ async def get_result(self, sha: str) -> Optional[Dict[str, Any]]: return commit_data return None - async def set_result(self, sha: str, classification: str, changelog_line: str, detailed_commit_analysis: str) -> None: - """Сохраняет успешный результат классификации.""" + async def set_result(self, sha: str, classification: str, changelog_line: str, detailed_commit_analysis: str, to_changelog: bool = False) -> int: + """Сохраняет успешный результат классификации. Возвращает количество готовых коммитов.""" async with self.lock: self.state[sha] = { "classification": classification, + "to_changelog": to_changelog, "changelog_line": changelog_line, "detailed_commit_analysis": detailed_commit_analysis, "error": None } + completed = len([k for k, v in self.state.items() if v.get("error") is None]) await self.save() + return completed async def set_error(self, sha: str, error_message: str) -> None: """Сохраняет ошибку классификации.""" diff --git a/scripts/changelog_tool/requirements.txt b/scripts/changelog_tool/requirements.txt index 30798fcb5dec..085d00e208db 100644 --- a/scripts/changelog_tool/requirements.txt +++ b/scripts/changelog_tool/requirements.txt @@ -1,5 +1,5 @@ click >= 8.0.0 PyYAML >= 6.0.1 pydantic >= 2.5.3 -aiohttp >= 3.9.0 -aiolimiter >= 1.1.0 \ No newline at end of file +openai >= 1.0.0 +aiolimiter >= 1.1.0 From 16c5adc0032ebe6c0fc664b2bf6c8afce9f63b00 Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 06:31:41 +0300 Subject: [PATCH 10/14] review --- scripts/changelog_tool/AGENTS.md | 40 +++++ scripts/changelog_tool/changelog-tool | 8 + scripts/changelog_tool/changelog.yaml | 5 +- .../changelog_tool/collect/classification.py | 8 +- .../changelog_tool/changelog_tool/config.py | 2 + .../changelog_tool/review/command.py | 157 ++++++++++++++++++ .../changelog_tool/review/config.py | 6 + 7 files changed, 218 insertions(+), 8 deletions(-) create mode 100644 scripts/changelog_tool/changelog_tool/review/command.py create mode 100644 scripts/changelog_tool/changelog_tool/review/config.py diff --git a/scripts/changelog_tool/AGENTS.md b/scripts/changelog_tool/AGENTS.md index e1bf08ab547e..c0922631badb 100644 --- a/scripts/changelog_tool/AGENTS.md +++ b/scripts/changelog_tool/AGENTS.md @@ -28,6 +28,45 @@ source .vent/bin/activate ./changelog-tool [command] [options] ``` +## Commands + +### collect + +Collects commits from the specified range and classifies them using heuristics and LLM analysis. + +```bash +./changelog-tool collect [options] +``` + +Options: +- `--from-sha`: Starting commit SHA (overrides config) +- `--to-sha`: Ending commit SHA (overrides config) +- `--repo-path`: Path to the repository (overrides config) + +### review + +Generates a markdown report and an override YAML file for reviewing classified commits. + +```bash +./changelog-tool review +``` + +The review command generates two files in the output directory: +- `review_report.md`: A markdown report showing all commits, sorted by size, with their classification status, changelog lines, and analysis +- `override.yaml`: A commented YAML file containing all commits that can be uncommented and modified to override classifications + +The report is divided into two sections: +1. **Not in Changelog**: Commits that are not included in the changelog (either filtered by heuristics or marked as unclear) +2. **In Changelog**: Commits that are included in the changelog + +Each commit in the report shows: +- Commit hash with link to GitHub +- Commit title +- Status (✅ In Changelog, ❌ Not in Changelog, or ❓ Unclear) +- Size (number of lines changed) +- Changelog line (if available) +- Analysis (if available) + ## Output Directory By default, the tool outputs classified commits to `.changelog/preclassified.json`. You can customize this with the `--output-dir` global option: @@ -35,4 +74,5 @@ By default, the tool outputs classified commits to `.changelog/preclassified.jso ```bash # Run with custom output directory ./changelog-tool --output-dir ./my-output-dir collect +./changelog-tool --output-dir ./my-output-dir review ``` \ No newline at end of file diff --git a/scripts/changelog_tool/changelog-tool b/scripts/changelog_tool/changelog-tool index 858937ac1b2c..702ce86fc036 100755 --- a/scripts/changelog_tool/changelog-tool +++ b/scripts/changelog_tool/changelog-tool @@ -4,6 +4,7 @@ import click import changelog_tool.config as cfg import changelog_tool.collect.command as collect_cmd +import changelog_tool.review.command as review_cmd @click.group() @click.option('--config', default='changelog.yaml') @@ -14,6 +15,7 @@ def cli(ctx: click.Context, config: str, output_dir: pathlib.Path | None): parsed_config = cfg.parse_config(pathlib.Path(config)) if output_dir: parsed_config.collect.output_dir = output_dir + parsed_config.review.output_dir = output_dir ctx.obj["CONFIG"] = parsed_config @cli.command() @@ -33,5 +35,11 @@ def collect(ctx: click.Context, from_sha: str | None, to_sha: str | None, repo_p collect_cmd.collect(config) +@cli.command() +@click.pass_context +def review(ctx: click.Context): + config = ctx.obj["CONFIG"] + review_cmd.review(config) + if __name__ == '__main__': cli() \ No newline at end of file diff --git a/scripts/changelog_tool/changelog.yaml b/scripts/changelog_tool/changelog.yaml index 7b6e592a0e8d..e570506f28f0 100644 --- a/scripts/changelog_tool/changelog.yaml +++ b/scripts/changelog_tool/changelog.yaml @@ -10,6 +10,9 @@ llm-config: target_rps: 1 retries: 3 max_commits_per_batch: 10 - max_user_prompt_length: 50000 + max_user_prompt_length: 100000 include_diff: true truncate_diff: false + +review: + github_url: "https://github.com/userver-framework/userver" diff --git a/scripts/changelog_tool/changelog_tool/collect/classification.py b/scripts/changelog_tool/changelog_tool/collect/classification.py index 96e629d241c8..0dbe6125a0ab 100644 --- a/scripts/changelog_tool/changelog_tool/collect/classification.py +++ b/scripts/changelog_tool/changelog_tool/collect/classification.py @@ -24,12 +24,6 @@ class ClassifiedCommit(Commit): commit_analysis: str | None = None def classify_commit(commit: Commit) -> Classification: - has_docs_in_files = any( - "docs/" in file_change.path.lower() or - "documentation" in file_change.path.lower() - for file_change in commit.changed_files - ) - doc_keywords = ["doc", "docs", "documentation", "readme"] commit_title_lower = commit.title.lower() has_docs_in_title = any(keyword in commit_title_lower for keyword in doc_keywords) @@ -37,7 +31,7 @@ def classify_commit(commit: Commit) -> Classification: fix_keywords = ["fix", "bugfix", "bug"] has_fix = any(keyword in commit_title_lower for keyword in fix_keywords) - if has_docs_in_files or has_docs_in_title: + if has_docs_in_title: return Classification.DOCS if has_fix and commit.score_size <= MINOR_BUG_SIZE_THRESHOLD: diff --git a/scripts/changelog_tool/changelog_tool/config.py b/scripts/changelog_tool/changelog_tool/config.py index a0ba032814ab..fc0ada2f8ab3 100644 --- a/scripts/changelog_tool/changelog_tool/config.py +++ b/scripts/changelog_tool/changelog_tool/config.py @@ -1,5 +1,6 @@ from changelog_tool.collect.config import CollectConfig from changelog_tool.llm.config import LLMConfig +from changelog_tool.review.config import ReviewConfig import pydantic import yaml @@ -8,6 +9,7 @@ class Config(pydantic.BaseModel): collect: CollectConfig llm_config: LLMConfig = pydantic.Field(alias="llm-config") + review: ReviewConfig def parse_config(config_path: pathlib.Path) -> Config: with open(config_path, 'r') as f: diff --git a/scripts/changelog_tool/changelog_tool/review/command.py b/scripts/changelog_tool/changelog_tool/review/command.py new file mode 100644 index 000000000000..8249499d71bd --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/review/command.py @@ -0,0 +1,157 @@ +import pathlib +from typing import List + +import changelog_tool.common.io as io +from changelog_tool.config import Config +from changelog_tool.collect.classification import ClassifiedCommit, Classification + + +def review(config: Config) -> None: + print(f"Loading classified commits from {config.review.output_dir}...") + classified_commits: List[ClassifiedCommit] = io.load_classified_commits( + config.review.output_dir, 'classified.json' + ) + + if not classified_commits: + print("No classified commits found. Please run 'collect' command first.") + return + + print(f"Found {len(classified_commits)} classified commits") + + # Split commits into two groups + not_in_changelog: List[ClassifiedCommit] = [] + in_changelog: List[ClassifiedCommit] = [] + + for commit in classified_commits: + if commit.to_changelog is False or commit.classification == Classification.UNCLEAR: + not_in_changelog.append(commit) + elif commit.to_changelog is True: + in_changelog.append(commit) + + # Sort both groups by score_size (descending) + not_in_changelog.sort(key=lambda c: c.score_size, reverse=True) + in_changelog.sort(key=lambda c: c.score_size, reverse=True) + + # Generate markdown report + markdown_content = _generate_markdown_report( + not_in_changelog, in_changelog, config.review.github_url + ) + + # Generate override YAML + override_yaml_content = _generate_override_yaml( + not_in_changelog, in_changelog + ) + + # Write output files + output_dir = config.review.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + + markdown_file = output_dir / 'review_report.md' + with open(markdown_file, 'w') as f: + f.write(markdown_content) + print(f"Generated markdown report: {markdown_file}") + + override_file = output_dir / 'override.yaml' + with open(override_file, 'w') as f: + f.write(override_yaml_content) + print(f"Generated override YAML: {override_file}") + + +def _generate_markdown_report( + not_in_changelog: List[ClassifiedCommit], + in_changelog: List[ClassifiedCommit], + github_url: str +) -> str: + lines = [] + + # Header + lines.append("# Changelog Review Report\n") + + # Not in changelog section + lines.append("## Not in Changelog\n") + lines.append(f"Total: {len(not_in_changelog)} commits\n") + + for commit in not_in_changelog: + lines.append(_format_commit_markdown(commit, github_url)) + lines.append("") + + # In changelog section + lines.append("## In Changelog\n") + lines.append(f"Total: {len(in_changelog)} commits\n") + + for commit in in_changelog: + lines.append(_format_commit_markdown(commit, github_url)) + lines.append("") + + return "\n".join(lines) + + +def _format_commit_markdown(commit: ClassifiedCommit, github_url: str) -> str: + short_sha = commit.sha[:8] + commit_url = f"{github_url}/commit/{commit.sha}" + + lines = [] + lines.append(f"### [{short_sha}]({commit_url}) {commit.title}") + lines.append("") + + # Status + if commit.to_changelog is True: + status = "✅ In Changelog" + elif commit.to_changelog is False: + status = f"❌ Not in Changelog (Classification: {commit.classification})" + else: + status = f"❓ Unclear (Classification: {commit.classification})" + + lines.append(f"**Status:** {status}") + lines.append(f"**Size:** {commit.score_size} lines changed") + lines.append("") + + # Changelog line (if available) + if commit.changelog_line: + lines.append(f"**Changelog Line:** {commit.changelog_line}") + lines.append("") + + # Analysis (if available) + if commit.commit_analysis: + lines.append("**Analysis:**") + lines.append(commit.commit_analysis) + lines.append("") + + return "\n".join(lines) + + +def _generate_override_yaml( + not_in_changelog: List[ClassifiedCommit], + in_changelog: List[ClassifiedCommit] +) -> str: + lines = [] + + # Header comment + lines.append("# Override file for changelog classification") + lines.append("# Uncomment and modify entries to override classification") + lines.append("") + + # Process all commits in order + all_commits = not_in_changelog + in_changelog + + for commit in all_commits: + lines.append(f"# {commit.sha}:") + lines.append(f"# commit_title: \"{commit.title}\"") + + if commit.to_changelog is True: + to_changelog = "true" + elif commit.to_changelog is False: + to_changelog = "false" + else: + to_changelog = "null" + + lines.append(f"# to_changelog: {to_changelog}") + + if commit.changelog_line: + lines.append(f"# changelog_line: \"{commit.changelog_line}\"") + else: + lines.append(f"# changelog_line: null") + + lines.append("") + + return "\n".join(lines) diff --git a/scripts/changelog_tool/changelog_tool/review/config.py b/scripts/changelog_tool/changelog_tool/review/config.py new file mode 100644 index 000000000000..9a40fbf578ec --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/review/config.py @@ -0,0 +1,6 @@ +import pathlib +import pydantic + +class ReviewConfig(pydantic.BaseModel): + github_url: str + output_dir: pathlib.Path = pydantic.Field(default_factory=lambda: pathlib.Path(".changelog")) From dbdb6acfca89d6669a60ff8f6fa9d765e3a2a43f Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 10:21:38 +0300 Subject: [PATCH 11/14] report generation --- scripts/changelog_tool/AGENTS.md | 25 +++ scripts/changelog_tool/changelog-tool | 8 + scripts/changelog_tool/changelog.yaml | 7 +- .../changelog_tool/collect/classification.py | 1 + .../changelog_tool/changelog_tool/config.py | 2 + .../changelog_tool/llm/processor.py | 3 +- .../changelog_tool/report/command.py | 175 ++++++++++++++++++ .../changelog_tool/report/config.py | 6 + 8 files changed, 224 insertions(+), 3 deletions(-) create mode 100644 scripts/changelog_tool/changelog_tool/report/command.py create mode 100644 scripts/changelog_tool/changelog_tool/report/config.py diff --git a/scripts/changelog_tool/AGENTS.md b/scripts/changelog_tool/AGENTS.md index c0922631badb..c53eb45e36be 100644 --- a/scripts/changelog_tool/AGENTS.md +++ b/scripts/changelog_tool/AGENTS.md @@ -67,6 +67,31 @@ Each commit in the report shows: - Changelog line (if available) - Analysis (if available) +### report + +Generates a formatted Markdown changelog based on the review output and applies user overrides. + +```bash +./changelog-tool report +``` + +The report command performs the following steps: +1. Loads classified commits from `classified.json` +2. Applies overrides from `override.yaml` (if present) +3. Identifies commits marked for the changelog that lack changelog lines or analysis +4. Runs these commits through the LLM with 1.5x increased prompt size and diff truncation enabled +5. Generates a formatted Markdown changelog grouped by classification: + - Breaking Changes + - Features + - Optimizations + - Bug Fixes + - Refactoring + - Minor Changes + - Documentation +6. Appends "Many thanks to [Name] for the PR!" for external contributors in the changelog +7. Appends a section at the end for external contributors not included in the changelog +8. Saves the generated changelog to `changelog.md` in the output directory + ## Output Directory By default, the tool outputs classified commits to `.changelog/preclassified.json`. You can customize this with the `--output-dir` global option: diff --git a/scripts/changelog_tool/changelog-tool b/scripts/changelog_tool/changelog-tool index 702ce86fc036..801aca07f57b 100755 --- a/scripts/changelog_tool/changelog-tool +++ b/scripts/changelog_tool/changelog-tool @@ -5,6 +5,7 @@ import click import changelog_tool.config as cfg import changelog_tool.collect.command as collect_cmd import changelog_tool.review.command as review_cmd +import changelog_tool.report.command as report_cmd @click.group() @click.option('--config', default='changelog.yaml') @@ -16,6 +17,7 @@ def cli(ctx: click.Context, config: str, output_dir: pathlib.Path | None): if output_dir: parsed_config.collect.output_dir = output_dir parsed_config.review.output_dir = output_dir + parsed_config.report.output_dir = output_dir ctx.obj["CONFIG"] = parsed_config @cli.command() @@ -41,5 +43,11 @@ def review(ctx: click.Context): config = ctx.obj["CONFIG"] review_cmd.review(config) +@cli.command() +@click.pass_context +def report(ctx: click.Context): + config = ctx.obj["CONFIG"] + report_cmd.report(config) + if __name__ == '__main__': cli() \ No newline at end of file diff --git a/scripts/changelog_tool/changelog.yaml b/scripts/changelog_tool/changelog.yaml index e570506f28f0..b76d6e3bf37a 100644 --- a/scripts/changelog_tool/changelog.yaml +++ b/scripts/changelog_tool/changelog.yaml @@ -1,5 +1,5 @@ collect: - from_sha: c580979b522f43ea1ab9cd55033cd353d52844f6 + from_sha: da8642900398c33333e29e2bd3e91ca4e181f602 to_sha: HEAD repo_path: ../.. core_team_patterns: @@ -9,10 +9,13 @@ collect: llm-config: target_rps: 1 retries: 3 - max_commits_per_batch: 10 + max_commits_per_batch: 4 max_user_prompt_length: 100000 include_diff: true truncate_diff: false review: github_url: "https://github.com/userver-framework/userver" + +report: + github_url: "https://github.com/userver-framework/userver" diff --git a/scripts/changelog_tool/changelog_tool/collect/classification.py b/scripts/changelog_tool/changelog_tool/collect/classification.py index 0dbe6125a0ab..9f0a9c0abc2c 100644 --- a/scripts/changelog_tool/changelog_tool/collect/classification.py +++ b/scripts/changelog_tool/changelog_tool/collect/classification.py @@ -12,6 +12,7 @@ class Classification(str, Enum): DOCS = "docs" UNCLEAR = "unclear" MINOR="minor" + OPTIMIZATION = "optimization" MINOR_BUG_SIZE_THRESHOLD = 200 MINOR_SIZE_THRESHOLD = 50 diff --git a/scripts/changelog_tool/changelog_tool/config.py b/scripts/changelog_tool/changelog_tool/config.py index fc0ada2f8ab3..57b38b878d01 100644 --- a/scripts/changelog_tool/changelog_tool/config.py +++ b/scripts/changelog_tool/changelog_tool/config.py @@ -1,6 +1,7 @@ from changelog_tool.collect.config import CollectConfig from changelog_tool.llm.config import LLMConfig from changelog_tool.review.config import ReviewConfig +from changelog_tool.report.config import ReportConfig import pydantic import yaml @@ -10,6 +11,7 @@ class Config(pydantic.BaseModel): collect: CollectConfig llm_config: LLMConfig = pydantic.Field(alias="llm-config") review: ReviewConfig + report: ReportConfig def parse_config(config_path: pathlib.Path) -> Config: with open(config_path, 'r') as f: diff --git a/scripts/changelog_tool/changelog_tool/llm/processor.py b/scripts/changelog_tool/changelog_tool/llm/processor.py index 11511aa8d2a0..ac3bde54142b 100644 --- a/scripts/changelog_tool/changelog_tool/llm/processor.py +++ b/scripts/changelog_tool/changelog_tool/llm/processor.py @@ -216,11 +216,12 @@ def _build_prompt(self, commits: List[Commit]) -> str: IMPORTANT: This is for the USERVER project - a C++ asynchronous framework. Focus on changes that are significant for users of this framework. For each commit, you MUST provide a JSON object with the following fields: -1. "classification": One of ["feature", "breaking-change", "refactor", "minor", "unclear"]. +1. "classification": One of ["feature", "breaking-change", "refactor", "minor", "optimization", "unclear"]. - Use "breaking-change" if the commit introduces backward-incompatible changes. - Use "feature" for new functionality that is important for USERVER users. - Use "refactor" for significant architectural changes. - Use "minor" for small improvements. + - Use "optimization" for performance improvements, optimizations, and efficiency gains. - Use "unclear" if you cannot determine the classification. 2. "to_changelog": Boolean - MUST be true for: - ALL breaking-change commits (these are critical for users) diff --git a/scripts/changelog_tool/changelog_tool/report/command.py b/scripts/changelog_tool/changelog_tool/report/command.py new file mode 100644 index 000000000000..073736d7b0d4 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/report/command.py @@ -0,0 +1,175 @@ +import asyncio +import pathlib +import re +from typing import List, Dict, Any + +import changelog_tool.common.git as git +import changelog_tool.common.io as io +from changelog_tool.config import Config +from changelog_tool.collect.classification import ClassifiedCommit, Classification +from changelog_tool.llm.client import HttpLLMClient +from changelog_tool.llm.processor import LLMProcessor +from changelog_tool.llm.config import LLMConfig + + +def report(config: Config) -> None: + print(f"Loading classified commits from {config.report.output_dir}...") + classified_commits: List[ClassifiedCommit] = io.load_classified_commits( + config.report.output_dir, 'classified.json' + ) + + if not classified_commits: + print("No classified commits found. Please run 'collect' command first.") + return + + print(f"Found {len(classified_commits)} classified commits") + + # Load and apply overrides + override_file = config.report.output_dir / 'override.yaml' + if override_file.exists(): + print(f"Applying overrides from {override_file}...") + _apply_overrides(classified_commits, override_file) + + # Identify commits that need LLM analysis + commits_needing_analysis = [ + commit for commit in classified_commits + if commit.to_changelog is True and (not commit.changelog_line or not commit.commit_analysis) + ] + + if commits_needing_analysis: + print(f"Found {len(commits_needing_analysis)} commits needing LLM analysis") + + # Create modified LLM config with 1.5x prompt size and truncate enabled + modified_llm_config = LLMConfig( + target_rps=config.llm_config.target_rps, + retries=config.llm_config.retries, + max_commits_per_batch=config.llm_config.max_commits_per_batch, + max_user_prompt_length=int(config.llm_config.max_user_prompt_length * 1.5), + include_diff=config.llm_config.include_diff, + truncate_diff=True + ) + + llm_client = HttpLLMClient(modified_llm_config) + llm_processor = LLMProcessor(modified_llm_config, llm_client, config.report.output_dir) + + llm_results = asyncio.run(llm_processor.process_commits(commits_needing_analysis)) + + # Update commits with LLM results + for commit in classified_commits: + if commit.sha in llm_results: + result = llm_results[commit.sha] + commit.changelog_line = result.get("changelog_line", "") + commit.commit_analysis = result.get("detailed_commit_analysis", "") + try: + commit.classification = Classification(result.get("classification", "unclear")) + except ValueError: + pass + print(f"Updated commit {commit.sha} with LLM results") + + # Generate changelog + print("Generating changelog...") + changelog_content = _generate_changelog(classified_commits, config.report.github_url) + + # Save changelog + changelog_file = config.report.output_dir / 'changelog.md' + with open(changelog_file, 'w') as f: + f.write(changelog_content) + print(f"Generated changelog: {changelog_file}") + + +def _apply_overrides(commits: List[ClassifiedCommit], override_file: pathlib.Path) -> None: + """Parse override.yaml and apply overrides to commits.""" + import yaml + + with open(override_file, 'r') as f: + override_data = yaml.safe_load(f) + + if not override_data: + return + + # Create a mapping of SHA to commit for quick lookup + commit_map = {commit.sha: commit for commit in commits} + + for sha, override in override_data.items(): + if sha in commit_map: + commit = commit_map[sha] + if 'to_changelog' in override: + commit.to_changelog = override['to_changelog'] + if 'changelog_line' in override: + commit.changelog_line = override['changelog_line'] + + +def _generate_changelog(commits: List[ClassifiedCommit], github_url: str) -> str: + """Generate formatted Markdown changelog.""" + lines = [] + + # Group commits by classification + groups: Dict[str, List[ClassifiedCommit]] = {} + for commit in commits: + if commit.to_changelog is True and commit.changelog_line: + classification = commit.classification.value + if classification not in groups: + groups[classification] = [] + groups[classification].append(commit) + + # Define order of classifications + classification_order = [ + "breaking-change", + "feature", + "optimization", + "bug", + "refactor", + "minor", + "docs", + "unclear" + ] + + # Generate sections for each classification + for classification in classification_order: + if classification not in groups: + continue + + section_commits = groups[classification] + if not section_commits: + continue + + # Section header + section_title = classification.replace("-", " ").title() + lines.append(f"* {section_title}") + lines.append("") + + # Commit entries + for commit in section_commits: + line = f" * {commit.changelog_line}" + + # Add external contributor thanks + if commit.is_external: + author_name = _extract_author_name(commit.author) + line += f" Many thanks to {author_name} for the PR!" + + lines.append(line) + + lines.append("") + + # Collect external contributors not in changelog + external_contributors_not_in_changelog = set() + for commit in commits: + if commit.is_external and (commit.to_changelog is False or commit.to_changelog is None): + author_name = _extract_author_name(commit.author) + external_contributors_not_in_changelog.add(author_name) + + if external_contributors_not_in_changelog: + lines.append("* Many thanks to:") + for contributor in sorted(external_contributors_not_in_changelog): + lines.append(f" * {contributor} for the contribution!") + lines.append("") + + return "\n".join(lines) + + +def _extract_author_name(author: str) -> str: + """Extract author name from 'Name ' format.""" + match = re.match(r'^(.+?)\s*<', author) + if match: + return match.group(1).strip() + return author diff --git a/scripts/changelog_tool/changelog_tool/report/config.py b/scripts/changelog_tool/changelog_tool/report/config.py new file mode 100644 index 000000000000..74d84c30fc98 --- /dev/null +++ b/scripts/changelog_tool/changelog_tool/report/config.py @@ -0,0 +1,6 @@ +import pathlib +import pydantic + +class ReportConfig(pydantic.BaseModel): + github_url: str + output_dir: pathlib.Path = pydantic.Field(default_factory=lambda: pathlib.Path(".changelog")) From c4de078d7f700c17e75cdbf04aa9d7f281f5218f Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 11:43:13 +0300 Subject: [PATCH 12/14] changelog-tool complete --- scripts/changelog_tool/changelog.yaml | 3 +- .../changelog_tool/collect/classification.py | 1 + .../changelog_tool/collect/command.py | 38 +++++++++- .../changelog_tool/llm/client.py | 66 ++++++++--------- .../changelog_tool/llm/config.py | 1 + .../changelog_tool/report/command.py | 71 +++++++++++++++---- 6 files changed, 134 insertions(+), 46 deletions(-) diff --git a/scripts/changelog_tool/changelog.yaml b/scripts/changelog_tool/changelog.yaml index b76d6e3bf37a..e38c1c2ee8e7 100644 --- a/scripts/changelog_tool/changelog.yaml +++ b/scripts/changelog_tool/changelog.yaml @@ -8,11 +8,12 @@ collect: llm-config: target_rps: 1 - retries: 3 + retries: 7 max_commits_per_batch: 4 max_user_prompt_length: 100000 include_diff: true truncate_diff: false + max_concurrent_requests: 2 review: github_url: "https://github.com/userver-framework/userver" diff --git a/scripts/changelog_tool/changelog_tool/collect/classification.py b/scripts/changelog_tool/changelog_tool/collect/classification.py index 9f0a9c0abc2c..cd83f7c870ff 100644 --- a/scripts/changelog_tool/changelog_tool/collect/classification.py +++ b/scripts/changelog_tool/changelog_tool/collect/classification.py @@ -23,6 +23,7 @@ class ClassifiedCommit(Commit): to_changelog: bool | None = None changelog_line: str | None = None commit_analysis: str | None = None + component: str | None = None def classify_commit(commit: Commit) -> Classification: doc_keywords = ["doc", "docs", "documentation", "readme"] diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py index 25eeb1e14b41..9e02ccb07adc 100644 --- a/scripts/changelog_tool/changelog_tool/collect/command.py +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -11,6 +11,38 @@ from changelog_tool.llm.processor import LLMProcessor from changelog_tool.llm.exceptions import LLMError + +def _extract_component_from_title(title: str) -> str | None: + """Extract component name from commit title. + + Examples: + - "feat odbc: improve driver" -> "odbc" + - "fix(redis): connection leak" -> "redis" + - "feat chaotic: deal with..." -> "chaotic" + - "docs: update README" -> None + """ + # Pattern: type(component): or type component: or type component description + match = re.match(r'^(\w+)(?:\(([^)]+)\))?:?\s*(.+)', title) + if match: + commit_type = match.group(1) + component = match.group(2) + description = match.group(3) + + # If component in parentheses, use it + if component: + return component.lower() + + # If no component in parentheses, check description + words = description.split() + if words: + # Check if first word ends with colon (e.g., "odbc: improve driver") + if words[0].endswith(':'): + return words[0][:-1].lower() + # Check if first word is followed by a colon (e.g., "chaotic: deal with...") + if len(words) > 1 and words[1].startswith(':'): + return words[0].lower() + return None + def collect(config: Config) -> None: print(f"Collecting commits from {config.collect.from_sha} to {config.collect.to_sha}...") commits: list[git.Commit] = git.get_commits(config.collect.from_sha, config.collect.to_sha, config.collect.repo_path) @@ -21,13 +53,17 @@ def collect(config: Config) -> None: for commit in commits: is_core_team = any(regex.match(commit.author) for regex in core_team_regexes) classification = classify_commit(commit) + # Extract component from title (e.g., "feat odbc: improve driver" -> "odbc") + component = _extract_component_from_title(commit.title) + classified_commit = ClassifiedCommit( **commit.model_dump(), classification=classification, is_external=not is_core_team, to_changelog=None, changelog_line=None, - commit_analysis=None + commit_analysis=None, + component=component ) if classification in [Classification.FEATURE, Classification.BUG, Classification.BREAKING_CHANGE]: diff --git a/scripts/changelog_tool/changelog_tool/llm/client.py b/scripts/changelog_tool/changelog_tool/llm/client.py index 26880f6e8e9b..76525cba7d05 100644 --- a/scripts/changelog_tool/changelog_tool/llm/client.py +++ b/scripts/changelog_tool/changelog_tool/llm/client.py @@ -44,6 +44,7 @@ def __init__(self, config: LLMConfig): raise RuntimeError("Missing required environment variable: either CHANGELOG_LLM_API_KEY or CHANGELOG_LLM_OAUTH_KEY must be set") self.limiter = AsyncLimiter(config.target_rps, 1) + self.semaphore = asyncio.Semaphore(config.max_concurrent_requests) http_client = httpx.AsyncClient(verify=False) @@ -58,16 +59,17 @@ def __init__(self, config: LLMConfig): async def generate(self, prompt: str) -> str: last_error = None - for attempt in range(self.retries + 1): - try: - async with self.limiter: - if attempt > 0: - print(f" Retrying ({attempt}/{self.retries})...") - - response = await self.client.chat.completions.create( - model=self.model, - messages=[{"role": "user", "content": prompt}], - ) + async with self.semaphore: + for attempt in range(self.retries + 1): + try: + async with self.limiter: + if attempt > 0: + print(f" Retrying ({attempt}/{self.retries})...") + + response = await self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + ) # Handle non-standard API response format # Some APIs return the actual data in a 'response' dict attribute @@ -102,30 +104,30 @@ async def generate(self, prompt: str) -> str: content = message.content return content or "" - except openai.RateLimitError as e: - last_error = f"Rate limit: {e}" - print(f" Rate limit hit, waiting...") - await asyncio.sleep(2 ** attempt) - continue - except openai.APIStatusError as e: - if e.status_code in (400, 401, 403, 404): - raise LLMError(f"Critical LLM error: {e.status_code} - {e.message}") - if e.status_code >= 500: - last_error = f"Server error {e.status_code}" - print(f" Server error, retrying...") + except openai.RateLimitError as e: + last_error = f"Rate limit: {e}" + print(f" Rate limit hit, waiting...") + await asyncio.sleep(2 ** attempt) + continue + except openai.APIStatusError as e: + if e.status_code in (400, 401, 403, 404): + raise LLMError(f"Critical LLM error: {e.status_code} - {e.message}") + if e.status_code >= 500: + last_error = f"Server error {e.status_code}" + print(f" Server error, retrying...") + await asyncio.sleep(2 ** attempt) + continue + raise LLMError(f"Unexpected status {e.status_code}: {e.message}") + except openai.APIError as e: + last_error = f"API error: {e}" + print(f" API error, retrying...") + await asyncio.sleep(2 ** attempt) + continue + except Exception as e: + last_error = f"Client error: {e}" + print(f" Error, retrying...") await asyncio.sleep(2 ** attempt) continue - raise LLMError(f"Unexpected status {e.status_code}: {e.message}") - except openai.APIError as e: - last_error = f"API error: {e}" - print(f" API error, retrying...") - await asyncio.sleep(2 ** attempt) - continue - except Exception as e: - last_error = f"Client error: {e}" - print(f" Error, retrying...") - await asyncio.sleep(2 ** attempt) - continue raise LLMTransientError(f"Max retries ({self.retries}) exceeded. Last error: {last_error}") diff --git a/scripts/changelog_tool/changelog_tool/llm/config.py b/scripts/changelog_tool/changelog_tool/llm/config.py index 5327103bd420..571683c63736 100644 --- a/scripts/changelog_tool/changelog_tool/llm/config.py +++ b/scripts/changelog_tool/changelog_tool/llm/config.py @@ -7,3 +7,4 @@ class LLMConfig(pydantic.BaseModel): max_user_prompt_length: int = 8000 include_diff: bool = True truncate_diff: bool = True + max_concurrent_requests: int = 5 diff --git a/scripts/changelog_tool/changelog_tool/report/command.py b/scripts/changelog_tool/changelog_tool/report/command.py index 073736d7b0d4..cf8da1034873 100644 --- a/scripts/changelog_tool/changelog_tool/report/command.py +++ b/scripts/changelog_tool/changelog_tool/report/command.py @@ -97,6 +97,11 @@ def _apply_overrides(commits: List[ClassifiedCommit], override_file: pathlib.Pat commit.to_changelog = override['to_changelog'] if 'changelog_line' in override: commit.changelog_line = override['changelog_line'] + if 'classification' in override: + try: + commit.classification = Classification(override['classification']) + except ValueError: + pass def _generate_changelog(commits: List[ClassifiedCommit], github_url: str) -> str: @@ -138,30 +143,72 @@ def _generate_changelog(commits: List[ClassifiedCommit], github_url: str) -> str lines.append(f"* {section_title}") lines.append("") - # Commit entries + # Group commits by component within each classification + component_groups: Dict[str, List[ClassifiedCommit]] = {} + commits_without_component = [] + for commit in section_commits: - line = f" * {commit.changelog_line}" + if commit.component: + if commit.component not in component_groups: + component_groups[commit.component] = [] + component_groups[commit.component].append(commit) + else: + commits_without_component.append(commit) + + # Generate entries for each component + for component in sorted(component_groups.keys()): + component_commits = component_groups[component] + lines.append(f" * {component}") + lines.append("") - # Add external contributor thanks - if commit.is_external: - author_name = _extract_author_name(commit.author) - line += f" Many thanks to {author_name} for the PR!" + for commit in component_commits: + short_sha = commit.sha[:8] + line = f" * {commit.changelog_line} " + + # Add external contributor thanks + if commit.is_external: + author_name = _extract_author_name(commit.author) + line += f" Many thanks to {author_name} for the PR!" + + lines.append(line) - lines.append(line) + lines.append("") - lines.append("") + # Generate entries for commits without component + if commits_without_component: + for commit in commits_without_component: + short_sha = commit.sha[:8] + line = f" * {commit.changelog_line} " + + # Add external contributor thanks + if commit.is_external: + author_name = _extract_author_name(commit.author) + line += f" Many thanks to {author_name} for the PR!" + + lines.append(line) + + lines.append("") # Collect external contributors not in changelog - external_contributors_not_in_changelog = set() + # Group by author and collect their commit titles + external_contributors_not_in_changelog: Dict[str, List[str]] = {} for commit in commits: if commit.is_external and (commit.to_changelog is False or commit.to_changelog is None): author_name = _extract_author_name(commit.author) - external_contributors_not_in_changelog.add(author_name) + if author_name not in external_contributors_not_in_changelog: + external_contributors_not_in_changelog[author_name] = [] + external_contributors_not_in_changelog[author_name].append(commit.title) if external_contributors_not_in_changelog: lines.append("* Many thanks to:") - for contributor in sorted(external_contributors_not_in_changelog): - lines.append(f" * {contributor} for the contribution!") + for contributor in sorted(external_contributors_not_in_changelog.keys()): + titles = external_contributors_not_in_changelog[contributor] + if len(titles) == 1: + lines.append(f" * {contributor} for {titles[0]}!") + else: + lines.append(f" * {contributor} for:") + for title in titles: + lines.append(f" * {title}") lines.append("") return "\n".join(lines) From 1bd78a29299fe12728af7529e0ebfe9eb2968971 Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 11:50:10 +0300 Subject: [PATCH 13/14] changelog-tool readme --- scripts/changelog_tool/README.md | 174 +++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 scripts/changelog_tool/README.md diff --git a/scripts/changelog_tool/README.md b/scripts/changelog_tool/README.md new file mode 100644 index 000000000000..3edc19efe209 --- /dev/null +++ b/scripts/changelog_tool/README.md @@ -0,0 +1,174 @@ +# Changelog Tool + +A tool for automatically generating changelogs from git commits using LLM analysis. + +## Features + +- **Automatic commit classification**: Classifies commits into categories (feature, bug, optimization, refactor, minor, docs, unclear) +- **LLM-powered analysis**: Uses LLM to analyze commits and generate changelog entries +- **External contributor detection**: Identifies external contributors and generates acknowledgments +- **Component extraction**: Extracts component names from commit titles for better organization +- **Override support**: Allows manual override of classifications and changelog entries +- **State persistence**: Saves LLM analysis results to avoid reprocessing +- **Rate limiting**: Configurable rate limiting and concurrent request limits + +## Installation + +1. Ensure you have Python 3.8+ installed +2. Install dependencies: +```bash +cd scripts/changelog_tool +python3 -m venv .venv +source .venv/bin/activate +python3 -m pip3 install -r requirements.txt +``` + +3. Set up environment variables: +```bash +export CHANGELOG_LLM_URL="https://your-llm-api.com/v1" +export CHANGELOG_LLM_API_KEY="your-api-key" +export CHANGELOG_LLM_MODEL="your-model-name" +``` + +## Configuration + +The tool is configured via `changelog.yaml`: + +```yaml +collect: + from_sha: # Starting commit SHA + to_sha: HEAD # Ending commit SHA (default: HEAD) + repo_path: ../.. # Path to the repository (default: ../..) + core_team_patterns: # Patterns to identify core team members + - ".*@userver\\.tech" + - ".*@yandex-team\\.com" + +llm-config: + target_rps: 1 # Target requests per second + retries: 3 # Number of retry attempts + max_commits_per_batch: 10 # Maximum commits per LLM batch + max_user_prompt_length: 100000 # Maximum prompt length in characters + include_diff: true # Include diff in LLM prompt + truncate_diff: false # Truncate diff if too long + max_concurrent_requests: 5 # Maximum concurrent requests + +review: + github_url: "https://github.com/userver-framework/userver" + +report: + github_url: "https://github.com/userver-framework/userver" +``` + +## Usage + +### Step 1: Collect Commits + +Run the `collect` command to gather commits and analyze them: + +```bash +source .venv/bin/activate +./changelog-tool collect +``` + +The tool will: +1. Fetch commits from the specified range +2. Classify commits using heuristics +3. Send unclear commits to LLM for analysis +4. Save results to `.changelog/classified.json` + +**Important**: Run the `collect` command repeatedly until you see a message like: +``` +Found 10 commits, 10 already processed, 0 to process via LLM +``` + +This ensures all commits have been processed by the LLM. The tool uses state persistence to avoid reprocessing commits, so running it multiple times is safe and recommended for reliability. + +### Step 2: Review and Override + +Run the `review` command to generate a review report: + +```bash +./changelog-tool review +``` + +This generates two files in `.changelog/`: +- `review_report.md`: A markdown report showing all commits with their classification status +- `override.yaml`: A commented YAML file for overriding classifications + +Review the report and uncomment/modify entries in `override.yaml` to override classifications: + +```yaml +# Example override.yaml +commit_sha_1: + to_changelog: true + changelog_line: "Added support for async LLM processing" + +commit_sha_2: + to_changelog: false + classification: "minor" +``` + +Feel free to leave classification or changelog_line empty LLM will handle it on the next step. + +### Step 3: Generate Changelog + +Run the `report` command to generate the final changelog: + +```bash +./changelog-tool report +``` + +This will: +1. Load classified commits from `classified.json` +2. Apply overrides from `override.yaml` +3. Process commits needing LLM analysis with increased prompt size (1.5x) and diff truncation +4. Generate a formatted Markdown changelog grouped by classification and component +5. Save the changelog to `.changelog/changelog.md` + +**Important**: Run the `report` command repeatedly until you see a message like: +``` +Found 10 commits, 10 already processed, 0 to process via LLM +``` + +This ensures all commits that need LLM analysis have been processed. + +## Output Format + +The generated changelog has the following structure: + +```markdown +* Breaking Change + * component1 + * changelog line 1 + * changelog line 2 + * changelog line without component + +* Feature + * component1 + * changelog line 3 + * changelog line without component + +* Optimization + * component2 + * changelog line 4 + +* Bug + * component1 + * changelog line 5 + +* Refactor + * component3 + * changelog line 6 + +* Minor + * changelog line 7 + +* Documentation + * changelog line 8 + +* Many thanks to: + * External Contributor 1 for commit title 1! + * External Contributor 2 for: + * commit title 2 + * commit title 3 +``` \ No newline at end of file From ca9f2963ea247c0f04d2e4bf0a6b55db2a8ec621 Mon Sep 17 00:00:00 2001 From: Ivan Skriabin Date: Wed, 10 Jun 2026 18:01:36 +0300 Subject: [PATCH 14/14] cleanup russian --- .../changelog_tool/collect/command.py | 2 +- .../changelog_tool/common/git.py | 10 ++-- .../changelog_tool/llm/client.py | 6 +-- .../changelog_tool/llm/exceptions.py | 4 +- .../changelog_tool/llm/processor.py | 46 +++++++++---------- .../changelog_tool/llm/state.py | 18 ++++---- 6 files changed, 43 insertions(+), 43 deletions(-) diff --git a/scripts/changelog_tool/changelog_tool/collect/command.py b/scripts/changelog_tool/changelog_tool/collect/command.py index 9e02ccb07adc..8275b1dc7d3a 100644 --- a/scripts/changelog_tool/changelog_tool/collect/command.py +++ b/scripts/changelog_tool/changelog_tool/collect/command.py @@ -95,7 +95,7 @@ def collect(config: Config) -> None: try: commit.classification = Classification(result.get("classification", "unclear")) except ValueError: - # Если LLM вернула неизвестную классификацию, оставляем UNCLEAR + # If LLM returned an unknown classification, keep UNCLEAR pass commit.to_changelog = result.get("to_changelog") diff --git a/scripts/changelog_tool/changelog_tool/common/git.py b/scripts/changelog_tool/changelog_tool/common/git.py index f240d2ae38f7..4c47c0605274 100644 --- a/scripts/changelog_tool/changelog_tool/common/git.py +++ b/scripts/changelog_tool/changelog_tool/common/git.py @@ -7,22 +7,22 @@ class GitError(Exception): - """Любая ошибка при работе с git.""" + """Any error when working with git.""" class FileChange(pydantic.BaseModel): path: str - old_path: str | None = None # None если файл не переименован + old_path: str | None = None # None if file is not renamed added_lines: int = 0 removed_lines: int = 0 class Commit(pydantic.BaseModel): sha: str - title: str # первая строка message - message: str # полный message + title: str # first line of message + message: str # full message author: str # "Name " - co_authors: list[str] # из "Co-authored-by:" + co_authors: list[str] # from "Co-authored-by:" changed_files: list[FileChange] total_added: int = 0 total_removed: int = 0 diff --git a/scripts/changelog_tool/changelog_tool/llm/client.py b/scripts/changelog_tool/changelog_tool/llm/client.py index 76525cba7d05..1f1a7610025a 100644 --- a/scripts/changelog_tool/changelog_tool/llm/client.py +++ b/scripts/changelog_tool/changelog_tool/llm/client.py @@ -14,14 +14,14 @@ class BaseLLMClient(ABC): @abstractmethod async def generate(self, prompt: str) -> str: """ - Асинхронно отправляет текстовый промпт в LLM и возвращает текстовый ответ. - Может выбрасывать LLMError или LLMTransientError. + Asynchronously sends a text prompt to the LLM and returns a text response. + May throw LLMError or LLMTransientError. """ pass @abstractmethod async def close(self): - """Закрывает ресурсы клиента.""" + """Closes client resources.""" pass diff --git a/scripts/changelog_tool/changelog_tool/llm/exceptions.py b/scripts/changelog_tool/changelog_tool/llm/exceptions.py index 3aa06096ec86..54bdf1a53562 100644 --- a/scripts/changelog_tool/changelog_tool/llm/exceptions.py +++ b/scripts/changelog_tool/changelog_tool/llm/exceptions.py @@ -1,7 +1,7 @@ class LLMError(Exception): - """Критическая ошибка LLM (например, неверный формат запроса, 400 Bad Request).""" + """Critical LLM error (e.g., invalid request format, 400 Bad Request).""" pass class LLMTransientError(LLMError): - """Временная ошибка LLM (например, 500, 503, таймаут или исчерпаны попытки ретраев).""" + """Transient LLM error (e.g., 500, 503, timeout or retries exhausted).""" pass diff --git a/scripts/changelog_tool/changelog_tool/llm/processor.py b/scripts/changelog_tool/changelog_tool/llm/processor.py index ac3bde54142b..f46bc65745ec 100644 --- a/scripts/changelog_tool/changelog_tool/llm/processor.py +++ b/scripts/changelog_tool/changelog_tool/llm/processor.py @@ -18,20 +18,20 @@ def __init__(self, config: LLMConfig, llm_client: BaseLLMClient, output_dir: Pat async def process_commits(self, commits: List[Commit]) -> Dict[str, Dict[str, Any]]: """ - Асинхронно обрабатывает список коммитов через LLM. - Возвращает словарь SHA -> dict с результатами (classification, changelog_line, detailed_commit_analysis). + Asynchronously processes a list of commits through the LLM. + Returns a dictionary SHA -> dict with results (classification, changelog_line, detailed_commit_analysis). """ - # Загружаем и очищаем стейт + # Load and clean state await self.state.load() valid_shas = {commit.sha for commit in commits} await self.state.cleanup(valid_shas) - # Фильтруем коммиты для обработки + # Filter commits for processing commits_to_process = [] results = {} for commit in commits: - # Проверяем стейт + # Check state result = await self.state.get_result(commit.sha) if result: results[commit.sha] = result @@ -43,23 +43,23 @@ async def process_commits(self, commits: List[Commit]) -> Dict[str, Dict[str, An if not commits_to_process: return results - # Разбиваем на батчи с учетом размера промпта + # Split into batches considering prompt size batches = self._create_smart_batches(commits_to_process) total_commits = sum(len(batch) for batch in batches) print(f"Processing {total_commits} commits in {len(batches)} batches...") - # Обрабатываем батчи параллельно + # Process batches in parallel batch_results = await asyncio.gather( *[self._process_batch(batch, i, len(batches), total_commits) for i, batch in enumerate(batches)], return_exceptions=True ) - # Собираем результаты + # Collect results completed_batches = 0 for batch_idx, batch_result in enumerate(batch_results): if isinstance(batch_result, Exception): - # Ошибки в батчах уже записаны в стейт, просто продолжаем + # Batch errors are already written to state, just continue continue completed_batches += 1 results.update(batch_result) @@ -69,17 +69,17 @@ async def process_commits(self, commits: List[Commit]) -> Dict[str, Dict[str, An return results async def _process_batch(self, batch: List[Commit], batch_idx: int = 0, total_batches: int = 0, total_commits: int = 0) -> Dict[str, Dict[str, Any]]: - """Обрабатывает один батч коммитов.""" + """Processes one batch of commits.""" try: print(f"[{batch_idx + 1}/{total_batches}] Processing {len(batch)} commits...") prompt = self._build_prompt(batch) - # Проверяем длину промпта + # Check prompt length if len(prompt) > self.config.max_user_prompt_length: if self.config.truncate_diff: prompt = self._truncate_prompt(prompt) else: - # Помечаем все коммиты батча как ошибочные + # Mark all batch commits as erroneous error_msg = f"Prompt too long ({len(prompt)} > {self.config.max_user_prompt_length})" for commit in batch: await self.state.set_error(commit.sha, error_msg) @@ -92,7 +92,7 @@ async def _process_batch(self, batch: List[Commit], batch_idx: int = 0, total_ba } for commit in batch } - # Отправляем в LLM + # Send to LLM response_text = await self.llm_client.generate(prompt) # Remove markdown code blocks if present @@ -104,17 +104,17 @@ async def _process_batch(self, batch: List[Commit], batch_idx: int = 0, total_ba response_text = response_text.strip()[:-3] # Remove trailing ``` response_text = response_text.strip() - # Парсим ответ + # Parse response try: response_data = json.loads(response_text) except json.JSONDecodeError as e: raise LLMError(f"LLM returned invalid JSON: {e}") - # Проверяем формат ответа + # Check response format if not isinstance(response_data, dict): raise LLMError("LLM returned invalid response format (not a dict)") - # Сохраняем результаты и возвращаем + # Save results and return results = {} for commit in batch: commit_data = response_data.get(commit.sha, {}) @@ -146,10 +146,10 @@ async def _process_batch(self, batch: List[Commit], batch_idx: int = 0, total_ba return results except LLMError: - # Критическая ошибка - пробрасываем дальше + # Critical error - re-raise raise except Exception as e: - # Временная ошибка или другая проблема - помечаем коммиты как ошибочные + # Temporary error or other problem - mark commits as erroneous error_msg = f"{type(e).__name__}: {str(e)}" print(f"✗ Batch {batch_idx + 1}/{total_batches} failed: {error_msg}") for commit in batch: @@ -197,7 +197,7 @@ def _create_smart_batches(self, commits: List[Commit]) -> List[List[Commit]]: return batches def _estimate_commit_size(self, commit: Commit) -> int: - """Оценивает размер промпта для одного коммита.""" + """Estimates the prompt size for one commit.""" size = len(commit.sha) + len(commit.title) + len(commit.message) size += len(', '.join(f.path for f in commit.changed_files)) @@ -205,10 +205,10 @@ def _estimate_commit_size(self, commit: Commit) -> int: diff = get_commit_diff(commit) size += len(diff) - return size + 200 # запас на JSON форматирование и разделители + return size + 200 # reserve for JSON formatting and separators def _build_prompt(self, commits: List[Commit]) -> str: - """Формирует промпт для батча коммитов.""" + """Forms a prompt for a batch of commits.""" system_prompt = """You are an expert software engineer analyzing git commits for a changelog. Your task is to analyze commits since the last release and highlight important and interesting changes. Ignore simple bugfixes, typos, and minor refactoring. @@ -273,8 +273,8 @@ def _build_prompt(self, commits: List[Commit]) -> str: return f"{system_prompt}\n\n{user_prompt}" def _truncate_prompt(self, prompt: str) -> str: - """Обрезает промпт до допустимой длины.""" - # Простая обрезка - в реальности может потребоваться более умная логика + """Truncates the prompt to the allowed length.""" + # Simple truncation - in reality, smarter logic may be needed if len(prompt) <= self.config.max_user_prompt_length: return prompt return prompt[:self.config.max_user_prompt_length] \ No newline at end of file diff --git a/scripts/changelog_tool/changelog_tool/llm/state.py b/scripts/changelog_tool/changelog_tool/llm/state.py index 2295c2e6e35c..3bdb702a0279 100644 --- a/scripts/changelog_tool/changelog_tool/llm/state.py +++ b/scripts/changelog_tool/changelog_tool/llm/state.py @@ -10,13 +10,13 @@ def __init__(self, state_file_path: Path): self.lock = asyncio.Lock() async def load(self) -> None: - """Асинхронно загружает состояние из файла.""" + """Asynchronously loads state from file.""" async with self.lock: if self.state_file_path.exists(): try: with open(self.state_file_path, 'r', encoding='utf-8') as f: loaded_state = json.load(f) - # Убедимся, что состояние имеет правильный формат + # Ensure the state has the correct format if isinstance(loaded_state, dict): self.state = loaded_state else: @@ -28,11 +28,11 @@ async def load(self) -> None: self.state = {} async def save(self) -> None: - """Асинхронно сохраняет состояние в файл.""" - # Создаем директорию если её нет + """Asynchronously saves state to file.""" + # Create directory if it doesn't exist self.state_file_path.parent.mkdir(parents=True, exist_ok=True) - # Атомарная запись через временный файл + # Atomic write through temporary file temp_file = self.state_file_path.with_suffix('.tmp') try: with open(temp_file, 'w', encoding='utf-8') as f: @@ -44,7 +44,7 @@ async def save(self) -> None: temp_file.unlink() async def cleanup(self, valid_shas: Set[str]) -> None: - """Удаляет из стейта коммиты, не попавшие в текущую выборку.""" + """Removes from state commits that are not in the current selection.""" async with self.lock: keys_to_remove = set(self.state.keys()) - valid_shas for key in keys_to_remove: @@ -53,7 +53,7 @@ async def cleanup(self, valid_shas: Set[str]) -> None: await self.save() async def get_result(self, sha: str) -> Optional[Dict[str, Any]]: - """Возвращает результат анализа коммита, если он есть и не содержит ошибки.""" + """Returns the commit analysis result if it exists and contains no errors.""" async with self.lock: commit_data = self.state.get(sha) if commit_data and commit_data.get("error") is None: @@ -61,7 +61,7 @@ async def get_result(self, sha: str) -> Optional[Dict[str, Any]]: return None async def set_result(self, sha: str, classification: str, changelog_line: str, detailed_commit_analysis: str, to_changelog: bool = False) -> int: - """Сохраняет успешный результат классификации. Возвращает количество готовых коммитов.""" + """Saves successful classification result. Returns the number of completed commits.""" async with self.lock: self.state[sha] = { "classification": classification, @@ -75,7 +75,7 @@ async def set_result(self, sha: str, classification: str, changelog_line: str, d return completed async def set_error(self, sha: str, error_message: str) -> None: - """Сохраняет ошибку классификации.""" + """Saves classification error.""" async with self.lock: self.state[sha] = { "classification": "unclear",