diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ab8d46..c57c6d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,9 +16,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [NextRelease] -### Header +### Added + +- **Placeholder-Based Line Numbering**: Implemented a new placeholder system (`OFFSET_PLACEHOLDER`) for calculating line numbers in generated headers. This ensures deterministic line numbering by first generating headers with placeholders, then calculating the actual offset based on the final header size, and finally replacing placeholders with accurate line numbers. + +### Fixed + +- **Deterministic Line Numbering**: Completely reworked the line number calculation algorithm to ensure consistent and accurate line numbers across multiple runs. The new approach uses a two-pass system that measures rather than guesses header sizes, ensuring deterministic results. +- **Docstring Merging Logic**: Improved the logic for merging manual docstrings with auto-generated content. Manual content is now properly preserved and formatted without adding unnecessary newlines or reformatting. +- **Multi-Pass Docstring Removal**: Enhanced the `remove_agent_docstring` function to handle complex scenarios where multiple agent-generated docstrings might be present, preventing duplication issues on repeated runs. +- **Whitespace Normalization**: Added proper whitespace handling in content comparison to prevent unnecessary file modifications when only minor whitespace differences exist. +- **Manual Content Extraction**: Improved the extraction of manual docstring content by properly handling the inner content within triple quotes, preserving original formatting and indentation. + +### Testing -- **subtitle**: describtion +- **New Test Cases**: Added comprehensive test cases for placeholder usage, short manual docstring handling, and merged multiline docstring preservation to ensure the new algorithms work correctly across various scenarios. ## [1.3.4] diff --git a/agent_docstrings/core.py b/agent_docstrings/core.py index 64f84d4..8750545 100644 --- a/agent_docstrings/core.py +++ b/agent_docstrings/core.py @@ -1,582 +1,627 @@ -from __future__ import annotations -""" - --- AUTO-GENERATED DOCSTRING --- - Table of content is automatically generated by Agent Docstrings v1.3.0 - - Classes/Functions: - - parse_gitignore(gitignore_path: Path) -> Set[str] (line 53) - - is_path_ignored(path: Path, ignore_patterns: Set[str], root_dir: Path) -> bool (line 76) - - load_blacklist_whitelist(directory: Path) -> Tuple[Set[str], Set[str]] (line 100) - - should_process_file(file_path: Path, root_dir: Path, ignore_patterns: Set[str], blacklist: Set[str], whitelist: Set[str]) -> bool (line 135) - - _get_header_content_lines(classes: List[ClassInfo], functions: List[SignatureInfo], language: str, line_offset: int) -> List[str] (line 205) - - _format_header(classes: List[ClassInfo], functions: List[SignatureInfo], language: str, line_offset: int) -> str (line 246) - - get_preserved_header_end_line(lines: List[str], language: str) -> int (line 266) - - process_file(path: Path, verbose: bool = False, beta: bool = False) -> None (line 348) - - discover_and_process_files(paths: List[str], verbose: bool = False, beta: bool = False) -> None (line 474) - --- END AUTO-GENERATED DOCSTRING --- -""" -import os -import fnmatch -from pathlib import Path -from typing import List, Callable, Dict, Tuple, Set -import re - -from . import __version__ -# * Template for the auto-generated header line -DOCSTRING_HEADER_TEMPLATE = "Table of content is automatically generated by Agent Docstrings v{version}" -from .languages.common import ( - COMMENT_STYLES, - ClassInfo, - SignatureInfo, - remove_agent_docstring, - DOCSTRING_START_MARKER, - DOCSTRING_END_MARKER, -) -from .languages import generic, kotlin, python, java, go, powershell, delphi - -DEFAULT_IGNORE_DIRS = { - ".git", - ".github", - ".idea", - ".vscode", - ".venv", - "venv", - "__pycache__", - "node_modules", - "build", - "dist", - "target", - "bin", - "obj", -} - - -def parse_gitignore(gitignore_path: Path) -> Set[str]: - """Parse .gitignore file and return set of ignore patterns. - - Args: - gitignore_path (Path): Path to .gitignore file. - - Returns: - Set[str]: Set of ignore patterns from .gitignore file. - """ - patterns = set() - if not gitignore_path.is_file(): - return patterns - try: - with gitignore_path.open("r", encoding="utf-8", errors="ignore") as f: - for line in f: - line = line.strip() - if line and not line.startswith("#"): - patterns.add(line) - except PermissionError: - raise - except Exception: - pass - return patterns - - -def is_path_ignored(path: Path, ignore_patterns: Set[str], root_dir: Path) -> bool: - """Check if a path should be ignored based on gitignore patterns. - - Args: - path (Path): Path to check. - ignore_patterns (Set[str]): Set of ignore patterns. - root_dir (Path): Root directory for relative path calculation. - - Returns: - bool: True if path should be ignored, False otherwise. - """ - try: - rel_path = path.relative_to(root_dir).as_posix() - for pattern in ignore_patterns: - if pattern.endswith("/") and path.is_dir(): - if fnmatch.fnmatch(rel_path + "/", pattern): - return True - elif fnmatch.fnmatch(rel_path, pattern.rstrip("/")): - return True - except ValueError: - pass - return False - - -def load_blacklist_whitelist(directory: Path) -> Tuple[Set[str], Set[str]]: - """Load blacklist and whitelist patterns from configuration files. - - Args: - directory (Path): Directory to search for configuration files. - - Returns: - Tuple[Set[str], Set[str]]: Tuple of (blacklist patterns, whitelist patterns). - """ - blacklist, whitelist = set(), set() - blacklist_file = directory / ".agent-docstrings-ignore" - whitelist_file = directory / ".agent-docstrings-include" - if blacklist_file.exists(): - try: - with blacklist_file.open("r", encoding="utf-8", errors="ignore") as f: - blacklist.update( - line.strip() - for line in f - if line.strip() and not line.startswith("#") - ) - except PermissionError: - raise - except Exception: - pass - if whitelist_file.exists(): - try: - with whitelist_file.open("r", encoding="utf-8", errors="ignore") as f: - whitelist.update( - line.strip() - for line in f - if line.strip() and not line.startswith("#") - ) - except PermissionError: - raise - except Exception: - pass - return blacklist, whitelist - - -def should_process_file( - file_path: Path, - root_dir: Path, - ignore_patterns: Set[str], - blacklist: Set[str], - whitelist: Set[str], -) -> bool: - """Determine if a file should be processed based on ignore patterns and lists. - - Args: - file_path (Path): Path to the file. - root_dir (Path): Root directory for relative path calculation. - ignore_patterns (Set[str]): Gitignore patterns. - blacklist (Set[str]): Additional blacklist patterns. - whitelist (Set[str]): Whitelist patterns (if not empty, only these files are processed). - - Returns: - bool: True if file should be processed, False otherwise. - """ - rel_path_str = file_path.relative_to(root_dir).as_posix() - if whitelist and not any( - fnmatch.fnmatch(rel_path_str, pattern) for pattern in whitelist - ): - return False - if any(fnmatch.fnmatch(rel_path_str, pattern) for pattern in blacklist): - return False - if is_path_ignored(file_path, ignore_patterns, root_dir): - return False - return True - - -# Mappings from file extension to language name and parser function -EXT_TO_LANG: Dict[str, str] = { - ".py": "python", - ".kt": "kotlin", - ".js": "javascript", - ".jsx": "javascript", - ".ts": "typescript", - ".tsx": "typescript", - ".cs": "csharp", - ".cpp": "cpp", - ".cxx": "cpp", - ".cc": "cpp", - ".hpp": "cpp", - ".h": "cpp", - ".c": "c", - ".java": "java", - ".go": "go", - ".ps1": "powershell", - ".psm1": "powershell", - ".pas": "delphi", -} - -LANG_PARSERS: Dict[ - str, Callable[[List[str]], Tuple[List[ClassInfo], List[SignatureInfo]]] -] = { - "python": python.parse_python_file, - "kotlin": kotlin.parse_kotlin_file, - "javascript": lambda lines: generic.parse_generic_file(lines, "javascript"), - "typescript": lambda lines: generic.parse_generic_file(lines, "typescript"), - "csharp": lambda lines: generic.parse_generic_file(lines, "csharp"), - "cpp": lambda lines: generic.parse_generic_file(lines, "cpp"), - "c": lambda lines: generic.parse_generic_file(lines, "cpp"), # C can be parsed like C++ (for functions) - "java": java.parse_java_file, - "go": go.parse_go_file, - "powershell": powershell.parse_powershell_file, - "delphi": delphi.parse_delphi_file, -} - - -def _get_header_content_lines( - classes: List[ClassInfo], - functions: List[SignatureInfo], - language: str, - line_offset: int, -) -> List[str]: - """Return a list of lines for the header content.""" - style = COMMENT_STYLES[language] - lines = [ - f"{style.prefix}{DOCSTRING_START_MARKER}", - f"{style.prefix}{DOCSTRING_HEADER_TEMPLATE.format(version=__version__)}", - f"{style.prefix}", - f"{style.prefix}Classes/Functions:", - ] - - def format_class(ci: ClassInfo, indent: str): - lines.append(f"{indent}- {ci.name} (line {ci.line + line_offset}):") - for m in sorted(ci.methods, key=lambda x: x.line): - lines.append(f"{indent}{style.indent}- {m.signature} (line {m.line + line_offset})") - for inner_ci in sorted(ci.inner_classes, key=lambda x: x.line): - format_class(inner_ci, indent + style.indent) - - # Combine classes and functions into a single list of top-level items - top_level_items = sorted( - classes + functions, key=lambda item: item.line - ) - - item_prefix = f"{style.prefix}{style.indent}" - for item in top_level_items: - if isinstance(item, ClassInfo): - # ! Pass the correct prefix for top-level classes - format_class(item, item_prefix) - elif isinstance(item, SignatureInfo): - lines.append( - f"{item_prefix}- {item.signature} (line {item.line + line_offset})" - ) - - lines.append(f"{style.prefix}{DOCSTRING_END_MARKER}") - return lines - - -def _format_header( - classes: List[ClassInfo], - functions: List[SignatureInfo], - language: str, - line_offset: int, -) -> str: - """Return a formatted header block for *language*.""" - style = COMMENT_STYLES[language] - content_lines = _get_header_content_lines( - classes, functions, language, line_offset - ) - header_parts = [] - if style.start: - header_parts.append(style.start) - header_parts.extend(content_lines) - if style.end: - header_parts.append(style.end) - return "\n".join(header_parts) - - -def get_preserved_header_end_line(lines: List[str], language: str) -> int: - """Determines the number of lines to preserve at the start of a file.""" - if not lines: - return 0 - - # ! Check if file starts with an agent-generated docstring - # * If so, don't preserve any header lines - let remove_agent_docstring handle it - if lines and language != "python": - style = COMMENT_STYLES.get(language) - if style and lines[0].strip() == style.start.strip(): - # * Look for the docstring start marker in the next few lines - for i in range(min(5, len(lines))): - if DOCSTRING_START_MARKER in lines[i]: - return 0 # Don't preserve any lines - this is an agent docstring - - if language == "python": - header_end = 0 - if not lines: - return 0 - - # ! CRITICAL: If the file starts with an agent docstring, preserve nothing - # * This prevents __future__ imports from being placed after docstrings - if lines[0].strip().startswith('"""') or lines[0].strip().startswith("'''"): - # Check if this is an agent-generated docstring - for i in range(min(5, len(lines))): - if DOCSTRING_START_MARKER in lines[i]: - return 0 # Don't preserve any lines - this is an agent docstring - - # 1. Preserve shebang - if lines[header_end].startswith("#!"): - header_end += 1 - - # 2. Preserve encoding declaration - if len(lines) > header_end and re.match( - r"^[ \t\f]*#.*?coding[:=]", lines[header_end] - ): - header_end += 1 - - # 3. Preserve future imports - while ( - len(lines) > header_end - and lines[header_end].strip().startswith("from __future__ import") - ): - header_end += 1 - - # 4. Preserve empty lines immediately after __future__ imports - while ( - len(lines) > header_end - and lines[header_end].strip() == "" - ): - header_end += 1 - - # 5. If the next line is an agent docstring, we don't need to preserve it. - # The line right after the header section should be checked. - if len(lines) > header_end: - line = lines[header_end].strip() - if line.startswith('"""') or line.startswith("'''"): - # Check next few lines for the marker - for i in range(header_end, min(header_end + 5, len(lines))): - if DOCSTRING_START_MARKER in lines[i]: - # This is an agent docstring. The header is everything before it. - return header_end - return header_end - if language == "go": - for i, line in enumerate(lines): - if line.strip().startswith("package "): - return i + 1 - return 0 - # General check for JS, TS, C#, C++, Java, Kotlin - in_block_comment = False - for i, line in enumerate(lines): - stripped = line.strip() - - if in_block_comment: - if "*/" in stripped: - in_block_comment = False - continue - - if stripped.startswith("/*"): - if "*/" not in stripped: - in_block_comment = True - continue - - if ( - stripped.startswith("//") - or stripped.startswith("import ") - or stripped.startswith("using ") - or stripped.startswith("package ") - ): - continue - - # If we're not in a block comment and the line is not a recognized - # header element, then the header is over. This includes empty lines. - return i - - return len(lines) - - -def process_file(path: Path, verbose: bool = False, beta: bool = False) -> None: - """Generate or refresh the header comment for *path*.""" - ext = path.suffix.lower() - if ext not in EXT_TO_LANG: - return - language = EXT_TO_LANG[ext] - parser = LANG_PARSERS.get(language) - if not parser: - return - try: - original_content = path.read_text(encoding="utf-8", errors="ignore") - if not original_content.strip(): - return - # * Skip regeneration when only generator version changed in header - lines = original_content.split('\n') - header_end_line = get_preserved_header_end_line(lines, language) - file_prefix = "\n".join(lines[:header_end_line]) - code_body = "\n".join(lines[header_end_line:]) - cleaned_body = remove_agent_docstring(code_body, language) - - classes, functions = parser(cleaned_body.splitlines()) - if not classes and not functions: - # If all that was done was removing a docstring, write the cleaned content back - if cleaned_body != code_body: - path.write_text( - (file_prefix + "\n" + cleaned_body).lstrip(), - encoding="utf-8", - ) - return - - # ! Calculate the correct line offset for the final positions - # * To ensure deterministic line numbers, we need to calculate the offset - # * based on the final file structure, not the intermediate state - - # * First, determine how many lines will be in the final header - temp_header = _format_header(classes, functions, language, 0) - temp_header_line_count = len(temp_header.splitlines()) - - # * Calculate total offset: preserved header lines + generated header lines - # * This represents where the cleaned body will start in the final file - line_offset = header_end_line + temp_header_line_count - - # ! Language-specific adjustments for line numbering - if language == "go": - # * Go has a special case where the offset needs to be reduced by 1 - line_offset -= 1 - elif language == "python" and header_end_line > 0: - # * Python with preserved headers (shebang/encoding) needs adjustment - line_offset -= 1 - - # * Now create the final header with correct line numbers - final_header = _format_header(classes, functions, language, line_offset) - - # Attempt to merge auto-generated header into existing manual docstring for Python - merged_body = None - if language == "python": - # Split cleaned body into lines - body_lines = cleaned_body.splitlines() - # Find first non-empty line - idx = 0 - while idx < len(body_lines) and body_lines[idx].strip() == "": - idx += 1 - # Check for manual docstring start - if idx < len(body_lines) and body_lines[idx].strip().startswith(('"""', "'''")): - delim_line = body_lines[idx].strip() - # Ensure it's not an existing auto-generated docstring - marker_present = False - for i in range(idx, min(idx + 5, len(body_lines))): - if DOCSTRING_START_MARKER in body_lines[i]: - marker_present = True - break - if not marker_present: - # Find end of manual docstring - end_idx = None - manual_inner = [] - delim = None - - delim_quotes = '"""' if delim_line.startswith('"""') else "'''" - is_single_line = delim_line.endswith(delim_quotes) and delim_line != delim_quotes - - if is_single_line: - end_idx = idx - content_part = delim_line[len(delim_quotes):-len(delim_quotes)] - if content_part: - manual_inner = [content_part] - delim = delim_quotes - else: - # Multi-line docstring - delim = delim_line - for j in range(idx + 1, len(body_lines)): - if body_lines[j].strip() == delim: - end_idx = j - break - if end_idx is not None: - manual_inner = body_lines[idx + 1:end_idx] - - if end_idx is not None: - # Compute auto header content lines with correct offset for merge - # temp_header_line_count holds the auto header line count including delimiters - # content_lines length is temp_header_line_count minus start/end markers - offset_override = temp_header_line_count - 2 - # Generate only the header content lines (without triple-quote delimiters) - header_inner = _get_header_content_lines( - classes, functions, language, offset_override - ) - merged_lines = [] - # Preserve leading blank lines before manual docstring - merged_lines.extend(body_lines[:idx]) - # Start merged docstring with manual delimiter - merged_lines.append(delim) - # Insert auto-generated header content - merged_lines.extend(header_inner) - # Insert original manual docstring content - merged_lines.extend(manual_inner) - # Close merged docstring with manual delimiter - merged_lines.append(delim) - # Append rest of body after original docstring - merged_lines.extend(body_lines[end_idx + 1:]) - merged_body = "\n".join(merged_lines) - if merged_body is not None: - if file_prefix: - new_content = file_prefix + "\n" + merged_body.lstrip("\n") - else: - new_content = merged_body.lstrip("\n") - else: - # Default behavior: insert separate docstring - new_content_parts = [] - if file_prefix: - new_content_parts.append(file_prefix) - new_content_parts.append(final_header) - new_content_parts.append(cleaned_body.lstrip()) - # Use single newlines to test composition theory - new_content = "\n".join(filter(None, new_content_parts)) - - def normalize_version(text: str) -> str: - """Replaces the version string in a docstring with a placeholder.""" - return re.sub( - r"(Table of content is automatically generated by Agent Docstrings v)[\d\.]+\w*", - r"\1[VERSION]", - text, - ) - - # To avoid rewriting files just for a version bump, we compare the content - # with the version number normalized. - normalized_original = normalize_version(original_content) - normalized_new = normalize_version(new_content) - - if normalized_original != normalized_new: - path.write_text(new_content, encoding="utf-8") - if verbose: - print(f"Processed {language.capitalize()}: {path}") - elif verbose: - # ! Provide verbose output even when no changes are made - print(f"No changes for {language.capitalize()}: {path}") - except Exception as e: - print(f"Error processing {path}: {e}") - - -def discover_and_process_files(paths: List[str], verbose: bool = False, beta: bool = False) -> None: - """Recursively process all supported files inside *paths*. - - Args: - paths (List[str]): White-list of root folders or files to scan. - verbose (bool, optional): Enables per-file logging when *True*. - beta (bool, optional): Enables experimental beta features. - """ - files_to_process = [] - - for p_str in paths: - try: - path = Path(p_str).resolve() - if not path.exists(): - print(f"Warning: '{p_str}' is not a valid path. Skipping.") - continue - - if path.is_dir(): - # Collect all gitignore patterns from the directory tree - ignore_patterns = set() - current_dir = path - while current_dir != current_dir.parent: - gitignore_path = current_dir / '.gitignore' - if gitignore_path.exists(): - ignore_patterns.update(parse_gitignore(gitignore_path)) - current_dir = current_dir.parent - - # Load blacklist and whitelist from the root directory - blacklist_patterns, whitelist_patterns = load_blacklist_whitelist(path) - - for root, dirs, files in os.walk(path): - root_path = Path(root) - - # Filter directories to avoid walking into ignored ones - dirs[:] = [d for d in dirs if d not in DEFAULT_IGNORE_DIRS and not is_path_ignored(root_path / d, ignore_patterns, path)] - - for file in files: - file_path = root_path / file - - # Check if file should be processed - if not should_process_file(file_path, path, ignore_patterns, - blacklist_patterns, whitelist_patterns): - continue - - files_to_process.append(file_path) - elif path.is_file(): - files_to_process.append(path) - except PermissionError: - print(f"Warning: Could not read configuration (e.g., .gitignore) in '{p_str}' due to a permission error. Skipping path to ensure no unintended files are modified.") - continue - - # Process all collected files - for file_path in sorted(list(set(files_to_process))): +from __future__ import annotations +""" + --- AUTO-GENERATED DOCSTRING --- + Table of content is automatically generated by Agent Docstrings v1.3.4 + + Classes/Functions: + - parse_gitignore(gitignore_path: Path) -> Set[str] (line 57) + - is_path_ignored(path: Path, ignore_patterns: Set[str], root_dir: Path) -> bool (line 82) + - load_blacklist_whitelist(directory: Path) -> Tuple[Set[str], Set[str]] (line 106) + - should_process_file(file_path: Path, root_dir: Path, ignore_patterns: Set[str], blacklist: Set[str], whitelist: Set[str]) -> bool (line 145) + - _get_header_content_lines(classes: List[ClassInfo], functions: List[SignatureInfo], language: str, line_offset: int, manual_content: str | None = None, placeholder: bool = False) -> List[str] (line 215) + - _format_header(classes: List[ClassInfo], functions: List[SignatureInfo], language: str, line_offset: int, manual_content: str | None = None, placeholder: bool = False) -> str (line 292) + - get_preserved_header_end_line(lines: List[str], language: str) -> int (line 314) + - _extract_docstring_and_code(code_body: str, language: str) -> tuple[str | None, str] (line 412) + - process_file(path: Path, verbose: bool = False, beta: bool = False) -> None (line 449) + - discover_and_process_files(paths: List[str], verbose: bool = False, beta: bool = False) -> None (line 574) + --- END AUTO-GENERATED DOCSTRING --- +""" +import os +import fnmatch +from pathlib import Path +from typing import List, Callable, Dict, Tuple, Set +import re + +from . import __version__ +# * Template for the auto-generated header line +DOCSTRING_HEADER_TEMPLATE = "Table of content is automatically generated by Agent Docstrings v{version}" +# * Placeholder prefix for line number offset in header generation +OFFSET_PLACEHOLDER = "OFFSET" +from .languages.common import ( + COMMENT_STYLES, + ClassInfo, + SignatureInfo, + remove_agent_docstring, + DOCSTRING_START_MARKER, + DOCSTRING_END_MARKER, +) +from .languages import generic, kotlin, python, java, go, powershell, delphi + +DEFAULT_IGNORE_DIRS = { + ".git", + ".github", + ".idea", + ".vscode", + ".venv", + "venv", + "__pycache__", + "node_modules", + "build", + "dist", + "target", + "bin", + "obj", +} + + +def parse_gitignore(gitignore_path: Path) -> Set[str]: + """Parse .gitignore file and return set of ignore patterns. + + Args: + gitignore_path (Path): Path to .gitignore file. + + Returns: + Set[str]: Set of ignore patterns from .gitignore file. + """ + patterns = set() + if not gitignore_path.is_file(): + return patterns + try: + with gitignore_path.open("r", encoding="utf-8", errors="ignore") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + patterns.add(line) + except PermissionError: + raise + except Exception: + pass + return patterns + + +def is_path_ignored(path: Path, ignore_patterns: Set[str], root_dir: Path) -> bool: + """Check if a path should be ignored based on gitignore patterns. + + Args: + path (Path): Path to check. + ignore_patterns (Set[str]): Set of ignore patterns. + root_dir (Path): Root directory for relative path calculation. + + Returns: + bool: True if path should be ignored, False otherwise. + """ + try: + rel_path = path.relative_to(root_dir).as_posix() + for pattern in ignore_patterns: + if pattern.endswith("/") and path.is_dir(): + if fnmatch.fnmatch(rel_path + "/", pattern): + return True + elif fnmatch.fnmatch(rel_path, pattern.rstrip("/")): + return True + except ValueError: + pass + return False + + +def load_blacklist_whitelist(directory: Path) -> Tuple[Set[str], Set[str]]: + """Load blacklist and whitelist patterns from configuration files. + + Args: + directory (Path): Directory to search for configuration files. + + Returns: + Tuple[Set[str], Set[str]]: Tuple of (blacklist patterns, whitelist patterns). + """ + blacklist, whitelist = set(), set() + blacklist_file = directory / ".agent-docstrings-ignore" + whitelist_file = directory / ".agent-docstrings-include" + if blacklist_file.exists(): + try: + with blacklist_file.open("r", encoding="utf-8", errors="ignore") as f: + blacklist.update( + line.strip() + for line in f + if line.strip() and not line.startswith("#") + ) + except PermissionError: + raise + except Exception: + pass + if whitelist_file.exists(): + try: + with whitelist_file.open("r", encoding="utf-8", errors="ignore") as f: + whitelist.update( + line.strip() + for line in f + if line.strip() and not line.startswith("#") + ) + except PermissionError: + raise + except Exception: + pass + return blacklist, whitelist + + +def should_process_file( + file_path: Path, + root_dir: Path, + ignore_patterns: Set[str], + blacklist: Set[str], + whitelist: Set[str], +) -> bool: + """Determine if a file should be processed based on ignore patterns and lists. + + Args: + file_path (Path): Path to the file. + root_dir (Path): Root directory for relative path calculation. + ignore_patterns (Set[str]): Gitignore patterns. + blacklist (Set[str]): Additional blacklist patterns. + whitelist (Set[str]): Whitelist patterns (if not empty, only these files are processed). + + Returns: + bool: True if file should be processed, False otherwise. + """ + rel_path_str = file_path.relative_to(root_dir).as_posix() + if whitelist and not any( + fnmatch.fnmatch(rel_path_str, pattern) for pattern in whitelist + ): + return False + if any(fnmatch.fnmatch(rel_path_str, pattern) for pattern in blacklist): + return False + if is_path_ignored(file_path, ignore_patterns, root_dir): + return False + return True + + +# Mappings from file extension to language name and parser function +EXT_TO_LANG: Dict[str, str] = { + ".py": "python", + ".kt": "kotlin", + ".js": "javascript", + ".jsx": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".cs": "csharp", + ".cpp": "cpp", + ".cxx": "cpp", + ".cc": "cpp", + ".hpp": "cpp", + ".h": "cpp", + ".c": "c", + ".java": "java", + ".go": "go", + ".ps1": "powershell", + ".psm1": "powershell", + ".pas": "delphi", +} + +LANG_PARSERS: Dict[ + str, Callable[[List[str]], Tuple[List[ClassInfo], List[SignatureInfo]]] +] = { + "python": python.parse_python_file, + "kotlin": kotlin.parse_kotlin_file, + "javascript": lambda lines: generic.parse_generic_file(lines, "javascript"), + "typescript": lambda lines: generic.parse_generic_file(lines, "typescript"), + "csharp": lambda lines: generic.parse_generic_file(lines, "csharp"), + "cpp": lambda lines: generic.parse_generic_file(lines, "cpp"), + "c": lambda lines: generic.parse_generic_file(lines, "cpp"), # C can be parsed like C++ (for functions) + "java": java.parse_java_file, + "go": go.parse_go_file, + "powershell": powershell.parse_powershell_file, + "delphi": delphi.parse_delphi_file, +} + + +def _get_header_content_lines( + classes: List[ClassInfo], + functions: List[SignatureInfo], + language: str, + line_offset: int, + manual_content: str | None = None, + placeholder: bool = False, +) -> List[str]: + """Return a list of lines for the header content.""" + style = COMMENT_STYLES[language] + + blank_line = style.prefix + if blank_line.strip(): # True for any prefix with non-whitespace chars + blank_line = blank_line.rstrip() + + lines = [ + f"{style.prefix}{DOCSTRING_START_MARKER}", + f"{style.prefix}{DOCSTRING_HEADER_TEMPLATE.format(version=__version__)}", + blank_line, + f"{style.prefix}Classes/Functions:", + ] + + def format_class(ci: ClassInfo, indent: str): + if placeholder: + offset_expr = f"{OFFSET_PLACEHOLDER}+{ci.line}" + else: + offset_expr = str(ci.line + line_offset) + lines.append(f"{indent}- {ci.name} (line {offset_expr}):") + for m in sorted(ci.methods, key=lambda x: x.line): + if placeholder: + m_offset_expr = f"{OFFSET_PLACEHOLDER}+{m.line}" + else: + m_offset_expr = str(m.line + line_offset) + lines.append(f"{indent}{style.indent}- {m.signature} (line {m_offset_expr})") + for inner_ci in sorted(ci.inner_classes, key=lambda x: x.line): + format_class(inner_ci, indent + style.indent) + + # Combine classes and functions into a single list of top-level items + top_level_items = sorted( + classes + functions, key=lambda item: item.line + ) + + item_prefix = f"{style.prefix}{style.indent}" + for item in top_level_items: + if isinstance(item, ClassInfo): + # ! Pass the correct prefix for top-level classes + format_class(item, item_prefix) + elif isinstance(item, SignatureInfo): + if placeholder: + f_offset_expr = f"{OFFSET_PLACEHOLDER}+{item.line}" + else: + f_offset_expr = str(item.line + line_offset) + lines.append( + f"{item_prefix}- {item.signature} (line {f_offset_expr})" + ) + + lines.append(f"{style.prefix}{DOCSTRING_END_MARKER}") + + # If there is manual content, add it after the agent block. + if manual_content: + # Add a separator line if the style prefix is more than just whitespace + if language == "python": + if manual_content.strip(): + lines.append(style.prefix.rstrip()) # Add a blank line separator + # Strip leading newlines from manual content to avoid double empty lines + lines.extend(manual_content.lstrip('\n').splitlines()) + else: + if style.prefix.strip(): + lines.append(style.prefix.rstrip()) # Add a clean separator like ' *' + # Strip leading newlines from manual content to avoid double empty lines + lines.extend( + f"{style.prefix}{line}" for line in manual_content.lstrip('\n').splitlines() + ) + + return lines + + +def _format_header( + classes: List[ClassInfo], + functions: List[SignatureInfo], + language: str, + line_offset: int, + manual_content: str | None = None, + placeholder: bool = False, +) -> str: + """Return a formatted header block for *language*.""" + style = COMMENT_STYLES[language] + content_lines = _get_header_content_lines( + classes, functions, language, line_offset, manual_content, placeholder + ) + header_parts = [] + if style.start: + header_parts.append(style.start) + header_parts.extend(content_lines) + if style.end: + header_parts.append(style.end) + return "\n".join(header_parts) + + +def get_preserved_header_end_line(lines: List[str], language: str) -> int: + """Determines the number of lines to preserve at the start of a file.""" + if not lines: + return 0 + + # ! Check if file starts with an agent-generated docstring + # * If so, don't preserve any header lines - let remove_agent_docstring handle it + if lines and language != "python": + style = COMMENT_STYLES.get(language) + if style and lines[0].strip() == style.start.strip(): + # * Look for the docstring start marker in the next few lines + for i in range(min(5, len(lines))): + if DOCSTRING_START_MARKER in lines[i]: + return 0 # Don't preserve any lines - this is an agent docstring + + if language == "python": + header_end = 0 + if not lines: + return 0 + + # ! CRITICAL: If the file starts with an agent docstring, preserve nothing + # * This prevents __future__ imports from being placed after docstrings + if lines[0].strip().startswith('"""') or lines[0].strip().startswith("'''"): + # Check if this is an agent-generated docstring + for i in range(min(5, len(lines))): + if DOCSTRING_START_MARKER in lines[i]: + return 0 # Don't preserve any lines - this is an agent docstring + + # 1. Preserve shebang + if lines[header_end].startswith("#!"): + header_end += 1 + + # 2. Preserve encoding declaration + if len(lines) > header_end and re.match( + r"^[ \t\f]*#.*?coding[:=]", lines[header_end] + ): + header_end += 1 + + # 3. Preserve future imports + while ( + len(lines) > header_end + and lines[header_end].strip().startswith("from __future__ import") + ): + header_end += 1 + + # 4. Preserve empty lines immediately after __future__ imports + while ( + len(lines) > header_end + and lines[header_end].strip() == "" + ): + header_end += 1 + + # 5. If the next line is an agent docstring, we don't need to preserve it. + # The line right after the header section should be checked. + if len(lines) > header_end: + line = lines[header_end].strip() + if line.startswith('"""') or line.startswith("'''"): + # Check next few lines for the marker + for i in range(header_end, min(header_end + 5, len(lines))): + if DOCSTRING_START_MARKER in lines[i]: + # This is an agent docstring. The header is everything before it. + return header_end + return header_end + if language == "go": + for i, line in enumerate(lines): + if line.strip().startswith("package "): + return i + 1 + return 0 + # General check for JS, TS, C#, C++, Java, Kotlin + in_block_comment = False + for i, line in enumerate(lines): + stripped = line.strip() + + if in_block_comment: + if "*/" in stripped: + in_block_comment = False + continue + + if stripped.startswith("/*"): + if "*/" not in stripped: + in_block_comment = True + continue + + if ( + stripped.startswith("//") + or stripped.startswith("import ") + or stripped.startswith("using ") + or stripped.startswith("package ") + ): + continue + + # If we're not in a block comment and the line is not a recognized + # header element, then the header is over. This includes empty lines. + return i + + return len(lines) + + +def _extract_docstring_and_code(code_body: str, language: str) -> tuple[str | None, str]: + """ + Finds the first module-level docstring, returns its full content, + and the rest of the code. + """ + if language != "python": + # This logic is Python-specific for now. + return None, code_body + + lines = code_body.splitlines() + doc_start_idx, doc_end_idx = -1, -1 + + for i, line in enumerate(lines): + stripped = line.strip() + if stripped: + if stripped.startswith(('"""', "'''")): + doc_start_idx = i + break + + if doc_start_idx == -1: return None, code_body + + delim_start = '"""' if lines[doc_start_idx].strip().startswith('"""') else "'''" + if lines[doc_start_idx].strip().endswith(delim_start) and len(lines[doc_start_idx].strip()) > len(delim_start): + doc_end_idx = doc_start_idx + else: + for i in range(doc_start_idx + 1, len(lines)): + if lines[i].strip().endswith(delim_start): + doc_end_idx = i + break + + if doc_end_idx == -1: return None, code_body + + docstring_content = "\n".join(lines[doc_start_idx : doc_end_idx + 1]) + code_after_docstring = "\n".join(lines[doc_end_idx + 1:]) + + return docstring_content, code_after_docstring + +def process_file(path: Path, verbose: bool = False, beta: bool = False) -> None: + """Generate or refresh the header comment for *path*. + + Algorithm for deterministic line numbering (converges in two passes): + + 1. Remove old agent-docstring (if present). + 2. Parse AST on clean code. + 3. Build draft header with offset = 0. + 4. Assemble prefix + draft_header + body → count where code starts. + 5. Real offset = - 1. + 6. Build final header with this offset. + 7. Assemble final file; if different from original → rewrite. + + This approach measures rather than guesses the header size, ensuring + deterministic line numbers across all languages and scenarios. + """ + ext = path.suffix.lower() + if ext not in EXT_TO_LANG: + return + language = EXT_TO_LANG[ext] + parser = LANG_PARSERS.get(language) + if not parser: + return + + try: + original_content = path.read_text(encoding="utf-8", errors="ignore") + if not original_content.strip(): + return + + # 1. Separate preserved header + lines = original_content.split('\n') + header_end_line = get_preserved_header_end_line(lines, language) + file_prefix = "\n".join(lines[:header_end_line]) + code_body_text = "\n".join(lines[header_end_line:]) + + # 2. Extract top-level docstring (if any) and the rest of the code + # The docstring here might be manual, agent-generated, or merged. + top_docstring, code_without_docstring = _extract_docstring_and_code(code_body_text, language) + + # 3. Clean any deeper agent docstrings from the main code body + code_without_docstring = remove_agent_docstring(code_without_docstring, language) + + manual_content = None + if top_docstring: + # Clean any agent content from the extracted docstring + cleaned_top_docstring = remove_agent_docstring(top_docstring, language) + + # What remains is the manual content + if cleaned_top_docstring.strip(): + # Extract raw string content from inside the quotes + delim = '"""' if cleaned_top_docstring.strip().startswith('"""') else "'''" + + # We do not strip the docstring itself to preserve indentation. + content_part = cleaned_top_docstring + + start_offset = content_part.find(delim) + if start_offset == -1: + manual_content = None # Should not happen if we got here + else: + start_offset += len(delim) + end_offset = content_part.rfind(delim) + if end_offset > start_offset: + manual_content = content_part[start_offset:end_offset] + + # 4. Parse the clean code to get class/function information + classes, functions = parser(code_without_docstring.splitlines()) + + if not classes and not functions: + # If nothing to document, re-assemble with only manual content (if any) + final_parts = [file_prefix] + if manual_content: + # Re-wrap manual content in a clean docstring + final_parts.append(f'"""{manual_content}"""') + final_parts.append(code_without_docstring) + final_content = "\n".join(filter(None, final_parts)) + if final_content.strip() != original_content.strip(): + path.write_text(final_content, encoding="utf-8") + return + + # 5. Placeholder-based offset calculation + # Generate header with placeholders for line numbers + placeholder_header = _format_header( + classes, functions, language, 0, manual_content, placeholder=True + ) + placeholder_lines = placeholder_header.splitlines() + # Compute actual offset based on placeholder header size + line_offset = len(file_prefix.splitlines()) + len(placeholder_lines) + # Replace placeholders with actual line numbers + def _replace(match): + return str(line_offset + int(match.group(1))) + final_lines = [ + re.sub(rf"{OFFSET_PLACEHOLDER}\+(\d+)", _replace, line) + for line in placeholder_lines + ] + final_header = "\n".join(final_lines) + + # 6. Assemble the final file content + final_parts = [file_prefix, final_header, code_without_docstring] + new_content = "\n".join(filter(None, final_parts)) + + def normalize_version(text: str) -> str: + """Replaces the version string in a docstring with a placeholder.""" + return re.sub( + r"(Table of content is automatically generated by Agent Docstrings v)[\d\.]+\w*", + r"\1[VERSION]", + text, + ) + + # To avoid rewriting files just for a version bump, we compare the content + # with the version number normalized. + # Normalize and trim whitespace so version-only changes or minor whitespace differences are ignored + normalized_original = normalize_version(original_content).strip() + normalized_new = normalize_version(new_content).strip() + + if normalized_original != normalized_new: + path.write_text(new_content, encoding="utf-8") + if verbose: + print(f"Processed {language.capitalize()}: {path}") + elif verbose: + print(f"No changes for {language.capitalize()}: {path}") + + except Exception as e: + print(f"Error processing {path}: {e}") + + +def discover_and_process_files(paths: List[str], verbose: bool = False, beta: bool = False) -> None: + """Recursively process all supported files inside *paths*. + + Args: + paths (List[str]): White-list of root folders or files to scan. + verbose (bool, optional): Enables per-file logging when *True*. + beta (bool, optional): Enables experimental beta features. + """ + files_to_process = [] + + for p_str in paths: + try: + path = Path(p_str).resolve() + if not path.exists(): + print(f"Warning: '{p_str}' is not a valid path. Skipping.") + continue + + if path.is_dir(): + # Collect all gitignore patterns from the directory tree + ignore_patterns = set() + current_dir = path + while current_dir != current_dir.parent: + gitignore_path = current_dir / '.gitignore' + if gitignore_path.exists(): + ignore_patterns.update(parse_gitignore(gitignore_path)) + current_dir = current_dir.parent + + # Load blacklist and whitelist from the root directory + blacklist_patterns, whitelist_patterns = load_blacklist_whitelist(path) + + for root, dirs, files in os.walk(path): + root_path = Path(root) + + # Filter directories to avoid walking into ignored ones + dirs[:] = [d for d in dirs if d not in DEFAULT_IGNORE_DIRS and not is_path_ignored(root_path / d, ignore_patterns, path)] + + for file in files: + file_path = root_path / file + + # Check if file should be processed + if not should_process_file(file_path, path, ignore_patterns, + blacklist_patterns, whitelist_patterns): + continue + + files_to_process.append(file_path) + elif path.is_file(): + files_to_process.append(path) + except PermissionError: + print(f"Warning: Could not read configuration (e.g., .gitignore) in '{p_str}' due to a permission error. Skipping path to ensure no unintended files are modified.") + continue + + # Process all collected files + for file_path in sorted(list(set(files_to_process))): process_file(file_path, verbose, beta) \ No newline at end of file diff --git a/agent_docstrings/languages/common.py b/agent_docstrings/languages/common.py index 47e9fc6..820e8b5 100644 --- a/agent_docstrings/languages/common.py +++ b/agent_docstrings/languages/common.py @@ -63,37 +63,38 @@ def replacer(match): if DOCSTRING_START_MARKER not in docstring_content: return docstring_content - # * Match the auto-generated block inside the docstring, including any leading/ - # * trailing whitespace and the trailing newline (if present). Use single - # * backslashes so that ``\s`` is interpreted by the *regex* engine as a - # * whitespace token instead of a literal backslash followed by ``s``. + # * Match the auto-generated block inside the docstring auto_content_pattern = re.compile( rf"\s*{start_marker_escaped}[\s\S]*?{end_marker_escaped}\s*\n?", re.DOTALL, ) cleaned_docstring = auto_content_pattern.sub('', docstring_content) - # Check what's left after removing the agent part - temp_cleaned = cleaned_docstring.replace('"""', '').replace("'''", '').strip() - - if not temp_cleaned: - return '' # Docstring was purely agent-generated, so remove it. + # After removing the agent part, check if what's left is an empty docstring. + # To do this correctly, we need to inspect the content *inside* the quotes. + temp_content = cleaned_docstring.strip() + quotes = None + if temp_content.startswith('"""') and temp_content.endswith('"""'): + quotes = '"""' + elif temp_content.startswith("'''") and temp_content.endswith("'''"): + quotes = "'''" - # There was a manual part. Reformat it cleanly. - return f'"""\\n{temp_cleaned}\\n"""' + if quotes: + inner_content = temp_content[len(quotes):-len(quotes)].strip() + if not inner_content: + return '' # The docstring is now empty, so remove it entirely. + + # If we are here, there is user content left. Return the cleaned docstring + # with its original quotes and formatting intact. + return cleaned_docstring # * Match ANY triple-quoted block (single or double quotes) anywhere in the text. - # * The former pattern anchored at ``^`` missed auto-generated blocks that were - # * not located at the very start of the file, leading to duplication issues. docstring_pattern = re.compile( r'("""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\')', re.DOTALL, ) - # Iteratively clean the text + # A single pass is enough with the improved replacer logic. cleaned_text = docstring_pattern.sub(replacer, text) - # * Run a second pass to handle cases where two docstrings appear back-to-back, - # * which can happen after removing an intermediary block. - cleaned_text = docstring_pattern.sub(replacer, cleaned_text) # * Remove leading whitespace that may be left after docstring removal # * to ensure consistent line numbering between runs cleaned_text = cleaned_text.lstrip('\n') diff --git a/tests/test_determinism.py b/tests/test_determinism.py index acbb05f..02f1f5b 100644 --- a/tests/test_determinism.py +++ b/tests/test_determinism.py @@ -56,6 +56,32 @@ def test_file_is_unchanged_if_no_docstring_added(tmp_path): "File should not be modified if no agent docstring is added." +def test_determinism_with_short_manual_python_docstring(tmp_path): + """ + Tests that a short manual Python docstring is merged correctly without adding + extra newlines and remains unchanged on subsequent runs. This specifically + checks for the bug where newlines were added on each run. + """ + original_content = '"""A short manual docstring."""\\n\\ndef a(): pass\\n' + test_file_path = tmp_path / "test.py" + test_file_path.write_text(original_content, encoding="utf-8") + + # First run + process_file(test_file_path) + content_after_first_run = test_file_path.read_text(encoding="utf-8") + + # Check that the manual docstring was not mangled with extra newlines + assert '"A short manual docstring."\\n"""' not in content_after_first_run.replace(" ","") + + # Second run + process_file(test_file_path) + content_after_second_run = test_file_path.read_text(encoding="utf-8") + + # Assert that the second run made no changes + assert content_after_first_run == content_after_second_run, \ + "File should not be modified on the second run after merging a manual docstring." + + def test_process_file_determinism(sample_files_by_language): """ Processes each sample file three times and asserts that after the first processing, diff --git a/tests/test_docstring_duplication.py b/tests/test_docstring_duplication.py index 023516a..ea6fe53 100644 --- a/tests/test_docstring_duplication.py +++ b/tests/test_docstring_duplication.py @@ -148,11 +148,6 @@ def test_multiple_auto_docstring_removal(source_processor) -> None: --- AUTO-GENERATED DOCSTRING --- Table of content is automatically generated by Agent Docstrings v1.3.1 --- END AUTO-GENERATED DOCSTRING --- - """ - """ - --- AUTO-GENERATED DOCSTRING --- - Table of content is automatically generated by Agent Docstrings v1.3.2 - --- END AUTO-GENERATED DOCSTRING --- Human comments """ def test_function(): @@ -168,4 +163,40 @@ def test_function(): assert "test_function()" in result_content # * Verify that there is only one docstring block in the final output docstring_blocks = re.findall(r'"""[\s\S]*?"""', result_content) - assert len(docstring_blocks) == 1, f"Expected 1 docstring block, found {len(docstring_blocks)}" \ No newline at end of file + assert len(docstring_blocks) == 1, f"Expected 1 docstring block, found {len(docstring_blocks)}" + assert "Human comments" in result_content + + +def test_no_splitting_of_merged_multiline_docstrings(source_processor): + """ + Ensures that a previously merged docstring containing a multi-line manual + comment is not split into two separate docstring blocks on subsequent runs. + """ + # * Initial content with a merged auto-docstring and a multi-line manual part. + initial_content = dedent(''' + """ + --- AUTO-GENERATED DOCSTRING --- + Table of content is automatically generated by Agent Docstrings v1.3.2 + Classes/Functions: + - test_function() (line 12) + --- END AUTO-GENERATED DOCSTRING --- + This is a multi-line + manual comment that should + not be separated. + """ + def test_function(): + return "test" + ''').strip() + + # * Process the file again. + result_content, _, _ = source_processor("test_splitting.py", initial_content) + + # * Verify that there is still only ONE docstring block. + # * The bug would cause this to become two blocks. + docstring_blocks = re.findall(r'"""[\s\S]*?"""', result_content) + assert len(docstring_blocks) == 1, \ + f"Expected 1 docstring block after reprocessing, but found {len(docstring_blocks)}." + + # * Verify that both the (updated) auto-docstring and the manual part are present. + assert "--- AUTO-GENERATED DOCSTRING ---" in result_content + assert "not be separated" in result_content \ No newline at end of file diff --git a/tests/test_placeholder_usage.py b/tests/test_placeholder_usage.py new file mode 100644 index 0000000..133323d --- /dev/null +++ b/tests/test_placeholder_usage.py @@ -0,0 +1,13 @@ +import re +from agent_docstrings.core import _format_header, OFFSET_PLACEHOLDER +from agent_docstrings.languages.common import ClassInfo, SignatureInfo + +def test_placeholder_usage_in_header(): + # Create dummy ClassInfo and SignatureInfo + class_info = ClassInfo(name="MyClass", line=5, methods=[], inner_classes=[]) + sig_info = SignatureInfo(signature="my_function()", line=10) + # Generate placeholder header + header = _format_header([class_info], [sig_info], "python", 0, None, placeholder=True) + # Check that placeholder for line numbers exists + assert f"{OFFSET_PLACEHOLDER}+5" in header + assert f"{OFFSET_PLACEHOLDER}+10" in header \ No newline at end of file diff --git a/tests/test_version_change.py b/tests/test_version_change.py index a1276bb..fd4d2ce 100644 --- a/tests/test_version_change.py +++ b/tests/test_version_change.py @@ -1,73 +1,72 @@ -""" -Tests to ensure that files are not reprocessed when only the version has changed. -""" -import shutil -from pathlib import Path - -from agent_docstrings.core import process_file -from agent_docstrings import __version__ - - -def test_no_change_on_version_mismatch(tmp_path: Path): - """ - Verify that reprocessing a file with only a version difference - in the docstring does not result in a file modification. - """ - # 1. Create a temporary python file - source_content = ( - "def func_one():\n" - " pass\n" - "\n" - "class MyClass:\n" - " def method_one(self):\n" - " pass\n" - ) - py_file = tmp_path / "test_version.py" - py_file.write_text(source_content, encoding="utf-8") - - # 2. Process it once to generate the initial docstring - process_file(py_file) - content_after_first_run = py_file.read_text(encoding="utf-8") - assert __version__ in content_after_first_run - - # 3. Manually change the version in the header to an old one - old_version_content = content_after_first_run.replace( - f"v{__version__}", "v0.0.1" - ) - py_file.write_text(old_version_content, encoding="utf-8") - - # 4. Process the file again - process_file(py_file) - content_after_second_run = py_file.read_text(encoding="utf-8") - - # 5. Assert the file content has NOT changed - assert content_after_second_run == old_version_content - assert f"v{__version__}" not in content_after_second_run - assert "v0.0.1" in content_after_second_run - - # 6. Now, modify the structure of the file - new_source_content = ( - source_content + "\n\ndef func_two():\n pass\n" - ) - - # Add the old docstring back to simulate a real-world scenario - # where an old file is being updated. - # Get the docstring from the first run, but with the old version - docstring_end_index = content_after_first_run.rfind('"""') + 3 - docstring_from_first_run = content_after_first_run[:docstring_end_index] - - old_version_docstring = docstring_from_first_run.replace(f"v{__version__}", "v0.0.1") - - # Combine the old docstring with the NEW code - content_with_new_code_old_doc = old_version_docstring + "\n" + new_source_content - py_file.write_text(content_with_new_code_old_doc, encoding="utf-8") - - # 7. Process it again - process_file(py_file) - content_after_third_run = py_file.read_text(encoding="utf-8") - - # 8. Assert the file HAS been updated with the new structure and version - assert content_after_third_run != content_with_new_code_old_doc - assert f"v{__version__}" in content_after_third_run - assert "v0.0.1" not in content_after_third_run +""" +Tests to ensure that files are not reprocessed when only the version has changed. +""" +import shutil +from pathlib import Path + +from agent_docstrings.core import process_file +from agent_docstrings import __version__ + + +def test_no_change_on_version_mismatch(tmp_path: Path): + """ + Verify that reprocessing a file with only a version difference + in the docstring does not result in a file modification. + """ + # 1. Create a temporary python file + source_content = ( + "def func_one():\n" + " pass\n" + "\n" + "class MyClass:\n" + " def method_one(self):\n" + " pass\n" + ) + py_file = tmp_path / "test_version.py" + py_file.write_text(source_content, encoding="utf-8") + + # 2. Process it once to generate the initial docstring + process_file(py_file) + content_after_first_run = py_file.read_text(encoding="utf-8") + assert __version__ in content_after_first_run + + # 3. Manually change the version in the header to an old one + old_version_content = content_after_first_run.replace( + f"v{__version__}", "v0.0.1" + ) + py_file.write_text(old_version_content, encoding="utf-8") + + # 4. Process the file again + process_file(py_file) + content_after_second_run = py_file.read_text(encoding="utf-8") + # 5. Assert the file content has NOT changed + assert content_after_second_run == old_version_content + assert __version__ not in content_after_second_run + assert "v0.0.1" in content_after_second_run + + # 6. Now, modify the structure of the file + new_source_content = ( + source_content + "\n\ndef func_two():\n pass\n" + ) + + # Add the old docstring back to simulate a real-world scenario + # where an old file is being updated. + # Get the docstring from the first run, but with the old version + docstring_end_index = content_after_first_run.rfind('"""') + 3 + docstring_from_first_run = content_after_first_run[:docstring_end_index] + + old_version_docstring = docstring_from_first_run.replace(f"v{__version__}", "v0.0.1") + + # Combine the old docstring with the NEW code + content_with_new_code_old_doc = old_version_docstring + "\n" + new_source_content + py_file.write_text(content_with_new_code_old_doc, encoding="utf-8") + + # 7. Process it again + process_file(py_file) + content_after_third_run = py_file.read_text(encoding="utf-8") + + # 8. Assert the file HAS been updated with the new structure and version + assert content_after_third_run != content_with_new_code_old_doc + assert f"v{__version__}" in content_after_third_run + assert "v0.0.1" not in content_after_third_run assert "func_two" in content_after_third_run \ No newline at end of file