Skip to content

Commit b0ddc20

Browse files
committed
fix: binary exclusion, rename/delete filtering, generated file caps
1 parent f099600 commit b0ddc20

4 files changed

Lines changed: 351 additions & 4 deletions

File tree

src/treemapper/diffctx/__init__.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import logging
4+
import re
45
import subprocess
56
from collections import defaultdict
67
from pathlib import Path
@@ -10,14 +11,17 @@
1011

1112
from ..ignore import get_ignore_specs, get_whitelist_spec, is_whitelisted, should_ignore
1213
from ..tokens import count_tokens
14+
from ..tree import KNOWN_BINARY_EXTENSIONS
1315
from .config import LIMITS
1416
from .config.extensions import CODE_EXTENSIONS, CONFIG_EXTENSIONS, DOC_EXTENSIONS
1517
from .edges import discover_all_related_files
1618
from .fragments import enclosing_fragment, fragment_file # type: ignore[attr-defined]
1719
from .git import (
1820
GitError,
1921
get_changed_files,
22+
get_deleted_files,
2023
get_diff_text,
24+
get_renamed_old_paths,
2125
get_untracked_files,
2226
is_git_repo,
2327
parse_diff,
@@ -64,11 +68,21 @@ def _kind_priority(kind: str) -> int:
6468
return 0 if kind in _SEMANTIC_KINDS else 1
6569

6670

71+
_BINARY_CTRL_RE = re.compile(r"[\x00-\x08\x0e-\x1f]")
72+
73+
74+
def _looks_binary(content: str) -> bool:
75+
return bool(_BINARY_CTRL_RE.search(content[:8192]))
76+
77+
6778
def _read_file_content(
6879
file_path: Path,
6980
root_dir: Path,
7081
preferred_revs: list[str],
7182
) -> str | None:
83+
if file_path.suffix.lower() in KNOWN_BINARY_EXTENSIONS:
84+
return None
85+
7286
abs_path = _normalize_path(file_path, root_dir)
7387
try:
7488
rel = abs_path.relative_to(root_dir.resolve())
@@ -78,13 +92,19 @@ def _read_file_content(
7892

7993
for rev in preferred_revs:
8094
try:
81-
return show_file_at_revision(root_dir, rev, rel)
95+
content = show_file_at_revision(root_dir, rev, rel)
96+
if _looks_binary(content):
97+
return None
98+
return content
8299
except GitError:
83100
continue
84101

85102
if abs_path.exists() and abs_path.is_file():
86103
try:
87-
return abs_path.read_text(encoding="utf-8")
104+
content = abs_path.read_text(encoding="utf-8")
105+
if _looks_binary(content):
106+
return None
107+
return content
88108
except (OSError, UnicodeDecodeError):
89109
pass
90110

@@ -104,6 +124,7 @@ def _build_preferred_revs(base_rev: str | None, head_rev: str | None) -> list[st
104124

105125

106126
_MAX_GENERATED_FRAGMENTS = LIMITS.max_generated_fragments
127+
_MAX_GENERATED_LINES = LIMITS.max_generated_lines
107128

108129

109130
_GENERATED_FILENAME_PATTERNS = frozenset(
@@ -117,6 +138,7 @@ def _build_preferred_revs(base_rev: str | None, head_rev: str | None) -> list[st
117138
".min.js",
118139
".min.css",
119140
".designer.cs",
141+
".api",
120142
}
121143
)
122144

@@ -195,6 +217,26 @@ def _process_files_for_fragments(
195217
" (generated)" if is_generated else "",
196218
)
197219

220+
if is_generated:
221+
truncated: list[Fragment] = []
222+
for frag in file_frags:
223+
if frag.line_count > _MAX_GENERATED_LINES:
224+
lines = frag.content.splitlines()
225+
remaining = len(lines) - _MAX_GENERATED_LINES
226+
lines = lines[:_MAX_GENERATED_LINES]
227+
truncated_content = "\n".join(lines) + f"\n# ... [{remaining} more lines]"
228+
truncated.append(
229+
Fragment(
230+
id=FragmentId(frag.path, frag.start_line, frag.start_line + len(lines) - 1),
231+
kind=frag.kind,
232+
content=truncated_content,
233+
identifiers=extract_identifiers(truncated_content),
234+
)
235+
)
236+
else:
237+
truncated.append(frag)
238+
file_frags = truncated
239+
198240
for frag in file_frags:
199241
fragments.append(frag)
200242
seen_frag_ids.add(frag.id)
@@ -521,6 +563,10 @@ def build_diff_context(
521563
changed_files = _filter_ignored(changed_files, root_dir, combined_spec)
522564
changed_files = _filter_whitelist(changed_files, root_dir, wl_spec)
523565

566+
excluded_paths = get_deleted_files(root_dir, diff_range) | get_renamed_old_paths(root_dir, diff_range)
567+
if excluded_paths:
568+
changed_files = [f for f in changed_files if f.resolve() not in excluded_paths]
569+
524570
preferred_revs = _build_preferred_revs(base_rev, head_rev)
525571

526572
seen_frag_ids: set[FragmentId] = set()

src/treemapper/diffctx/config/limits.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ class AlgorithmLimits:
88
max_file_size: int = 100_000
99
max_fragments: int = 200
1010
max_generated_fragments: int = 5
11+
max_generated_lines: int = 30
1112
max_candidate_files: int = 5000
1213
max_discovered_files: int = 200
1314
skip_expensive_threshold: int = 2000

src/treemapper/diffctx/git.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def _parse_path_line(line: str, repo_root: Path) -> tuple[str, Path | None]:
7373

7474

7575
def parse_diff(repo_root: Path, diff_range: str) -> list[DiffHunk]:
76-
output = run_git(repo_root, ["diff", "--unified=0", diff_range])
76+
output = run_git(repo_root, ["diff", "--unified=0", "-M", diff_range])
7777
hunks: list[DiffHunk] = []
7878
old_path: Path | None = None
7979
new_path: Path | None = None
@@ -97,7 +97,7 @@ def parse_diff(repo_root: Path, diff_range: str) -> list[DiffHunk]:
9797

9898

9999
def get_changed_files(repo_root: Path, diff_range: str) -> list[Path]:
100-
output = run_git(repo_root, ["diff", "--name-only", diff_range])
100+
output = run_git(repo_root, ["diff", "--name-only", "-M", diff_range])
101101
files: list[Path] = []
102102
for line in output.splitlines():
103103
line = line.strip()
@@ -120,6 +120,21 @@ def get_untracked_files(repo_root: Path) -> list[Path]:
120120
return [repo_root / line.strip() for line in output.splitlines() if line.strip()]
121121

122122

123+
def get_deleted_files(repo_root: Path, diff_range: str) -> set[Path]:
124+
output = run_git(repo_root, ["diff", "--diff-filter=D", "--name-only", "-M", diff_range])
125+
return {(repo_root / line.strip()).resolve() for line in output.splitlines() if line.strip()}
126+
127+
128+
def get_renamed_old_paths(repo_root: Path, diff_range: str) -> set[Path]:
129+
output = run_git(repo_root, ["diff", "--diff-filter=R", "--name-status", "-M", diff_range])
130+
old_paths: set[Path] = set()
131+
for line in output.splitlines():
132+
parts = line.strip().split("\t")
133+
if len(parts) >= 3 and parts[0].startswith("R"):
134+
old_paths.add((repo_root / parts[1]).resolve())
135+
return old_paths
136+
137+
123138
def show_file_at_revision(repo_root: Path, rev: str, rel_path: Path) -> str:
124139
spec = f"{rev}:{rel_path.as_posix()}"
125140
return run_git(repo_root, ["show", spec])

0 commit comments

Comments
 (0)