Skip to content

Commit bfe070b

Browse files
committed
fix: Kotlin regex, PySBD line tracking, tree-sitter container headers
1 parent 16d4679 commit bfe070b

58 files changed

Lines changed: 1575 additions & 1177 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

docs/Fragmentation.md renamed to docs/Context-Selection-for-Git-Diff/Fragmentation.md

File renamed without changes.

src/treemapper/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ class ParsedArgs:
9696
force_stdout: bool
9797
diff_range: str | None = None
9898
budget: int | None = None
99-
alpha: float = 0.55
99+
alpha: float = 0.60
100100
tau: float = 0.08
101101
full_diff: bool = False
102102

src/treemapper/diffctx/__init__.py

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ def build_diff_context(
198198
full: bool = False,
199199
) -> dict[str, Any]:
200200
_validate_inputs(root_dir, alpha, tau, budget_tokens)
201+
root_dir = root_dir.resolve()
201202

202203
hunks = parse_diff(root_dir, diff_range)
203204
if not hunks:
@@ -265,6 +266,9 @@ def _validate_inputs(root_dir: Path, alpha: float, tau: float, budget_tokens: in
265266
raise ValueError(f"budget_tokens must be > 0, got {budget_tokens}")
266267

267268

269+
_CONTAINER_FRAGMENT_KINDS = frozenset({"class", "interface", "struct"})
270+
271+
268272
def _identify_core_fragments(hunks: list[DiffHunk], all_fragments: list[Fragment]) -> set[FragmentId]:
269273
frags_by_path: dict[Path, list[Fragment]] = defaultdict(list)
270274
for frag in all_fragments:
@@ -276,9 +280,25 @@ def _identify_core_fragments(hunks: list[DiffHunk], all_fragments: list[Fragment
276280
if frags:
277281
h_start, h_end = h.core_selection_range
278282
core_ids.update(_find_core_for_hunk(frags, h_start, h_end))
283+
284+
_add_container_headers(core_ids, frags_by_path)
279285
return core_ids
280286

281287

288+
def _add_container_headers(core_ids: set[FragmentId], frags_by_path: dict[Path, list[Fragment]]) -> None:
289+
core_paths = {fid.path for fid in core_ids}
290+
headers_to_add: list[FragmentId] = []
291+
for path in core_paths:
292+
for frag in frags_by_path.get(path, []):
293+
if frag.kind not in _CONTAINER_FRAGMENT_KINDS or frag.id in core_ids:
294+
continue
295+
for core_id in core_ids:
296+
if core_id.path == path and core_id.start_line > frag.end_line:
297+
headers_to_add.append(frag.id)
298+
break
299+
core_ids.update(headers_to_add)
300+
301+
282302
def _log_full_mode(selected: list[Fragment]) -> None:
283303
try:
284304
used = sum(f.token_count for f in selected)
@@ -409,21 +429,6 @@ def _collect_expansion_files(
409429
return list(expansion_files)
410430

411431

412-
def _expand_universe_by_rare_identifiers(
413-
root_dir: Path,
414-
concepts: frozenset[str],
415-
already_included: list[Path],
416-
combined_spec: pathspec.PathSpec,
417-
) -> list[Path]:
418-
if not concepts:
419-
return []
420-
421-
included_set = set(already_included)
422-
files = _collect_candidate_files(root_dir, included_set, combined_spec)
423-
inverted_index = _build_ident_index(files, concepts)
424-
return _collect_expansion_files(inverted_index, concepts, included_set)
425-
426-
427432
def _filter_ignored(
428433
files: list[Path],
429434
root_dir: Path,
@@ -436,10 +441,25 @@ def _filter_ignored(
436441
if not should_ignore(rel_path, combined_spec):
437442
result.append(file_path)
438443
except ValueError:
439-
result.append(file_path)
444+
pass
440445
return result
441446

442447

448+
def _expand_universe_by_rare_identifiers(
449+
root_dir: Path,
450+
concepts: frozenset[str],
451+
already_included: list[Path],
452+
combined_spec: pathspec.PathSpec,
453+
) -> list[Path]:
454+
if not concepts:
455+
return []
456+
457+
included_set = set(already_included)
458+
files = _collect_candidate_files(root_dir, included_set, combined_spec)
459+
inverted_index = _build_ident_index(files, concepts)
460+
return _collect_expansion_files(inverted_index, concepts, included_set)
461+
462+
443463
def _empty_tree(root_dir: Path) -> dict[str, Any]:
444464
return {
445465
"name": root_dir.name,

src/treemapper/diffctx/config/extensions.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,27 +16,27 @@
1616

1717
KOTLIN_EXTENSIONS = frozenset({".kt", ".kts"})
1818

19-
SCALA_EXTENSIONS = frozenset({".scala"})
19+
SCALA_EXTENSIONS = frozenset({".scala", ".sc"})
2020

2121
JVM_EXTENSIONS = JAVA_EXTENSIONS | KOTLIN_EXTENSIONS | SCALA_EXTENSIONS
2222

2323
C_EXTENSIONS = frozenset({".c", ".h"})
2424

25-
CPP_EXTENSIONS = frozenset({".cpp", ".hpp", ".cc", ".hh", ".cxx", ".hxx"})
25+
CPP_EXTENSIONS = frozenset({".cpp", ".hpp", ".cc", ".hh", ".cxx", ".hxx", ".c++", ".h++", ".ipp", ".tpp"})
2626

2727
C_FAMILY_EXTENSIONS = C_EXTENSIONS | CPP_EXTENSIONS | frozenset({".m", ".mm"})
2828

2929
CSHARP_EXTENSIONS = frozenset({".cs"})
3030

31-
FSHARP_EXTENSIONS = frozenset({".fs", ".fsx"})
31+
FSHARP_EXTENSIONS = frozenset({".fs", ".fsi", ".fsx"})
3232

3333
DOTNET_EXTENSIONS = CSHARP_EXTENSIONS | FSHARP_EXTENSIONS
3434

3535
RUBY_EXTENSIONS = frozenset({".rb", ".rake", ".gemspec"})
3636

37-
PHP_EXTENSIONS = frozenset({".php"})
37+
PHP_EXTENSIONS = frozenset({".php", ".phtml", ".php3", ".php4", ".php5", ".php7", ".phps"})
3838

39-
SHELL_EXTENSIONS = frozenset({".sh", ".bash", ".zsh", ".ps1"})
39+
SHELL_EXTENSIONS = frozenset({".sh", ".bash", ".zsh", ".ksh", ".fish", ".ps1", ".psm1", ".psd1"})
4040

4141
SWIFT_EXTENSIONS = frozenset({".swift"})
4242

src/treemapper/diffctx/config/weights.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ class LangWeights:
6565
LANG_WEIGHTS: dict[str, LangWeights] = {
6666
"python": LangWeights(0.55, 0.60, 0.50, 0.20, 0.35),
6767
"javascript": LangWeights(0.50, 0.55, 0.45, 0.25, 0.35),
68+
"jsx": LangWeights(0.50, 0.55, 0.45, 0.25, 0.35),
6869
"typescript": LangWeights(0.70, 0.75, 0.65, 0.15, 0.25),
70+
"tsx": LangWeights(0.70, 0.75, 0.65, 0.15, 0.25),
6971
"rust": LangWeights(0.90, 0.95, 0.85, 0.10, 0.15),
7072
"java": LangWeights(0.85, 0.90, 0.80, 0.10, 0.15),
7173
"kotlin": LangWeights(0.80, 0.85, 0.75, 0.12, 0.18),

src/treemapper/diffctx/edges/base.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,12 @@ def path_to_module(path: Path, repo_root: Path | None = None) -> str:
2323
parts = parts[i + 1 :]
2424
break
2525

26-
if parts and parts[-1].endswith(".py"):
27-
parts[-1] = parts[-1][:-3]
28-
if parts[-1] == "__init__":
26+
if parts:
27+
for _ext in (".pyw", ".pyi", ".py"):
28+
if parts[-1].endswith(_ext):
29+
parts[-1] = parts[-1][: -len(_ext)]
30+
break
31+
if parts and parts[-1] == "__init__":
2932
parts = parts[:-1]
3033

3134
return ".".join(parts) if parts else ""
@@ -127,6 +130,26 @@ def add_ref_edges(
127130
edges[(dst, src_id)] = max(edges.get((dst, src_id), 0.0), weight * reverse_factor)
128131

129132

133+
def add_semantic_edges(
134+
edges: EdgeDict,
135+
src_id: FragmentId,
136+
info: object,
137+
name_to_defs: dict[str, list[FragmentId]],
138+
call_weight: float,
139+
ref_weight: float,
140+
type_weight: float,
141+
reverse_factor: float,
142+
self_defs: set[str],
143+
) -> None:
144+
add_ref_edges(edges, src_id, set(info.calls), name_to_defs, call_weight, reverse_factor=reverse_factor) # type: ignore[attr-defined]
145+
add_ref_edges(
146+
edges, src_id, set(info.references), name_to_defs, ref_weight, reverse_factor=reverse_factor, skip_self_defs=self_defs # type: ignore[attr-defined]
147+
)
148+
add_ref_edges(
149+
edges, src_id, set(info.type_refs), name_to_defs, type_weight, reverse_factor=reverse_factor, skip_self_defs=self_defs # type: ignore[attr-defined]
150+
)
151+
152+
130153
class EdgeBuilder(ABC):
131154
weight: float = 0.5
132155
reverse_weight_factor: float = 0.7

src/treemapper/diffctx/edges/config/cicd.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
r"^\s{0,20}-?\s{0,5}(?:local|project|remote|template):\s{0,5}['\"]?([^'\"#\n]{1,300})", re.MULTILINE
1818
)
1919

20-
_JENKINS_SH_RE = re.compile(r"sh\s*(?:\(['\"]|['\"])(.+?)['\"]\)?", re.MULTILINE | re.DOTALL)
20+
_JENKINS_SH_DOUBLE_RE = re.compile(r'sh\s*\(?"([^"]*)"', re.MULTILINE)
21+
_JENKINS_SH_SINGLE_RE = re.compile(r"sh\s*\(?'([^']*)'", re.MULTILINE)
2122
_JENKINS_SCRIPT_RE = re.compile(r"script\s*\{([^}]+)\}", re.MULTILINE | re.DOTALL)
2223

2324
_SCRIPT_CALL_TOOLS = frozenset(
@@ -174,9 +175,11 @@ def _extract_gitlab_refs(content: str) -> set[str]:
174175
def _extract_jenkins_refs(content: str) -> set[str]:
175176
refs: set[str] = set()
176177

177-
for match in _JENKINS_SH_RE.finditer(content):
178-
sh_content = match.group(1)
179-
refs.update(_extract_script_refs(sh_content))
178+
for match in _JENKINS_SH_DOUBLE_RE.finditer(content):
179+
refs.update(_extract_script_refs(match.group(1)))
180+
181+
for match in _JENKINS_SH_SINGLE_RE.finditer(content):
182+
refs.update(_extract_script_refs(match.group(1)))
180183

181184
for match in _JENKINS_SCRIPT_RE.finditer(content):
182185
refs.update(_extract_script_refs(match.group(1)))
@@ -328,7 +331,11 @@ def _link_by_name(self, ci_id: FragmentId, ref: str, idx: FragmentIndex, edges:
328331
ref_name = ref.split("/")[-1].lower()
329332
ref_base = ref_name.split(".")[0]
330333
for name, frag_ids in idx.by_name.items():
331-
if name == ref_name or name.startswith(ref_base):
334+
if name == ref_name or (
335+
ref_base
336+
and len(ref_base) >= 3
337+
and (name == ref_base or name.startswith(ref_base + "_") or name.startswith(ref_base + "."))
338+
):
332339
for fid in frag_ids:
333340
if fid != ci_id:
334341
self.add_edge(edges, ci_id, fid, self.script_weight)

src/treemapper/diffctx/edges/config/docker.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,15 @@ def _is_compose_file(path: Path) -> bool:
3131
return path.name.lower() in _COMPOSE_NAMES
3232

3333

34+
def _strip_dot_slash(s: str) -> str:
35+
while s.startswith("./"):
36+
s = s[2:]
37+
return s
38+
39+
3440
def _normalize_path(base_dir: Path, rel_path: str) -> Path:
3541
rel_path = rel_path.strip().strip("'\"")
36-
if rel_path.startswith("./"):
37-
rel_path = rel_path[2:]
42+
rel_path = _strip_dot_slash(rel_path)
3843
return base_dir / rel_path
3944

4045

@@ -57,14 +62,14 @@ def _collect_dockerfile_refs(content: str, refs: set[str]) -> None:
5762
for match in _DOCKERFILE_COPY_RE.finditer(content):
5863
src = match.group(1)
5964
if not src.startswith("--") and not src.startswith("$"):
60-
refs.add(src.strip().strip("'\"").lstrip("./"))
65+
refs.add(_strip_dot_slash(src.strip().strip("'\"")))
6166

6267

6368
def _collect_compose_refs(content: str, refs: set[str]) -> None:
6469
for match in _COMPOSE_BUILD_RE.finditer(content):
65-
refs.add(match.group(1).strip().lstrip("./"))
70+
refs.add(_strip_dot_slash(match.group(1).strip()))
6671
for match in _COMPOSE_VOLUME_RE.finditer(content):
67-
refs.add(match.group(1).strip().lstrip("./"))
72+
refs.add(_strip_dot_slash(match.group(1).strip()))
6873

6974

7075
class DockerEdgeBuilder(EdgeBuilder):
@@ -134,7 +139,9 @@ def _link_copy_source(
134139

135140
if "*" in src_path:
136141
return
137-
suffix = src_path.lstrip("./")
142+
suffix = _strip_dot_slash(src_path)
143+
if not suffix or suffix == ".":
144+
return
138145
for p, frag_ids in path_to_frags.items():
139146
if str(p).endswith(suffix):
140147
for frag_id in frag_ids:

src/treemapper/diffctx/edges/config/generic.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,66 @@
99

1010
_CONFIG_EXTENSIONS = {".yaml", ".yml", ".json", ".toml", ".ini", ".env"}
1111

12+
_CONFIG_KEY_STOPWORDS = frozenset(
13+
{
14+
"action",
15+
"actions",
16+
"assert",
17+
"author",
18+
"before",
19+
"branch",
20+
"change",
21+
"client",
22+
"config",
23+
"create",
24+
"default",
25+
"delete",
26+
"deploy",
27+
"description",
28+
"enable",
29+
"engine",
30+
"engines",
31+
"export",
32+
"exports",
33+
"format",
34+
"health",
35+
"ignore",
36+
"import",
37+
"inputs",
38+
"keywords",
39+
"module",
40+
"modules",
41+
"number",
42+
"object",
43+
"openapi",
44+
"option",
45+
"options",
46+
"output",
47+
"outputs",
48+
"params",
49+
"plugin",
50+
"plugins",
51+
"private",
52+
"public",
53+
"remove",
54+
"render",
55+
"report",
56+
"require",
57+
"result",
58+
"return",
59+
"script",
60+
"scripts",
61+
"server",
62+
"source",
63+
"status",
64+
"string",
65+
"target",
66+
"update",
67+
"verbose",
68+
"version",
69+
}
70+
)
71+
1272
_CONFIG_KEY_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_-]*)\s*:", re.MULTILINE)
1373
_JSON_KEY_RE = re.compile(r'"([a-zA-Z_][a-zA-Z0-9_-]*)"\s*:')
1474
_TOML_KEY_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_-]*)\s*=", re.MULTILINE)
@@ -86,7 +146,7 @@ def discover_related_files(
86146
def _build_key_patterns(self, keys: set[str]) -> dict[str, re.Pattern[str]]:
87147
patterns: dict[str, re.Pattern[str]] = {}
88148
for key in keys:
89-
if len(key) >= 6:
149+
if len(key) >= 6 and key not in _CONFIG_KEY_STOPWORDS:
90150
patterns[key] = re.compile(rf"\b{re.escape(key)}\b", re.IGNORECASE)
91151
return patterns
92152

src/treemapper/diffctx/edges/config/helm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from ...types import Fragment, FragmentId
88
from ..base import EdgeBuilder, EdgeDict
99

10-
_HELM_VALUES_RE = re.compile(r"\{\{\s*\.Values\.([a-zA-Z0-9_.]+)\s*\}\}")
10+
_HELM_VALUES_RE = re.compile(r"\{\{-?\s*\.Values\.([a-zA-Z0-9_.]+)")
1111
_HELM_INCLUDE_RE = re.compile(r'\{\{\s*(?:include|template)\s+"([^"]+)"')
1212
_HELM_DEFINE_RE = re.compile(r'\{\{-?\s*define\s+"([^"]+)"')
1313
_HELM_RELEASE_RE = re.compile(r"\{\{\s*\.Release\.(\w+)\s*\}\}")

0 commit comments

Comments
 (0)