Skip to content

Commit e5bcb74

Browse files
committed
fix: address multiple diffctx bugs and tighten similarity thresholds
- Fix PPR iterator consumption bug (dict.values() exhausted after comprehension) - Differentiate TS vs TSX parsing (use correct tree-sitter grammar) - Fix Language double-wrapping in tree-sitter parser - Add backtick support in bracket balancing for JS template strings - Fix Go edge builder case mismatch in package name matching - Replace rglob with git ls-files for faster candidate file collection - Tighten lexical similarity: min_similarity 0.1->0.30, top_k 10->5 - Restrict config-to-code edges: require exact match, min key length 6 - Fix JSON fragmentation: handle files with no top-level keys
1 parent 86ef6af commit e5bcb74

10 files changed

Lines changed: 59 additions & 38 deletions

File tree

src/treemapper/diffctx/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import logging
4+
import subprocess
45
from collections import defaultdict
56
from pathlib import Path
67
from typing import Any
@@ -340,6 +341,19 @@ def _is_candidate_file(file_path: Path, root_dir: Path, included_set: set[Path],
340341

341342

342343
def _collect_candidate_files(root_dir: Path, included_set: set[Path], combined_spec: pathspec.PathSpec) -> list[Path]:
344+
try:
345+
result = subprocess.run(
346+
["git", "ls-files", "-z"],
347+
cwd=root_dir,
348+
capture_output=True,
349+
text=True,
350+
timeout=30,
351+
)
352+
if result.returncode == 0 and result.stdout:
353+
files = [root_dir / f for f in result.stdout.split("\0") if f]
354+
return [f for f in files if _is_candidate_file(f, root_dir, included_set, combined_spec)]
355+
except (subprocess.SubprocessError, OSError):
356+
pass
343357
return [f for f in root_dir.rglob("*") if _is_candidate_file(f, root_dir, included_set, combined_spec)]
344358

345359

src/treemapper/diffctx/config/limits.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@ class PPRConfig:
2222

2323
@dataclass(frozen=True)
2424
class LexicalConfig:
25-
min_similarity: float = 0.1
25+
min_similarity: float = 0.30
2626
hub_percentile: float = 0.95
27-
top_k_neighbors: int = 10
28-
max_df_ratio: float = 0.20
29-
min_idf: float = 1.6
30-
max_postings: int = 200
31-
weight_min: float = 0.1
32-
weight_max: float = 0.2
33-
backward_factor: float = 0.7
27+
top_k_neighbors: int = 5
28+
max_df_ratio: float = 0.15
29+
min_idf: float = 2.0
30+
max_postings: int = 100
31+
weight_min: float = 0.05
32+
weight_max: float = 0.15
33+
backward_factor: float = 0.5
3434

3535

3636
@dataclass(frozen=True)

src/treemapper/diffctx/constants.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,6 @@
1414

1515

1616
def expand_config_key(key: str) -> set[str]:
17-
expanded: set[str] = {key}
18-
parts = key.split("_")
19-
for part in parts:
20-
if len(part) >= 3:
21-
expanded.add(part)
22-
for prefix in CONFIG_KEY_COMMON_PREFIXES:
23-
if key.startswith(prefix + "_") and len(key) > len(prefix) + 1:
24-
stripped = key[len(prefix) + 1 :]
25-
expanded.add(stripped)
26-
for sub in stripped.split("_"):
27-
if len(sub) >= 3:
28-
expanded.add(sub)
29-
return expanded
17+
if len(key) < 6:
18+
return set()
19+
return {key}

src/treemapper/diffctx/edges/config/generic.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,7 @@ def discover_related_files(
8686
def _build_key_patterns(self, keys: set[str]) -> dict[str, re.Pattern[str]]:
8787
patterns: dict[str, re.Pattern[str]] = {}
8888
for key in keys:
89-
if len(key) >= 4:
90-
patterns[key] = re.compile(rf"\b\w*{re.escape(key)}\w*\b", re.IGNORECASE)
91-
else:
89+
if len(key) >= 6:
9290
patterns[key] = re.compile(rf"\b{re.escape(key)}\b", re.IGNORECASE)
9391
return patterns
9492

src/treemapper/diffctx/edges/semantic/go.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def _build_indices(
8888
func_defs: dict[str, list[FragmentId]] = defaultdict(list)
8989

9090
for f in go_frags:
91-
pkg = _get_package_name(f.path)
91+
pkg = _get_package_name(f.path).lower()
9292
pkg_to_frags[pkg].append(f.id)
9393

9494
if repo_root:
@@ -143,7 +143,7 @@ def _link_import_by_package(
143143
pkg_to_frags: dict[str, list[FragmentId]],
144144
edges: EdgeDict,
145145
) -> None:
146-
imp_pkg = imp.split("/")[-1]
146+
imp_pkg = imp.split("/")[-1].lower()
147147
for pkg, frag_ids in pkg_to_frags.items():
148148
if pkg == imp_pkg:
149149
self.add_edges_from_ids(gf_id, frag_ids, self.import_weight, edges)
@@ -190,7 +190,7 @@ def _link_same_package(
190190
pkg_to_frags: dict[str, list[FragmentId]],
191191
edges: EdgeDict,
192192
) -> None:
193-
current_pkg = _get_package_name(gf.path)
193+
current_pkg = _get_package_name(gf.path).lower()
194194
for fid in pkg_to_frags.get(current_pkg, []):
195195
if fid != gf.id:
196196
self.add_edge(edges, gf.id, fid, self.same_package_weight)

src/treemapper/diffctx/languages.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,11 @@
164164
".pyw": "python",
165165
".pyi": "python",
166166
".js": "javascript",
167-
".jsx": "javascript",
167+
".jsx": "jsx",
168168
".mjs": "javascript",
169169
".cjs": "javascript",
170170
".ts": "typescript",
171-
".tsx": "typescript",
171+
".tsx": "tsx",
172172
".mts": "typescript",
173173
".cts": "typescript",
174174
".go": "go",

src/treemapper/diffctx/parsers/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def _process_char_in_string(char: str, string_char: str, escape_count: int) -> t
3535

3636

3737
def _process_char_outside_string(char: str, stack: list[str]) -> tuple[bool, str]:
38-
if char in ('"', "'"):
38+
if char in ('"', "'", "`"):
3939
return True, char
4040
if char in _BRACKET_PAIRS:
4141
stack.append(_BRACKET_PAIRS[char])

src/treemapper/diffctx/parsers/config.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,15 +82,16 @@ def fragment(self, path: Path, content: str) -> list[Fragment]:
8282
elif suffix == ".toml":
8383
key_re = re.compile(r"^\[([a-zA-Z_][a-zA-Z0-9_.-]*)\]")
8484
else:
85-
key_re = re.compile(r'^\s{0,2}"([^"]+)":\s*')
85+
key_re = re.compile(r'^\s{0,4}"([^"]+)":\s*')
8686

8787
boundaries: list[int] = []
8888
for i, line in enumerate(lines):
8989
if key_re.match(line):
9090
boundaries.append(i)
9191

92-
if len(boundaries) < 2:
93-
return []
92+
if not boundaries:
93+
frag = create_fragment_from_lines(path, lines, 1, len(lines), "config", "data")
94+
return [frag] if frag else []
9495

9596
fragments: list[Fragment] = []
9697
boundaries.append(len(lines))

src/treemapper/diffctx/parsers/tree_sitter.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
_DEFINITION_NODE_TYPES = {
1414
"python": {"function_definition", "class_definition", "decorated_definition"},
1515
"javascript": {"function_declaration", "class_declaration", "method_definition", "arrow_function", "variable_declarator"},
16+
"jsx": {"function_declaration", "class_declaration", "method_definition", "arrow_function", "variable_declarator"},
1617
"typescript": {
1718
"function_declaration",
1819
"class_declaration",
@@ -23,6 +24,16 @@
2324
"enum_declaration",
2425
"variable_declarator",
2526
},
27+
"tsx": {
28+
"function_declaration",
29+
"class_declaration",
30+
"method_definition",
31+
"arrow_function",
32+
"interface_declaration",
33+
"type_alias_declaration",
34+
"enum_declaration",
35+
"variable_declarator",
36+
},
2637
"go": {"function_declaration", "method_declaration", "type_declaration"},
2738
"rust": {"function_item", "impl_item", "struct_item", "enum_item", "trait_item"},
2839
"java": {"method_declaration", "class_declaration", "interface_declaration", "enum_declaration"},
@@ -69,7 +80,9 @@
6980
_LANG_MODULES = {
7081
"python": "tree_sitter_python",
7182
"javascript": "tree_sitter_javascript",
83+
"jsx": "tree_sitter_javascript",
7284
"typescript": "tree_sitter_typescript",
85+
"tsx": "tree_sitter_typescript",
7386
"go": "tree_sitter_go",
7487
"rust": "tree_sitter_rust",
7588
"java": "tree_sitter_java",
@@ -98,15 +111,20 @@ def _get_parser(self, lang: str) -> Parser:
98111
module_name = _LANG_MODULES[lang]
99112
ts_lang_module = importlib.import_module(module_name)
100113

101-
if lang == "typescript":
114+
if lang == "tsx":
102115
ts_lang = ts_lang_module.language_tsx()
116+
elif lang == "typescript":
117+
ts_lang = ts_lang_module.language_typescript()
103118
elif hasattr(ts_lang_module, "language"):
104119
ts_lang = ts_lang_module.language()
105120
else:
106121
ts_lang = ts_lang_module
107122

108123
parser = Parser()
109-
parser.language = Language(ts_lang)
124+
if isinstance(ts_lang, Language):
125+
parser.language = ts_lang
126+
else:
127+
parser.language = Language(ts_lang)
110128
self._parsers[lang] = parser
111129
return parser
112130

src/treemapper/diffctx/ppr.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,9 @@ def personalized_pagerank(
6969
p, scores = _initialize_ppr_scores(nodes, valid_seeds)
7070
out_sum = {}
7171
for n in nodes:
72-
weights = graph.neighbors(n).values()
73-
finite_weights = [w for w in weights if math.isfinite(w)]
74-
if len(finite_weights) < len(list(weights)):
72+
nbr_values = list(graph.neighbors(n).values())
73+
finite_weights = [w for w in nbr_values if math.isfinite(w)]
74+
if len(finite_weights) < len(nbr_values):
7575
logging.debug("Node %s has non-finite edge weights, filtering", n)
7676
total = sum(finite_weights)
7777
out_sum[n] = total if math.isfinite(total) else 0.0

0 commit comments

Comments
 (0)