Skip to content

Commit 2078d37

Browse files
committed
fix(diffctx): hub suppression + expansion rarity + 10 correctness fixes
- Hub suppression counts unique files not fragments (fixes rust_002) - Expansion rarity includes already-included files (fixes julia_002) - concepts_from_diff_text filters docs stopwords (fixes fragments_015) - dbt YAML discovery skips source_tables (fixes dbt_004) - Terraform strict discovery excludes generic var/local refs - Terraform module resolution tries repo_root fallback - Hub dampening skips changed-path hubs in reverse_deps - Remove main.rs from Rust same_crate linking - PPR warns on silent uniform fallback - tree-sitter caches failed grammar imports - select.py warns on empty candidates with budget - parsers exc_info=True on strategy failures - MINIMUM_AVERAGE_SCORE raised to 82.0
1 parent 38fc1f7 commit 2078d37

209 files changed

Lines changed: 10812 additions & 9289 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/treemapper/diffctx/config/weights.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class EdgeWeightConfig:
3939
"jvm_import": EdgeWeightConfig(0.75, 0.40),
4040
"jvm_inheritance": EdgeWeightConfig(0.80, 0.40),
4141
"jvm_type": EdgeWeightConfig(0.60, 0.40),
42-
"jvm_same_package": EdgeWeightConfig(0.55, 0.40),
42+
"jvm_same_package": EdgeWeightConfig(0.10, 0.40),
4343
"jvm_annotation": EdgeWeightConfig(0.50, 0.40),
4444
"c_include": EdgeWeightConfig(0.65, 0.40),
4545
"c_call": EdgeWeightConfig(0.55, 0.40),
@@ -58,15 +58,15 @@ class EdgeWeightConfig:
5858
"php_require": EdgeWeightConfig(0.60, 0.40),
5959
"php_inheritance": EdgeWeightConfig(0.75, 0.40),
6060
"php_type": EdgeWeightConfig(0.55, 0.40),
61-
"php_same_namespace": EdgeWeightConfig(0.50, 0.40),
61+
"php_same_namespace": EdgeWeightConfig(0.10, 0.40),
6262
"shell_source": EdgeWeightConfig(0.60, 0.35),
6363
"shell_script": EdgeWeightConfig(0.50, 0.35),
6464
"swift_import": EdgeWeightConfig(0.65, 0.40),
6565
"swift_conformance": EdgeWeightConfig(0.70, 0.40),
6666
"swift_extension": EdgeWeightConfig(0.65, 0.40),
6767
"swift_type": EdgeWeightConfig(0.60, 0.40),
6868
"swift_func": EdgeWeightConfig(0.55, 0.40),
69-
"swift_same_module": EdgeWeightConfig(0.45, 0.40),
69+
"swift_same_module": EdgeWeightConfig(0.15, 0.40),
7070
"zig_import": EdgeWeightConfig(0.65, 0.40),
7171
"zig_type": EdgeWeightConfig(0.60, 0.40),
7272
"zig_fn": EdgeWeightConfig(0.55, 0.40),

src/treemapper/diffctx/edges/config/terraform.py

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
_TF_VAR_REF_RE = re.compile(r"var\.(\w+)")
2121
_TF_LOCAL_REF_RE = re.compile(r"local\.(\w+)")
2222
_TF_DATA_REF_RE = re.compile(r"data\.(\w+)\.(\w+)")
23-
_TF_RESOURCE_REF_RE = re.compile(r"(?<![.\w])(\w+)\.(\w+)\.(\w+)")
23+
_TF_RESOURCE_REF_RE = re.compile(r'(?<![.\w"])(\w+)\.(\w+)\.(\w+)')
2424
_TF_MODULE_REF_RE = re.compile(r"module\.(\w+)")
2525

2626
_TF_SOURCE_RE = re.compile(r'^\s*source\s*=\s*"([^"]+)"', re.MULTILINE)
@@ -74,15 +74,89 @@ def _collect_tf_dirs_and_sources(tf_files: list[Path]) -> tuple[set[Path], set[s
7474
return tf_dirs, module_sources
7575

7676

77-
def _is_in_module(candidate: Path, module_sources: set[str], tf_dirs: set[Path]) -> bool:
77+
def _is_in_module(candidate: Path, module_sources: set[str], tf_dirs: set[Path], repo_root: Path | None = None) -> bool:
7878
for src in module_sources:
7979
for tf_dir in tf_dirs:
8080
try:
8181
module_path = (tf_dir / src).resolve()
8282
if candidate.is_relative_to(module_path):
8383
return True
8484
except (ValueError, OSError):
85-
continue
85+
pass
86+
if repo_root:
87+
try:
88+
clean_src = src.lstrip("./")
89+
module_path = (repo_root / clean_src).resolve()
90+
if candidate.is_relative_to(module_path):
91+
return True
92+
except (ValueError, OSError):
93+
pass
94+
return False
95+
96+
97+
def _extract_qualified_defs(content: str) -> set[str]:
98+
defs: set[str] = set()
99+
for match in _TF_VARIABLE_RE.finditer(content):
100+
defs.add(match.group(1))
101+
for match in _TF_RESOURCE_RE.finditer(content):
102+
defs.add(f"{match.group(1)}.{match.group(2)}")
103+
for match in _TF_DATA_RE.finditer(content):
104+
defs.add(f"{match.group(1)}.{match.group(2)}")
105+
for local_key in _extract_locals(content):
106+
defs.add(local_key)
107+
for match in _TF_MODULE_RE.finditer(content):
108+
defs.add(match.group(1))
109+
return defs
110+
111+
112+
_TF_GENERIC_NAMES = frozenset(
113+
{
114+
"name",
115+
"region",
116+
"tags",
117+
"environment",
118+
"env",
119+
"description",
120+
"enabled",
121+
"type",
122+
"value",
123+
"default",
124+
"count",
125+
"id",
126+
"arn",
127+
"vpc_id",
128+
"subnet_id",
129+
"key",
130+
"project",
131+
"owner",
132+
"stage",
133+
}
134+
)
135+
136+
137+
def _candidate_references_changed_defs_strict(content: str, changed_defs: set[str]) -> bool:
138+
for match in _TF_VAR_REF_RE.finditer(content):
139+
name = match.group(1)
140+
if name in changed_defs and name not in _TF_GENERIC_NAMES:
141+
return True
142+
for match in _TF_LOCAL_REF_RE.finditer(content):
143+
name = match.group(1)
144+
if name in changed_defs and name not in _TF_GENERIC_NAMES:
145+
return True
146+
for match in _TF_DATA_REF_RE.finditer(content):
147+
data_type, data_name = match.group(1), match.group(2)
148+
if f"{data_type}.{data_name}" in changed_defs or data_name in changed_defs:
149+
return True
150+
for match in _TF_MODULE_REF_RE.finditer(content):
151+
if match.group(1) in changed_defs:
152+
return True
153+
skip_types = {"var", "local", "data", "module", "path", "terraform", "each", "self", "count"}
154+
for match in _TF_RESOURCE_REF_RE.finditer(content):
155+
res_type, res_name, _ = match.groups()
156+
if res_type in skip_types:
157+
continue
158+
if f"{res_type}.{res_name}" in changed_defs or res_name in changed_defs:
159+
return True
86160
return False
87161

88162

@@ -117,13 +191,37 @@ def discover_related_files(
117191
return []
118192

119193
tf_dirs, module_sources = _collect_tf_dirs_and_sources(tf_files)
194+
195+
changed_defs: set[str] = set()
196+
changed_contents: list[str] = []
197+
for tf in tf_files:
198+
try:
199+
content = tf.read_text(encoding="utf-8")
200+
changed_defs.update(_extract_qualified_defs(content))
201+
changed_contents.append(content)
202+
except (OSError, UnicodeDecodeError):
203+
pass
204+
120205
changed_set = set(changed_files)
121206
discovered: list[Path] = []
122207

123208
for candidate in all_candidate_files:
124209
if candidate in changed_set or not _is_terraform_file(candidate):
125210
continue
126-
if candidate.parent in tf_dirs or _is_in_module(candidate, module_sources, tf_dirs):
211+
if _is_in_module(candidate, module_sources, tf_dirs, repo_root):
212+
discovered.append(candidate)
213+
continue
214+
if candidate.parent not in tf_dirs:
215+
continue
216+
try:
217+
content = candidate.read_text(encoding="utf-8")
218+
except (OSError, UnicodeDecodeError):
219+
continue
220+
if _candidate_references_changed_defs_strict(content, changed_defs):
221+
discovered.append(candidate)
222+
continue
223+
candidate_defs = _extract_qualified_defs(content)
224+
if candidate_defs and any(_candidate_references_changed_defs_strict(c, candidate_defs) for c in changed_contents):
127225
discovered.append(candidate)
128226

129227
return discovered

src/treemapper/diffctx/edges/history/cochange.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def build(self, fragments: list[Fragment], repo_root: Path | None = None) -> Edg
4141
def _get_git_log_files(self, repo_root: Path) -> list[list[str]] | None:
4242
try:
4343
result = subprocess.run(
44-
["git", "-C", str(repo_root), "log", "--name-only", "--format=", f"-n{COCHANGE.commits_limit}"],
44+
["git", "-C", str(repo_root), "log", "--name-only", "--pretty=format:", f"-n{COCHANGE.commits_limit}"],
4545
capture_output=True,
4646
encoding="utf-8",
4747
errors="replace",

src/treemapper/diffctx/edges/semantic/dbt.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,6 @@ def _collect_refs_from_yaml(self, content: str, refs: set[str], source_tables: s
179179
for name in _extract_schema_model_names(content):
180180
refs.add(name.lower())
181181
refs.add(f"{name}.sql")
182-
source_tables.update(_extract_source_table_names(content))
183182

184183
def discover_related_files(
185184
self,

src/treemapper/diffctx/edges/semantic/julia.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,6 @@ def _collect_julia_refs(julia_files: list[Path]) -> set[str]:
230230
for module in _extract_usings(content):
231231
refs.add(_module_to_path(module))
232232
refs.add(_module_leaf(module))
233-
refs.update(_extract_includes(content))
234233
return refs
235234

236235

@@ -342,7 +341,6 @@ def _add_fragment_edges(
342341
edges: EdgeDict,
343342
) -> None:
344343
self._link_usings(jf, idx, edges)
345-
self._link_includes(jf, idx, edges)
346344
self._link_supertypes(jf, type_defs, edges)
347345

348346
self_funcs, self_types = _extract_definitions(jf.content)

src/treemapper/diffctx/edges/semantic/jvm.py

Lines changed: 88 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
_JAVA_IMPORT_RE = re.compile(r"^\s*import\s+(?:static\s+)?([a-z][a-z0-9_]*(?:\.[a-z][a-z0-9_]*)*(?:\.[A-Z]\w*)?)", re.MULTILINE)
1717
_JAVA_PACKAGE_RE = re.compile(r"^\s*package\s+([a-z][a-z0-9_]*(?:\.[a-z][a-z0-9_]*)*)", re.MULTILINE)
1818
_JAVA_CLASS_RE = re.compile(
19-
r"^\s*(?:public |private |protected )?(?:abstract |final )?(?:class|interface|enum|record)\s+([A-Z]\w*)", re.MULTILINE
19+
r"^\s*(?:(?:public|private|protected|static|abstract|final|sealed|non-sealed|strictfp)\s+)*(?:class|interface|enum|record)\s+([A-Z]\w*)",
20+
re.MULTILINE,
2021
)
2122
_JAVA_EXTENDS_RE = re.compile(r"\bextends\s+([A-Z]\w*(?:\s*,\s*[A-Z]\w*)*)")
2223
_JAVA_IMPLEMENTS_RE = re.compile(r"\bimplements\s+([A-Z]\w*(?:\s*,\s*[A-Z]\w*)*)")
@@ -44,6 +45,76 @@
4445

4546
_ANNOTATION_RE = re.compile(r"@([A-Z]\w*)")
4647

48+
_JVM_STDLIB_TYPES: frozenset[str] = frozenset(
49+
{
50+
"String",
51+
"Integer",
52+
"Long",
53+
"Double",
54+
"Float",
55+
"Boolean",
56+
"Byte",
57+
"Short",
58+
"Character",
59+
"Object",
60+
"Class",
61+
"System",
62+
"Math",
63+
"Collections",
64+
"Arrays",
65+
"Optional",
66+
"HashMap",
67+
"ArrayList",
68+
"LinkedList",
69+
"Iterator",
70+
"Iterable",
71+
"Comparable",
72+
"Runnable",
73+
"Thread",
74+
"Exception",
75+
"RuntimeException",
76+
"IllegalArgumentException",
77+
"IllegalStateException",
78+
"NullPointerException",
79+
"IndexOutOfBoundsException",
80+
"IOException",
81+
"InputStream",
82+
"OutputStream",
83+
"StringBuilder",
84+
"StringBuffer",
85+
"Number",
86+
"Enum",
87+
"Void",
88+
"Override",
89+
"Unit",
90+
"Any",
91+
"AnyVal",
92+
"AnyRef",
93+
"Nothing",
94+
"Option",
95+
"Some",
96+
"Either",
97+
"Left",
98+
"Right",
99+
"Try",
100+
"Success",
101+
"Failure",
102+
"Future",
103+
"Promise",
104+
"Seq",
105+
"Vector",
106+
"Map",
107+
"Set",
108+
"Tuple",
109+
"Function",
110+
"Product",
111+
"Serializable",
112+
"Pair",
113+
"Triple",
114+
"Sequence",
115+
}
116+
)
117+
47118

48119
def _is_jvm_file(path: Path) -> bool:
49120
return path.suffix.lower() in _JVM_EXTS
@@ -262,17 +333,22 @@ def _link_refs(
262333
class_to_frags: dict[str, list[FragmentId]],
263334
edges: EdgeDict,
264335
) -> None:
265-
ref_weights = [
266-
(_extract_inheritance(jf.content, jf.path), self.inheritance_weight),
267-
(_extract_type_refs(jf.content), self.type_weight),
268-
(_extract_annotations(jf.content), self.annotation_weight),
269-
]
270-
271-
for refs, weight in ref_weights:
272-
for ref in refs:
273-
for fid in class_to_frags.get(ref.lower(), []):
274-
if fid != jf.id:
275-
self.add_edge(edges, jf.id, fid, weight)
336+
for inh_ref in _extract_inheritance(jf.content, jf.path):
337+
for fid in class_to_frags.get(inh_ref.lower(), []):
338+
if fid != jf.id:
339+
self.add_edge(edges, jf.id, fid, self.inheritance_weight)
340+
341+
for type_ref in _extract_type_refs(jf.content):
342+
if type_ref in _JVM_STDLIB_TYPES:
343+
continue
344+
for fid in class_to_frags.get(type_ref.lower(), []):
345+
if fid != jf.id:
346+
self.add_edge(edges, jf.id, fid, self.type_weight)
347+
348+
for ann_ref in _extract_annotations(jf.content):
349+
for fid in class_to_frags.get(ann_ref.lower(), []):
350+
if fid != jf.id:
351+
self.add_edge(edges, jf.id, fid, self.annotation_weight)
276352

277353
def _link_same_package(
278354
self,

src/treemapper/diffctx/edges/semantic/nim.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
_NIM_IMPORT_RE = re.compile(r"^\s*import\s+([\w/]+)", re.MULTILINE)
1414
_NIM_FROM_IMPORT_RE = re.compile(r"^\s*from\s+([\w/]+)\s+import", re.MULTILINE)
15-
_NIM_INCLUDE_RE = re.compile(r"^\s*include\s+([\w/]+)", re.MULTILINE)
1615

1716
_PROC_RE = re.compile(
1817
r"^\s*(?:proc|func|method|iterator|converter|template|macro)\s+([a-zA-Z_]\w*)\s*(?:\*\s*)?[(\[]",
@@ -168,8 +167,6 @@ def _extract_refs(content: str) -> set[str]:
168167
refs.add(m.group(1))
169168
for m in _NIM_FROM_IMPORT_RE.finditer(content):
170169
refs.add(m.group(1))
171-
for m in _NIM_INCLUDE_RE.finditer(content):
172-
refs.add(m.group(1))
173170
return refs
174171

175172

src/treemapper/diffctx/edges/semantic/rust.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ def _link_same_crate(
378378
rust_frags: list[Fragment],
379379
edges: EdgeDict,
380380
) -> None:
381-
if rf.path.stem.lower() not in {"lib", "main", "mod"}:
381+
if rf.path.stem.lower() not in {"lib", "mod"}:
382382
return
383383
parent_dir = rf.path.parent
384384
for f in rust_frags:

src/treemapper/diffctx/edges/semantic/swift.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,8 @@ def _add_dot_call_edges(self, sf_id: FragmentId, dot_calls: set[tuple[str, str]]
290290

291291
def _add_same_module_edges(self, sf: Fragment, idx: _SwiftIndex, edges: EdgeDict) -> None:
292292
current_module = sf.path.parent.name.lower()
293+
if not current_module:
294+
return
293295
for fid in idx.module_to_frags.get(current_module, []):
294296
if fid != sf.id:
295297
self.add_edge(edges, sf.id, fid, self.same_module_weight)

src/treemapper/diffctx/embeddings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def _get_embed_model() -> SentenceTransformer | None:
4747
_EMBED_AVAILABLE = False
4848
return None
4949
except Exception as e:
50-
logger.debug("diffctx: failed to load embedding model: %s", e)
50+
logger.warning("diffctx: failed to load embedding model %s: %s", _EMBED_MODEL_NAME, e)
5151
_EMBED_AVAILABLE = False
5252
return None
5353
return _EMBED_MODEL

0 commit comments

Comments
 (0)