Skip to content

Commit 58f5377

Browse files
committed
fix(diffctx): correctness audit — 16 fixes across parsers, edges, and filters
1 parent e85287c commit 58f5377

16 files changed

Lines changed: 160 additions & 65 deletions

File tree

src/treemapper/diffctx/config/weights.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class EdgeWeightConfig:
1515
"test_direct": EdgeWeightConfig(0.60, 0.50),
1616
"test_naming": EdgeWeightConfig(0.50, 0.50),
1717
"test_reverse": EdgeWeightConfig(0.30, 1.0),
18-
"config_code": EdgeWeightConfig(0.45, 0.70),
18+
"config_code": EdgeWeightConfig(0.35, 0.50),
1919
"sibling": EdgeWeightConfig(0.05, 1.0),
2020
"cochange": EdgeWeightConfig(0.40, 1.0),
2121
"doc_structure": EdgeWeightConfig(0.30, 0.83),

src/treemapper/diffctx/edges/config/generic.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import re
44
from pathlib import Path
55

6+
from ...config.weights import EDGE_WEIGHTS
67
from ...constants import CODE_EXTENSIONS, expand_config_key
78
from ...types import Fragment
89
from ..base import EdgeBuilder, EdgeDict
@@ -74,16 +75,22 @@
7475
_TOML_KEY_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_-]*)\s*=", re.MULTILINE)
7576
_INI_KEY_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_-]*)\s*=", re.MULTILINE)
7677
_ENV_KEY_RE = re.compile(r"^([A-Za-z_]\w*)\s*=", re.MULTILINE)
78+
_PROPERTIES_KEY_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_./-]*)\s*[=:]", re.MULTILINE)
79+
_XML_ATTR_RE = re.compile(r"<([a-zA-Z_][\w.-]*)[>\s/]")
7780

7881

7982
def _get_patterns_for_suffix(suffix: str) -> list[re.Pattern[str]]:
80-
patterns_map = {
83+
patterns_map: dict[str, list[re.Pattern[str]]] = {
8184
".yaml": [_CONFIG_KEY_RE],
8285
".yml": [_CONFIG_KEY_RE],
8386
".json": [_JSON_KEY_RE],
8487
".toml": [_TOML_KEY_RE],
8588
".ini": [_INI_KEY_RE],
8689
".env": [_ENV_KEY_RE],
90+
".cfg": [_INI_KEY_RE],
91+
".conf": [_INI_KEY_RE],
92+
".properties": [_PROPERTIES_KEY_RE],
93+
".xml": [_XML_ATTR_RE],
8794
}
8895
return patterns_map.get(suffix, [])
8996

@@ -107,8 +114,8 @@ def _is_code_file(path: Path) -> bool:
107114

108115

109116
class ConfigToCodeEdgeBuilder(EdgeBuilder):
110-
weight = 0.45
111-
reverse_weight_factor = 0.70
117+
weight = EDGE_WEIGHTS["config_code"].forward
118+
reverse_weight_factor = EDGE_WEIGHTS["config_code"].reverse_factor
112119
category = "config_generic"
113120

114121
def discover_related_files(

src/treemapper/diffctx/edges/config/kubernetes.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
_K8S_API_VERSION_RE = re.compile(r"^apiVersion:\s?([^\s#]{1,100})", re.MULTILINE)
1111
_K8S_KIND_RE = re.compile(r"^kind:\s?(\w{1,100})", re.MULTILINE)
12+
_K8S_METADATA_NAME_RE = re.compile(r"^metadata:\s*\n\s{2,4}name:\s?['\"]?([^'\"#\n]{1,200})", re.MULTILINE)
1213
_K8S_NAME_RE = re.compile(r"^\s{1,20}name:\s?['\"]?([^'\"#\n]{1,200})", re.MULTILINE)
1314
_K8S_NAMESPACE_RE = re.compile(r"^\s{1,20}namespace:\s?['\"]?([^'\"#\n]{1,200})", re.MULTILINE)
1415

@@ -82,7 +83,7 @@ def _is_kubernetes_manifest(path: Path, content: str | None = None) -> bool:
8283

8384
def _extract_resource_info(content: str) -> tuple[str | None, str | None, str | None]:
8485
kind_match = _K8S_KIND_RE.search(content)
85-
name_match = _K8S_NAME_RE.search(content)
86+
name_match = _K8S_METADATA_NAME_RE.search(content) or _K8S_NAME_RE.search(content)
8687
namespace_match = _K8S_NAMESPACE_RE.search(content)
8788

8889
kind = kind_match.group(1).strip() if kind_match else None

src/treemapper/diffctx/edges/config/terraform.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ def _add_module_edges(self, f: Fragment, module_defs: dict[str, list[FragmentId]
202202
self.add_edge(edges, f.id, def_id, self.weight)
203203

204204
def _add_resource_edges(self, f: Fragment, resource_defs: dict[str, list[FragmentId]], edges: EdgeDict) -> None:
205-
skip_types = {"var", "local", "data", "module", "path", "terraform"}
205+
skip_types = {"var", "local", "data", "module", "path", "terraform", "each", "self", "count"}
206206
for match in _TF_RESOURCE_REF_RE.finditer(f.content):
207207
res_type, res_name, _ = match.groups()
208208
if res_type in skip_types:

src/treemapper/diffctx/edges/history/cochange.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@ def build(self, fragments: list[Fragment], repo_root: Path | None = None) -> Edg
3434
for fid2 in path_to_frags.get(p2, []):
3535
if fid1 == fid2:
3636
continue
37-
edges[(fid1, fid2)] = max(edges.get((fid1, fid2), 0.0), weight)
38-
edges[(fid2, fid1)] = max(edges.get((fid2, fid1), 0.0), weight)
37+
self.add_edge(edges, fid1, fid2, weight)
3938

4039
return edges
4140

src/treemapper/diffctx/edges/semantic/go.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
_GO_IMPORT_SINGLE_RE = re.compile(r'^\s*import\s+"([^"]+)"', re.MULTILINE)
1212
_GO_IMPORT_BLOCK_RE = re.compile(r"import\s*\((.*?)\)", re.DOTALL)
13-
_GO_IMPORT_LINE_RE = re.compile(r'^\s*(?:\w+\s+)?"([^"]+)"', re.MULTILINE)
13+
_GO_IMPORT_LINE_RE = re.compile(r'^\s*(?:(?:\w+|\.)\s+)?"([^"]+)"', re.MULTILINE)
1414

1515
_GO_FUNC_RE = re.compile(r"^func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(", re.MULTILINE)
1616
_GO_TYPE_RE = re.compile(r"^type\s+(\w+)\s+", re.MULTILINE)
@@ -324,7 +324,7 @@ def _link_import_by_path(
324324
edges: EdgeDict,
325325
) -> None:
326326
for path_str, frag_ids in path_to_frags.items():
327-
if f"/{path_str}" in imp or imp == path_str or imp.endswith(f"/{path_str}"):
327+
if imp == path_str or imp.endswith(f"/{path_str}") or f"/{path_str}/" in imp:
328328
self.add_edges_from_ids(gf_id, frag_ids, self.import_weight, edges)
329329

330330
def _link_refs(

src/treemapper/diffctx/edges/semantic/javascript_semantics.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,10 @@ def _extract_namespace_imports(code: str, sources: set[str], names: set[str]) ->
313313

314314
def _extract_side_effect_imports(code: str, sources: set[str]) -> None:
315315
for match in _SIDE_EFFECT_IMPORT_RE.finditer(code):
316+
line_start = code.rfind("\n", 0, match.start()) + 1
317+
line_prefix = code[line_start : match.start()].lstrip()
318+
if line_prefix.startswith("//") or line_prefix.startswith("*"):
319+
continue
316320
sources.add(match.group(1))
317321

318322

@@ -505,7 +509,7 @@ def analyze_javascript_fragment(code: str) -> JsFragmentInfo:
505509
if not code.strip():
506510
return _EMPTY_INFO
507511

508-
import_sources, _imported_names = _extract_imports_full(code)
512+
import_sources, _ = _extract_imports_full(code)
509513
exports = _extract_exports(code)
510514
defines = _extract_defines(code)
511515
calls = _extract_calls(code)

src/treemapper/diffctx/edges/semantic/python.py

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
_SYMBOL_REF_WEIGHT = _PY_WEIGHTS.symbol_ref
1818
_TYPE_REF_WEIGHT = _PY_WEIGHTS.type_ref
1919

20-
_PY_IMPORT_RE = re.compile(r"(?:from\s{1,20}(\.{0,3}[\w.]{0,200})\s{1,20}import|import\s{1,20}([\w.]{1,200}))")
20+
_PY_IMPORT_RE = re.compile(
21+
r"(?:from\s{1,20}(\.{0,3}[\w.]{0,200})\s{1,20}import|import\s{1,20}([\w.]{1,200}(?:\s*,\s*[\w.]{1,200})*))"
22+
)
2123

2224

2325
def _is_python_file(path: Path) -> bool:
@@ -56,24 +58,33 @@ def _resolve_relative_import(imported: str, source_path: Path, repo_root: Path |
5658
return ".".join(parent_parts) if parent_parts else None
5759

5860

61+
def _add_import_with_prefixes(imports: set[str], imported: str) -> None:
62+
imports.add(imported)
63+
parts = imported.split(".")
64+
for i in range(1, len(parts) + 1):
65+
imports.add(".".join(parts[:i]))
66+
67+
5968
def _extract_imports_from_content(content: str, source_path: Path | None = None, repo_root: Path | None = None) -> set[str]:
6069
imports: set[str] = set()
6170
for match in _PY_IMPORT_RE.finditer(content):
62-
imported = match.group(1) or match.group(2)
63-
if not imported:
64-
continue
65-
66-
if imported.startswith(".") and source_path:
67-
resolved = _resolve_relative_import(imported, source_path, repo_root)
68-
if resolved:
69-
imported = resolved
70-
else:
71-
continue
72-
73-
imports.add(imported)
74-
parts = imported.split(".")
75-
for i in range(1, len(parts) + 1):
76-
imports.add(".".join(parts[:i]))
71+
from_module = match.group(1)
72+
bare_imports = match.group(2)
73+
74+
if from_module:
75+
imported = from_module
76+
if imported.startswith(".") and source_path:
77+
resolved = _resolve_relative_import(imported, source_path, repo_root)
78+
if resolved:
79+
imported = resolved
80+
else:
81+
continue
82+
_add_import_with_prefixes(imports, imported)
83+
elif bare_imports:
84+
for name in bare_imports.split(","):
85+
name = name.strip()
86+
if name:
87+
_add_import_with_prefixes(imports, name)
7788
return imports
7889

7990

src/treemapper/diffctx/edges/semantic/rust.py

Lines changed: 53 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88
from ...types import Fragment, FragmentId
99
from ..base import EdgeBuilder, EdgeDict
1010

11-
_RUST_USE_RE = re.compile(r"^\s*use\s+(?:crate::)?([a-zA-Z_]\w*(?:::[a-zA-Z_]\w*)*)", re.MULTILINE)
12-
_RUST_USE_BRACED_RE = re.compile(r"use\s+(?:crate::)?([\w:]+)::\{([^}]+)\}", re.MULTILINE)
11+
_RUST_USE_STMT_RE = re.compile(r"^\s*use\s+(.+?)\s*;", re.MULTILINE)
1312
_RUST_MOD_RE = re.compile(r"^\s*(?:pub(?:\([^)]*\))?\s+)?mod\s+([a-z_][a-z0-9_]*)\s*[;{]", re.MULTILINE)
1413

1514
_RUST_FN_RE = re.compile(r"^\s*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?fn\s+([a-z_][a-z0-9_]*)", re.MULTILINE)
@@ -124,24 +123,60 @@ def _is_rust_file(path: Path) -> bool:
124123
return path.suffix.lower() == ".rs"
125124

126125

126+
_MAX_USE_TREE_DEPTH = 10
127+
128+
129+
def _parse_use_tree(text: str, _depth: int = 0) -> list[str]:
130+
if _depth > _MAX_USE_TREE_DEPTH:
131+
return []
132+
text = re.sub(r"^(?:crate|self|super)::", "", text.strip())
133+
if "{" not in text:
134+
return [text] if text else []
135+
brace_pos = text.index("{")
136+
prefix = text[:brace_pos].rstrip(":")
137+
inner = text[brace_pos + 1 :]
138+
depth = 1
139+
end = 0
140+
for i, ch in enumerate(inner):
141+
if ch == "{":
142+
depth += 1
143+
elif ch == "}":
144+
depth -= 1
145+
if depth == 0:
146+
end = i
147+
break
148+
items_str = inner[:end]
149+
results: list[str] = []
150+
current: list[str] = []
151+
d = 0
152+
for ch in items_str:
153+
if ch == "{":
154+
d += 1
155+
current.append(ch)
156+
elif ch == "}":
157+
d -= 1
158+
current.append(ch)
159+
elif ch == "," and d == 0:
160+
item = "".join(current).strip()
161+
if item and item != "self":
162+
results.extend(_parse_use_tree(f"{prefix}::{item}" if prefix else item, _depth + 1))
163+
current = []
164+
else:
165+
current.append(ch)
166+
item = "".join(current).strip()
167+
if item and item != "self":
168+
results.extend(_parse_use_tree(f"{prefix}::{item}" if prefix else item, _depth + 1))
169+
return results
170+
171+
127172
def _extract_uses(content: str) -> set[str]:
128173
uses: set[str] = set()
129-
for match in _RUST_USE_RE.finditer(content):
130-
path = match.group(1)
131-
uses.add(path)
132-
parts = path.split("::")
133-
if len(parts) > 1:
134-
uses.add(parts[0])
135-
for match in _RUST_USE_BRACED_RE.finditer(content):
136-
base_path = match.group(1)
137-
uses.add(base_path)
138-
base_parts = base_path.split("::")
139-
if len(base_parts) > 1:
140-
uses.add(base_parts[0])
141-
for name in match.group(2).split(","):
142-
name = name.strip()
143-
if name:
144-
uses.add(name)
174+
for match in _RUST_USE_STMT_RE.finditer(content):
175+
for path in _parse_use_tree(match.group(1)):
176+
uses.add(path)
177+
parts = path.split("::")
178+
if len(parts) > 1:
179+
uses.add(parts[0])
145180
return uses
146181

147182

src/treemapper/diffctx/edges/structural/sibling.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,5 +57,4 @@ def _add_sibling_edges_for_dir(self, files: set[Path], file_to_rep: dict[Path, F
5757
f1_id = file_to_rep.get(f1_path)
5858
f2_id = file_to_rep.get(f2_path)
5959
if f1_id and f2_id:
60-
edges[(f1_id, f2_id)] = max(edges.get((f1_id, f2_id), 0.0), self.weight)
61-
edges[(f2_id, f1_id)] = max(edges.get((f2_id, f1_id), 0.0), self.weight)
60+
self.add_edge(edges, f1_id, f2_id, self.weight)

0 commit comments

Comments
 (0)