fix(diffctx): correctness audit — 16 fixes across parsers, edges, and filters

nikolay-e · nikolay-e · commit 58f53772eb51 · 2026-04-06T21:30:07.000+02:00
diff --git a/src/treemapper/diffctx/config/weights.py b/src/treemapper/diffctx/config/weights.py
@@ -15,7 +15,7 @@ class EdgeWeightConfig:
     "test_direct": EdgeWeightConfig(0.60, 0.50),
     "test_naming": EdgeWeightConfig(0.50, 0.50),
     "test_reverse": EdgeWeightConfig(0.30, 1.0),
-    "config_code": EdgeWeightConfig(0.45, 0.70),
+    "config_code": EdgeWeightConfig(0.35, 0.50),
     "sibling": EdgeWeightConfig(0.05, 1.0),
     "cochange": EdgeWeightConfig(0.40, 1.0),
     "doc_structure": EdgeWeightConfig(0.30, 0.83),
diff --git a/src/treemapper/diffctx/edges/config/generic.py b/src/treemapper/diffctx/edges/config/generic.py
@@ -3,6 +3,7 @@
 import re
 from pathlib import Path
 
+from ...config.weights import EDGE_WEIGHTS
 from ...constants import CODE_EXTENSIONS, expand_config_key
 from ...types import Fragment
 from ..base import EdgeBuilder, EdgeDict
@@ -74,16 +75,22 @@
 _TOML_KEY_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_-]*)\s*=", re.MULTILINE)
 _INI_KEY_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_-]*)\s*=", re.MULTILINE)
 _ENV_KEY_RE = re.compile(r"^([A-Za-z_]\w*)\s*=", re.MULTILINE)
+_PROPERTIES_KEY_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_./-]*)\s*[=:]", re.MULTILINE)
+_XML_ATTR_RE = re.compile(r"<([a-zA-Z_][\w.-]*)[>\s/]")
 
 
 def _get_patterns_for_suffix(suffix: str) -> list[re.Pattern[str]]:
-    patterns_map = {
+    patterns_map: dict[str, list[re.Pattern[str]]] = {
         ".yaml": [_CONFIG_KEY_RE],
         ".yml": [_CONFIG_KEY_RE],
         ".json": [_JSON_KEY_RE],
         ".toml": [_TOML_KEY_RE],
         ".ini": [_INI_KEY_RE],
         ".env": [_ENV_KEY_RE],
+        ".cfg": [_INI_KEY_RE],
+        ".conf": [_INI_KEY_RE],
+        ".properties": [_PROPERTIES_KEY_RE],
+        ".xml": [_XML_ATTR_RE],
     }
     return patterns_map.get(suffix, [])
 
@@ -107,8 +114,8 @@ def _is_code_file(path: Path) -> bool:
 
 
 class ConfigToCodeEdgeBuilder(EdgeBuilder):
-    weight = 0.45
-    reverse_weight_factor = 0.70
+    weight = EDGE_WEIGHTS["config_code"].forward
+    reverse_weight_factor = EDGE_WEIGHTS["config_code"].reverse_factor
     category = "config_generic"
 
     def discover_related_files(
diff --git a/src/treemapper/diffctx/edges/config/kubernetes.py b/src/treemapper/diffctx/edges/config/kubernetes.py
@@ -9,6 +9,7 @@
 
 _K8S_API_VERSION_RE = re.compile(r"^apiVersion:\s?([^\s#]{1,100})", re.MULTILINE)
 _K8S_KIND_RE = re.compile(r"^kind:\s?(\w{1,100})", re.MULTILINE)
+_K8S_METADATA_NAME_RE = re.compile(r"^metadata:\s*\n\s{2,4}name:\s?['\"]?([^'\"#\n]{1,200})", re.MULTILINE)
 _K8S_NAME_RE = re.compile(r"^\s{1,20}name:\s?['\"]?([^'\"#\n]{1,200})", re.MULTILINE)
 _K8S_NAMESPACE_RE = re.compile(r"^\s{1,20}namespace:\s?['\"]?([^'\"#\n]{1,200})", re.MULTILINE)
 
@@ -82,7 +83,7 @@ def _is_kubernetes_manifest(path: Path, content: str | None = None) -> bool:
 
 def _extract_resource_info(content: str) -> tuple[str | None, str | None, str | None]:
     kind_match = _K8S_KIND_RE.search(content)
-    name_match = _K8S_NAME_RE.search(content)
+    name_match = _K8S_METADATA_NAME_RE.search(content) or _K8S_NAME_RE.search(content)
     namespace_match = _K8S_NAMESPACE_RE.search(content)
 
     kind = kind_match.group(1).strip() if kind_match else None
diff --git a/src/treemapper/diffctx/edges/config/terraform.py b/src/treemapper/diffctx/edges/config/terraform.py
@@ -202,7 +202,7 @@ def _add_module_edges(self, f: Fragment, module_defs: dict[str, list[FragmentId]
                     self.add_edge(edges, f.id, def_id, self.weight)
 
     def _add_resource_edges(self, f: Fragment, resource_defs: dict[str, list[FragmentId]], edges: EdgeDict) -> None:
-        skip_types = {"var", "local", "data", "module", "path", "terraform"}
+        skip_types = {"var", "local", "data", "module", "path", "terraform", "each", "self", "count"}
         for match in _TF_RESOURCE_REF_RE.finditer(f.content):
             res_type, res_name, _ = match.groups()
             if res_type in skip_types:
diff --git a/src/treemapper/diffctx/edges/history/cochange.py b/src/treemapper/diffctx/edges/history/cochange.py
@@ -34,8 +34,7 @@ def build(self, fragments: list[Fragment], repo_root: Path | None = None) -> Edg
                 for fid2 in path_to_frags.get(p2, []):
                     if fid1 == fid2:
                         continue
-                    edges[(fid1, fid2)] = max(edges.get((fid1, fid2), 0.0), weight)
-                    edges[(fid2, fid1)] = max(edges.get((fid2, fid1), 0.0), weight)
+                    self.add_edge(edges, fid1, fid2, weight)
 
         return edges
 
diff --git a/src/treemapper/diffctx/edges/semantic/go.py b/src/treemapper/diffctx/edges/semantic/go.py
@@ -10,7 +10,7 @@
 
 _GO_IMPORT_SINGLE_RE = re.compile(r'^\s*import\s+"([^"]+)"', re.MULTILINE)
 _GO_IMPORT_BLOCK_RE = re.compile(r"import\s*\((.*?)\)", re.DOTALL)
-_GO_IMPORT_LINE_RE = re.compile(r'^\s*(?:\w+\s+)?"([^"]+)"', re.MULTILINE)
+_GO_IMPORT_LINE_RE = re.compile(r'^\s*(?:(?:\w+|\.)\s+)?"([^"]+)"', re.MULTILINE)
 
 _GO_FUNC_RE = re.compile(r"^func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(", re.MULTILINE)
 _GO_TYPE_RE = re.compile(r"^type\s+(\w+)\s+", re.MULTILINE)
@@ -324,7 +324,7 @@ def _link_import_by_path(
         edges: EdgeDict,
     ) -> None:
         for path_str, frag_ids in path_to_frags.items():
-            if f"/{path_str}" in imp or imp == path_str or imp.endswith(f"/{path_str}"):
+            if imp == path_str or imp.endswith(f"/{path_str}") or f"/{path_str}/" in imp:
                 self.add_edges_from_ids(gf_id, frag_ids, self.import_weight, edges)
 
     def _link_refs(
diff --git a/src/treemapper/diffctx/edges/semantic/javascript_semantics.py b/src/treemapper/diffctx/edges/semantic/javascript_semantics.py
@@ -313,6 +313,10 @@ def _extract_namespace_imports(code: str, sources: set[str], names: set[str]) ->
 
 def _extract_side_effect_imports(code: str, sources: set[str]) -> None:
     for match in _SIDE_EFFECT_IMPORT_RE.finditer(code):
+        line_start = code.rfind("\n", 0, match.start()) + 1
+        line_prefix = code[line_start : match.start()].lstrip()
+        if line_prefix.startswith("//") or line_prefix.startswith("*"):
+            continue
         sources.add(match.group(1))
 
 
@@ -505,7 +509,7 @@ def analyze_javascript_fragment(code: str) -> JsFragmentInfo:
     if not code.strip():
         return _EMPTY_INFO
 
-    import_sources, _imported_names = _extract_imports_full(code)
+    import_sources, _ = _extract_imports_full(code)
     exports = _extract_exports(code)
     defines = _extract_defines(code)
     calls = _extract_calls(code)
diff --git a/src/treemapper/diffctx/edges/semantic/python.py b/src/treemapper/diffctx/edges/semantic/python.py
@@ -17,7 +17,9 @@
 _SYMBOL_REF_WEIGHT = _PY_WEIGHTS.symbol_ref
 _TYPE_REF_WEIGHT = _PY_WEIGHTS.type_ref
 
-_PY_IMPORT_RE = re.compile(r"(?:from\s{1,20}(\.{0,3}[\w.]{0,200})\s{1,20}import|import\s{1,20}([\w.]{1,200}))")
+_PY_IMPORT_RE = re.compile(
+    r"(?:from\s{1,20}(\.{0,3}[\w.]{0,200})\s{1,20}import|import\s{1,20}([\w.]{1,200}(?:\s*,\s*[\w.]{1,200})*))"
+)
 
 
 def _is_python_file(path: Path) -> bool:
@@ -56,24 +58,33 @@ def _resolve_relative_import(imported: str, source_path: Path, repo_root: Path |
     return ".".join(parent_parts) if parent_parts else None
 
 
+def _add_import_with_prefixes(imports: set[str], imported: str) -> None:
+    imports.add(imported)
+    parts = imported.split(".")
+    for i in range(1, len(parts) + 1):
+        imports.add(".".join(parts[:i]))
+
+
 def _extract_imports_from_content(content: str, source_path: Path | None = None, repo_root: Path | None = None) -> set[str]:
     imports: set[str] = set()
     for match in _PY_IMPORT_RE.finditer(content):
-        imported = match.group(1) or match.group(2)
-        if not imported:
-            continue
-
-        if imported.startswith(".") and source_path:
-            resolved = _resolve_relative_import(imported, source_path, repo_root)
-            if resolved:
-                imported = resolved
-            else:
-                continue
-
-        imports.add(imported)
-        parts = imported.split(".")
-        for i in range(1, len(parts) + 1):
-            imports.add(".".join(parts[:i]))
+        from_module = match.group(1)
+        bare_imports = match.group(2)
+
+        if from_module:
+            imported = from_module
+            if imported.startswith(".") and source_path:
+                resolved = _resolve_relative_import(imported, source_path, repo_root)
+                if resolved:
+                    imported = resolved
+                else:
+                    continue
+            _add_import_with_prefixes(imports, imported)
+        elif bare_imports:
+            for name in bare_imports.split(","):
+                name = name.strip()
+                if name:
+                    _add_import_with_prefixes(imports, name)
     return imports
 
 
diff --git a/src/treemapper/diffctx/edges/semantic/rust.py b/src/treemapper/diffctx/edges/semantic/rust.py
@@ -8,8 +8,7 @@
 from ...types import Fragment, FragmentId
 from ..base import EdgeBuilder, EdgeDict
 
-_RUST_USE_RE = re.compile(r"^\s*use\s+(?:crate::)?([a-zA-Z_]\w*(?:::[a-zA-Z_]\w*)*)", re.MULTILINE)
-_RUST_USE_BRACED_RE = re.compile(r"use\s+(?:crate::)?([\w:]+)::\{([^}]+)\}", re.MULTILINE)
+_RUST_USE_STMT_RE = re.compile(r"^\s*use\s+(.+?)\s*;", re.MULTILINE)
 _RUST_MOD_RE = re.compile(r"^\s*(?:pub(?:\([^)]*\))?\s+)?mod\s+([a-z_][a-z0-9_]*)\s*[;{]", re.MULTILINE)
 
 _RUST_FN_RE = re.compile(r"^\s*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?fn\s+([a-z_][a-z0-9_]*)", re.MULTILINE)
@@ -124,24 +123,60 @@ def _is_rust_file(path: Path) -> bool:
     return path.suffix.lower() == ".rs"
 
 
+_MAX_USE_TREE_DEPTH = 10
+
+
+def _parse_use_tree(text: str, _depth: int = 0) -> list[str]:
+    if _depth > _MAX_USE_TREE_DEPTH:
+        return []
+    text = re.sub(r"^(?:crate|self|super)::", "", text.strip())
+    if "{" not in text:
+        return [text] if text else []
+    brace_pos = text.index("{")
+    prefix = text[:brace_pos].rstrip(":")
+    inner = text[brace_pos + 1 :]
+    depth = 1
+    end = 0
+    for i, ch in enumerate(inner):
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                end = i
+                break
+    items_str = inner[:end]
+    results: list[str] = []
+    current: list[str] = []
+    d = 0
+    for ch in items_str:
+        if ch == "{":
+            d += 1
+            current.append(ch)
+        elif ch == "}":
+            d -= 1
+            current.append(ch)
+        elif ch == "," and d == 0:
+            item = "".join(current).strip()
+            if item and item != "self":
+                results.extend(_parse_use_tree(f"{prefix}::{item}" if prefix else item, _depth + 1))
+            current = []
+        else:
+            current.append(ch)
+    item = "".join(current).strip()
+    if item and item != "self":
+        results.extend(_parse_use_tree(f"{prefix}::{item}" if prefix else item, _depth + 1))
+    return results
+
+
 def _extract_uses(content: str) -> set[str]:
     uses: set[str] = set()
-    for match in _RUST_USE_RE.finditer(content):
-        path = match.group(1)
-        uses.add(path)
-        parts = path.split("::")
-        if len(parts) > 1:
-            uses.add(parts[0])
-    for match in _RUST_USE_BRACED_RE.finditer(content):
-        base_path = match.group(1)
-        uses.add(base_path)
-        base_parts = base_path.split("::")
-        if len(base_parts) > 1:
-            uses.add(base_parts[0])
-        for name in match.group(2).split(","):
-            name = name.strip()
-            if name:
-                uses.add(name)
+    for match in _RUST_USE_STMT_RE.finditer(content):
+        for path in _parse_use_tree(match.group(1)):
+            uses.add(path)
+            parts = path.split("::")
+            if len(parts) > 1:
+                uses.add(parts[0])
     return uses
 
 
diff --git a/src/treemapper/diffctx/edges/structural/sibling.py b/src/treemapper/diffctx/edges/structural/sibling.py
@@ -57,5 +57,4 @@ def _add_sibling_edges_for_dir(self, files: set[Path], file_to_rep: dict[Path, F
                 f1_id = file_to_rep.get(f1_path)
                 f2_id = file_to_rep.get(f2_path)
                 if f1_id and f2_id:
-                    edges[(f1_id, f2_id)] = max(edges.get((f1_id, f2_id), 0.0), self.weight)
-                    edges[(f2_id, f1_id)] = max(edges.get((f2_id, f1_id), 0.0), self.weight)
+                    self.add_edge(edges, f1_id, f2_id, self.weight)
diff --git a/src/treemapper/diffctx/embeddings.py b/src/treemapper/diffctx/embeddings.py
@@ -21,7 +21,6 @@
 
 _TOP_K_NEIGHBORS = 10
 _MIN_SIMILARITY = 0.1
-_BACKWARD_WEIGHT_FACTOR = 0.7
 
 _EMBED_MODEL: SentenceTransformer | None = None
 _EMBED_LOCK = threading.Lock()
@@ -92,6 +91,5 @@ def _build_embedding_edges(
                 f2 = fragments[k_idx]
                 weight = clamp_weight_fn(score, f1.path, f2.path) * _EMBED_WEIGHT
                 edges[(f1.id, f2.id)] = max(edges.get((f1.id, f2.id), 0.0), weight)
-                edges[(f2.id, f1.id)] = max(edges.get((f2.id, f1.id), 0.0), weight * _BACKWARD_WEIGHT_FACTOR)
 
     return edges
diff --git a/src/treemapper/diffctx/filtering.py b/src/treemapper/diffctx/filtering.py
@@ -10,7 +10,7 @@
 
 logger = logging.getLogger(__name__)
 
-_PROXIMITY_FLOOR_MAX = 0.01
+_PROXIMITY_FLOOR_MAX = 0.04
 _PROXIMITY_HALF_DECAY = 50
 _DEFINITION_PROXIMITY_HALF_DECAY = 5
 _HUB_REVERSE_THRESHOLD = 2
diff --git a/src/treemapper/diffctx/git.py b/src/treemapper/diffctx/git.py
@@ -37,6 +37,8 @@ def run_git(repo_root: Path, args: list[str]) -> str:
         return result.stdout
     except subprocess.CalledProcessError as e:
         raise GitError(f"git {' '.join(args)} failed: {e.stderr.strip()}") from e
+    except subprocess.TimeoutExpired as e:
+        raise GitError(f"git {' '.join(args)} timed out after 60s") from e
     except FileNotFoundError as e:
         raise GitError("git is not installed or not in PATH") from e
 
@@ -71,15 +73,50 @@ def _parse_hunk_header(match: re.Match[str], path: Path) -> DiffHunk:
     )
 
 
+_C_ESCAPE_MAP = {"t": "\t", "n": "\n", "r": "\r", "b": "\b", "f": "\f", "v": "\v", "a": "\a", "\\": "\\", '"': '"'}
+
+
+def _unquote_c_style(quoted: str) -> str:
+    if not (quoted.startswith('"') and quoted.endswith('"')):
+        return quoted
+    raw = quoted[1:-1]
+    chars: list[str] = []
+    i = 0
+    while i < len(raw):
+        if raw[i] == "\\" and i + 1 < len(raw):
+            nxt = raw[i + 1]
+            if nxt in _C_ESCAPE_MAP:
+                chars.append(_C_ESCAPE_MAP[nxt])
+                i += 2
+            elif nxt in "01234567" and i + 3 < len(raw) and all(c in "01234567" for c in raw[i + 1 : i + 4]):
+                chars.append(chr(int(raw[i + 1 : i + 4], 8)))
+                i += 4
+            else:
+                chars.append("\\")
+                i += 1
+        else:
+            chars.append(raw[i])
+            i += 1
+    result = "".join(chars)
+    try:
+        return result.encode("latin-1").decode("utf-8")
+    except (UnicodeDecodeError, UnicodeEncodeError):
+        return result
+
+
 def _parse_path_line(line: str, repo_root: Path) -> tuple[str, Path | None]:
-    if line.startswith("--- a/"):
-        return "old", repo_root / line.removeprefix("--- a/").strip()
     if line.startswith("--- /dev/null"):
         return "old", None
-    if line.startswith("+++ b/"):
-        return "new", repo_root / line.removeprefix("+++ b/").strip()
     if line.startswith("+++ /dev/null"):
         return "new", None
+    if line.startswith("--- a/"):
+        return "old", repo_root / line.removeprefix("--- a/").strip()
+    if line.startswith("+++ b/"):
+        return "new", repo_root / line.removeprefix("+++ b/").strip()
+    if line.startswith('--- "a/'):
+        return "old", repo_root / _unquote_c_style(line.removeprefix("--- ").strip()).removeprefix("a/")
+    if line.startswith('+++ "b/'):
+        return "new", repo_root / _unquote_c_style(line.removeprefix("+++ ").strip()).removeprefix("b/")
     return "", None
 
 
diff --git a/src/treemapper/diffctx/parsers/tree_sitter.py b/src/treemapper/diffctx/parsers/tree_sitter.py
@@ -208,11 +208,14 @@ def _handle_definition_node(
 
         sym_name = self._extract_symbol_name(node)
 
-        parent = node.parent
-        if parent is not None and parent.type in ("export_statement", "decorated_definition"):
-            parent_start = parent.start_point[0] + 1
-            if parent_start < start:
-                start = parent_start
+        ancestor = node.parent
+        if ancestor is not None and ancestor.type not in ("export_statement", "decorated_definition"):
+            if ancestor.parent is not None and ancestor.parent.type in ("export_statement", "decorated_definition"):
+                ancestor = ancestor.parent
+        if ancestor is not None and ancestor.type in ("export_statement", "decorated_definition"):
+            ancestor_start = ancestor.start_point[0] + 1
+            if ancestor_start < start:
+                start = ancestor_start
 
         if kind in _CONTAINER_KINDS and self._try_container_split(
             node, code_bytes, path, lines, definition_types, fragments, covered, added_ends, depth, start, end, kind, sym_name
diff --git a/src/treemapper/diffctx/render.py b/src/treemapper/diffctx/render.py
@@ -86,7 +86,9 @@ def _create_fragment_entry(frag: Fragment, path_str: str) -> dict[str, Any]:
         "lines": f"{frag.start_line}-{frag.end_line}",
         "kind": frag.kind,
     }
-    symbol = frag.symbol_name or _extract_symbol(frag)
+    symbol = frag.symbol_name
+    if not symbol and frag.kind in _SYMBOL_PATTERNS:
+        symbol = _extract_symbol(frag)
     if symbol:
         entry["symbol"] = symbol
     if frag.content:
diff --git a/src/treemapper/diffctx/signatures.py b/src/treemapper/diffctx/signatures.py