Skip to content

Commit 047d462

Browse files
committed
fix: deep audit — correctness, dead code, perf, and edge builder fixes
1 parent 5970a1f commit 047d462

29 files changed

Lines changed: 563 additions & 294 deletions

.pre-commit-config.yaml

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,8 @@ repos:
7979
name: Lint GitHub Actions
8080

8181
# ============================================================================
82-
# CODE FORMATTING (Order: isort → black to avoid conflicts)
82+
# CODE FORMATTING
8383
# ============================================================================
84-
- repo: https://github.com/pycqa/isort
85-
rev: 7.0.0
86-
hooks:
87-
- id: isort
88-
8984
- repo: https://github.com/psf/black
9085
rev: 26.1.0
9186
hooks:
@@ -97,7 +92,7 @@ repos:
9792
hooks:
9893
- id: pyupgrade
9994
name: pyupgrade (auto-upgrade Python syntax)
100-
args: ['--py312-plus']
95+
args: ['--py310-plus']
10196
files: ^src/.*\.py$
10297

10398
# ============================================================================

.secrets.baseline

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,9 @@
139139
"filename": ".pre-commit-config.yaml",
140140
"hashed_secret": "861d9d607b1513c2096a6c1495043e2cc31a8f41",
141141
"is_verified": false,
142-
"line_number": 161
142+
"line_number": 156
143143
}
144144
]
145145
},
146-
"generated_at": "2026-03-15T15:14:51Z"
146+
"generated_at": "2026-03-21T11:42:35Z"
147147
}

pyproject.toml

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,45 +52,44 @@ classifiers = [
5252
]
5353
dynamic = [ "version" ] # Version is still managed in version.py
5454
dependencies = [
55-
"lxml>=5.0,<7.0",
56-
"mistune>=3.0,<4.0",
5755
"pathspec>=0.11,<2.0",
58-
"pyyaml>=6.0.2,<8.0",
59-
"ruamel-yaml>=0.18,<1.0",
6056
"tiktoken>=0.7,<1.0",
6157
]
6258
optional-dependencies.dev = [
63-
"autoflake>=2.0,<3.0",
6459
"black>=23.0.0,<27.0",
6560
# Build and release
6661
"build>=0.10,<2.0",
6762
"coverage>=7.0,<8.0",
6863
"hypothesis>=6.0,<7.0",
6964
"import-linter>=2.0,<3.0",
7065
"isort>=5.12,<9.0",
66+
"lxml>=5.0,<7.0",
67+
"mistune>=3.0,<4.0",
7168
"mutmut>=3.5,<4.0",
7269
"mypy>=1.0,<2.0",
7370
"pre-commit>=3.0,<5.0",
7471
"pyinstaller>=5.0,<7.0",
7572
"pytest>=7.0,<10.0",
7673
"pytest-cov>=3.0,<8.0",
7774
"pytest-xdist>=3.0,<4.0",
75+
"pyyaml>=6.0.2,<8.0",
7876
"radon>=6.0,<7.0",
77+
"ruamel-yaml>=0.18,<1.0",
7978
# Quality checks
8079
"ruff>=0.4,<1.0",
8180
"treemapper[tree-sitter]",
82-
"twine>=4.0,<7.0",
8381
# Type stubs
8482
"types-pyyaml>=6.0,<7.0",
85-
# Security fixes
86-
"urllib3>=2.6.0",
8783
]
8884
optional-dependencies.embeddings = [
8985
# Semantic code embeddings for diffctx
9086
"sentence-transformers>=3.0,<6.0",
9187
]
9288
optional-dependencies.full = [
89+
"lxml>=5.0,<7.0",
90+
"mistune>=3.0,<4.0",
9391
"pysbd>=0.3,<1.0",
92+
"ruamel-yaml>=0.18,<1.0",
9493
"treemapper[tree-sitter]",
9594
]
9695
optional-dependencies.nlp = [

src/treemapper/diffctx/__init__.py

Lines changed: 27 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def _truncate_generated_fragments(file_frags: list[Fragment]) -> list[Fragment]:
205205
truncated_content = "\n".join(lines) + f"\n# ... [{remaining} more lines]"
206206
truncated.append(
207207
Fragment(
208-
id=FragmentId(frag.path, frag.start_line, frag.start_line + len(lines)),
208+
id=FragmentId(frag.path, frag.start_line, frag.start_line + len(lines) - 1),
209209
kind=frag.kind,
210210
content=truncated_content,
211211
identifiers=extract_identifiers(truncated_content),
@@ -871,16 +871,19 @@ def _count_brackets_outside_strings(line: str) -> tuple[int, int, int, int]:
871871
open_braces = 0
872872
close_braces = 0
873873
in_string: str | None = None
874-
prev = ""
874+
escaped = False
875875
for ch in line:
876876
if in_string is not None:
877-
if ch == in_string and prev != "\\":
877+
if escaped:
878+
escaped = False
879+
elif ch == "\\":
880+
escaped = True
881+
elif ch == in_string:
878882
in_string = None
879-
prev = ch
880883
continue
881884
if ch in ("'", '"', "`"):
882885
in_string = ch
883-
prev = ch
886+
escaped = False
884887
continue
885888
if ch == "(":
886889
open_parens += 1
@@ -890,7 +893,6 @@ def _count_brackets_outside_strings(line: str) -> tuple[int, int, int, int]:
890893
open_braces += 1
891894
elif ch == "}":
892895
close_braces += 1
893-
prev = ch
894896
return open_parens, close_parens, open_braces, close_braces
895897

896898

@@ -965,16 +967,12 @@ def _add_container_headers(core_ids: set[FragmentId], frags_by_path: dict[Path,
965967

966968

967969
def _log_full_mode(selected: list[Fragment]) -> None:
968-
try:
969-
used = sum(f.token_count for f in selected)
970-
logger.info(
971-
"diffctx: full mode selected=%d from changed files used=%d tokens",
972-
len(selected),
973-
used,
974-
)
975-
except (TypeError, AttributeError) as e:
976-
# nosemgrep: python-logger-credential-disclosure
977-
logger.debug("diffctx: failed to compute token count: %s", e)
970+
used = sum(f.token_count for f in selected)
971+
logger.info(
972+
"diffctx: full mode selected=%d from changed files used=%d tokens",
973+
len(selected),
974+
used,
975+
)
978976

979977

980978
def _log_ppr_mode(
@@ -985,23 +983,19 @@ def _log_ppr_mode(
985983
alpha: float,
986984
tau: float,
987985
) -> None:
988-
try:
989-
used = sum(f.token_count for f in selected)
990-
budget_str = str(budget_tokens) if budget_tokens is not None else "unlimited"
991-
logger.info(
992-
"diffctx: selected=%d core=%d used=%d/%s reason=%s utility=%.4f alpha=%.3f tau=%.3f",
993-
len(selected),
994-
len(core_ids),
995-
used,
996-
budget_str,
997-
result.reason,
998-
result.utility,
999-
alpha,
1000-
tau,
1001-
)
1002-
except (TypeError, AttributeError) as e:
1003-
# nosemgrep: python-logger-credential-disclosure
1004-
logger.debug("diffctx: failed to compute token count: %s", e)
986+
used = sum(f.token_count for f in selected)
987+
budget_str = str(budget_tokens) if budget_tokens is not None else "unlimited"
988+
logger.info(
989+
"diffctx: selected=%d core=%d used=%d/%s reason=%s utility=%.4f alpha=%.3f tau=%.3f",
990+
len(selected),
991+
len(core_ids),
992+
used,
993+
budget_str,
994+
result.reason,
995+
result.utility,
996+
alpha,
997+
tau,
998+
)
1005999

10061000

10071001
_MAX_FILE_SIZE = LIMITS.max_file_size

src/treemapper/diffctx/edges/base.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,7 @@
1010
EdgeDict = dict[tuple[FragmentId, FragmentId], float]
1111

1212

13-
_STRIP_EXTENSIONS = CODE_EXTENSIONS | frozenset(
14-
{
15-
".sc",
16-
".fs",
17-
".fsi",
18-
".fsx",
19-
}
20-
)
13+
_STRIP_EXTENSIONS = CODE_EXTENSIONS
2114

2215
_INDEX_FILE_STEMS = frozenset({"__init__", "index", "mod"})
2316

src/treemapper/diffctx/edges/config/build.py

Lines changed: 8 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,11 @@
99
_MAKEFILE_NAMES = {"makefile", "gnumakefile"}
1010
_MAKEFILE_EXTS = {".mk", ".mak", ".make"}
1111

12-
_MAKE_TARGET_RE = re.compile(r"^([a-zA-Z_][a-zA-Z0-9_.-]{0,100})\s{0,20}:(?!=)", re.MULTILINE)
1312
_MAKE_INCLUDE_RE = re.compile(r"^(?:-)?include\s+([^\n]{1,500})$", re.MULTILINE)
14-
_MAKE_VAR_RE = re.compile(r"^\s{0,20}([A-Z_][A-Z0-9_]{0,100})\s{0,20}[:?]?=", re.MULTILINE)
1513
_MAKE_RECIPE_RE = re.compile(r"^\t([^\n]{1,1000})$", re.MULTILINE)
1614

17-
_CMAKE_ADD_EXE_RE = re.compile(r"add_executable\s{0,10}\(\s{0,10}(\w{1,100})", re.IGNORECASE)
18-
_CMAKE_ADD_LIB_RE = re.compile(r"add_library\s{0,10}\(\s{0,10}(\w{1,100})", re.IGNORECASE)
19-
_CMAKE_TARGET_LINK_RE = re.compile(
20-
r"target_link_libraries\s{0,10}\(\s{0,10}(\w{1,100})\s{1,20}(?:PUBLIC|PRIVATE|INTERFACE)?\s{0,10}([^)]{1,500})\)",
21-
re.IGNORECASE,
22-
)
2315
_CMAKE_INCLUDE_RE = re.compile(r"include\s{0,10}\(\s{0,10}([^)]{1,300})\)", re.IGNORECASE)
2416
_CMAKE_ADD_SUBDIR_RE = re.compile(r"add_subdirectory\s{0,10}\(\s{0,10}([^\)\s]{1,200})", re.IGNORECASE)
25-
_CMAKE_FIND_PKG_RE = re.compile(r"find_package\s{0,10}\(\s{0,10}(\w{1,100})", re.IGNORECASE)
26-
_CMAKE_SET_RE = re.compile(r"set\s{0,10}\(\s{0,10}([A-Z_][A-Z0-9_]{0,100})", re.IGNORECASE)
2717

2818
_SCRIPT_CALL_RE = re.compile(r"(?:bash|sh|python|python3|\.\/scripts\/|\.\/bin\/)([a-zA-Z0-9_.-]+)")
2919
_SOURCE_FILE_RE = re.compile(r"\b([a-zA-Z_]\w*\.(?:c|cpp|cc|cxx|h|hpp|hxx|py|sh|go|rs|java))\b")
@@ -39,13 +29,9 @@ def _is_cmake(path: Path) -> bool:
3929
return name == "cmakelists.txt" or path.suffix.lower() == ".cmake"
4030

4131

42-
def _extract_make_refs(content: str) -> tuple[set[str], set[str]]:
43-
targets: set[str] = set()
32+
def _extract_make_refs(content: str) -> set[str]:
4433
file_refs: set[str] = set()
4534

46-
for match in _MAKE_TARGET_RE.finditer(content):
47-
targets.add(match.group(1))
48-
4935
for match in _MAKE_INCLUDE_RE.finditer(content):
5036
includes = match.group(1).split()
5137
for inc in includes:
@@ -60,22 +46,12 @@ def _extract_make_refs(content: str) -> tuple[set[str], set[str]]:
6046

6147
file_refs.update(_SOURCE_FILE_RE.findall(content))
6248

63-
return targets, file_refs
49+
return file_refs
6450

6551

66-
def _extract_cmake_refs(content: str) -> tuple[set[str], set[str]]:
67-
targets: set[str] = set()
52+
def _extract_cmake_refs(content: str) -> set[str]:
6853
file_refs: set[str] = set()
6954

70-
for pattern in [_CMAKE_ADD_EXE_RE, _CMAKE_ADD_LIB_RE]:
71-
for match in pattern.finditer(content):
72-
targets.add(match.group(1))
73-
74-
for match in _CMAKE_TARGET_LINK_RE.finditer(content):
75-
targets.add(match.group(1))
76-
deps = match.group(2).split()
77-
targets.update(d for d in deps if d and not d.startswith("$"))
78-
7955
for match in _CMAKE_INCLUDE_RE.finditer(content):
8056
file_refs.add(match.group(1).strip())
8157

@@ -86,7 +62,7 @@ def _extract_cmake_refs(content: str) -> tuple[set[str], set[str]]:
8662

8763
file_refs.update(_SOURCE_FILE_RE.findall(content))
8864

89-
return targets, file_refs
65+
return file_refs
9066

9167

9268
def _collect_build_refs(make_files: list[Path], cmake_files: list[Path]) -> set[str]:
@@ -95,16 +71,14 @@ def _collect_build_refs(make_files: list[Path], cmake_files: list[Path]) -> set[
9571
for mf in make_files:
9672
try:
9773
content = mf.read_text(encoding="utf-8")
98-
_, file_refs = _extract_make_refs(content)
99-
refs.update(file_refs)
74+
refs.update(_extract_make_refs(content))
10075
except (OSError, UnicodeDecodeError):
10176
continue
10277

10378
for cf in cmake_files:
10479
try:
10580
content = cf.read_text(encoding="utf-8")
106-
_, file_refs = _extract_cmake_refs(content)
107-
refs.update(file_refs)
81+
refs.update(_extract_cmake_refs(content))
10882
except (OSError, UnicodeDecodeError):
10983
continue
11084

@@ -150,7 +124,7 @@ def build(self, fragments: list[Fragment], repo_root: Path | None = None) -> Edg
150124
return edges
151125

152126
def _add_makefile_edges(self, mf: Fragment, cmake_frags: list[Fragment], idx: FragmentIndex, edges: EdgeDict) -> None:
153-
_, file_refs = _extract_make_refs(mf.content)
127+
file_refs = _extract_make_refs(mf.content)
154128

155129
for ref in file_refs:
156130
self._link_ref(mf.id, ref, idx, edges)
@@ -160,7 +134,7 @@ def _add_makefile_edges(self, mf: Fragment, cmake_frags: list[Fragment], idx: Fr
160134
self.add_edge(edges, mf.id, cf.id, self.weight * 0.7)
161135

162136
def _add_cmake_edges(self, cf: Fragment, fragments: list[Fragment], idx: FragmentIndex, edges: EdgeDict) -> None:
163-
_, file_refs = _extract_cmake_refs(cf.content)
137+
file_refs = _extract_cmake_refs(cf.content)
164138

165139
for ref in file_refs:
166140
self._link_ref(cf.id, ref, idx, edges)

src/treemapper/diffctx/edges/semantic/go.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
_GO_FUNC_RE = re.compile(r"^func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(", re.MULTILINE)
1616
_GO_TYPE_RE = re.compile(r"^type\s+(\w+)\s+", re.MULTILINE)
17-
_GO_CONST_VAR_RE = re.compile(r"^(?:const|var)\s+(\w+)\s+", re.MULTILINE)
1817

1918
_GO_FUNC_CALL_RE = re.compile(r"\b([a-zA-Z_]\w*)\s*\(")
2019
_GO_KEYWORDS = frozenset(
@@ -49,6 +48,7 @@
4948
_GO_TYPE_REF_RE = re.compile(r"\*?([A-Z]\w*)\b")
5049
_GO_PKG_CALL_RE = re.compile(r"\b(\w+)\.([A-Z]\w*)")
5150
_GO_EMBED_RE = re.compile(r"//go:embed\s+(\S+)", re.MULTILINE)
51+
_GO_PKG_DECL_RE = re.compile(r"^package\s+(\w+)", re.MULTILINE)
5252

5353

5454
def _extract_imports(content: str) -> set[str]:
@@ -65,11 +65,10 @@ def _extract_imports(content: str) -> set[str]:
6565
return imports
6666

6767

68-
def _extract_definitions(content: str) -> tuple[set[str], set[str], set[str]]:
68+
def _extract_definitions(content: str) -> tuple[set[str], set[str]]:
6969
funcs = {m.group(1) for m in _GO_FUNC_RE.finditer(content)}
7070
types = {m.group(1) for m in _GO_TYPE_RE.finditer(content)}
71-
consts_vars = {m.group(1) for m in _GO_CONST_VAR_RE.finditer(content)}
72-
return funcs, types, consts_vars
71+
return funcs, types
7372

7473

7574
def _extract_references(content: str) -> tuple[set[str], set[str], set[tuple[str, str]]]:
@@ -83,7 +82,10 @@ def _is_go_file(path: Path) -> bool:
8382
return path.suffix.lower() == ".go"
8483

8584

86-
def _get_package_name(path: Path) -> str:
85+
def _get_package_name_from_content(content: str, path: Path) -> str:
86+
match = _GO_PKG_DECL_RE.search(content)
87+
if match:
88+
return match.group(1)
8789
return path.parent.name
8890

8991

@@ -207,7 +209,7 @@ def _build_indices(
207209
func_defs: dict[str, list[FragmentId]] = defaultdict(list)
208210

209211
for f in go_frags:
210-
pkg = _get_package_name(f.path).lower()
212+
pkg = _get_package_name_from_content(f.content, f.path).lower()
211213
pkg_to_frags[pkg].append(f.id)
212214

213215
if repo_root:
@@ -217,7 +219,7 @@ def _build_indices(
217219
except ValueError:
218220
pass
219221

220-
funcs, types, _ = _extract_definitions(f.content)
222+
funcs, types = _extract_definitions(f.content)
221223
for t in types:
222224
type_defs[t.lower()].append(f.id)
223225
for fn in funcs:
@@ -275,7 +277,7 @@ def _link_import_by_path(
275277
edges: EdgeDict,
276278
) -> None:
277279
for path_str, frag_ids in path_to_frags.items():
278-
if path_str in imp or imp.endswith(path_str):
280+
if f"/{path_str}" in imp or imp == path_str or imp.endswith(f"/{path_str}"):
279281
self.add_edges_from_ids(gf_id, frag_ids, self.import_weight, edges)
280282

281283
def _link_refs(
@@ -309,7 +311,7 @@ def _link_same_package(
309311
pkg_to_frags: dict[str, list[FragmentId]],
310312
edges: EdgeDict,
311313
) -> None:
312-
current_pkg = _get_package_name(gf.path).lower()
314+
current_pkg = _get_package_name_from_content(gf.content, gf.path).lower()
313315
for fid in pkg_to_frags.get(current_pkg, []):
314316
if fid != gf.id:
315317
self.add_edge(edges, gf.id, fid, self.same_package_weight)

0 commit comments

Comments
 (0)