Skip to content

Commit dc16701

Browse files
committed
perf(diffctx): ego_graph O(k*BFS)→O(BFS) + file_cache to universe
Fix ego_graph: hoist to_undirected outside seed loop, replace per-node shortest_path_length with single_source_shortest_path_length. Reduces complexity from O(seeds*ego_size*(V+E)) to O(seeds*(V+E)). Propagate file_cache to _build_ident_index and _expand_universe_by_rare_identifiers in universe.py. Add _read_file helper to EdgeBuilder base class for cache-aware file reading across all 40+ edge builders.
1 parent 0ebe7c9 commit dc16701

5 files changed

Lines changed: 41 additions & 19 deletions

File tree

src/treemapper/diffctx/edges/base.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,17 @@ def discover_related_files(
185185
) -> list[Path]:
186186
return []
187187

188+
@staticmethod
189+
def _read_file(path: Path, file_cache: dict[Path, str] | None = None) -> str | None:
190+
if file_cache is not None:
191+
content = file_cache.get(path)
192+
if content is not None:
193+
return content
194+
try:
195+
return path.read_text(encoding="utf-8")
196+
except (OSError, UnicodeDecodeError):
197+
return None
198+
188199
def add_edge(
189200
self,
190201
edges: EdgeDict,

src/treemapper/diffctx/graph.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,14 @@ def nx(self) -> nx.DiGraph:
8080

8181
def ego_graph(self, seeds: set[FragmentId], radius: int = 2) -> dict[FragmentId, float]:
8282
scores: dict[FragmentId, float] = {}
83+
if not self._g:
84+
return scores
85+
undirected = self._g.to_undirected(as_view=True)
8386
for seed in seeds:
8487
if seed not in self._g:
8588
continue
86-
undirected = self._g.to_undirected(as_view=True)
87-
ego = nx.ego_graph(undirected, seed, radius=radius)
88-
for node in ego.nodes:
89-
dist = nx.shortest_path_length(undirected, seed, node)
89+
distances = nx.single_source_shortest_path_length(undirected, seed, cutoff=radius)
90+
for node, dist in distances.items():
9091
hop_score = 1.0 / (1 + dist) if dist > 0 else 1.0
9192
scores[node] = max(scores.get(node, 0.0), hop_score)
9293
return scores

src/treemapper/diffctx/scoring.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def discover(self, ctx: DiscoveryContext) -> list[Path]:
4949
ctx.changed_files + edge_discovered,
5050
combined_spec,
5151
candidate_files=ctx.all_candidate_files,
52+
file_cache=ctx.file_cache,
5253
)
5354

5455
return list(dict.fromkeys(edge_discovered + expanded))

src/treemapper/diffctx/universe.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -132,17 +132,20 @@ def _collect_candidate_files(root_dir: Path, included_set: set[Path], combined_s
132132
def _build_ident_index(
133133
files: list[Path],
134134
concepts: frozenset[str],
135+
file_cache: dict[Path, str] | None = None,
135136
) -> dict[str, list[Path]]:
136137
inverted_index: dict[str, list[Path]] = defaultdict(list)
137138
for file_path in files:
138-
try:
139-
content = file_path.read_text(encoding="utf-8")
140-
file_idents = extract_identifiers(content, skip_stopwords=False)
141-
for ident in file_idents:
142-
if ident in concepts:
143-
inverted_index[ident].append(file_path)
144-
except (OSError, UnicodeDecodeError):
145-
continue
139+
content = file_cache.get(file_path) if file_cache else None
140+
if content is None:
141+
try:
142+
content = file_path.read_text(encoding="utf-8")
143+
except (OSError, UnicodeDecodeError):
144+
continue
145+
file_idents = extract_identifiers(content, skip_stopwords=False)
146+
for ident in file_idents:
147+
if ident in concepts:
148+
inverted_index[ident].append(file_path)
146149
return inverted_index
147150

148151

@@ -219,6 +222,7 @@ def _expand_universe_by_rare_identifiers(
219222
already_included: list[Path],
220223
combined_spec: pathspec.PathSpec,
221224
candidate_files: list[Path] | None = None,
225+
file_cache: dict[Path, str] | None = None,
222226
) -> list[Path]:
223227
if not concepts:
224228
return []
@@ -228,16 +232,19 @@ def _expand_universe_by_rare_identifiers(
228232
files = [f for f in candidate_files if f not in included_set]
229233
else:
230234
files = _collect_candidate_files(root_dir, included_set, combined_spec)
231-
inverted_index = _build_ident_index(files, concepts)
235+
inverted_index = _build_ident_index(files, concepts, file_cache=file_cache)
232236

233237
included_concept_counts: dict[str, int] = {}
234238
for f in already_included:
235-
try:
236-
content = f.read_text(encoding="utf-8")
237-
for ident in extract_identifiers(content, skip_stopwords=False):
238-
if ident in concepts:
239-
included_concept_counts[ident] = included_concept_counts.get(ident, 0) + 1
240-
except (OSError, UnicodeDecodeError):
239+
content = file_cache.get(f) if file_cache else None
240+
if content is None:
241+
try:
242+
content = f.read_text(encoding="utf-8")
243+
except (OSError, UnicodeDecodeError):
244+
continue
245+
for ident in extract_identifiers(content, skip_stopwords=False):
246+
if ident in concepts:
247+
included_concept_counts[ident] = included_concept_counts.get(ident, 0) + 1
241248
continue
242249

243250
return _collect_expansion_files(inverted_index, concepts, included_set, included_concept_counts)

whitelist_vulture.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from treemapper.clipboard import clipboard_available
2+
from treemapper.diffctx.edges.base import EdgeBuilder
23
from treemapper.diffctx.edges.semantic import (
34
AnsibleEdgeBuilder,
45
BazelEdgeBuilder,
@@ -66,4 +67,5 @@
6667
blast_radius = graph_analytics.blast_radius
6768
ScoringMode.AUTO
6869
PipelineConfig.low_relevance
70+
EdgeBuilder._read_file
6971
EgoGraphScoring

0 commit comments

Comments
 (0)