Skip to content

Commit 203d670

Browse files
authored
perf: optimize cycles and stats for large codebases (#781)
* perf: optimize cycles and stats commands for large codebases findCycles: build adjacency list directly from DB rows and run Tarjan inline — eliminates intermediate CodeGraph construction, toEdgeArray serialization, and redundant label CodeGraph rebuild. statsData/moduleMapData: replace O(V×E) correlated subqueries in findHotspots and moduleMapData with pre-aggregated LEFT JOINs. Replace full-table-load + JS filtering in countNodesByKind, countEdgesByKind, and countRoles with SQL WHERE clauses using testFilterSQL. Remove buildTestFileIds (no longer needed). Closes #763 * fix: deduplicate edges in findCycles and filter edge kinds in findHotspots (#781) Restore edge deduplication that existed in the old buildDependencyGraph path — without it, duplicate (source, target) pairs from multiple call sites inflated the edge array passed to both native and JS engines. Also align findHotspots fan-in/fan-out subqueries with moduleMapData by excluding contains, parameter_of, and receiver edges so hotspot ranking reflects inter-file coupling rather than internal structure edges.
1 parent d9ed474 commit 203d670

2 files changed

Lines changed: 168 additions & 115 deletions

File tree

src/domain/analysis/module-map.ts

Lines changed: 58 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,13 @@
11
import path from 'node:path';
22
import { openReadonlyOrFail, openReadonlyWithNative, testFilterSQL } from '../../db/index.js';
3-
import { cachedStmt } from '../../db/repository/cached-stmt.js';
43
import { loadConfig } from '../../infrastructure/config.js';
54
import { debug } from '../../infrastructure/logger.js';
65
import { isTestFile } from '../../infrastructure/test-filter.js';
76
import { DEAD_ROLE_PREFIX } from '../../shared/kinds.js';
8-
import type { BetterSqlite3Database, StmtCache } from '../../types.js';
7+
import type { BetterSqlite3Database } from '../../types.js';
98
import { findCycles } from '../graph/cycles.js';
109
import { LANGUAGE_REGISTRY } from '../parser.js';
1110

12-
// ---------------------------------------------------------------------------
13-
// Statement caches (one prepared statement per db instance)
14-
// ---------------------------------------------------------------------------
15-
16-
const _fileNodesStmtCache: StmtCache<{ id: number; file: string }> = new WeakMap();
17-
const _allNodesStmtCache: StmtCache<{ id: number; file: string }> = new WeakMap();
18-
1911
export const FALSE_POSITIVE_NAMES = new Set([
2012
'run',
2113
'get',
@@ -52,48 +44,11 @@ export const FALSE_POSITIVE_CALLER_THRESHOLD = 20;
5244
// Section helpers
5345
// ---------------------------------------------------------------------------
5446

55-
const _fileNodesStmt: StmtCache<{ id: number; file: string }> = new WeakMap();
56-
const _allNodesIdFileStmt: StmtCache<{ id: number; file: string }> = new WeakMap();
57-
58-
function buildTestFileIds(db: BetterSqlite3Database): Set<number> {
59-
const allFileNodes = cachedStmt(
60-
_fileNodesStmt,
61-
db,
62-
"SELECT id, file FROM nodes WHERE kind = 'file'",
63-
).all();
64-
const testFileIds = new Set<number>();
65-
const testFiles = new Set<string>();
66-
for (const n of allFileNodes) {
67-
if (isTestFile(n.file)) {
68-
testFileIds.add(n.id);
69-
testFiles.add(n.file);
70-
}
71-
}
72-
const allNodes = cachedStmt(_allNodesIdFileStmt, db, 'SELECT id, file FROM nodes').all();
73-
for (const n of allNodes) {
74-
if (testFiles.has(n.file)) testFileIds.add(n.id);
75-
}
76-
return testFileIds;
77-
}
78-
79-
function countNodesByKind(db: BetterSqlite3Database, testFileIds: Set<number> | null) {
80-
let nodeRows: Array<{ kind: string; c: number }>;
81-
if (testFileIds) {
82-
const allNodes = db.prepare('SELECT id, kind, file FROM nodes').all() as Array<{
83-
id: number;
84-
kind: string;
85-
file: string;
86-
}>;
87-
const filtered = allNodes.filter((n) => !testFileIds.has(n.id));
88-
const counts: Record<string, number> = {};
89-
for (const n of filtered) counts[n.kind] = (counts[n.kind] || 0) + 1;
90-
nodeRows = Object.entries(counts).map(([kind, c]) => ({ kind, c }));
91-
} else {
92-
nodeRows = db.prepare('SELECT kind, COUNT(*) as c FROM nodes GROUP BY kind').all() as Array<{
93-
kind: string;
94-
c: number;
95-
}>;
96-
}
47+
function countNodesByKind(db: BetterSqlite3Database, noTests: boolean) {
48+
const testFilter = testFilterSQL('file', noTests);
49+
const nodeRows = db
50+
.prepare(`SELECT kind, COUNT(*) as c FROM nodes WHERE 1=1 ${testFilter} GROUP BY kind`)
51+
.all() as Array<{ kind: string; c: number }>;
9752
const byKind: Record<string, number> = {};
9853
let total = 0;
9954
for (const r of nodeRows) {
@@ -103,20 +58,21 @@ function countNodesByKind(db: BetterSqlite3Database, testFileIds: Set<number> |
10358
return { total, byKind };
10459
}
10560

106-
function countEdgesByKind(db: BetterSqlite3Database, testFileIds: Set<number> | null) {
61+
function countEdgesByKind(db: BetterSqlite3Database, noTests: boolean) {
10762
let edgeRows: Array<{ kind: string; c: number }>;
108-
if (testFileIds) {
109-
const allEdges = db.prepare('SELECT source_id, target_id, kind FROM edges').all() as Array<{
110-
source_id: number;
111-
target_id: number;
112-
kind: string;
113-
}>;
114-
const filtered = allEdges.filter(
115-
(e) => !testFileIds.has(e.source_id) && !testFileIds.has(e.target_id),
116-
);
117-
const counts: Record<string, number> = {};
118-
for (const e of filtered) counts[e.kind] = (counts[e.kind] || 0) + 1;
119-
edgeRows = Object.entries(counts).map(([kind, c]) => ({ kind, c }));
63+
if (noTests) {
64+
// Join edges with source node to filter out test files in SQL
65+
const srcFilter = testFilterSQL('ns.file', true);
66+
const tgtFilter = testFilterSQL('nt.file', true);
67+
edgeRows = db
68+
.prepare(`
69+
SELECT e.kind, COUNT(*) as c FROM edges e
70+
JOIN nodes ns ON e.source_id = ns.id
71+
JOIN nodes nt ON e.target_id = nt.id
72+
WHERE 1=1 ${srcFilter} ${tgtFilter}
73+
GROUP BY e.kind
74+
`)
75+
.all() as Array<{ kind: string; c: number }>;
12076
} else {
12177
edgeRows = db.prepare('SELECT kind, COUNT(*) as c FROM edges GROUP BY kind').all() as Array<{
12278
kind: string;
@@ -157,16 +113,25 @@ function findHotspots(db: BetterSqlite3Database, noTests: boolean, limit: number
157113
const hotspotRows = db
158114
.prepare(`
159115
SELECT n.file,
160-
(SELECT COUNT(*) FROM edges WHERE target_id = n.id) as fan_in,
161-
(SELECT COUNT(*) FROM edges WHERE source_id = n.id) as fan_out
116+
COALESCE(fi.cnt, 0) as fan_in,
117+
COALESCE(fo.cnt, 0) as fan_out
162118
FROM nodes n
119+
LEFT JOIN (
120+
SELECT target_id, COUNT(*) AS cnt FROM edges
121+
WHERE kind NOT IN ('contains', 'parameter_of', 'receiver')
122+
GROUP BY target_id
123+
) fi ON fi.target_id = n.id
124+
LEFT JOIN (
125+
SELECT source_id, COUNT(*) AS cnt FROM edges
126+
WHERE kind NOT IN ('contains', 'parameter_of', 'receiver')
127+
GROUP BY source_id
128+
) fo ON fo.source_id = n.id
163129
WHERE n.kind = 'file' ${testFilter}
164-
ORDER BY (SELECT COUNT(*) FROM edges WHERE target_id = n.id)
165-
+ (SELECT COUNT(*) FROM edges WHERE source_id = n.id) DESC
130+
ORDER BY COALESCE(fi.cnt, 0) + COALESCE(fo.cnt, 0) DESC
131+
LIMIT ?
166132
`)
167-
.all() as Array<{ file: string; fan_in: number; fan_out: number }>;
168-
const filtered = noTests ? hotspotRows.filter((r) => !isTestFile(r.file)) : hotspotRows;
169-
return filtered.slice(0, limit).map((r) => ({
133+
.all(limit) as Array<{ file: string; fan_in: number; fan_out: number }>;
134+
return hotspotRows.map((r) => ({
170135
file: r.file,
171136
fanIn: r.fan_in,
172137
fanOut: r.fan_out,
@@ -275,20 +240,12 @@ function computeQualityMetrics(
275240
}
276241

277242
function countRoles(db: BetterSqlite3Database, noTests: boolean) {
278-
let roleRows: Array<{ role: string; c: number }>;
279-
if (noTests) {
280-
const allRoleNodes = db
281-
.prepare('SELECT role, file FROM nodes WHERE role IS NOT NULL')
282-
.all() as Array<{ role: string; file: string }>;
283-
const filtered = allRoleNodes.filter((n) => !isTestFile(n.file));
284-
const counts: Record<string, number> = {};
285-
for (const n of filtered) counts[n.role] = (counts[n.role] || 0) + 1;
286-
roleRows = Object.entries(counts).map(([role, c]) => ({ role, c }));
287-
} else {
288-
roleRows = db
289-
.prepare('SELECT role, COUNT(*) as c FROM nodes WHERE role IS NOT NULL GROUP BY role')
290-
.all() as Array<{ role: string; c: number }>;
291-
}
243+
const testFilter = testFilterSQL('file', noTests);
244+
const roleRows = db
245+
.prepare(
246+
`SELECT role, COUNT(*) as c FROM nodes WHERE role IS NOT NULL ${testFilter} GROUP BY role`,
247+
)
248+
.all() as Array<{ role: string; c: number }>;
292249
const roles: Record<string, number> & { dead?: number } = {};
293250
let deadTotal = 0;
294251
for (const r of roleRows) {
@@ -344,13 +301,23 @@ export function moduleMapData(customDbPath: string, limit = 20, opts: { noTests?
344301

345302
const nodes = db
346303
.prepare(`
347-
SELECT n.*,
348-
(SELECT COUNT(*) FROM edges WHERE source_id = n.id AND kind NOT IN ('contains', 'parameter_of', 'receiver')) as out_edges,
349-
(SELECT COUNT(*) FROM edges WHERE target_id = n.id AND kind NOT IN ('contains', 'parameter_of', 'receiver')) as in_edges
304+
SELECT n.file,
305+
COALESCE(fo.cnt, 0) as out_edges,
306+
COALESCE(fi.cnt, 0) as in_edges
350307
FROM nodes n
308+
LEFT JOIN (
309+
SELECT source_id, COUNT(*) AS cnt FROM edges
310+
WHERE kind NOT IN ('contains', 'parameter_of', 'receiver')
311+
GROUP BY source_id
312+
) fo ON fo.source_id = n.id
313+
LEFT JOIN (
314+
SELECT target_id, COUNT(*) AS cnt FROM edges
315+
WHERE kind NOT IN ('contains', 'parameter_of', 'receiver')
316+
GROUP BY target_id
317+
) fi ON fi.target_id = n.id
351318
WHERE n.kind = 'file'
352319
${testFilter}
353-
ORDER BY (SELECT COUNT(*) FROM edges WHERE target_id = n.id AND kind NOT IN ('contains', 'parameter_of', 'receiver')) DESC
320+
ORDER BY COALESCE(fi.cnt, 0) DESC
354321
LIMIT ?
355322
`)
356323
.all(limit) as Array<{ file: string; in_edges: number; out_edges: number }>;
@@ -486,10 +453,9 @@ export function statsData(customDbPath: string, opts: { noTests?: boolean; confi
486453

487454
// ── JS fallback ───────────────────────────────────────────────────
488455
const testFilter = testFilterSQL('n.file', noTests);
489-
const testFileIds = noTests ? buildTestFileIds(db) : null;
490456

491-
const { total: totalNodes, byKind: nodesByKind } = countNodesByKind(db, testFileIds);
492-
const { total: totalEdges, byKind: edgesByKind } = countEdgesByKind(db, testFileIds);
457+
const { total: totalNodes, byKind: nodesByKind } = countNodesByKind(db, noTests);
458+
const { total: totalEdges, byKind: edgesByKind } = countEdgesByKind(db, noTests);
493459

494460
const hotspots = findHotspots(db, noTests, 5);
495461
const embeddings = getEmbeddingsInfo(db);

src/domain/graph/cycles.ts

Lines changed: 110 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,137 @@
1-
import { tarjan } from '../../graph/algorithms/tarjan.js';
2-
import { buildDependencyGraph } from '../../graph/builders/dependency.js';
3-
import { CodeGraph } from '../../graph/model.js';
1+
import { getCallableNodes, getCallEdges, getFileNodesAll, getImportEdges } from '../../db/index.js';
42
import { loadNative } from '../../infrastructure/native.js';
3+
import { isTestFile } from '../../infrastructure/test-filter.js';
54
import type { BetterSqlite3Database } from '../../types.js';
65

6+
/**
7+
* Find cycles using Tarjan's SCC algorithm.
8+
*
9+
* Builds a label-based adjacency list directly from DB rows — no intermediate
10+
* CodeGraph construction. This is O(V + E) with minimal memory overhead.
11+
*/
712
export function findCycles(
813
db: BetterSqlite3Database,
914
opts: { fileLevel?: boolean; noTests?: boolean } = {},
1015
): string[][] {
1116
const fileLevel = opts.fileLevel !== false;
1217
const noTests = opts.noTests || false;
1318

14-
const graph = buildDependencyGraph(db, { fileLevel, noTests });
19+
const edges: Array<{ source: string; target: string }> = [];
20+
const seen = new Set<string>();
1521

16-
const idToLabel = new Map<string, string>();
17-
for (const [id, attrs] of graph.nodes()) {
18-
if (fileLevel) {
19-
idToLabel.set(id, attrs.file as string);
20-
} else {
21-
idToLabel.set(id, `${attrs.label}|${attrs.file}`);
22+
if (fileLevel) {
23+
let nodes = getFileNodesAll(db);
24+
if (noTests) nodes = nodes.filter((n) => !isTestFile(n.file));
25+
const nodeIds = new Set<number>();
26+
const idToFile = new Map<number, string>();
27+
for (const n of nodes) {
28+
nodeIds.add(n.id);
29+
idToFile.set(n.id, n.file);
30+
}
31+
for (const e of getImportEdges(db)) {
32+
if (!nodeIds.has(e.source_id) || !nodeIds.has(e.target_id)) continue;
33+
if (e.source_id === e.target_id) continue;
34+
const src = idToFile.get(e.source_id)!;
35+
const tgt = idToFile.get(e.target_id)!;
36+
const key = `${src}\0${tgt}`;
37+
if (seen.has(key)) continue;
38+
seen.add(key);
39+
edges.push({ source: src, target: tgt });
40+
}
41+
} else {
42+
let nodes = getCallableNodes(db);
43+
if (noTests) nodes = nodes.filter((n) => !isTestFile(n.file));
44+
const nodeIds = new Set<number>();
45+
const idToLabel = new Map<number, string>();
46+
for (const n of nodes) {
47+
nodeIds.add(n.id);
48+
idToLabel.set(n.id, `${n.name}|${n.file}`);
49+
}
50+
for (const e of getCallEdges(db)) {
51+
if (!nodeIds.has(e.source_id) || !nodeIds.has(e.target_id)) continue;
52+
if (e.source_id === e.target_id) continue;
53+
const src = idToLabel.get(e.source_id)!;
54+
const tgt = idToLabel.get(e.target_id)!;
55+
const key = `${src}\0${tgt}`;
56+
if (seen.has(key)) continue;
57+
seen.add(key);
58+
edges.push({ source: src, target: tgt });
2259
}
2360
}
2461

25-
const edges = graph.toEdgeArray().map((e) => ({
26-
source: idToLabel.get(e.source) ?? e.source,
27-
target: idToLabel.get(e.target) ?? e.target,
28-
}));
29-
3062
const native = loadNative();
3163
if (native) {
3264
return native.detectCycles(edges) as string[][];
3365
}
3466

35-
const labelGraph = new CodeGraph();
36-
for (const { source, target } of edges) {
37-
labelGraph.addEdge(source, target);
38-
}
39-
return tarjan(labelGraph);
67+
return tarjanFromEdges(edges);
4068
}
4169

4270
export function findCyclesJS(edges: Array<{ source: string; target: string }>): string[][] {
43-
const graph = new CodeGraph();
71+
return tarjanFromEdges(edges);
72+
}
73+
74+
/**
75+
* Run Tarjan's SCC on a flat edge list. Returns SCCs with length > 1 (cycles).
76+
* Uses a simple adjacency-list Map instead of a full CodeGraph.
77+
*/
78+
function tarjanFromEdges(edges: Array<{ source: string; target: string }>): string[][] {
79+
const adj = new Map<string, string[]>();
80+
const allNodes = new Set<string>();
4481
for (const { source, target } of edges) {
45-
graph.addEdge(source, target);
82+
allNodes.add(source);
83+
allNodes.add(target);
84+
let list = adj.get(source);
85+
if (!list) {
86+
list = [];
87+
adj.set(source, list);
88+
}
89+
list.push(target);
4690
}
47-
return tarjan(graph);
91+
92+
let index = 0;
93+
const stack: string[] = [];
94+
const onStack = new Set<string>();
95+
const indices = new Map<string, number>();
96+
const lowlinks = new Map<string, number>();
97+
const sccs: string[][] = [];
98+
99+
function strongconnect(v: string): void {
100+
indices.set(v, index);
101+
lowlinks.set(v, index);
102+
index++;
103+
stack.push(v);
104+
onStack.add(v);
105+
106+
const successors = adj.get(v);
107+
if (successors) {
108+
for (const w of successors) {
109+
if (!indices.has(w)) {
110+
strongconnect(w);
111+
lowlinks.set(v, Math.min(lowlinks.get(v)!, lowlinks.get(w)!));
112+
} else if (onStack.has(w)) {
113+
lowlinks.set(v, Math.min(lowlinks.get(v)!, indices.get(w)!));
114+
}
115+
}
116+
}
117+
118+
if (lowlinks.get(v) === indices.get(v)) {
119+
const scc: string[] = [];
120+
let w: string | undefined;
121+
do {
122+
w = stack.pop()!;
123+
onStack.delete(w);
124+
scc.push(w);
125+
} while (w !== v);
126+
if (scc.length > 1) sccs.push(scc);
127+
}
128+
}
129+
130+
for (const id of allNodes) {
131+
if (!indices.has(id)) strongconnect(id);
132+
}
133+
134+
return sccs;
48135
}
49136

50137
export function formatCycles(cycles: string[][]): string {

0 commit comments

Comments
 (0)