Skip to content

Commit a724a09

Browse files
authored
perf(native): scope incremental rebuild to truly-changed files (#1012) (#1027)
* perf(native): scope incremental rebuild to truly-changed files (#1012) Native 1-file incremental was 4.3x slower than WASM (876ms vs 203ms) because `run_pipeline` re-parsed the full reverse-dep cone (47 files for a 1-file change), which cascaded into insert/structure/roles/analysis phases. Adopt the WASM save+reconnect strategy: save reverse-dep → changed-file edges before purge, reconnect to new node IDs after Stage 5. Reverse-dep files are no longer re-parsed — only their affected edges are reconstructed. Result: 876ms → 43ms (95% faster, 0.78x WASM). - insertMs: 185ms → 0.2ms - structureMs: 73ms → 3.1ms - rolesMs: 311ms → 18.3ms Edge counts identical between full and 1-file rebuild (37134), confirming no edge loss or duplication. The existing `1-file incremental ratio` gate in `benchmark-parity-gate.mjs` (limit 1.5) enforces the ceiling. * fix(native): only count actual inserts in reconnect counter (#1027) INSERT OR IGNORE returns Ok(0) for duplicate-row no-ops, so the previous `is_ok()` check inflated `reconnected` whenever two saved entries resolved to the same (source_id, new_target_id, kind). Match on the rows-affected count and only increment when n > 0; leave the dropped counter for genuine errors. Diagnostic-only — no functional change to edge data. Impact: 1 functions changed, 0 affected * fix(native): log dropped reverse-dep edges during reconnect (#1027) `reconnect_reverse_dep_edges` returns `(reconnected, dropped)` but the caller previously discarded the result. Dropped edges occur when a saved target node can't be re-located after re-insert (e.g. the symbol was renamed or deleted), and silently swallowing them makes partial edge loss during incremental rebuild hard to diagnose. Surface the count via eprintln when dropped > 0 — same observability pattern other native diagnostics use. Impact: 1 functions changed, 0 affected * fix(native): reclassify reverse-deps of removed files (#1027) When a file is removed during incremental build, its nodes are purged along with edges pointing at them. Files that imported the removed file were never re-parsed (correct, by design) but their role records became stale: fan-out dropped silently because do_classify_incremental was only seeded with parsed files. For removal-only builds, classification was skipped entirely. Compute reverse-deps of removed entries before purge (find_reverse_deps with change_result.removed as seed, mirroring WASM/JS) and union them into the seed passed to do_classify_incremental. No re-parse needed — the classifier reads fan-in/fan-out from the post-purge edges table. Impact: 1 functions changed, 0 affected
1 parent 57a6df4 commit a724a09

3 files changed

Lines changed: 224 additions & 56 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/codegraph-core/src/build_pipeline.rs

Lines changed: 72 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -180,59 +180,61 @@ pub fn run_pipeline(
180180
});
181181
}
182182

183-
// Track reverse-dep files that need re-parsing for edge reconstruction.
184-
// Also track their relative paths so we can exclude them from analysis_scope —
185-
// reverse-dep files are re-parsed for edge rebuilding but their content didn't
186-
// change, so running AST/complexity/CFG/dataflow on them is wasted work (#761).
187-
let mut reverse_dep_abs_paths: Vec<String> = Vec::new();
188-
let mut reverse_dep_rel_paths: HashSet<String> = HashSet::new();
183+
// Save reverse-dep → changed-file edges before purge so we can reconnect
184+
// them to new node IDs after Stage 5 (#1012). This matches the WASM/JS
185+
// strategy and lets us skip re-parsing reverse-dep files entirely:
186+
// parse/insert/structure/roles/analysis all scope to truly-changed files.
187+
let mut saved_reverse_dep_edges: Vec<change_detection::SavedReverseDepEdge> = Vec::new();
188+
// Files that import a removed file. Save+reconnect doesn't apply (the
189+
// target node is gone for good), but their role records go stale because
190+
// edges to the deleted file's nodes get purged in Stage 3. Reclassify them
191+
// in Stage 8 so fan-out reflects reality. (#1027 review)
192+
let mut removal_reverse_deps: Vec<String> = Vec::new();
189193

190194
// Handle full build: clear all graph data
191195
if change_result.is_full_build {
192196
let has_embeddings = change_detection::has_embeddings(conn);
193197
change_detection::clear_all_graph_data(conn, has_embeddings);
194198
} else {
195-
// Incremental: find reverse deps and purge changed files
196-
let changed_rel_paths: HashSet<String> = parse_changes
197-
.iter()
198-
.map(|c| c.rel_path.clone())
199-
.chain(change_result.removed.iter().cloned())
200-
.collect();
201-
202-
let reverse_deps = if opts.no_reverse_deps.unwrap_or(false) {
203-
HashSet::new()
204-
} else {
205-
change_detection::find_reverse_dependencies(conn, &changed_rel_paths, root_dir)
206-
};
199+
// Incremental: save reverse-dep edges (if reverse-dep tracking is enabled),
200+
// then purge changed files only.
201+
let changed_paths: Vec<String> =
202+
parse_changes.iter().map(|c| c.rel_path.clone()).collect();
203+
204+
if !opts.no_reverse_deps.unwrap_or(false) {
205+
saved_reverse_dep_edges =
206+
change_detection::save_reverse_dep_edges(conn, &changed_paths);
207+
208+
if !change_result.removed.is_empty() {
209+
let removed_set: HashSet<String> =
210+
change_result.removed.iter().cloned().collect();
211+
removal_reverse_deps =
212+
change_detection::find_reverse_dependencies(conn, &removed_set, root_dir)
213+
.into_iter()
214+
.collect();
215+
}
216+
}
207217

208218
let files_to_purge: Vec<String> = change_result
209219
.removed
210220
.iter()
211221
.chain(parse_changes.iter().map(|c| &c.rel_path))
212222
.cloned()
213223
.collect();
214-
let reverse_dep_list: Vec<String> = reverse_deps.iter().cloned().collect();
215-
change_detection::purge_changed_files(conn, &files_to_purge, &reverse_dep_list);
216-
217-
// Track reverse-dep absolute paths so we can re-parse them for edge
218-
// rebuilding. Their nodes are still in the DB (only edges were purged),
219-
// but we need fresh FileSymbols so Stage 7 can reconstruct their
220-
// import and call edges.
221-
for rdep in &reverse_dep_list {
222-
let abs = Path::new(root_dir).join(rdep);
223-
if abs.exists() {
224-
reverse_dep_abs_paths.push(abs.to_str().unwrap_or("").to_string());
225-
reverse_dep_rel_paths.insert(rdep.clone());
226-
}
227-
}
224+
// Pass empty reverse_dep_files: purge already deletes both directions
225+
// for changed files (which removes the saved reverse-dep → changed-file
226+
// edges from the live table), and other outgoing edges from reverse-dep
227+
// files remain valid and must NOT be deleted — they will be reconnected
228+
// to new target IDs after insert.
229+
change_detection::purge_changed_files(conn, &files_to_purge, &[]);
228230
}
229231

230232
// ── Stage 4: Parse files ───────────────────────────────────────────
233+
// Only truly-changed files are parsed. Reverse-dep files are not re-parsed —
234+
// their edges to changed files are reconstructed via save+reconnect (#1012).
231235
let t0 = Instant::now();
232-
let mut files_to_parse: Vec<String> =
236+
let files_to_parse: Vec<String> =
233237
parse_changes.iter().map(|c| c.abs_path.clone()).collect();
234-
// Include reverse-dep files so their edges are rebuilt after purging
235-
files_to_parse.extend(reverse_dep_abs_paths);
236238
let parsed =
237239
parallel::parse_files_parallel(&files_to_parse, root_dir, include_dataflow, include_ast);
238240

@@ -330,32 +332,36 @@ pub fn run_pipeline(
330332
// internal logic. We load nodes from DB and pass to the edge builder.
331333
build_and_insert_call_edges(conn, &file_symbols, &import_ctx, !change_result.is_full_build);
332334

335+
// Reconnect saved reverse-dep edges to new node IDs (#1012). Mirrors
336+
// `reconnectReverseDepEdges` in build-edges.ts — for each saved edge,
337+
// look up the new target node and recreate the edge with the original
338+
// source_id (still valid; reverse-dep nodes were never purged).
339+
if !saved_reverse_dep_edges.is_empty() {
340+
let (reconnected, dropped) =
341+
change_detection::reconnect_reverse_dep_edges(conn, &saved_reverse_dep_edges);
342+
if dropped > 0 {
343+
eprintln!(
344+
"[codegraph] reconnect_reverse_dep_edges: {reconnected} reconnected, {dropped} dropped (target nodes not found)"
345+
);
346+
}
347+
}
348+
333349
timing.edges_ms = t0.elapsed().as_secs_f64() * 1000.0;
334350

335351
// ── Stage 8: Structure + roles ─────────────────────────────────────
336352
let t0 = Instant::now();
337353
let line_count_map = structure::build_line_count_map(&file_symbols, root_dir);
354+
// file_symbols only contains truly-changed files (reverse-deps are not
355+
// re-parsed; their edges are reconnected via save+reconnect — #1012), so
356+
// analysis_scope == changed_files.
338357
let changed_files: Vec<String> = file_symbols.keys().cloned().collect();
339-
// Build analysis_scope excluding reverse-dep files — they were re-parsed for
340-
// edge reconstruction but their content didn't change, so AST/complexity/CFG/
341-
// dataflow analysis would be redundant (#761). This matches the JS pipeline's
342-
// _reverseDepOnly filtering in run-analyses.ts.
343358
let analysis_scope: Option<Vec<String>> = if change_result.is_full_build {
344359
None
345360
} else {
346-
Some(
347-
changed_files
348-
.iter()
349-
.filter(|f| !reverse_dep_rel_paths.contains(f.as_str()))
350-
.cloned()
351-
.collect(),
352-
)
361+
Some(changed_files.clone())
353362
};
354363

355364
let existing_file_count = structure::get_existing_file_count(conn);
356-
// Use parse_changes.len() for the threshold — changed_files includes
357-
// reverse-dep files added for edge rebuilding, which inflates the count
358-
// and would skip the fast path even for single-file incremental builds.
359365
let use_fast_path =
360366
!change_result.is_full_build && parse_changes.len() <= FAST_PATH_MAX_CHANGED_FILES && existing_file_count > FAST_PATH_MIN_EXISTING_FILES;
361367

@@ -385,15 +391,26 @@ pub fn run_pipeline(
385391
timing.structure_ms = t0.elapsed().as_secs_f64() * 1000.0;
386392

387393
let t0 = Instant::now();
388-
// Role classification intentionally uses the full `changed_files` list
389-
// (including reverse-dep files), not `analysis_scope`. Reverse-dep files
390-
// had their edges rebuilt, which can change fan-in/fan-out and therefore
391-
// role assignments — so they must be re-classified even though their
392-
// content didn't change and they are excluded from AST analysis.
394+
// Role classification needs the truly-changed files plus reverse-deps of
395+
// any removed files. `do_classify_incremental` expands to neighbours via
396+
// the edges table, so reverse-deps of *changed* files are picked up
397+
// automatically when their fan-in/fan-out is affected. Reverse-deps of
398+
// *removed* files have to be added explicitly — the deleted file's nodes
399+
// are gone, so neighbour expansion can't reach the importer. Without this
400+
// seed, removal-only builds skip role classification entirely. (#1027)
393401
let changed_file_list: Option<Vec<String>> = if change_result.is_full_build {
394402
None
395403
} else {
396-
Some(changed_files)
404+
let mut files = changed_files;
405+
if !removal_reverse_deps.is_empty() {
406+
let existing: HashSet<String> = files.iter().cloned().collect();
407+
for f in removal_reverse_deps {
408+
if !existing.contains(&f) {
409+
files.push(f);
410+
}
411+
}
412+
}
413+
Some(files)
397414
};
398415
if let Some(ref files) = changed_file_list {
399416
if !files.is_empty() {

crates/codegraph-core/src/change_detection.rs

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,157 @@ fn file_mtime_size(path: &str) -> (i64, i64) {
345345
}
346346
}
347347

348+
/// A reverse-dep edge captured before purge so it can be reconnected to the
349+
/// new target node ID after the changed file's nodes are re-inserted.
350+
#[derive(Debug, Clone)]
351+
pub struct SavedReverseDepEdge {
352+
pub source_id: i64,
353+
pub tgt_name: String,
354+
pub tgt_kind: String,
355+
pub tgt_file: String,
356+
pub tgt_line: i64,
357+
pub edge_kind: String,
358+
pub confidence: f64,
359+
pub dynamic: i64,
360+
}
361+
362+
/// Save edges from reverse-dep files → changed files BEFORE purge so they
363+
/// can be reconnected to new target node IDs after node insertion (#1012).
364+
///
365+
/// Mirrors the JS `purgeAndAddReverseDeps` path in `detect-changes.ts`. By
366+
/// saving the edge topology and reconnecting after insert, we avoid the need
367+
/// to re-parse every reverse-dep file just to rebuild its edges. That re-parse
368+
/// is what made the native pipeline scale parse/insert/structure/roles with
369+
/// the full reverse-dep cone (47 files for a 1-file change) instead of just
370+
/// the truly-changed files (1 file).
371+
pub fn save_reverse_dep_edges(
372+
conn: &Connection,
373+
changed_paths: &[String],
374+
) -> Vec<SavedReverseDepEdge> {
375+
let mut saved = Vec::new();
376+
if changed_paths.is_empty() {
377+
return saved;
378+
}
379+
let changed_set: HashSet<&str> = changed_paths.iter().map(|s| s.as_str()).collect();
380+
381+
let mut stmt = match conn.prepare(
382+
"SELECT e.source_id, n_tgt.name, n_tgt.kind, n_tgt.file, n_tgt.line, \
383+
e.kind, e.confidence, e.dynamic, n_src.file \
384+
FROM edges e \
385+
JOIN nodes n_src ON e.source_id = n_src.id \
386+
JOIN nodes n_tgt ON e.target_id = n_tgt.id \
387+
WHERE n_tgt.file = ?1 AND n_src.file != n_tgt.file",
388+
) {
389+
Ok(s) => s,
390+
Err(_) => return saved,
391+
};
392+
393+
for changed in changed_paths {
394+
let rows = match stmt.query_map([changed], |row| {
395+
Ok((
396+
row.get::<_, i64>(0)?,
397+
row.get::<_, String>(1)?,
398+
row.get::<_, String>(2)?,
399+
row.get::<_, String>(3)?,
400+
row.get::<_, i64>(4)?,
401+
row.get::<_, String>(5)?,
402+
row.get::<_, f64>(6)?,
403+
row.get::<_, i64>(7)?,
404+
row.get::<_, String>(8)?,
405+
))
406+
}) {
407+
Ok(r) => r,
408+
Err(_) => continue,
409+
};
410+
for row in rows.flatten() {
411+
// Skip edges whose source is itself being purged — buildEdges will
412+
// re-emit them with correct new IDs.
413+
if changed_set.contains(row.8.as_str()) {
414+
continue;
415+
}
416+
saved.push(SavedReverseDepEdge {
417+
source_id: row.0,
418+
tgt_name: row.1,
419+
tgt_kind: row.2,
420+
tgt_file: row.3,
421+
tgt_line: row.4,
422+
edge_kind: row.5,
423+
confidence: row.6,
424+
dynamic: row.7,
425+
});
426+
}
427+
}
428+
saved
429+
}
430+
431+
/// Reconnect saved reverse-dep edges to the new target node IDs.
432+
///
433+
/// The source node ID is still valid (reverse-dep nodes were never purged).
434+
/// The target was deleted and re-inserted with a new ID — look it up by
435+
/// (name, kind, file) using nearest-line matching, and recreate the edge.
436+
/// Mirrors `reconnectReverseDepEdges` in `build-edges.ts`.
437+
///
438+
/// Returns (reconnected, dropped) counts.
439+
pub fn reconnect_reverse_dep_edges(
440+
conn: &Connection,
441+
saved: &[SavedReverseDepEdge],
442+
) -> (usize, usize) {
443+
if saved.is_empty() {
444+
return (0, 0);
445+
}
446+
let tx = match conn.unchecked_transaction() {
447+
Ok(tx) => tx,
448+
Err(_) => return (0, 0),
449+
};
450+
451+
let mut reconnected = 0usize;
452+
let mut dropped = 0usize;
453+
{
454+
let mut find_stmt = match tx.prepare(
455+
"SELECT id FROM nodes WHERE name = ?1 AND kind = ?2 AND file = ?3 \
456+
ORDER BY ABS(line - ?4) LIMIT 1",
457+
) {
458+
Ok(s) => s,
459+
Err(_) => return (0, 0),
460+
};
461+
let mut insert_stmt = match tx.prepare(
462+
"INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) \
463+
VALUES (?1, ?2, ?3, ?4, ?5)",
464+
) {
465+
Ok(s) => s,
466+
Err(_) => return (0, 0),
467+
};
468+
for s in saved {
469+
match find_stmt.query_row(
470+
rusqlite::params![&s.tgt_name, &s.tgt_kind, &s.tgt_file, s.tgt_line],
471+
|row| row.get::<_, i64>(0),
472+
) {
473+
Ok(new_id) => {
474+
// INSERT OR IGNORE silently swallows duplicate-row constraint
475+
// errors and returns Ok(0). Only count rows that actually
476+
// inserted so the diagnostic counter isn't inflated by no-ops.
477+
match insert_stmt.execute(rusqlite::params![
478+
s.source_id,
479+
new_id,
480+
&s.edge_kind,
481+
s.confidence,
482+
s.dynamic,
483+
]) {
484+
Ok(n) if n > 0 => reconnected += 1,
485+
Ok(_) => {} // duplicate skipped by INSERT OR IGNORE
486+
Err(_) => dropped += 1,
487+
}
488+
}
489+
Err(_) => {
490+
dropped += 1;
491+
}
492+
}
493+
}
494+
}
495+
let _ = tx.commit();
496+
(reconnected, dropped)
497+
}
498+
348499
/// Find files that import from changed files (reverse dependencies).
349500
pub fn find_reverse_dependencies(
350501
conn: &Connection,

0 commit comments

Comments
 (0)