diff --git a/Cargo.lock b/Cargo.lock index 757dfb80..f1ba9935 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1103,7 +1103,7 @@ dependencies = [ [[package]] name = "gather-step" -version = "4.2.1" +version = "4.3.1" dependencies = [ "anyhow", "blake3", @@ -1145,7 +1145,7 @@ dependencies = [ [[package]] name = "gather-step-analysis" -version = "4.2.1" +version = "4.3.1" dependencies = [ "gather-step-core", "gather-step-parser", @@ -1160,7 +1160,7 @@ dependencies = [ [[package]] name = "gather-step-bench" -version = "4.2.1" +version = "4.3.1" dependencies = [ "anyhow", "chrono", @@ -1190,7 +1190,7 @@ dependencies = [ [[package]] name = "gather-step-core" -version = "4.2.1" +version = "4.3.1" dependencies = [ "bitcode", "blake3", @@ -1204,7 +1204,7 @@ dependencies = [ [[package]] name = "gather-step-deploy" -version = "4.2.1" +version = "4.3.1" dependencies = [ "gather-step-core", "pretty_assertions", @@ -1216,7 +1216,7 @@ dependencies = [ [[package]] name = "gather-step-git" -version = "4.2.1" +version = "4.3.1" dependencies = [ "blake3", "gather-step-core", @@ -1233,7 +1233,7 @@ dependencies = [ [[package]] name = "gather-step-mcp" -version = "4.2.1" +version = "4.3.1" dependencies = [ "blake3", "criterion", @@ -1264,7 +1264,7 @@ dependencies = [ [[package]] name = "gather-step-output" -version = "4.2.1" +version = "4.3.1" dependencies = [ "chrono", "gather-step-analysis", @@ -1276,7 +1276,7 @@ dependencies = [ [[package]] name = "gather-step-parser" -version = "4.2.1" +version = "4.3.1" dependencies = [ "aho-corasick", "blake3", @@ -1318,7 +1318,7 @@ dependencies = [ [[package]] name = "gather-step-storage" -version = "4.2.1" +version = "4.3.1" dependencies = [ "bitcode", "blake3", diff --git a/Cargo.toml b/Cargo.toml index 2b5e0f21..a90acb56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ members = [ ] [workspace.package] -version = "4.2.1" +version = "4.3.1" authors = ["JJ Adonis"] edition = "2024" rust-version = "1.96" @@ -24,15 +24,15 @@ homepage = "https://github.com/thedoublejay/gather-step" description = "High-performance multi-repo codebase intelligence engine" [workspace.dependencies] -gather-step = { path = "crates/gather-step-cli", version = "4.2.1" } -gather-step-analysis = { path = "crates/gather-step-analysis", version = "4.2.1" } -gather-step-core = { path = "crates/gather-step-core", version = "4.2.1" } -gather-step-deploy = { path = "crates/gather-step-deploy", version = "4.2.1" } -gather-step-git = { path = "crates/gather-step-git", version = "4.2.1" } -gather-step-mcp = { path = "crates/gather-step-mcp", version = "4.2.1" } -gather-step-output = { path = "crates/gather-step-output", version = "4.2.1" } -gather-step-parser = { path = "crates/gather-step-parser", version = "4.2.1" } -gather-step-storage = { path = "crates/gather-step-storage", version = "4.2.1" } +gather-step = { path = "crates/gather-step-cli", version = "4.3.1" } +gather-step-analysis = { path = "crates/gather-step-analysis", version = "4.3.1" } +gather-step-core = { path = "crates/gather-step-core", version = "4.3.1" } +gather-step-deploy = { path = "crates/gather-step-deploy", version = "4.3.1" } +gather-step-git = { path = "crates/gather-step-git", version = "4.3.1" } +gather-step-mcp = { path = "crates/gather-step-mcp", version = "4.3.1" } +gather-step-output = { path = "crates/gather-step-output", version = "4.3.1" } +gather-step-parser = { path = "crates/gather-step-parser", version = "4.3.1" } +gather-step-storage = { path = "crates/gather-step-storage", version = "4.3.1" } tree-sitter = "=0.26.9" tree-sitter-typescript = "0.23.2" diff --git a/crates/gather-step-analysis/src/anchor.rs b/crates/gather-step-analysis/src/anchor.rs index c68cb0e3..8c42c99b 100644 --- a/crates/gather-step-analysis/src/anchor.rs +++ b/crates/gather-step-analysis/src/anchor.rs @@ -148,7 +148,7 @@ fn score_candidate( if boundary_bonus > 0.0 { let downstream = outgoing .iter() - .filter(|edge| !matches!(edge.kind, EdgeKind::Defines | EdgeKind::Imports)) + .filter(|edge| edge.kind.is_consumer_edge()) .count(); rationale.push(AnchorRationale::ControllerService { downstream_nodes: downstream, diff --git a/crates/gather-step-analysis/src/cycles.rs b/crates/gather-step-analysis/src/cycles.rs new file mode 100644 index 00000000..9a6f89d4 --- /dev/null +++ b/crates/gather-step-analysis/src/cycles.rs @@ -0,0 +1,286 @@ +use gather_step_core::{EdgeKind, NodeKind}; +use gather_step_storage::{GraphStore, GraphStoreError}; +use rustc_hash::FxHashMap; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum CycleError { + #[error(transparent)] + Store(#[from] GraphStoreError), +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Cycle { + pub nodes: Vec, + pub repos: Vec, + pub cross_repo: bool, +} + +pub fn find_cycles( + store: &S, + edge_kinds: Option<&[EdgeKind]>, +) -> Result, CycleError> { + let mut index_of: FxHashMap<[u8; 16], usize> = FxHashMap::default(); + let mut labels: Vec = Vec::new(); + let mut repos: Vec = Vec::new(); + let mut node_ids = Vec::new(); + for kind in NodeKind::all() { + for node in store.nodes_by_type(*kind)? { + let key = node.id.as_bytes(); + if index_of.contains_key(&key) { + continue; + } + index_of.insert(key, labels.len()); + labels.push( + node.qualified_name + .clone() + .unwrap_or_else(|| format!("{}::{}", node.repo, node.name)), + ); + repos.push(node.repo.clone()); + node_ids.push(node.id); + } + } + + let count = node_ids.len(); + let mut adjacency: Vec> = vec![Vec::new(); count]; + let mut self_loop = vec![false; count]; + for (source_index, node_id) in node_ids.iter().enumerate() { + for edge in store.get_outgoing(*node_id)? { + if let Some(kinds) = edge_kinds + && !kinds.contains(&edge.kind) + { + continue; + } + let Some(&target_index) = index_of.get(&edge.target.as_bytes()) else { + continue; + }; + if target_index == source_index { + self_loop[source_index] = true; + } + adjacency[source_index].push(target_index); + } + } + + let sccs = tarjan(&adjacency); + + let mut cycles = Vec::new(); + for scc in sccs { + let is_cycle = scc.len() > 1 || (scc.len() == 1 && self_loop[scc[0]]); + if !is_cycle { + continue; + } + let mut nodes: Vec = scc.iter().map(|&i| labels[i].clone()).collect(); + nodes.sort(); + let mut cycle_repos: Vec = scc.iter().map(|&i| repos[i].clone()).collect(); + cycle_repos.sort(); + cycle_repos.dedup(); + let cross_repo = cycle_repos.len() > 1; + cycles.push(Cycle { + nodes, + repos: cycle_repos, + cross_repo, + }); + } + cycles.sort_by(|left, right| left.nodes.cmp(&right.nodes)); + Ok(cycles) +} + +fn tarjan(adjacency: &[Vec]) -> Vec> { + let count = adjacency.len(); + let mut indices = vec![usize::MAX; count]; + let mut lowlink = vec![0_usize; count]; + let mut on_stack = vec![false; count]; + let mut stack: Vec = Vec::new(); + let mut sccs: Vec> = Vec::new(); + let mut next_index = 0_usize; + + for root in 0..count { + if indices[root] != usize::MAX { + continue; + } + let mut call_stack: Vec<(usize, usize)> = vec![(root, 0)]; + while let Some(&(node, child)) = call_stack.last() { + if child == 0 { + indices[node] = next_index; + lowlink[node] = next_index; + next_index += 1; + stack.push(node); + on_stack[node] = true; + } + + if child < adjacency[node].len() { + let top = call_stack.len() - 1; + call_stack[top].1 += 1; + let target = adjacency[node][child]; + if indices[target] == usize::MAX { + call_stack.push((target, 0)); + } else if on_stack[target] { + lowlink[node] = lowlink[node].min(indices[target]); + } + } else { + if lowlink[node] == indices[node] { + let mut scc = Vec::new(); + while let Some(member) = stack.pop() { + on_stack[member] = false; + scc.push(member); + if member == node { + break; + } + } + sccs.push(scc); + } + call_stack.pop(); + if let Some(&(parent, _)) = call_stack.last() { + lowlink[parent] = lowlink[parent].min(lowlink[node]); + } + } + } + } + sccs +} + +#[cfg(test)] +mod tests { + use std::{ + env, fs, + path::{Path, PathBuf}, + process, + sync::atomic::{AtomicU64, Ordering}, + }; + + use gather_step_core::{EdgeData, EdgeKind, EdgeMetadata, NodeData, NodeId, NodeKind, node_id}; + use gather_step_storage::{GraphStore, GraphStoreDb}; + + use super::find_cycles; + + static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0); + + struct TempDb { + path: PathBuf, + } + + impl TempDb { + fn new(name: &str) -> Self { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = env::temp_dir().join(format!( + "gather-step-cycles-{name}-{}-{id}.redb", + process::id() + )); + Self { path } + } + + fn path(&self) -> &Path { + &self.path + } + } + + impl Drop for TempDb { + fn drop(&mut self) { + let _ = fs::remove_file(&self.path); + } + } + + fn func(repo: &str, name: &str) -> NodeData { + NodeData { + id: node_id(repo, "src/a.ts", NodeKind::Function, name), + kind: NodeKind::Function, + repo: repo.to_owned(), + file_path: "src/a.ts".to_owned(), + name: name.to_owned(), + qualified_name: Some(format!("{repo}::{name}")), + external_id: None, + signature: None, + visibility: None, + span: None, + is_virtual: false, + } + } + + fn calls(source: NodeId, target: NodeId, owner: NodeId) -> EdgeData { + EdgeData { + source, + target, + kind: EdgeKind::Calls, + metadata: EdgeMetadata::default(), + owner_file: owner, + is_cross_file: false, + } + } + + #[test] + fn detects_a_simple_cycle_and_ignores_acyclic_chains() { + let temp = TempDb::new("simple"); + let store = GraphStoreDb::open(temp.path()).expect("store"); + let file = NodeData { + kind: NodeKind::File, + ..func("web", "src/a.ts") + }; + let a = func("web", "a"); + let b = func("web", "b"); + let c = func("web", "c"); + let d = func("web", "d"); + store + .bulk_insert( + &[file.clone(), a.clone(), b.clone(), c.clone(), d.clone()], + &[ + calls(a.id, b.id, file.id), + calls(b.id, c.id, file.id), + calls(c.id, a.id, file.id), + calls(c.id, d.id, file.id), + ], + ) + .expect("write"); + + let cycles = find_cycles(&store, Some(&[EdgeKind::Calls])).expect("cycles"); + assert_eq!(cycles.len(), 1, "expected one cycle, got {cycles:?}"); + assert_eq!(cycles[0].nodes, vec!["web::a", "web::b", "web::c"]); + assert!(!cycles[0].cross_repo); + } + + #[test] + fn acyclic_graph_has_no_cycles() { + let temp = TempDb::new("acyclic"); + let store = GraphStoreDb::open(temp.path()).expect("store"); + let file = NodeData { + kind: NodeKind::File, + ..func("web", "src/a.ts") + }; + let a = func("web", "a"); + let b = func("web", "b"); + store + .bulk_insert( + &[file.clone(), a.clone(), b.clone()], + &[calls(a.id, b.id, file.id)], + ) + .expect("write"); + + assert!( + find_cycles(&store, Some(&[EdgeKind::Calls])) + .expect("cycles") + .is_empty() + ); + } + + #[test] + fn flags_cross_repo_cycle() { + let temp = TempDb::new("cross-repo"); + let store = GraphStoreDb::open(temp.path()).expect("store"); + let file = NodeData { + kind: NodeKind::File, + ..func("web", "src/a.ts") + }; + let a = func("web", "a"); + let b = func("api", "b"); + store + .bulk_insert( + &[file.clone(), a.clone(), b.clone()], + &[calls(a.id, b.id, file.id), calls(b.id, a.id, file.id)], + ) + .expect("write"); + + let cycles = find_cycles(&store, Some(&[EdgeKind::Calls])).expect("cycles"); + assert_eq!(cycles.len(), 1); + assert!(cycles[0].cross_repo); + assert_eq!(cycles[0].repos, vec!["api", "web"]); + } +} diff --git a/crates/gather-step-analysis/src/lib.rs b/crates/gather-step-analysis/src/lib.rs index 3ac29fdb..07517af6 100644 --- a/crates/gather-step-analysis/src/lib.rs +++ b/crates/gather-step-analysis/src/lib.rs @@ -9,17 +9,21 @@ pub mod contract_drift; pub mod conventions; pub mod cross_repo; pub mod crud_trace; +pub mod cycles; pub mod dead_code; pub mod deployment_topology; pub mod event_topology; pub mod evidence; pub mod impact; +pub mod mock_leakage; +pub mod mongo_query_safety; pub mod overview; pub mod pack_assembly; pub mod projection_impact; pub mod proofs; pub mod query; pub mod semantic_health; +pub mod shared_component_usage; pub mod shared_contract; pub mod transport; @@ -36,6 +40,7 @@ pub use cross_repo::{ pub use crud_trace::{ CrudTrace, CrudTraceEntry, CrudTraceError, CrudTraceRole, trace_crud_route, trace_crud_symbol, }; +pub use cycles::{Cycle, CycleError, find_cycles}; pub use dead_code::{ ConfidenceBand, DeadCodeError, DeadCodeFinding, DeadCodeReport, DetectorBasis, find_dead_code, find_dead_code_with_manifest, @@ -54,6 +59,11 @@ pub use event_topology::{ pub use impact::{ BoundaryRole, EvidenceBand, ImpactError, ImpactMap, ImpactedFile, shared_contract_impact, }; +pub use mock_leakage::{MockLeakage, MockLeakageError, find_mock_leakage, is_mock_path}; +pub use mongo_query_safety::{ + MongoQueryFinding, RULE_ATLAS_INDEX_DRIFT, RULE_INDEX_DEFEAT, RULE_NULL_PARENT_PATH, + RULE_UNSAFE_COERCION, analyze_atlas_index_drift, analyze_mongo_value, +}; pub use overview::{ModuleSummary, OverviewError, RepoOverview, build_overview}; pub use pack_assembly::{ CandidateKey, Pack, PackAssembler, PackItem, PackMode, QueryShape, SimplePackAssembler, @@ -68,11 +78,14 @@ pub use proofs::{ MAX_PROOFS_PER_REPO, ProofCaller, ProofEngineError, ProofEngineOptions, ProofEngineOutput, build_pack_proofs, derive_repo_sets, finalize_proofs, proof_strength, }; -pub use query::{GraphQuery, QueryError, TraversalStep}; +pub use query::{GraphQuery, QueryError, TraversalOutcome, TraversalStep}; pub use semantic_health::{ SemanticHealthError, SemanticHealthReport, SemanticLinkHealth, semantic_health_for_repo, semantic_health_for_workspace, }; +pub use shared_component_usage::{ + ReuseOpportunity, SharedComponentError, analyze_shared_component_reuse, is_design_system_path, +}; pub use shared_contract::{ guard_class_name_for_anchor, looks_like_guard_entrypoint, peer_matches_guard_class_name, shared_contract_candidate_ids, diff --git a/crates/gather-step-analysis/src/mock_leakage.rs b/crates/gather-step-analysis/src/mock_leakage.rs new file mode 100644 index 00000000..d2d41bdc --- /dev/null +++ b/crates/gather-step-analysis/src/mock_leakage.rs @@ -0,0 +1,207 @@ +use gather_step_core::EdgeKind; +use gather_step_storage::{GraphStore, GraphStoreError}; +use rustc_hash::FxHashMap; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum MockLeakageError { + #[error(transparent)] + Store(#[from] GraphStoreError), +} + +const MOCK_MARKERS: &[&str] = &[ + "__mocks__", + ".mock.", + "/mocks/", + ".fixture.", + "/fixtures/", + ".stub.", +]; + +const TEST_MARKERS: &[&str] = &[".test.", ".spec.", "/__tests__/", "/tests/", "/test/"]; + +#[must_use] +pub fn is_mock_path(file_path: &str) -> bool { + MOCK_MARKERS.iter().any(|marker| file_path.contains(marker)) +} + +#[must_use] +pub fn is_test_path(file_path: &str) -> bool { + TEST_MARKERS.iter().any(|marker| file_path.contains(marker)) +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct MockLeakage { + pub repo: String, + pub importer_file: String, + pub mock_file: String, +} + +pub fn find_mock_leakage( + store: &S, + repo: &str, +) -> Result, MockLeakageError> { + let nodes = store.nodes_by_repo(repo)?; + let file_of: FxHashMap<[u8; 16], String> = nodes + .iter() + .map(|node| (node.id.as_bytes(), node.file_path.clone())) + .collect(); + + let mut leaks = Vec::new(); + for node in &nodes { + if is_test_path(&node.file_path) || is_mock_path(&node.file_path) { + continue; + } + for edge in store.get_outgoing(node.id)? { + if edge.kind != EdgeKind::Imports { + continue; + } + let Some(target_file) = file_of.get(&edge.target.as_bytes()) else { + continue; + }; + if is_mock_path(target_file) { + leaks.push(MockLeakage { + repo: repo.to_owned(), + importer_file: node.file_path.clone(), + mock_file: target_file.clone(), + }); + } + } + } + + leaks.sort_by(|left, right| { + left.importer_file + .cmp(&right.importer_file) + .then(left.mock_file.cmp(&right.mock_file)) + }); + leaks.dedup(); + Ok(leaks) +} + +#[cfg(test)] +mod tests { + use std::{ + env, fs, + path::{Path, PathBuf}, + process, + sync::atomic::{AtomicU64, Ordering}, + }; + + use gather_step_core::{EdgeData, EdgeKind, EdgeMetadata, NodeData, NodeId, NodeKind, node_id}; + use gather_step_storage::{GraphStore, GraphStoreDb}; + + use super::find_mock_leakage; + + static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0); + + struct TempDb { + path: PathBuf, + } + + impl TempDb { + fn new(name: &str) -> Self { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = env::temp_dir().join(format!( + "gather-step-mock-leakage-{name}-{}-{id}.redb", + process::id() + )); + Self { path } + } + + fn path(&self) -> &Path { + &self.path + } + } + + impl Drop for TempDb { + fn drop(&mut self) { + let _ = fs::remove_file(&self.path); + } + } + + fn module(repo: &str, file_path: &str) -> NodeData { + NodeData { + id: node_id(repo, file_path, NodeKind::File, file_path), + kind: NodeKind::File, + repo: repo.to_owned(), + file_path: file_path.to_owned(), + name: file_path.to_owned(), + qualified_name: Some(file_path.to_owned()), + external_id: None, + signature: None, + visibility: None, + span: None, + is_virtual: false, + } + } + + fn imports(source: NodeId, target: NodeId, owner: NodeId) -> EdgeData { + EdgeData { + source, + target, + kind: EdgeKind::Imports, + metadata: EdgeMetadata::default(), + owner_file: owner, + is_cross_file: true, + } + } + + #[test] + fn flags_prod_module_importing_a_mock() { + let temp = TempDb::new("leak"); + let store = GraphStoreDb::open(temp.path()).expect("store"); + let prod = module("web", "src/features/OrderList.tsx"); + let mock = module("web", "src/features/__mocks__/orders.mock.ts"); + store + .bulk_insert( + &[prod.clone(), mock.clone()], + &[imports(prod.id, mock.id, prod.id)], + ) + .expect("write"); + + let leaks = find_mock_leakage(&store, "web").expect("analyze"); + assert_eq!(leaks.len(), 1); + assert_eq!(leaks[0].importer_file, "src/features/OrderList.tsx"); + assert_eq!(leaks[0].mock_file, "src/features/__mocks__/orders.mock.ts"); + } + + #[test] + fn test_file_importing_a_mock_is_not_flagged() { + let temp = TempDb::new("test-import"); + let store = GraphStoreDb::open(temp.path()).expect("store"); + let spec = module("web", "src/features/OrderList.test.tsx"); + let mock = module("web", "src/features/__mocks__/orders.mock.ts"); + store + .bulk_insert( + &[spec.clone(), mock.clone()], + &[imports(spec.id, mock.id, spec.id)], + ) + .expect("write"); + + assert!( + find_mock_leakage(&store, "web") + .expect("analyze") + .is_empty() + ); + } + + #[test] + fn prod_importing_prod_is_not_flagged() { + let temp = TempDb::new("clean"); + let store = GraphStoreDb::open(temp.path()).expect("store"); + let prod = module("web", "src/features/OrderList.tsx"); + let helper = module("web", "src/features/format.ts"); + store + .bulk_insert( + &[prod.clone(), helper.clone()], + &[imports(prod.id, helper.id, prod.id)], + ) + .expect("write"); + + assert!( + find_mock_leakage(&store, "web") + .expect("analyze") + .is_empty() + ); + } +} diff --git a/crates/gather-step-analysis/src/mongo_query_safety.rs b/crates/gather-step-analysis/src/mongo_query_safety.rs new file mode 100644 index 00000000..3bd741d9 --- /dev/null +++ b/crates/gather-step-analysis/src/mongo_query_safety.rs @@ -0,0 +1,297 @@ +//! Structural safety detectors for Mongo queries, aggregations, and Atlas +//! search-index definitions. Pure over a [`serde_json::Value`] so they are +//! testable without a live parser. + +use serde_json::Value; + +pub const RULE_INDEX_DEFEAT: &str = "GS-MONGO-INDEX-DEFEAT"; +pub const RULE_UNSAFE_COERCION: &str = "GS-MONGO-UNSAFE-COERCION"; +pub const RULE_NULL_PARENT_PATH: &str = "GS-MONGO-NULL-PARENT-PATH"; +pub const RULE_ATLAS_INDEX_DRIFT: &str = "GS-MONGO-ATLAS-INDEX-DRIFT"; + +#[derive(Clone, Debug, PartialEq)] +pub struct MongoQueryFinding { + pub rule_id: &'static str, + pub confidence: f64, + pub message: String, + pub path: String, +} + +#[must_use] +pub fn analyze_mongo_value(value: &Value) -> Vec { + let mut findings = Vec::new(); + walk(value, "$", &mut findings); + findings.sort_by(|left, right| { + left.rule_id + .cmp(right.rule_id) + .then_with(|| left.path.cmp(&right.path)) + }); + findings +} + +#[must_use] +pub fn analyze_atlas_index_drift( + index_def: &Value, + referenced_fields: &[&str], +) -> Vec { + let mappings = index_def.get("mappings"); + let dynamic = mappings + .and_then(|m| m.get("dynamic")) + .and_then(Value::as_bool) + .unwrap_or(true); + if dynamic { + return Vec::new(); + } + let mapped: Vec<&str> = mappings + .and_then(|m| m.get("fields")) + .and_then(Value::as_object) + .map(|fields| fields.keys().map(String::as_str).collect()) + .unwrap_or_default(); + + let mut findings: Vec = referenced_fields + .iter() + .filter(|field| !mapped.contains(*field)) + .map(|field| MongoQueryFinding { + rule_id: RULE_ATLAS_INDEX_DRIFT, + confidence: 0.75, + message: format!( + "Field `{field}` is queried but absent from the `dynamic:false` Atlas index \ + mapping; the search silently matches nothing on it. Add it to the mapping." + ), + path: (*field).to_owned(), + }) + .collect(); + findings.sort_by(|left, right| left.path.cmp(&right.path)); + findings +} + +fn walk(value: &Value, path: &str, findings: &mut Vec) { + match value { + Value::Object(map) => { + for (key, child) in map { + let child_path = format!("{path}.{key}"); + match key.as_str() { + "$lookup" => detect_index_defeat(child, &child_path, findings), + "$toObjectId" => findings.push(MongoQueryFinding { + rule_id: RULE_UNSAFE_COERCION, + confidence: 0.8, + message: "Bare `$toObjectId` throws on a malformed id; use `$convert` \ + with an `onError` fallback for untrusted input." + .to_owned(), + path: child_path.clone(), + }), + "$set" => detect_null_parent_path(child, &child_path, findings), + _ => {} + } + walk(child, &child_path, findings); + } + } + Value::Array(items) => { + for (index, item) in items.iter().enumerate() { + walk(item, &format!("{path}[{index}]"), findings); + } + } + _ => {} + } +} + +fn detect_index_defeat(lookup: &Value, path: &str, findings: &mut Vec) { + if mentions_operator(lookup, "$toString") || mentions_operator(lookup, "$toObjectId") { + findings.push(MongoQueryFinding { + rule_id: RULE_INDEX_DEFEAT, + confidence: 0.7, + message: "`$lookup` coerces its join key (`$toString`/`$toObjectId`), defeating the \ + index on the join field; align field types or pre-store the join key." + .to_owned(), + path: path.to_owned(), + }); + } +} + +fn detect_null_parent_path(set_doc: &Value, path: &str, findings: &mut Vec) { + let Value::Object(map) = set_doc else { + return; + }; + for (field, expr) in map { + let Some((parent, _)) = field.rsplit_once('.') else { + continue; + }; + if value_guards_parent(expr) || sibling_assigns_parent(map, parent) { + continue; + } + findings.push(MongoQueryFinding { + rule_id: RULE_NULL_PARENT_PATH, + confidence: 0.6, + message: format!( + "`$set` on dotted path `{field}` has no existence/`$type:object` guard on parent \ + `{parent}`; a null or scalar parent will be clobbered or error." + ), + path: format!("{path}.{field}"), + }); + } +} + +fn value_guards_parent(expr: &Value) -> bool { + mentions_operator(expr, "$ifNull") || mentions_operator(expr, "$cond") +} + +fn sibling_assigns_parent(map: &serde_json::Map, parent: &str) -> bool { + map.keys() + .any(|key| key == parent || parent.starts_with(&format!("{key}."))) +} + +fn mentions_operator(value: &Value, op: &str) -> bool { + match value { + Value::Object(map) => map + .iter() + .any(|(key, child)| key == op || mentions_operator(child, op)), + Value::Array(items) => items.iter().any(|item| mentions_operator(item, op)), + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::{ + RULE_INDEX_DEFEAT, RULE_NULL_PARENT_PATH, RULE_UNSAFE_COERCION, analyze_mongo_value, + }; + use serde_json::json; + + fn rule_ids(value: &serde_json::Value) -> Vec<&'static str> { + analyze_mongo_value(value) + .into_iter() + .map(|finding| finding.rule_id) + .collect() + } + + #[test] + fn flags_lookup_that_coerces_join_key_but_not_index_aligned_lookup() { + let coercing = json!([{ + "$lookup": { + "from": "users", + "let": { "uid": "$userId" }, + "pipeline": [ + { "$match": { "$expr": { "$eq": [{ "$toString": "$_id" }, "$$uid"] } } } + ], + "as": "user" + } + }]); + assert!(rule_ids(&coercing).contains(&RULE_INDEX_DEFEAT)); + + let aligned = json!([{ + "$lookup": { + "from": "users", + "localField": "userId", + "foreignField": "_id", + "as": "user" + } + }]); + assert!(!rule_ids(&aligned).contains(&RULE_INDEX_DEFEAT)); + } + + #[test] + fn flags_bare_to_object_id_but_not_convert_with_on_error() { + let bare = json!({ "$match": { "_id": { "$toObjectId": "$$req.id" } } }); + assert!(rule_ids(&bare).contains(&RULE_UNSAFE_COERCION)); + + let safe = json!({ + "$match": { + "_id": { "$convert": { "input": "$$req.id", "to": "objectId", "onError": null } } + } + }); + assert!(!rule_ids(&safe).contains(&RULE_UNSAFE_COERCION)); + } + + #[test] + fn flags_unguarded_dotted_set_but_not_guarded_sibling() { + let unguarded = json!({ "$set": { "meta.flags.active": true } }); + assert!(rule_ids(&unguarded).contains(&RULE_NULL_PARENT_PATH)); + + let guarded_value = + json!({ "$set": { "meta.flags.active": { "$ifNull": ["$meta.flags.active", true] } } }); + assert!(!rule_ids(&guarded_value).contains(&RULE_NULL_PARENT_PATH)); + + let guarded_sibling = + json!({ "$set": { "meta": { "flags": {} }, "meta.flags.active": true } }); + assert!(!rule_ids(&guarded_sibling).contains(&RULE_NULL_PARENT_PATH)); + } + + #[test] + fn flags_unmapped_field_on_dynamic_false_index_but_not_mapped_field() { + use super::{RULE_ATLAS_INDEX_DRIFT, analyze_atlas_index_drift}; + + let index = json!({ + "mappings": { "dynamic": false, "fields": { "title": {}, "body": {} } } + }); + let findings = analyze_atlas_index_drift(&index, &["title", "entity"]); + let ids: Vec<_> = findings.iter().map(|f| f.rule_id).collect(); + assert!(ids.contains(&RULE_ATLAS_INDEX_DRIFT)); + assert!(findings.iter().any(|f| f.path == "entity")); + assert!(!findings.iter().any(|f| f.path == "title")); + + let dynamic_index = json!({ "mappings": { "dynamic": true } }); + assert!(analyze_atlas_index_drift(&dynamic_index, &["anything"]).is_empty()); + } + + #[test] + fn trap_detectors_cover_each_pattern_and_clean_siblings() { + use super::{ + RULE_ATLAS_INDEX_DRIFT, RULE_INDEX_DEFEAT, RULE_NULL_PARENT_PATH, RULE_UNSAFE_COERCION, + analyze_atlas_index_drift, + }; + + let trap_a = json!([{ "$lookup": { + "from": "u", "let": { "k": "$userId" }, + "pipeline": [{ "$match": { "$expr": { "$eq": [{ "$toString": "$_id" }, "$$k"] } } }], + "as": "u" + }}]); + let clean_a = json!([{ "$lookup": { + "from": "u", "localField": "userId", "foreignField": "_id", "as": "u" + }}]); + assert!(rule_ids(&trap_a).contains(&RULE_INDEX_DEFEAT)); + assert!(!rule_ids(&clean_a).contains(&RULE_INDEX_DEFEAT)); + + let trap_b = json!({ "_id": { "$toObjectId": "$$req.id" } }); + let clean_b = json!({ "_id": { "$convert": { "input": "$$req.id", "to": "objectId", "onError": null } } }); + assert!(rule_ids(&trap_b).contains(&RULE_UNSAFE_COERCION)); + assert!(!rule_ids(&clean_b).contains(&RULE_UNSAFE_COERCION)); + + let trap_c = json!({ "$set": { "meta.flags.active": true } }); + let clean_c = json!({ "$set": { "meta": { "flags": {} }, "meta.flags.active": true } }); + assert!(rule_ids(&trap_c).contains(&RULE_NULL_PARENT_PATH)); + assert!(!rule_ids(&clean_c).contains(&RULE_NULL_PARENT_PATH)); + + let index = json!({ "mappings": { "dynamic": false, "fields": { "title": {} } } }); + let trap_d = analyze_atlas_index_drift(&index, &["title", "entity"]); + assert!( + trap_d + .iter() + .any(|f| f.rule_id == RULE_ATLAS_INDEX_DRIFT && f.path == "entity") + ); + assert!(!trap_d.iter().any(|f| f.path == "title")); + } + + #[test] + fn clean_pipeline_yields_no_findings() { + let clean = json!([ + { "$match": { "status": "active" } }, + { "$project": { "name": 1, "email": 1 } } + ]); + assert!(analyze_mongo_value(&clean).is_empty()); + } + + #[test] + fn findings_are_deterministically_sorted() { + let value = json!({ + "$set": { "a.b": true }, + "stage": { "$toObjectId": "$x" } + }); + let first = analyze_mongo_value(&value); + let second = analyze_mongo_value(&value); + assert_eq!(first, second); + let ids: Vec<_> = first.iter().map(|f| f.rule_id).collect(); + let mut sorted = ids.clone(); + sorted.sort_unstable(); + assert_eq!(ids, sorted); + } +} diff --git a/crates/gather-step-analysis/src/query.rs b/crates/gather-step-analysis/src/query.rs index 697d6ebb..d259ee2e 100644 --- a/crates/gather-step-analysis/src/query.rs +++ b/crates/gather-step-analysis/src/query.rs @@ -2,9 +2,11 @@ use std::collections::{BTreeMap, VecDeque}; use gather_step_core::{EdgeData, EdgeKind, NodeData, NodeId, NodeKind}; use gather_step_storage::{GraphStore, GraphStoreError}; -use rustc_hash::FxHashSet; +use rustc_hash::{FxHashMap, FxHashSet}; use thiserror::Error; +const DEFAULT_FANOUT_CAP: usize = 256; + #[derive(Debug, Error)] pub enum QueryError { #[error(transparent)] @@ -16,6 +18,14 @@ pub struct TraversalStep { pub node_id: NodeId, pub edge_kinds: Vec, pub depth: usize, + pub in_paths: Vec>, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct TraversalOutcome { + pub steps: Vec, + pub depth_capped: bool, + pub truncated: bool, } pub struct GraphQuery<'a, S> { @@ -40,11 +50,13 @@ impl<'a, S: GraphStore> GraphQuery<'a, S> { &self, source: NodeId, edge_kind: Option, + min_confidence: Option, ) -> Result, QueryError> { let mut edges = self.store.get_outgoing(source)?; if let Some(edge_kind) = edge_kind { edges.retain(|edge| edge.kind == edge_kind); } + edges.retain(|edge| edge.metadata.passes_confidence(min_confidence)); Ok(edges) } @@ -52,11 +64,13 @@ impl<'a, S: GraphStore> GraphQuery<'a, S> { &self, target: NodeId, edge_kind: Option, + min_confidence: Option, ) -> Result, QueryError> { let mut edges = self.store.get_incoming(target)?; if let Some(edge_kind) = edge_kind { edges.retain(|edge| edge.kind == edge_kind); } + edges.retain(|edge| edge.metadata.passes_confidence(min_confidence)); Ok(edges) } @@ -65,34 +79,88 @@ impl<'a, S: GraphStore> GraphQuery<'a, S> { start: NodeId, edge_kinds: &[EdgeKind], max_depth: usize, + min_confidence: Option, ) -> Result, QueryError> { + Ok(self + .traverse_with_provenance(start, edge_kinds, max_depth, min_confidence)? + .steps) + } + + pub fn traverse_with_provenance( + &self, + start: NodeId, + edge_kinds: &[EdgeKind], + max_depth: usize, + min_confidence: Option, + ) -> Result { let mut queue = VecDeque::from([(start, Vec::::new(), 0_usize)]); - let mut seen = FxHashSet::from_iter([start.as_bytes()]); - let mut steps = Vec::new(); + let mut enqueued = FxHashSet::from_iter([start.as_bytes()]); + let mut order: Vec = Vec::new(); + let mut primary: FxHashMap<[u8; 16], (Vec, usize)> = FxHashMap::default(); + let mut in_paths: FxHashMap<[u8; 16], Vec>> = FxHashMap::default(); + let mut depth_capped = false; + let mut truncated = false; while let Some((node_id, path, depth)) = queue.pop_front() { + let outgoing: Vec = self + .store + .get_outgoing(node_id)? + .into_iter() + .filter(|edge| edge_kinds.is_empty() || edge_kinds.contains(&edge.kind)) + .filter(|edge| edge.metadata.passes_confidence(min_confidence)) + .collect(); + if depth >= max_depth { + if !outgoing.is_empty() { + depth_capped = true; + } continue; } - for edge in self.store.get_outgoing(node_id)? { - if !edge_kinds.is_empty() && !edge_kinds.contains(&edge.kind) { - continue; + for (index, edge) in outgoing.iter().enumerate() { + if index >= DEFAULT_FANOUT_CAP { + truncated = true; + break; } let mut next_path = path.clone(); next_path.push(edge.kind); - if seen.insert(edge.target.as_bytes()) { - steps.push(TraversalStep { - node_id: edge.target, - edge_kinds: next_path.clone(), - depth: depth + 1, - }); + let key = edge.target.as_bytes(); + + let recorded = in_paths.entry(key).or_default(); + if recorded.len() < DEFAULT_FANOUT_CAP { + recorded.push(next_path.clone()); + } else { + truncated = true; + } + + if enqueued.insert(key) { + order.push(edge.target); + primary.insert(key, (next_path.clone(), depth + 1)); queue.push_back((edge.target, next_path, depth + 1)); } } } - Ok(steps) + let steps = order + .into_iter() + .map(|node_id| { + let key = node_id.as_bytes(); + let (edges, depth) = primary.get(&key).cloned().unwrap_or_default(); + let paths = in_paths.get(&key).cloned().unwrap_or_default(); + TraversalStep { + node_id, + edge_kinds: edges, + depth, + in_paths: paths, + } + }) + .collect(); + + Ok(TraversalOutcome { + steps, + depth_capped, + truncated, + }) } pub fn count_by_kind(&self) -> Result, QueryError> { @@ -113,6 +181,45 @@ impl<'a, S: GraphStore> GraphQuery<'a, S> { } Ok(counts) } + + pub fn resolution_fingerprint(&self) -> Result, QueryError> { + let mut labels: FxHashMap<[u8; 16], String> = FxHashMap::default(); + let mut node_ids: Vec = Vec::new(); + for kind in NodeKind::all() { + for node in self.store.nodes_by_type(*kind)? { + let label = node + .qualified_name + .clone() + .unwrap_or_else(|| format!("{}::{}", node.repo, node.name)); + labels.insert(node.id.as_bytes(), label); + node_ids.push(node.id); + } + } + + let mut lines = Vec::new(); + for node_id in node_ids { + for edge in self.store.get_outgoing(node_id)? { + let source = labels + .get(&edge.source.as_bytes()) + .map_or("", String::as_str); + let target = labels + .get(&edge.target.as_bytes()) + .map_or("", String::as_str); + let resolver = edge.metadata.resolver.as_deref().unwrap_or("none"); + let confidence = edge + .metadata + .confidence + .map_or_else(|| "none".to_owned(), |value| value.to_string()); + lines.push(format!( + "{source}\t{}\t{target}\t{resolver}\t{confidence}", + edge.kind + )); + } + } + lines.sort(); + lines.dedup(); + Ok(lines) + } } #[cfg(test)] @@ -221,13 +328,13 @@ mod tests { ); assert_eq!( query - .get_edges(a.id, Some(EdgeKind::Calls)) + .get_edges(a.id, Some(EdgeKind::Calls), None) .expect("edges should load") .len(), 1 ); let traversed = query - .traverse(a.id, &[EdgeKind::Calls], 2) + .traverse(a.id, &[EdgeKind::Calls], 2, None) .expect("traversal should succeed"); assert_eq!(traversed.len(), 2); assert!( @@ -242,6 +349,172 @@ mod tests { ); } + #[test] + fn min_confidence_filters_low_confidence_edges_but_keeps_structural() { + let temp_db = TempDb::new("query-confidence"); + let store = test_store(temp_db.path()); + let file = node("service-a", "src/a.ts", NodeKind::File, "src/a.ts", 0); + let a = node("service-a", "src/a.ts", NodeKind::Function, "a", 0); + let trusted = node("service-a", "src/a.ts", NodeKind::Function, "trusted", 1); + let guessed = node("service-a", "src/a.ts", NodeKind::Function, "guessed", 2); + + // a -> trusted has no confidence (a definite structural edge); + // a -> guessed is a low-confidence heuristic resolution. + let trusted_edge = edge(a.id, trusted.id, file.id); + let mut guessed_edge = edge(a.id, guessed.id, file.id); + guessed_edge.metadata.confidence = Some(300); + + store + .bulk_insert( + &[file.clone(), a.clone(), trusted.clone(), guessed.clone()], + &[trusted_edge, guessed_edge], + ) + .expect("graph should write"); + + let query = GraphQuery::new(&store); + + // No threshold: both edges traversed. + let all = query + .traverse(a.id, &[EdgeKind::Calls], 1, None) + .expect("traversal should succeed"); + assert_eq!(all.len(), 2); + + // Threshold above the heuristic edge: the low-confidence edge is + // dropped, but the structural (None) edge is kept. + let filtered = query + .traverse(a.id, &[EdgeKind::Calls], 1, Some(500)) + .expect("traversal should succeed"); + assert_eq!(filtered.len(), 1); + assert_eq!(filtered[0].node_id, trusted.id); + } + + #[test] + fn traverse_with_provenance_flags_depth_capping() { + let temp_db = TempDb::new("query-depth-cap"); + let store = test_store(temp_db.path()); + let file = node("service-a", "src/a.ts", NodeKind::File, "src/a.ts", 0); + let a = node("service-a", "src/a.ts", NodeKind::Function, "a", 0); + let b = node("service-a", "src/a.ts", NodeKind::Function, "b", 1); + let c = node("service-a", "src/a.ts", NodeKind::Function, "c", 2); + let d = node("service-a", "src/a.ts", NodeKind::Function, "d", 3); + store + .bulk_insert( + &[file.clone(), a.clone(), b.clone(), c.clone(), d.clone()], + &[ + edge(a.id, b.id, file.id), + edge(b.id, c.id, file.id), + edge(c.id, d.id, file.id), + ], + ) + .expect("graph should write"); + + let query = GraphQuery::new(&store); + + let capped = query + .traverse_with_provenance(a.id, &[EdgeKind::Calls], 2, None) + .expect("traversal should succeed"); + assert_eq!(capped.steps.len(), 2); + assert!( + capped.depth_capped, + "deeper edge beyond max_depth not flagged" + ); + assert!(!capped.truncated); + + let full = query + .traverse_with_provenance(a.id, &[EdgeKind::Calls], 8, None) + .expect("traversal should succeed"); + assert_eq!(full.steps.len(), 3); + assert!(!full.depth_capped); + } + + #[test] + fn traverse_with_provenance_records_every_in_path() { + let temp_db = TempDb::new("query-provenance"); + let store = test_store(temp_db.path()); + let file = node("service-a", "src/a.ts", NodeKind::File, "src/a.ts", 0); + let root = node("service-a", "src/a.ts", NodeKind::Function, "root", 0); + let a = node("service-a", "src/a.ts", NodeKind::Function, "a", 1); + let b = node("service-a", "src/a.ts", NodeKind::Function, "b", 2); + let c = node("service-a", "src/a.ts", NodeKind::Function, "c", 3); + let sink = node("service-a", "src/a.ts", NodeKind::Function, "sink", 4); + store + .bulk_insert( + &[ + file.clone(), + root.clone(), + a.clone(), + b.clone(), + c.clone(), + sink.clone(), + ], + &[ + edge(root.id, a.id, file.id), + edge(root.id, b.id, file.id), + edge(root.id, c.id, file.id), + edge(a.id, sink.id, file.id), + edge(b.id, sink.id, file.id), + edge(c.id, sink.id, file.id), + ], + ) + .expect("graph should write"); + + let query = GraphQuery::new(&store); + let outcome = query + .traverse_with_provenance(root.id, &[EdgeKind::Calls], 4, None) + .expect("traversal should succeed"); + + let sink_step = outcome + .steps + .iter() + .find(|step| step.node_id == sink.id) + .expect("sink should be reached"); + assert_eq!( + sink_step.in_paths.len(), + 3, + "a node reachable three ways should report three caller paths" + ); + } + + #[test] + fn traverse_with_provenance_flags_fan_out_truncation() { + let temp_db = TempDb::new("query-fan-out"); + let store = test_store(temp_db.path()); + let file = node("service-a", "src/a.ts", NodeKind::File, "src/a.ts", 0); + let hub = node("service-a", "src/a.ts", NodeKind::Function, "hub", 0); + + let fan_out = 300_u16; + let mut nodes = vec![file.clone(), hub.clone()]; + let mut edges = Vec::new(); + for ordinal in 1..=fan_out { + let leaf = node( + "service-a", + "src/a.ts", + NodeKind::Function, + &format!("leaf{ordinal}"), + ordinal, + ); + edges.push(edge(hub.id, leaf.id, file.id)); + nodes.push(leaf); + } + store + .bulk_insert(&nodes, &edges) + .expect("graph should write"); + + let query = GraphQuery::new(&store); + let outcome = query + .traverse_with_provenance(hub.id, &[EdgeKind::Calls], 2, None) + .expect("traversal should succeed"); + assert!( + outcome.truncated, + "fan-out above the cap must flag truncated" + ); + assert!( + outcome.steps.len() <= 256, + "truncated traversal must not exceed the fan-out cap: {}", + outcome.steps.len() + ); + } + #[test] fn counts_nodes_and_edges_by_kind() { let temp_db = TempDb::new("counts"); diff --git a/crates/gather-step-analysis/src/shared_component_usage.rs b/crates/gather-step-analysis/src/shared_component_usage.rs new file mode 100644 index 00000000..e69b5599 --- /dev/null +++ b/crates/gather-step-analysis/src/shared_component_usage.rs @@ -0,0 +1,192 @@ +use std::collections::BTreeMap; + +use gather_step_core::{NodeData, NodeKind}; +use gather_step_storage::{GraphStore, GraphStoreError}; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum SharedComponentError { + #[error(transparent)] + Store(#[from] GraphStoreError), +} + +const DESIGN_SYSTEM_MARKERS: &[&str] = &[ + "shared/", + "design-system", + "packages/", + "/ui/components", + "@shared", + "common/", + "/lib/", + "libs/", + "internal/", + "/pkg/", +]; + +#[must_use] +pub fn is_design_system_path(file_path: &str) -> bool { + DESIGN_SYSTEM_MARKERS + .iter() + .any(|marker| file_path.contains(marker)) +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ReuseOpportunity { + pub repo: String, + pub local_symbol: String, + pub local_file: String, + pub shared_symbol: String, + pub shared_file: String, +} + +fn is_component_like(node: &NodeData) -> bool { + matches!(node.kind, NodeKind::Function | NodeKind::Class) +} + +pub fn analyze_shared_component_reuse( + store: &S, + repo: &str, +) -> Result, SharedComponentError> { + let nodes = store.nodes_by_repo(repo)?; + + let mut shared: BTreeMap = BTreeMap::new(); + for node in &nodes { + if is_component_like(node) && is_design_system_path(&node.file_path) { + shared + .entry(node.name.clone()) + .or_insert_with(|| node.file_path.clone()); + } + } + + let mut opportunities = Vec::new(); + for node in &nodes { + if !is_component_like(node) || is_design_system_path(&node.file_path) { + continue; + } + if let Some(shared_file) = shared.get(&node.name) { + opportunities.push(ReuseOpportunity { + repo: repo.to_owned(), + local_symbol: node.name.clone(), + local_file: node.file_path.clone(), + shared_symbol: node.name.clone(), + shared_file: shared_file.clone(), + }); + } + } + + opportunities.sort_by(|left, right| { + left.local_file + .cmp(&right.local_file) + .then(left.local_symbol.cmp(&right.local_symbol)) + }); + opportunities.dedup(); + Ok(opportunities) +} + +#[cfg(test)] +mod tests { + use std::{ + env, fs, + path::{Path, PathBuf}, + process, + sync::atomic::{AtomicU64, Ordering}, + }; + + use gather_step_core::{NodeData, NodeKind, SourceSpan, Visibility, node_id}; + use gather_step_storage::{GraphStore, GraphStoreDb}; + + use super::{analyze_shared_component_reuse, is_design_system_path}; + + static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0); + + struct TempDb { + path: PathBuf, + } + + impl TempDb { + fn new(name: &str) -> Self { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = env::temp_dir().join(format!( + "gather-step-shared-component-{name}-{}-{id}.redb", + process::id() + )); + Self { path } + } + + fn path(&self) -> &Path { + &self.path + } + } + + impl Drop for TempDb { + fn drop(&mut self) { + let _ = fs::remove_file(&self.path); + } + } + + fn node(repo: &str, file_path: &str, name: &str, ordinal: u32) -> NodeData { + NodeData { + id: node_id(repo, file_path, NodeKind::Function, name), + kind: NodeKind::Function, + repo: repo.to_owned(), + file_path: file_path.to_owned(), + name: name.to_owned(), + qualified_name: Some(format!("{repo}::{name}")), + external_id: None, + signature: None, + visibility: Some(Visibility::Public), + span: Some(SourceSpan { + line_start: ordinal, + line_len: 0, + column_start: 0, + column_len: 1, + }), + is_virtual: false, + } + } + + #[test] + fn design_system_path_markers() { + assert!(is_design_system_path("packages/ui/components/Button.tsx")); + assert!(is_design_system_path("src/shared/Card.tsx")); + assert!(!is_design_system_path("src/features/orders/Button.tsx")); + } + + #[test] + fn flags_local_fork_of_a_shared_component() { + let temp = TempDb::new("fork"); + let store = GraphStoreDb::open(temp.path()).expect("store"); + let shared = node("web", "packages/ui/components/Button.tsx", "Button", 0); + let fork = node("web", "src/features/orders/Button.tsx", "Button", 1); + let unique = node("web", "src/features/orders/OrderList.tsx", "OrderList", 2); + store + .bulk_insert(&[shared.clone(), fork.clone(), unique.clone()], &[]) + .expect("write"); + + let opportunities = analyze_shared_component_reuse(&store, "web").expect("analyze"); + assert_eq!(opportunities.len(), 1); + assert_eq!( + opportunities[0].local_file, + "src/features/orders/Button.tsx" + ); + assert_eq!( + opportunities[0].shared_file, + "packages/ui/components/Button.tsx" + ); + } + + #[test] + fn shared_only_or_no_duplicate_yields_nothing() { + let temp = TempDb::new("clean"); + let store = GraphStoreDb::open(temp.path()).expect("store"); + let shared = node("web", "packages/ui/components/Modal.tsx", "Modal", 0); + let local = node("web", "src/features/orders/OrderRow.tsx", "OrderRow", 1); + store.bulk_insert(&[shared, local], &[]).expect("write"); + + assert!( + analyze_shared_component_reuse(&store, "web") + .expect("analyze") + .is_empty() + ); + } +} diff --git a/crates/gather-step-cli/src/commands/doctor.rs b/crates/gather-step-cli/src/commands/doctor.rs index 38c2e651..ea85eaa6 100644 --- a/crates/gather-step-cli/src/commands/doctor.rs +++ b/crates/gather-step-cli/src/commands/doctor.rs @@ -1,7 +1,11 @@ use anyhow::{Context, Result, bail}; use clap::Args; use comfy_table::{Cell, ContentArrangement, Table, presets::UTF8_BORDERS_ONLY}; -use gather_step_analysis::{SemanticHealthReport, semantic_health_for_repo}; +use gather_step_analysis::{ + SemanticHealthReport, analyze_shared_component_reuse, find_cycles, find_mock_leakage, + semantic_health_for_repo, +}; +use gather_step_core::EdgeKind; use gather_step_core::NodeKind; use gather_step_core::RegistryStore; use gather_step_parser::resolve::{ResolutionInput, is_non_actionable_unresolved_call}; @@ -24,6 +28,8 @@ struct DoctorOutput { issue_count: usize, pack_metrics: PackDoctorOutput, repos: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + quality_advisories: Vec, } #[derive(Debug, Serialize)] @@ -90,12 +96,16 @@ pub(crate) fn execute( let issue_count = repos.iter().map(|repo| repo.issues.len()).sum(); let pack_metrics = pack_metrics(storage.metadata()).context("computing pack diagnostics")?; + let repo_names: Vec = repos.iter().map(|repo| repo.repo.clone()).collect(); + let quality_advisories = collect_quality_advisories(storage, &repo_names) + .context("collecting code-quality advisories")?; let payload = DoctorOutput { event: "doctor_completed", ok: issue_count == 0, issue_count, pack_metrics, repos, + quality_advisories, }; let mut lines = Vec::new(); @@ -136,6 +146,15 @@ pub(crate) fn execute( payload.pack_metrics.unresolved_packs )); } + if !payload.quality_advisories.is_empty() { + lines.push(format!( + "Code-quality advisories ({}):", + payload.quality_advisories.len() + )); + for advisory in &payload.quality_advisories { + lines.push(format!(" - {advisory}")); + } + } let payload_json = json!(payload); Ok(if issue_count == 0 { RenderedCommand::success(payload_json, lines) @@ -148,6 +167,75 @@ pub(crate) fn execute( }) } +const MAX_ADVISORIES_PER_CATEGORY: usize = 50; + +fn collect_quality_advisories( + storage: &StorageCoordinator, + repos: &[String], +) -> Result> { + let graph = storage.graph(); + let mut advisories = Vec::new(); + + let cycles = find_cycles(graph, Some(&[EdgeKind::Imports, EdgeKind::Calls])) + .context("detecting dependency cycles")?; + push_capped( + &mut advisories, + cycles.iter().map(|cycle| { + let scope = if cycle.cross_repo { + " (cross-repo)" + } else { + "" + }; + format!("Dependency cycle{scope}: {}", cycle.nodes.join(" -> ")) + }), + "dependency cycle", + ); + + for repo in repos { + let leaks = find_mock_leakage(graph, repo) + .with_context(|| format!("detecting mock leakage in `{repo}`"))?; + push_capped( + &mut advisories, + leaks.iter().map(|leak| { + format!( + "Mock import in production (`{repo}`): {} imports {}", + leak.importer_file, leak.mock_file + ) + }), + "mock import in production", + ); + + let forks = analyze_shared_component_reuse(graph, repo) + .with_context(|| format!("auditing shared-component reuse in `{repo}`"))?; + push_capped( + &mut advisories, + forks.iter().map(|fork| { + format!( + "Reuse opportunity (`{repo}`): {} duplicates shared `{}` ({})", + fork.local_file, fork.shared_symbol, fork.shared_file + ) + }), + "reuse opportunity", + ); + } + + Ok(advisories) +} + +fn push_capped(advisories: &mut Vec, items: impl Iterator, label: &str) { + let collected: Vec = items.collect(); + let total = collected.len(); + for item in collected.into_iter().take(MAX_ADVISORIES_PER_CATEGORY) { + advisories.push(item); + } + if total > MAX_ADVISORIES_PER_CATEGORY { + advisories.push(format!( + "... and {} more {label} finding(s) not shown", + total - MAX_ADVISORIES_PER_CATEGORY + )); + } +} + fn inspect_repo( repo: &str, registered: &gather_step_core::RegisteredRepo, diff --git a/crates/gather-step-cli/src/commands/status.rs b/crates/gather-step-cli/src/commands/status.rs index 38d9ffe6..f05f24f0 100644 --- a/crates/gather-step-cli/src/commands/status.rs +++ b/crates/gather-step-cli/src/commands/status.rs @@ -6,8 +6,9 @@ use gather_step_analysis::{ GraphQuery, SemanticHealthReport, semantic_health_for_repo, semantic_health_for_workspace, }; use gather_step_core::{DepthLevel, RegistryStore}; +use gather_step_git::{GitHistoryIndexer, GitRepoSource, IndexFreshness}; use gather_step_mcp::output::redact::relativize_to_workspace; -use gather_step_storage::{ContextPackStats, GraphStore, StorageCoordinator}; +use gather_step_storage::{ContextPackStats, GraphStore, MetadataStore, StorageCoordinator}; use serde::Serialize; use serde_json::{Value, json}; @@ -37,6 +38,7 @@ struct RepoStatusOutput { path: String, path_exists: bool, depth_level: String, + freshness: String, last_indexed_at: Option, registry_file_count: u64, registry_symbol_count: u64, @@ -190,12 +192,18 @@ pub(crate) fn execute( unresolved_inputs, ) .with_context(|| format!("computing semantic health for `{repo}`"))?; + let indexed_sha = storage + .metadata() + .get_last_commit_sha(repo) + .with_context(|| format!("loading indexed commit SHA for `{repo}`"))?; + let freshness = repo_freshness(repo, ®istered.path, indexed_sha.as_deref()); Ok(RepoStatusOutput { repo: repo.clone(), path: relativize_to_workspace(®istered.path, workspace_path), path_exists: registered.path.exists(), depth_level: depth_label(registered.depth_level).to_owned(), + freshness, last_indexed_at: registered.last_indexed_at.clone(), registry_file_count: registered.file_count, registry_symbol_count: registered.symbol_count, @@ -268,6 +276,7 @@ pub(crate) fn execute( repo_table.set_header(vec![ "Repo", "Depth", + "Freshness", "Indexed", "Files", "Symbols", @@ -280,6 +289,7 @@ pub(crate) fn execute( repo_table.add_row(vec![ Cell::new(&repo.repo), Cell::new(&repo.depth_level), + Cell::new(&repo.freshness), Cell::new(repo.last_indexed_at.as_deref().unwrap_or("never")), Cell::new(format!( "{}/{}", @@ -330,6 +340,22 @@ fn format_semantic_summary(health: &SemanticHealthReport) -> String { ) } +fn repo_freshness(repo: &str, path: &std::path::Path, indexed_sha: Option<&str>) -> String { + let indexer = GitHistoryIndexer::new(GitRepoSource::from_path(path), repo); + match indexer.index_freshness(indexed_sha) { + Ok(freshness) => freshness_label(&freshness).to_owned(), + Err(_) => "unknown".to_owned(), + } +} + +fn freshness_label(freshness: &IndexFreshness) -> &'static str { + match freshness { + IndexFreshness::Fresh { .. } => "fresh", + IndexFreshness::Stale { .. } => "stale", + IndexFreshness::NeverIndexed { .. } => "never_indexed", + } +} + fn depth_label(depth: DepthLevel) -> &'static str { match depth { DepthLevel::Level1 => "level1", @@ -470,5 +496,39 @@ mod tests { Some("status_completed"), "status payload should have event=status_completed" ); + for repo in repos { + let freshness = repo["freshness"] + .as_str() + .expect("each repo should carry a freshness verdict"); + assert!( + matches!(freshness, "fresh" | "stale" | "never_indexed" | "unknown"), + "unexpected freshness verdict: {freshness}" + ); + } + } + + #[test] + fn freshness_label_maps_every_variant() { + use gather_step_git::IndexFreshness; + + assert_eq!( + super::freshness_label(&IndexFreshness::Fresh { + head_sha: "abc".to_owned() + }), + "fresh" + ); + assert_eq!( + super::freshness_label(&IndexFreshness::Stale { + indexed_sha: "old".to_owned(), + head_sha: "new".to_owned() + }), + "stale" + ); + assert_eq!( + super::freshness_label(&IndexFreshness::NeverIndexed { + head_sha: "abc".to_owned() + }), + "never_indexed" + ); } } diff --git a/crates/gather-step-cli/src/errors.rs b/crates/gather-step-cli/src/errors.rs index c7643de9..631379b0 100644 --- a/crates/gather-step-cli/src/errors.rs +++ b/crates/gather-step-cli/src/errors.rs @@ -6,6 +6,36 @@ use gather_step_storage::{GraphStoreError, MetadataStoreError, SearchStoreError} const SCHEMA_VERSION_MISMATCH_MESSAGE: &str = "Index schema version mismatch — built by a different gather-step release. Next step: run `gather-step index --auto-recover` to rebuild, or `gather-step clean && gather-step index`."; +pub const GRAPH_LOCKED_EXIT_CODE: u8 = 75; + +#[must_use] +pub fn graph_lock_contention(error: &Error) -> bool { + for cause in error.chain() { + if let Some(graph_error) = cause.downcast_ref::() + && matches!( + graph_error, + GraphStoreError::StorageHeld { .. } | GraphStoreError::StorageHeldByDaemon { .. } + ) + { + return true; + } + } + let full = error_chain_text(error); + contains_ascii_case_insensitive(&full, "locked by gather-step pid") + || contains_ascii_case_insensitive(&full, "already locked by another gather-step process") + || contains_ascii_case_insensitive(&full, "database already open") +} + +#[must_use] +pub fn graph_locked_json_disclosure(error: &Error) -> String { + serde_json::json!({ + "event": "command_failed", + "degraded": "graph_locked", + "message": format_operator_error(error), + }) + .to_string() +} + #[must_use] pub fn format_operator_error(error: &Error) -> String { let full = error_chain_text(error); @@ -177,6 +207,45 @@ mod tests { ); } + #[test] + fn graph_lock_contention_detects_typed_lock_errors() { + use super::{GRAPH_LOCKED_EXIT_CODE, graph_lock_contention, graph_locked_json_disclosure}; + + let held = gather_step_storage::GraphStoreError::StorageHeld { + path: PathBuf::from("/tmp/graph.redb"), + }; + let err: anyhow::Error = anyhow::Error::new(held); + assert!(graph_lock_contention(&err), "StorageHeld must be detected"); + + let by_daemon = gather_step_storage::GraphStoreError::StorageHeldByDaemon { + path: PathBuf::from("/tmp/graph.redb"), + pid: 4242, + started_at_epoch_ms: 1, + workspace_root: "/ws".to_owned(), + }; + let err: anyhow::Error = anyhow::Error::new(by_daemon); + assert!( + graph_lock_contention(&err), + "StorageHeldByDaemon must be detected" + ); + + let disclosure: serde_json::Value = + serde_json::from_str(&graph_locked_json_disclosure(&err)).expect("valid json"); + assert_eq!(disclosure["degraded"], "graph_locked"); + assert_eq!(disclosure["event"], "command_failed"); + + assert_ne!(GRAPH_LOCKED_EXIT_CODE, 0); + assert_ne!(GRAPH_LOCKED_EXIT_CODE, 1); + } + + #[test] + fn graph_lock_contention_ignores_unrelated_errors() { + use super::graph_lock_contention; + + let err: anyhow::Error = anyhow::Error::msg("read /tmp/foo: permission denied"); + assert!(!graph_lock_contention(&err)); + } + #[test] fn unhandled_error_preserves_full_cause_chain() { // Wrap an inner error with anyhow::Context so the chain has two links. diff --git a/crates/gather-step-cli/src/main.rs b/crates/gather-step-cli/src/main.rs index f15135cb..5fd3192f 100644 --- a/crates/gather-step-cli/src/main.rs +++ b/crates/gather-step-cli/src/main.rs @@ -9,7 +9,10 @@ use clap::Parser; use dhat::Alloc; use gather_step::app::{self, AppContext}; use gather_step::commands::{self, Cli}; -use gather_step::errors::format_operator_error; +use gather_step::errors::{ + GRAPH_LOCKED_EXIT_CODE, format_operator_error, graph_lock_contention, + graph_locked_json_disclosure, +}; #[cfg(not(feature = "dhat-heap"))] use mimalloc::MiMalloc; @@ -49,14 +52,27 @@ async fn run_main() -> Result { commands::run(cli, app).await } -/// Print the operator-facing error to stderr and return exit code 1. +/// Print the operator-facing error to stderr and return an exit code. /// /// Returning `ExitCode` rather than calling `std::process::exit(1)` lets /// tokio's runtime tear down cleanly and lets stdio buffers flush — important /// for `pr-review` (which prints a structured report on stdout) and any /// command that emits trailing tracing lines. fn print_operator_error_and_code(error: &Error) -> ExitCode { + if graph_lock_contention(error) { + if wants_json_output() { + let mut stdout = std::io::stdout().lock(); + let _ = writeln!(stdout, "{}", graph_locked_json_disclosure(error)); + } + let mut stderr = std::io::stderr().lock(); + let _ = writeln!(stderr, "{}", format_operator_error(error)); + return ExitCode::from(GRAPH_LOCKED_EXIT_CODE); + } let mut stderr = std::io::stderr().lock(); let _ = writeln!(stderr, "{}", format_operator_error(error)); ExitCode::from(1) } + +fn wants_json_output() -> bool { + std::env::args().any(|arg| arg == "--json") +} diff --git a/crates/gather-step-cli/tests/cli_commands.rs b/crates/gather-step-cli/tests/cli_commands.rs index f38a90a2..f11bf51e 100644 --- a/crates/gather-step-cli/tests/cli_commands.rs +++ b/crates/gather-step-cli/tests/cli_commands.rs @@ -348,6 +348,72 @@ fn write_qa_reference_v4_workspace(root: &Path) { .expect("notifications fixture source"); } +#[test] +fn resolution_snapshot_matches_golden() { + use gather_step_analysis::GraphQuery; + + let temp = TempDir::new("resolution-snapshot"); + write_fixture_workspace(temp.path()); + run_ok(temp.path(), &["init"]); + run_ok(temp.path(), &["--json", "index"]); + + let graph = + GraphStoreDb::open(temp.path().join(".gather-step/storage/graph.redb")).expect("graph"); + let actual = GraphQuery::new(&graph) + .resolution_fingerprint() + .expect("fingerprint") + .join("\n"); + + let golden_path = + Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/golden/resolution_snapshot.golden"); + if env::var_os("GATHER_STEP_BLESS_SNAPSHOT").is_some() { + fs::create_dir_all(golden_path.parent().expect("golden parent")).expect("golden dir"); + fs::write(&golden_path, &actual).expect("write golden"); + } + + let expected = fs::read_to_string(&golden_path).unwrap_or_else(|_| { + panic!( + "missing golden {}; regenerate with GATHER_STEP_BLESS_SNAPSHOT=1", + golden_path.display() + ) + }); + assert_eq!( + actual, expected, + "resolution snapshot drifted; if the change is intended, regenerate with GATHER_STEP_BLESS_SNAPSHOT=1" + ); +} + +#[test] +fn read_command_during_graph_lock_succeeds_via_snapshot() { + let temp = TempDir::new("graph-lock"); + write_fixture_workspace(temp.path()); + run_ok(temp.path(), &["init"]); + run_ok(temp.path(), &["--json", "index"]); + + let graph_path = temp + .path() + .join(".gather-step") + .join("storage") + .join("graph.redb"); + let held = GraphStoreDb::open(&graph_path).expect("should hold the graph lock"); + + let output = run_ok(temp.path(), &["search", "OrderList", "--json"]); + let search_json = stdout_json(&output); + assert_eq!(search_json["event"], "search_completed"); + assert!( + search_json["hits"] + .as_array() + .expect("hits array") + .iter() + .any(|item| item["symbol_name"] == "OrderList"), + "snapshot read should return the indexed hit while the store is held" + ); + + drop(held); + let recovered = run_ok(temp.path(), &["search", "OrderList", "--json"]); + assert_eq!(stdout_json(&recovered)["event"], "search_completed"); +} + #[test] fn cli_commands_work_on_indexed_fixture_workspace() { let temp = TempDir::new("cli-commands"); @@ -439,12 +505,17 @@ fn cli_commands_work_on_indexed_fixture_workspace() { let status = run_ok(temp.path(), &["status", "--json"]); let status_json = stdout_json(&status); assert_eq!(status_json["event"], "status_completed"); - assert!( - !status_json["repos"] - .as_array() - .expect("repos array") - .is_empty() - ); + let status_repos = status_json["repos"].as_array().expect("repos array"); + assert!(!status_repos.is_empty()); + for repo in status_repos { + let freshness = repo["freshness"] + .as_str() + .expect("each repo should carry a freshness verdict"); + assert!( + matches!(freshness, "fresh" | "stale" | "never_indexed" | "unknown"), + "unexpected freshness verdict in status output: {freshness}" + ); + } let search = run_ok(temp.path(), &["search", "OrderList", "--json"]); let search_json = stdout_json(&search); @@ -1164,7 +1235,7 @@ fn metadata_schema_user_version_mismatch_reports_recovery_hint() { } #[test] -fn concurrent_graph_open_reports_stable_process_error() { +fn concurrent_graph_open_reads_via_snapshot() { let temp = TempDir::new("concurrent-open"); write_fixture_workspace(temp.path()); run_ok(temp.path(), &["init"]); @@ -1172,10 +1243,8 @@ fn concurrent_graph_open_reports_stable_process_error() { let _held_graph = GraphStoreDb::open(temp.path().join(".gather-step/storage/graph.redb")) .expect("graph should open and hold the redb lock"); - let output = run_fail(temp.path(), &["status", "--json"]); - let stderr = String::from_utf8_lossy(&output.stderr); - assert!(stderr.contains("Another gather-step process is using this workspace")); - assert!(stderr.contains("Stop `gather-step watch`")); + let output = run_ok(temp.path(), &["status", "--json"]); + assert_eq!(stdout_json(&output)["event"], "status_completed"); } #[test] diff --git a/crates/gather-step-cli/tests/golden/resolution_snapshot.golden b/crates/gather-step-cli/tests/golden/resolution_snapshot.golden new file mode 100644 index 00000000..8f6c4ae9 --- /dev/null +++ b/crates/gather-step-cli/tests/golden/resolution_snapshot.golden @@ -0,0 +1,34 @@ +ServiceAController DependsOn __di__EventBusClient none none +ServiceAController UsesDecorator ServiceAController::@Controller none none +ServiceAController.constructor UsesDecorator ServiceAController.constructor::@Controller none none +ServiceAController.handleOrderCreated Consumes __event__kafka__order.created none none +ServiceAController.handleOrderCreated UsesDecorator ServiceAController.handleOrderCreated::@Controller none none +ServiceAController.handleOrderCreated UsesDecorator ServiceAController.handleOrderCreated::@MessagePattern none none +ServiceAController.handleOrderCreated UsesEventFrom __event__kafka__order.created none none +ServiceAController.listOrders ProducesEventFor __event__kafka__order.created none none +ServiceAController.listOrders Publishes __event__kafka__order.created none none +ServiceAController.listOrders Serves __route__GET__/orders none none +ServiceAController.listOrders UsesDecorator ServiceAController.listOrders::@Controller none none +ServiceAController.listOrders UsesDecorator ServiceAController.listOrders::@Get none none +backend_standard::package.json Defines backend_standard::__repo__ none none +backend_standard::src/controller.ts Defines ServiceAController none none +backend_standard::src/controller.ts Defines ServiceAController.constructor none none +backend_standard::src/controller.ts Defines ServiceAController.handleOrderCreated none none +backend_standard::src/controller.ts Defines ServiceAController.listOrders none none +backend_standard::src/controller.ts Defines module::backend_standard::src/controller.ts none none +backend_standard::src/controller.ts Defines src/controller.ts::Controller none none +backend_standard::src/controller.ts Defines src/controller.ts::Get none none +backend_standard::src/controller.ts Defines src/controller.ts::MessagePattern none none +backend_standard::src/controller.ts Imports module-import::@nestjs/common none none +backend_standard::src/controller.ts Imports module-import::@nestjs/microservices none none +frontend_standard::package.json Defines frontend_standard::__repo__ none none +frontend_standard::src/OrderList.test.tsx Defines module::frontend_standard::src/OrderList.test.tsx none none +frontend_standard::src/OrderList.test.tsx Defines src/OrderList.test.tsx::OrderList none none +frontend_standard::src/OrderList.test.tsx Imports module-import::./OrderList none none +frontend_standard::src/OrderList.tsx Defines OrderList none none +frontend_standard::src/OrderList.tsx Defines module::frontend_standard::src/OrderList.tsx none none +module::backend_standard::src/controller.ts Exports ServiceAController none none +module::backend_standard::src/controller.ts Exports ServiceAController.constructor none none +module::backend_standard::src/controller.ts Exports ServiceAController.handleOrderCreated none none +module::backend_standard::src/controller.ts Exports ServiceAController.listOrders none none +module::frontend_standard::src/OrderList.tsx Exports OrderList none none \ No newline at end of file diff --git a/crates/gather-step-core/src/graph.rs b/crates/gather-step-core/src/graph.rs index b040a17d..70c83744 100644 --- a/crates/gather-step-core/src/graph.rs +++ b/crates/gather-step-core/src/graph.rs @@ -194,6 +194,17 @@ impl EdgeMetadata { self.resolver_strategy() .map_or(0, crate::ResolverStrategy::strategy_weight) } + + /// Whether this edge passes a `min_confidence` traversal filter. + /// + /// `None` confidence means the edge is a definite structural fact (e.g. an + /// import-resolved call), not a heuristic guess, so it passes any threshold + /// rather than being filtered out. An explicit confidence is compared + /// against the threshold; a `None` threshold disables filtering entirely. + #[must_use] + pub fn passes_confidence(&self, min_confidence: Option) -> bool { + min_confidence.is_none_or(|min| self.confidence.is_none_or(|actual| actual >= min)) + } } #[derive( @@ -263,6 +274,25 @@ mod tests { }; use crate::schema::{EdgeKind, NodeKind}; + #[test] + fn passes_confidence_treats_none_as_trusted_and_filters_low() { + // No threshold disables filtering: everything passes. + let mut meta = EdgeMetadata::default(); + assert!(meta.passes_confidence(None)); + + // None confidence is a definite structural edge (not a heuristic + // guess) and must pass any threshold rather than be filtered out. + assert!(meta.passes_confidence(Some(500))); + + // An explicit low confidence is filtered out below the threshold. + meta.confidence = Some(300); + assert!(!meta.passes_confidence(Some(500))); + + // At or above the threshold, it passes. + meta.confidence = Some(900); + assert!(meta.passes_confidence(Some(500))); + } + #[test] fn node_id_is_deterministic_for_identical_inputs() { let first = node_id("service-a", "src/foo.ts", NodeKind::Function, "execute"); diff --git a/crates/gather-step-core/src/schema.rs b/crates/gather-step-core/src/schema.rs index 80e3caff..112fd8a7 100644 --- a/crates/gather-step-core/src/schema.rs +++ b/crates/gather-step-core/src/schema.rs @@ -227,6 +227,15 @@ impl EdgeKind { self as u8 } + /// Whether this edge represents a *consumer*/usage of the target rather + /// than a structural `Defines` (file→symbol) or `Imports` edge. Used to + /// count real consumers for reuse ranking and resolution scoring, so a + /// "consumer count" reflects callers/users, not raw inbound-edge volume. + #[must_use] + pub const fn is_consumer_edge(self) -> bool { + !matches!(self, Self::Defines | Self::Imports) + } + /// Whether this edge kind represents a semantic bridge link /// connecting a real symbol to a virtual bridge node (`Route`, `Topic`, /// `SharedSymbol`, `PayloadContract`, …) — i.e. an edge whose correctness diff --git a/crates/gather-step-git/src/history.rs b/crates/gather-step-git/src/history.rs index 30f27163..afa06c81 100644 --- a/crates/gather-step-git/src/history.rs +++ b/crates/gather-step-git/src/history.rs @@ -163,6 +163,44 @@ pub enum HistorySyncOutcome { NoChange { repo: String, head_sha: String }, } +/// Query-time freshness of an indexed repo relative to the working tree's +/// current HEAD (A13). Distinct from [`HistorySyncOutcome`], which describes an +/// indexing run; this answers "can I trust what the index says right now?" so +/// a trace/search against a stale index can be flagged instead of silently +/// answering about code that no longer matches the tree. +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum IndexFreshness { + /// The indexed commit matches the current HEAD — the index is current. + Fresh { head_sha: String }, + /// The index reflects an older commit than the working tree's HEAD. + Stale { + indexed_sha: String, + head_sha: String, + }, + /// No commit was recorded for this repo — it has never been indexed. + NeverIndexed { head_sha: String }, +} + +/// Classify index freshness by comparing the recorded indexed commit against +/// the repository's current HEAD. Pure so it is unit-testable without git or a +/// metadata store. +#[must_use] +pub fn classify_freshness(indexed_sha: Option<&str>, head_sha: &str) -> IndexFreshness { + match indexed_sha { + None => IndexFreshness::NeverIndexed { + head_sha: head_sha.to_owned(), + }, + Some(sha) if sha == head_sha => IndexFreshness::Fresh { + head_sha: head_sha.to_owned(), + }, + Some(sha) => IndexFreshness::Stale { + indexed_sha: sha.to_owned(), + head_sha: head_sha.to_owned(), + }, + } +} + impl HistorySyncOutcome { #[must_use] pub fn repo(&self) -> &str { @@ -350,6 +388,17 @@ impl GitHistoryIndexer { Ok(repo.head_id().map_err(Box::new)?.detach().to_string()) } + /// Classify how fresh an indexed repo is relative to its current HEAD (A13). + /// `indexed_sha` is the commit recorded for the repo (e.g. from + /// `MetadataStore::get_last_commit_sha`); `None` means never indexed. + pub fn index_freshness( + &self, + indexed_sha: Option<&str>, + ) -> Result { + let head = self.head_sha()?; + Ok(classify_freshness(indexed_sha, &head)) + } + /// Walks the repository, persists the new commits + per-file deltas via /// `store`, advances `repo_sync_state.last_commit_sha`, and returns the /// outcome describing which sync branch ran. @@ -1074,9 +1123,35 @@ mod tests { use super::{ CommitFact, CommitFileChangeKind, CommitFileDelta, GitHistoryIndexer, GitRepoSource, - HistorySyncOutcome, normalize_author_email_bytes, + HistorySyncOutcome, IndexFreshness, classify_freshness, normalize_author_email_bytes, }; + #[test] + fn classify_freshness_distinguishes_fresh_stale_and_never_indexed() { + // Indexed commit matches HEAD => fresh. + assert_eq!( + classify_freshness(Some("abc123"), "abc123"), + IndexFreshness::Fresh { + head_sha: "abc123".to_owned() + } + ); + // Indexed commit older than HEAD => stale, carrying both SHAs. + assert_eq!( + classify_freshness(Some("old111"), "new222"), + IndexFreshness::Stale { + indexed_sha: "old111".to_owned(), + head_sha: "new222".to_owned(), + } + ); + // No recorded commit => never indexed. + assert_eq!( + classify_freshness(None, "abc123"), + IndexFreshness::NeverIndexed { + head_sha: "abc123".to_owned() + } + ); + } + /// Minimal hand-rolled `TempDir` that deletes itself on drop. Avoids /// adding a `tempfile` dev-dependency. struct TempDir { diff --git a/crates/gather-step-git/src/lib.rs b/crates/gather-step-git/src/lib.rs index 4c6868b0..8ce1f07b 100644 --- a/crates/gather-step-git/src/lib.rs +++ b/crates/gather-step-git/src/lib.rs @@ -24,7 +24,8 @@ pub use classify::{ }; pub use history::{ CommitFact, CommitFileChangeKind, CommitFileDelta, GitHistoryError, GitHistoryIndexer, - GitHistorySyncError, GitIndexerOptions, GitRepoSource, HistorySyncOutcome, + GitHistorySyncError, GitIndexerOptions, GitRepoSource, HistorySyncOutcome, IndexFreshness, + classify_freshness, }; pub use intelligence::{ RepoIntelligenceError, RepoIntelligenceOptions, RepoIntelligenceReport, diff --git a/crates/gather-step-mcp/src/catalog.rs b/crates/gather-step-mcp/src/catalog.rs index 1ae4c72a..75fd5fac 100644 --- a/crates/gather-step-mcp/src/catalog.rs +++ b/crates/gather-step-mcp/src/catalog.rs @@ -92,7 +92,10 @@ pub const MCP_TOOLS: &[(&str, &str)] = &[ "planning_pack", "Context pack for architecture and planning tasks", ), - ("plan_change", "Alias for planning-oriented context"), + ( + "plan_change", + "Typed plan-change product (twelve planning sections)", + ), ("debug_pack", "Context pack for debugging production issues"), ("fix_pack", "Context pack scoped for a bug fix"), ("fix_surface", "Get a narrower fix-oriented surface"), diff --git a/crates/gather-step-mcp/src/server.rs b/crates/gather-step-mcp/src/server.rs index c93093cd..aaae37ed 100644 --- a/crates/gather-step-mcp/src/server.rs +++ b/crates/gather-step-mcp/src/server.rs @@ -50,11 +50,11 @@ use crate::{ }, orientation::{GraphSchemaResponse, ListReposResponse, get_graph_schema, list_repos}, packs::{ - ContextPackRequest, ContextPackResponse, ModePackRequest, + ContextPackRequest, ContextPackResponse, ModePackRequest, PlanChangeResponse, change_impact_pack_tool as run_change_impact_pack, context_pack_tool as run_context_pack, debug_pack_tool as run_debug_pack, fix_pack_tool as run_fix_pack, planning_pack_tool as run_planning_pack, - review_pack_tool as run_review_pack, + review_pack_tool as run_review_pack, run_plan_change, }, pr_review::{ PrReviewInput, PrReviewResponse, PrReviewSetInput, PrReviewSetResponse, run_pr_review, @@ -784,17 +784,17 @@ impl GatherStepMcpServer { #[tool( name = "plan_change", - description = "Return a bounded planning pack for a target symbol.", + description = "Return a typed plan-change product (twelve planning sections) for a target symbol.", annotations(read_only_hint = true) )] pub async fn plan_change_tool( &self, Parameters(request): Parameters, - ) -> Result, String> { + ) -> Result, String> { let args = serde_json::to_value(&request).unwrap_or_default(); let ctx = Arc::clone(&self.ctx); self.traced_call("plan_change", &args, move || { - run_planning_pack(&ctx, request) + run_plan_change(&ctx, request) .map(Json) .map_err(|error| error.to_string()) }) diff --git a/crates/gather-step-mcp/src/tools/composite.rs b/crates/gather-step-mcp/src/tools/composite.rs index 9717e710..94aee1be 100644 --- a/crates/gather-step-mcp/src/tools/composite.rs +++ b/crates/gather-step-mcp/src/tools/composite.rs @@ -23,7 +23,7 @@ use crate::{ orientation::{RepoSummary, list_repos}, packs::{ ContextPackRequest, ModePackRequest, change_impact_pack_tool, context_pack_tool, - debug_pack_tool, fix_pack_tool, planning_pack_tool, review_pack_tool, + debug_pack_tool, fix_pack_tool, planning_pack_tool, review_pack_tool, run_plan_change, }, repo_intelligence::{ DeadCodeRequest, RepoScopedRequest, WhoOwnsRequest, get_conventions_tool, @@ -513,11 +513,12 @@ fn execute_batch_op(ctx: &McpContext, op: BatchQueryOperation) -> BatchQueryResu context_pack_tool(ctx, args).and_then(to_value) }) } - "planning_pack" | "plan_change" => { - parse_and_run::(op.arguments, |args| { - planning_pack_tool(ctx, args).and_then(to_value) - }) - } + "planning_pack" => parse_and_run::(op.arguments, |args| { + planning_pack_tool(ctx, args).and_then(to_value) + }), + "plan_change" => parse_and_run::(op.arguments, |args| { + run_plan_change(ctx, args).and_then(to_value) + }), "debug_pack" => parse_and_run::(op.arguments, |args| { debug_pack_tool(ctx, args).and_then(to_value) }), diff --git a/crates/gather-step-mcp/src/tools/context_pack.rs b/crates/gather-step-mcp/src/tools/context_pack.rs index 34e35cf9..bb3bae48 100644 --- a/crates/gather-step-mcp/src/tools/context_pack.rs +++ b/crates/gather-step-mcp/src/tools/context_pack.rs @@ -1,5 +1,6 @@ pub use crate::tools::packs::{ ContextPackData, ContextPackMeta, ContextPackRequest, ContextPackResponse, ModePackRequest, - PackBridge, PackItem, context_pack_tool, debug_pack_tool, fix_pack_tool, planning_pack_tool, - review_pack_tool, + PackBridge, PackItem, PlanChangeContract, PlanChangeResponse, context_pack_tool, + debug_pack_tool, fix_pack_tool, planning_pack_tool, review_pack_tool, run_plan_change, + validate_plan_change_contract, }; diff --git a/crates/gather-step-mcp/src/tools/packs.rs b/crates/gather-step-mcp/src/tools/packs.rs index 52c4d843..d949aee8 100644 --- a/crates/gather-step-mcp/src/tools/packs.rs +++ b/crates/gather-step-mcp/src/tools/packs.rs @@ -207,6 +207,247 @@ pub struct ContextPackResponse { pub meta: Option, } +const PLAN_CHANGE_SCHEMA_VERSION: u8 = 3; + +const PLAN_CHANGE_SECTIONS: [&str; 12] = [ + "reuse_candidates", + "sibling_clone_targets", + "standards_to_preserve", + "integration_checks", + "cross_repo_reachability", + "display_ownership_checks", + "write_path_or_state_machine_risks", + "required_braingent_records", + "open_unknowns", + "pass_two_gap_dimensions", + "v1_completeness_checklist", + "verification_plan", +]; + +const PASS_TWO_GAP_DIMENSIONS: [&str; 8] = [ + "State machine: does this add/alter a status or lifecycle state? Enumerate transitions and conflict handling.", + "Token/URL security: are tokens, signed URLs, or IDs exposed or accepted? Validate scope, expiry, and tampering.", + "Dependent ticket: is there upstream/downstream work this depends on or unblocks? Link it.", + "Atomicity: do multi-document or multi-step writes need a transaction or idempotency guard?", + "Index-per-field: does any new query/filter field need an index? Confirm coverage.", + "Event contract: grep producers and consumers of any touched event/topic for set-difference drift.", + "Audit category: does this mutating path need an audit-log entry with the correct category?", + "Notification: should this change emit or suppress a user/system notification?", +]; + +const V1_COMPLETENESS_CHECKLIST: [&str; 19] = [ + "Scale test: does it hold at production data volume? → verify: run against a realistic-size fixture.", + "Failure-path lifecycle: are partial-failure and retry paths handled? → verify: exercise the failure branch.", + "Consumer reachability: does the output actually reach every consumer? → verify: trace producer→consumer.", + "Downstream coupling: which downstream systems depend on this shape? → verify: enumerate consumers.", + "Delete/invalidate: are deletes and cache/index invalidations handled? → verify: delete-path test.", + "Single source of truth: is the value derived once, not duplicated? → verify: grep for parallel computations.", + "Non-functional: latency/memory/throughput within budget? → verify: measure against baseline.", + "Exclusion scope: what is explicitly out of scope? → verify: state exclusions in the PR.", + "Cross-commit regression: could a later commit re-break this? → verify: add a regression test.", + "Deferred as follow-ups: are deferrals recorded as tickets? → verify: link follow-up issues.", + "Prior-release correctness/security gate: did a prior release defer hardening here? → verify: re-check the deferred item.", + "Partial-migration tail: zero residual call-sites workspace-wide? → verify: grep shows no residual callers.", + "Release-profile divergence: verify flags on the installed profile. → verify: check [profile.dist].", + "Teardown under-spec: disposable artifacts specify creation/reuse-key/safe-delete/in-flight-writer guard. → verify: all four present.", + "Schema-version forward-compat: a version-mismatch refusal/rebuild path exists. → verify: mismatch is refused, not misread.", + "Derived-field blast radius: enumerate source, projection writer, all readers, filters, search/index, backfill. → verify: all six listed.", + "Cross-service ownership: should the owner publish a snapshot/resolver before adopting a join? → verify: question answered.", + "Lossy rollback: enumerate unrecoverable rows/fields. → verify: rollback notes list them.", + "Implied-surface sweep: events/audit/notification/rate-limit/idempotency even when the spec is silent. → verify: sweep performed.", +]; + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq)] +pub struct PlanChangeContract { + pub schema_version: u8, + pub sections: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub exclusion_ledger: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq)] +pub struct PlanChangeResponse { + pub target: String, + pub found: bool, + pub reuse_candidates: Vec, + pub sibling_clone_targets: Vec, + pub standards_to_preserve: Vec, + pub integration_checks: Vec, + pub cross_repo_reachability: Vec, + pub display_ownership_checks: Vec, + pub write_path_or_state_machine_risks: Vec, + pub required_braingent_records: Vec, + pub open_unknowns: Vec, + pub pass_two_gap_dimensions: Vec, + pub v1_completeness_checklist: Vec, + pub verification_plan: Vec, + pub contract: PlanChangeContract, + #[serde(skip_serializing_if = "Option::is_none")] + pub meta: Option, +} + +fn build_plan_change( + target: &str, + found: bool, + items: &[PackItem], + unresolved_gaps: &[String], + planning_proofs: &[serde_json::Value], + next_steps: &[String], + change_impact: &ChangeImpactSummary, + meta: Option, +) -> PlanChangeResponse { + let mut reuse_candidates = Vec::new(); + let mut sibling_clone_targets = Vec::new(); + for item in items { + if item.category == "target" { + continue; + } + if is_shared_module_path(&item.file_path) { + reuse_candidates.push(item.clone()); + } else { + sibling_clone_targets.push(item.clone()); + } + } + + let verification_plan = next_steps + .iter() + .map(|step| format!("Run `{step}` and confirm the result before changing code.")) + .collect(); + + let mut integration_checks = Vec::new(); + for repo in &change_impact.confirmed_downstream_repos { + integration_checks.push(format!( + "Integration: verify confirmed downstream consumer `{repo}` still works after this change." + )); + } + for repo in &change_impact.probable_downstream_repos { + integration_checks.push(format!( + "Integration: check probable downstream `{repo}` (partial evidence) for impact." + )); + } + + let mut display_ownership_checks = Vec::new(); + for caller in &change_impact.cross_repo_callers { + display_ownership_checks.push(format!( + "Display ownership: cross-service ref `{}::{}` — confirm display fields are sourced \ + from the owner service (snapshot/API), not a direct cross-service DB lookup, and \ + that access control stays in the owner.", + caller.repo, caller.symbol_name + )); + } + + let mut write_path_or_state_machine_risks = Vec::new(); + for caller in &change_impact.cross_repo_callers { + write_path_or_state_machine_risks.push(format!( + "Cross-repo caller `{}::{}` depends on this behavior — preserve its contract.", + caller.repo, caller.symbol_name + )); + } + for unresolved in &change_impact.unresolved_possible { + write_path_or_state_machine_risks.push(format!( + "Unresolved possible impact: `{unresolved}` — confirm before changing." + )); + } + if change_impact.truncated_repos.is_some() { + write_path_or_state_machine_risks.push( + "Downstream fan-out was capped — affected repos exist beyond the listed set." + .to_owned(), + ); + } + + let mut exclusion_ledger = Vec::new(); + if let Some(truncated) = &change_impact.truncated_repos { + exclusion_ledger.push(format!( + "Downstream fan-out capped: {} repo(s) omitted ({}).", + truncated.count, + truncated.reason_codes.join(", ") + )); + } + if let Some(meta) = &meta { + for warning in &meta.warnings { + exclusion_ledger.push(format!("Planning warning: {warning}")); + } + } + exclusion_ledger + .push("standards_to_preserve: not yet computed (no standards engine).".to_owned()); + exclusion_ledger + .push("required_braingent_records: not yet computed (no record ingestion).".to_owned()); + let contract = PlanChangeContract { + schema_version: PLAN_CHANGE_SCHEMA_VERSION, + sections: PLAN_CHANGE_SECTIONS + .iter() + .map(|s| (*s).to_owned()) + .collect(), + exclusion_ledger, + }; + + PlanChangeResponse { + target: target.to_owned(), + found, + reuse_candidates, + sibling_clone_targets, + standards_to_preserve: Vec::new(), + integration_checks, + cross_repo_reachability: planning_proofs.to_vec(), + display_ownership_checks, + write_path_or_state_machine_risks, + required_braingent_records: Vec::new(), + open_unknowns: unresolved_gaps.to_vec(), + pass_two_gap_dimensions: PASS_TWO_GAP_DIMENSIONS + .iter() + .map(|item| (*item).to_owned()) + .collect(), + v1_completeness_checklist: V1_COMPLETENESS_CHECKLIST + .iter() + .map(|item| (*item).to_owned()) + .collect(), + verification_plan, + contract, + meta, + } +} + +pub fn validate_plan_change_contract(plan: &PlanChangeResponse) -> Vec { + let mut violations = Vec::new(); + if plan.contract.schema_version != PLAN_CHANGE_SCHEMA_VERSION { + violations.push(format!( + "schema_version {} != expected {PLAN_CHANGE_SCHEMA_VERSION}", + plan.contract.schema_version + )); + } + if !plan + .contract + .sections + .iter() + .map(String::as_str) + .eq(PLAN_CHANGE_SECTIONS) + { + violations.push(format!( + "section manifest {:?} != canonical {PLAN_CHANGE_SECTIONS:?}", + plan.contract.sections + )); + } + violations +} + +pub fn run_plan_change( + ctx: &McpContext, + request: ModePackRequest, +) -> Result { + let pack = planning_pack_tool(ctx, request)?; + let data = &pack.data; + Ok(build_plan_change( + &data.target, + data.found, + &data.items, + &data.unresolved_gaps, + &data.planning_proofs, + &data.next_steps, + &data.change_impact, + pack.meta.clone(), + )) +} + #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq)] pub struct ContextPackData { pub mode: String, @@ -1084,22 +1325,39 @@ fn assemble_context_pack_for_symbol( .map(|item| item.repo.clone()) .collect::>(); items.extend(contract_impact_items); - items.sort_by(|left, right| { - right - .score - .cmp(&left.score) - .then(left.repo.cmp(&right.repo)) - .then(left.file_path.cmp(&right.file_path)) - .then(left.line_start.cmp(&right.line_start)) - .then(left.symbol_name.cmp(&right.symbol_name)) - .then(left.symbol_id.cmp(&right.symbol_id)) - }); + sort_pack_items(&mut items); items.dedup_by(|left, right| { left.symbol_id == right.symbol_id && left.category == right.category }); if let Some(repo) = options.repo_filter { items.retain(|item| item.repo == repo); } + + if mode == PackMode::Planning { + // KNOWN CEILING: the reuse boost re-ranks the top `limit*5` base-ranked + // candidates; a canonical symbol ranked below that window for a very + // high-fan-out target is not rescued. The window keeps the boost (and + // the bounded re-sort below) cheap. + let window = options.limit.saturating_mul(5).min(items.len()); + let graph = ctx.graph(); + // Precompute distinct consumer counts in one pre-pass so the boost loop + // does map lookups instead of per-item graph I/O, deduped by symbol_id. + let mut consumer_counts: std::collections::BTreeMap = + std::collections::BTreeMap::new(); + for item in &items[..window] { + consumer_counts + .entry(item.symbol_id.clone()) + .or_insert_with(|| consumer_count(graph, &item.symbol_id)); + } + apply_reuse_boost_with(&mut items[..window], |symbol_id| { + consumer_counts.get(symbol_id).copied().unwrap_or(0) + }); + // The boost only raised window items' scores, and the window holds the + // top base-ranked candidates, so every boosted item still outranks the + // untouched tail — re-sorting just the window keeps global order. + sort_pack_items(&mut items[..window]); + } + items.truncate(options.limit); let evidence_populated = items.len() <= 20; @@ -2497,12 +2755,82 @@ fn populate_evidence_chain( /// Items with a confirmed evidence chain (graph-proven path from anchor) are /// boosted by 50 score points so they appear before lexical-only hits at /// equivalent depth. The list is re-sorted by the updated scores. -fn apply_planning_evidence_ranking(items: &mut [PackItem]) { +/// Whether a file path lives in a shared / design-system module — a strong +/// signal that a symbol there is the canonical reusable thing rather than a +/// local one-off. Markers are matched as-is (paths are conventionally +/// lowercase), avoiding an allocation. +fn is_shared_module_path(file_path: &str) -> bool { + // A path heuristic spanning common JS, Rust, Go, and Python shared-module + // layouts. NOTE: this is a heuristic; the pipeline-wide canonical signal + // (pack_assembly's `Canonical`) is the longer-term source of truth, pending + // threading it onto `PackItem`. + const MARKERS: &[&str] = &[ + "shared/", + "design-system", + "packages/", + "/ui/components", + "@shared", + "common/", + "/lib/", + "libs/", + "internal/", + "/pkg/", + ]; + MARKERS.iter().any(|marker| file_path.contains(marker)) +} + +/// S3 graph-derived reuse boost. A symbol in a shared/design-system path or +/// consumed by many modules is more likely the canonical reusable surface, so +/// it should rank above local candidates. Consumer influence is capped so a +/// single hub node cannot dominate the ranking. +fn reuse_evidence_boost(file_path: &str, consumer_count: usize) -> u16 { + let mut boost = 0_u16; + if is_shared_module_path(file_path) { + boost = boost.saturating_add(40); + } + // Each consumer adds weight, capped at 30 so a single hub node cannot + // dominate the ranking; 3 points per consumer => up to +90. + let consumers = u16::try_from(consumer_count.min(30)).unwrap_or(u16::MAX); + boost.saturating_add(consumers.saturating_mul(3)) +} + +/// Distinct reuse consumers of a symbol: the source nodes of inbound edges, +/// excluding structural `Defines`/`Imports` edges that do not represent a +/// caller/user. Mirrors the edge-kind filter used by the resolution scorer so +/// the count reflects real consumers, not raw inbound-edge volume. +fn consumer_count(graph: &S, symbol_id: &str) -> usize { + let Ok(node_id) = decode_node_id(symbol_id) else { + return 0; + }; + let Ok(incoming) = graph.get_incoming(node_id) else { + return 0; + }; + incoming + .iter() + .filter(|edge| edge.kind.is_consumer_edge()) + .map(|edge| edge.source) + .collect::>() + .len() +} + +/// Apply the graph-derived reuse boost to planning-pack items via a +/// consumer-count lookup. Generic over the lookup so the boost is unit-testable +/// without a graph store. Must run BEFORE truncation so a canonical reusable +/// symbol can be promoted into the pack, not merely reordered within an +/// already-cut set. +fn apply_reuse_boost_with(items: &mut [PackItem], consumer_count: impl Fn(&str) -> usize) { for item in items.iter_mut() { - if item.evidence_chain.is_some() { - item.score = item.score.saturating_add(50); + if item.category == "target" { + continue; } + let boost = reuse_evidence_boost(&item.file_path, consumer_count(&item.symbol_id)); + item.score = item.score.saturating_add(boost); } +} + +/// Canonical ordering for planning-pack items: descending score, then stable +/// lexical tie-breaks for deterministic output. +fn sort_pack_items(items: &mut [PackItem]) { items.sort_by(|left, right| { right .score @@ -2515,6 +2843,15 @@ fn apply_planning_evidence_ranking(items: &mut [PackItem]) { }); } +fn apply_planning_evidence_ranking(items: &mut [PackItem]) { + for item in items.iter_mut() { + if item.evidence_chain.is_some() { + item.score = item.score.saturating_add(50); + } + } + sort_pack_items(items); +} + fn mode_pack_tool( ctx: &McpContext, request: ModePackRequest, @@ -3525,12 +3862,12 @@ fn pack_resolution_edge_score(ctx: &McpContext, symbol_id: &str) -> Result 0); + + // More consumers => larger boost (the canonical reusable thing). + assert!(reuse_evidence_boost("src/x.ts", 10) > reuse_evidence_boost("src/x.ts", 1)); + + // A shared + widely-consumed symbol outranks a local one-off. + assert!( + reuse_evidence_boost("packages/ui/Button.tsx", 8) + > reuse_evidence_boost("src/features/orders/local-button.tsx", 0) + ); + } + + #[test] + fn reuse_boost_promotes_canonical_symbol_past_truncation() { + use super::{apply_reuse_boost_with, sort_pack_items}; + + // The local one-off has the higher base score; the shared, widely + // consumed component has a lower base score and would be cut by a + // base-score truncate to 1 — unless reuse ranking runs before the cut. + let mut items = vec![ + make_pack_item(100, "src/features/orders/local.tsx", false), + make_pack_item(80, "packages/ui/components/Button.tsx", false), + ]; + + apply_reuse_boost_with(&mut items, |symbol_id| { + if symbol_id.contains("Button") { 12 } else { 0 } + }); + sort_pack_items(&mut items); + items.truncate(1); + + assert_eq!(items.len(), 1); + assert_eq!(items[0].file_path, "packages/ui/components/Button.tsx"); + } + + #[test] + fn build_plan_change_projects_pack_into_typed_sections() { + use super::build_plan_change; + + let mut target_item = make_pack_item(200, "src/orders/create-order.ts", false); + target_item.category = "target".to_owned(); + let items = vec![ + make_pack_item(120, "packages/ui/components/Button.tsx", false), + make_pack_item(90, "src/features/orders/local-helper.ts", false), + target_item, + ]; + let gaps = vec!["Who emits status FINALIZED?".to_owned()]; + let proofs = vec![serde_json::json!({"repo": "alert", "edge": "ConsumesApiFrom"})]; + let next_steps = vec!["crud_trace".to_owned()]; + let change_impact = super::ChangeImpactSummary { + confirmed_downstream_repos: vec!["alert".to_owned()], + cross_repo_callers: vec![super::CrossRepoCaller { + file_path: "src/consumer.ts".to_owned(), + line_start: None, + repo: "report".to_owned(), + symbol_id: "id_consumer".to_owned(), + symbol_kind: "function".to_owned(), + symbol_name: "useOrderTotals".to_owned(), + evidence: None, + }], + ..Default::default() + }; + + let plan = build_plan_change( + "createOrder", + true, + &items, + &gaps, + &proofs, + &next_steps, + &change_impact, + None, + ); + + assert!(plan.found); + // Shared component => reuse candidate; local helper => sibling target; + // the target item is excluded from both. + assert_eq!(plan.reuse_candidates.len(), 1); + assert_eq!( + plan.reuse_candidates[0].file_path, + "packages/ui/components/Button.tsx" + ); + assert_eq!(plan.sibling_clone_targets.len(), 1); + assert_eq!( + plan.sibling_clone_targets[0].file_path, + "src/features/orders/local-helper.ts" + ); + assert_eq!(plan.open_unknowns, gaps); + assert_eq!(plan.cross_repo_reachability.len(), 1); + assert_eq!(plan.verification_plan.len(), 1); + assert_eq!(plan.integration_checks.len(), 1); + assert!(plan.integration_checks[0].contains("alert")); + assert_eq!(plan.write_path_or_state_machine_risks.len(), 1); + assert!(plan.write_path_or_state_machine_risks[0].contains("report::useOrderTotals")); + // Contract: sections awaiting later work still exist (empty). + assert!(plan.standards_to_preserve.is_empty()); + assert!(plan.required_braingent_records.is_empty()); + } + + #[test] + fn plan_change_includes_pass_two_and_v1_checklists() { + use super::{ChangeImpactSummary, build_plan_change}; + + let plan = build_plan_change( + "anyTarget", + true, + &[], + &[], + &[], + &[], + &ChangeImpactSummary::default(), + None, + ); + + assert_eq!( + plan.pass_two_gap_dimensions.len(), + 8, + "all 8 pass-2 dimensions must be present: {:?}", + plan.pass_two_gap_dimensions + ); + + assert!(!plan.v1_completeness_checklist.is_empty()); + assert!( + plan.v1_completeness_checklist + .iter() + .all(|item| item.contains("→ verify:")), + "every v1-completeness item must carry a verify step" + ); + assert!( + plan.v1_completeness_checklist + .iter() + .any(|item| item.contains("Derived-field blast radius")), + "deviation checklist item missing" + ); + + for section in ["pass_two_gap_dimensions", "v1_completeness_checklist"] { + assert!( + plan.contract.sections.iter().any(|s| s == section), + "missing section in manifest: {section}" + ); + } + } + + #[test] + fn plan_change_surfaces_display_ownership_for_cross_service_refs() { + use super::{ChangeImpactSummary, build_plan_change}; + + let change_impact = ChangeImpactSummary { + cross_repo_callers: vec![super::CrossRepoCaller { + file_path: "src/action_panel.ts".to_owned(), + line_start: None, + repo: "reporting".to_owned(), + symbol_id: "id_hub".to_owned(), + symbol_kind: "function".to_owned(), + symbol_name: "renderActionPanel".to_owned(), + evidence: None, + }], + ..Default::default() + }; + let plan = build_plan_change( + "getOrderDisplayName", + true, + &[], + &[], + &[], + &[], + &change_impact, + None, + ); + + assert!( + plan.display_ownership_checks + .iter() + .any(|check| check.contains("reporting")), + "display ownership not surfaced for cross-service ref: {:?}", + plan.display_ownership_checks + ); + assert!( + plan.contract + .sections + .iter() + .any(|s| s == "display_ownership_checks") + ); + } + + #[test] + fn plan_change_omits_display_ownership_without_cross_service_refs() { + use super::{ChangeImpactSummary, build_plan_change}; + + let plan = build_plan_change( + "localOnly", + true, + &[], + &[], + &[], + &[], + &ChangeImpactSummary::default(), + None, + ); + assert!( + plan.display_ownership_checks.is_empty(), + "no cross-service refs should mean no display-ownership prompts" + ); + } + + #[test] + fn plan_change_contract_gate_validates_deterministic_metadata() { + use super::{ + ChangeImpactSummary, PLAN_CHANGE_SCHEMA_VERSION, PLAN_CHANGE_SECTIONS, + build_plan_change, validate_plan_change_contract, + }; + + let mut plan = build_plan_change( + "createOrder", + true, + &[], + &[], + &[], + &[], + &ChangeImpactSummary::default(), + None, + ); + + // A freshly built product satisfies the contract: current schema + // version and the exact canonical section manifest, deterministically. + assert_eq!(plan.contract.schema_version, PLAN_CHANGE_SCHEMA_VERSION); + assert!( + plan.contract + .sections + .iter() + .map(String::as_str) + .eq(PLAN_CHANGE_SECTIONS) + ); + assert!(validate_plan_change_contract(&plan).is_empty()); + + // The gate fails on a stale schema version. + plan.contract.schema_version = 99; + assert!(!validate_plan_change_contract(&plan).is_empty()); + plan.contract.schema_version = PLAN_CHANGE_SCHEMA_VERSION; + + // The gate fails on a mangled section manifest (missing/renamed section). + plan.contract.sections.pop(); + assert!(!validate_plan_change_contract(&plan).is_empty()); + } + + #[test] + fn plan_change_exclusion_ledger_records_capped_fan_out() { + use super::{ChangeImpactSummary, TruncatedRepos, build_plan_change}; + + let change_impact = ChangeImpactSummary { + truncated_repos: Some(TruncatedRepos { + count: 4, + names: vec!["a".to_owned(), "b".to_owned()], + reason_codes: vec!["fan_out_cap".to_owned()], + }), + ..Default::default() + }; + let plan = build_plan_change( + "createOrder", + true, + &[], + &[], + &[], + &[], + &change_impact, + None, + ); + + assert!( + plan.contract + .exclusion_ledger + .iter() + .any(|entry| entry.contains("capped") && entry.contains('4')), + "ledger should record the capped fan-out: {:?}", + plan.contract.exclusion_ledger + ); + + // Sections that are not yet computed are recorded, so an empty value is + // never silently read as "nothing applies". + assert!( + plan.contract + .exclusion_ledger + .iter() + .any(|entry| entry.contains("standards_to_preserve")), + "ledger should record not-yet-computed sections: {:?}", + plan.contract.exclusion_ledger + ); + } + #[test] fn items_without_evidence_are_not_boosted() { let mut items = vec![make_pack_item(80, "a.rs", false)]; diff --git a/crates/gather-step-mcp/tests/batch_plan_change.rs b/crates/gather-step-mcp/tests/batch_plan_change.rs new file mode 100644 index 00000000..4c13a4ee --- /dev/null +++ b/crates/gather-step-mcp/tests/batch_plan_change.rs @@ -0,0 +1,62 @@ +//! Regression: `batch_query` must route `plan_change` to the typed +//! twelve-section product (`PlanChangeResponse`), not the legacy planning-pack +//! `ContextPackResponse`. The direct MCP route and the batch route must agree. + +use std::sync::Arc; + +use gather_step_mcp::{ + config::{McpContext, McpServerConfig}, + server::GatherStepMcpServer, + tools::composite::{BatchQueryOperation, BatchQueryRequest}, +}; +use gather_step_storage::WorkspaceStores; +use rmcp::handler::server::wrapper::Parameters; + +#[tokio::test] +async fn batch_query_plan_change_returns_typed_product() { + let tmp = tempfile::tempdir().expect("tempdir"); + let root = tmp.path(); + let stores = Arc::new(WorkspaceStores::open(root).expect("workspace stores should open")); + let config = McpServerConfig::new(root.join("registry.json"), root.join("graph.redb")); + let ctx = McpContext::from_workspace_stores(config, stores); + let server = GatherStepMcpServer::new(ctx); + + let request = BatchQueryRequest { + ops: vec![BatchQueryOperation { + tool: "plan_change".to_owned(), + arguments: serde_json::json!({ "target": "anySymbol" }), + }], + }; + + let response = server + .batch_query_tool(Parameters(request)) + .await + .expect("batch_query should succeed"); + + // These keys exist only on the typed PlanChangeResponse, never on the + // legacy ContextPackResponse — so their presence proves the batch route + // now matches the direct route. + let body = serde_json::to_string(&response.0).expect("serialize batch response"); + assert!( + body.contains("reuse_candidates"), + "batch plan_change must return the typed product (missing reuse_candidates): {body}" + ); + assert!( + body.contains("verification_plan") && body.contains("\"sections\""), + "batch plan_change must include the contract section manifest: {body}" + ); + for section in [ + "display_ownership_checks", + "pass_two_gap_dimensions", + "v1_completeness_checklist", + ] { + assert!( + body.contains(section), + "batch plan_change missing section `{section}`: {body}" + ); + } + assert!( + body.contains("\"schema_version\":3"), + "batch plan_change must carry the current contract schema version: {body}" + ); +} diff --git a/crates/gather-step-storage/src/graph_store.rs b/crates/gather-step-storage/src/graph_store.rs index e0a002d4..1085cdb2 100644 --- a/crates/gather-step-storage/src/graph_store.rs +++ b/crates/gather-step-storage/src/graph_store.rs @@ -1,7 +1,8 @@ use std::{ - fs, + env, fs, io::ErrorKind, path::{Path, PathBuf}, + sync::atomic::{AtomicU64, Ordering}, }; use gather_step_core::{ @@ -551,6 +552,7 @@ pub trait GraphStore { pub struct GraphStoreDb { db: Database, path: PathBuf, + snapshot_cleanup: Option, /// Active-`BulkModeGuard` reference count. When > 0, write /// transactions use `Durability::None` to skip fsync. The counter /// (rather than a flat `AtomicBool`) is what makes @@ -623,6 +625,14 @@ impl GraphStoreError { /// once the working set fits. const DEFAULT_GRAPH_CACHE_BYTES: usize = 256 * 1024 * 1024; +impl Drop for GraphStoreDb { + fn drop(&mut self) { + if let Some(snapshot) = self.snapshot_cleanup.take() { + let _ = fs::remove_file(snapshot); + } + } +} + impl GraphStoreDb { fn is_missing_table_error(error: &impl core::fmt::Display) -> bool { error.to_string().contains("does not exist") @@ -660,6 +670,7 @@ impl GraphStoreDb { let store = Self { db, path, + snapshot_cleanup: None, bulk_mode: std::sync::atomic::AtomicUsize::new(0), }; if is_new { @@ -670,6 +681,59 @@ impl GraphStoreDb { Ok(store) } + /// Open the graph store for reading without contending on the writer lock. + /// + /// The fast path is a normal [`Self::open`]. When the store is held by + /// another process (an in-progress index or watch) it falls back to a + /// point-in-time copy of the database file and opens that, so a read during + /// a write returns the last committed state instead of blocking. The copy is + /// removed when the returned store is dropped. + pub fn open_read_only(path: impl AsRef) -> Result { + let path = path.as_ref().to_path_buf(); + match Self::open(&path) { + Err( + GraphStoreError::StorageHeld { .. } | GraphStoreError::StorageHeldByDaemon { .. }, + ) => Self::open_snapshot(&path), + other => other, + } + } + + fn open_snapshot(path: &Path) -> Result { + let snapshot_path = Self::snapshot_path(path); + fs::copy(path, &snapshot_path)?; + let mut builder = Database::builder(); + builder.set_cache_size(DEFAULT_GRAPH_CACHE_BYTES); + let db = builder.open(&snapshot_path).map_err(|error| { + let _ = fs::remove_file(&snapshot_path); + Self::map_open_error(path, error) + })?; + let store = Self { + db, + path: path.to_path_buf(), + snapshot_cleanup: Some(snapshot_path), + bulk_mode: std::sync::atomic::AtomicUsize::new(0), + }; + store.validate_schema_version()?; + Ok(store) + } + + fn snapshot_path(path: &Path) -> PathBuf { + static COUNTER: AtomicU64 = AtomicU64::new(0); + let id = COUNTER.fetch_add(1, Ordering::Relaxed); + let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("graph"); + env::temp_dir().join(format!( + "gather-step-snapshot-{stem}-{}-{id}.redb", + std::process::id() + )) + } + + /// Whether this store is backed by a read-only snapshot copy rather than the + /// live database file. + #[must_use] + pub fn is_snapshot(&self) -> bool { + self.snapshot_cleanup.is_some() + } + fn write_schema_version(&self) -> Result<(), GraphStoreError> { let write_txn = self.begin_write_txn()?; { @@ -3922,6 +3986,34 @@ mod tests { )); } + #[test] + fn open_read_only_falls_back_to_snapshot_when_held() { + let (_workspace, _storage, graph_path) = temp_workspace_graph_path("held-snapshot"); + let function = node("service-a", "src/a.ts", NodeKind::Function, "execute", 0); + + let holder = GraphStoreDb::open(&graph_path).expect("first open should succeed"); + holder + .insert_node(&function) + .expect("committed write before the snapshot"); + + let reader = + GraphStoreDb::open_read_only(&graph_path).expect("read-only open should snapshot"); + assert!(reader.is_snapshot(), "held read must use a snapshot copy"); + let loaded = reader + .get_node(function.id) + .expect("snapshot read should succeed") + .expect("committed node should be visible in the snapshot"); + assert_eq!(loaded.name, "execute"); + drop(reader); + drop(holder); + + let direct = GraphStoreDb::open_read_only(&graph_path).expect("uncontended read-only open"); + assert!( + !direct.is_snapshot(), + "an uncontended read must use the live file, not a snapshot" + ); + } + #[test] fn open_stamps_fresh_schema_version_zero() { let graph_path = temp_db_path("fresh-graph-schema"); diff --git a/crates/gather-step-storage/src/search_store.rs b/crates/gather-step-storage/src/search_store.rs index 372dd968..8b8dece3 100644 --- a/crates/gather-step-storage/src/search_store.rs +++ b/crates/gather-step-storage/src/search_store.rs @@ -686,6 +686,77 @@ impl TantivySearchStore { file_path_stored, }) } + + /// Disjunction fallback for multi-**word** capability queries: each + /// whitespace-separated word becomes a `Should` clause and a hit must match + /// at least a majority of them (`ceil(n/2)`). + /// + /// Gated on the *original* query containing two or more words. A single + /// identifier such as `createOrderUseCase` is one word — even though it + /// camelCase-splits into several tokens — and must not fuzzily match + /// siblings that merely share tokens (e.g. `updateOrderUseCase`). Only a + /// genuine phrase like `email notification delivery` should relax to OR. + fn execute_search_disjunctive( + &self, + query_text: &str, + limit: usize, + filters: SearchFilters<'_>, + ) -> Result, SearchStoreError> { + let words: Vec<&str> = query_text.split_whitespace().collect(); + if words.len() < 2 { + return Ok(Vec::new()); + } + + let reader = self.reader.lock(); + let searcher = reader.searcher(); + // Fuzzy on: this is the last-resort recall pass. + let parser = self.build_query_parser(true); + let clauses: Vec<(Occur, Box)> = words + .iter() + .map(|word| { + // Expand each word into (word OR synonyms) so capability phrases + // bridge to the symbol vocabulary (e.g. "login" -> authenticate). + let mut sub: Vec<(Occur, Box)> = Vec::new(); + let (word_query, _) = parser.parse_query_lenient(word); + sub.push((Occur::Should, word_query)); + for synonym in synonym_terms(word) { + let (synonym_query, _) = parser.parse_query_lenient(synonym); + sub.push((Occur::Should, synonym_query)); + } + let clause: Box = if sub.len() == 1 { + sub.pop().expect("clause exists").1 + } else { + Box::new(BooleanQuery::from(sub)) + }; + (Occur::Should, clause) + }) + .collect(); + // INTENTIONAL RECALL BEHAVIOR: ceil(n/2) is 1 for a 2-word query, so + // the last-resort fallback can match a symbol covering a single token + // (possibly via fuzzy/synonym). This trades exact-phrase precision for + // recall and only fires after both conjunctive passes returned empty; + // callers must not treat a non-empty fallback result as an exact match. + let floor = words.len().div_ceil(2).max(1); + let query: Box = + Box::new(BooleanQuery::with_minimum_required_clauses(clauses, floor)); + let query = self.apply_filters(query, filters); + + let fetch_limit = limit.max(1).saturating_mul(5).min(MAX_RESULT_WINDOW); + let collector = TopDocs::with_limit(fetch_limit).order_by_score(); + let docs: Vec<(f32, DocAddress)> = searcher.search(&query, &collector)?; + + let mut hits = docs + .into_iter() + .map(|(score, address)| self.decode_hit(&searcher, address, score, false)) + .collect::, _>>()?; + rerank_hits(&mut hits, query_text); + + Ok(hits + .into_iter() + .take(limit.max(1)) + .map(|scored| scored.hit) + .collect()) + } } impl SearchStore for TantivySearchStore { @@ -718,7 +789,17 @@ impl SearchStore for TantivySearchStore { return Ok(exact_hits); } - self.execute_search(trimmed, limit, false, true, filters) + let fuzzy_hits = self.execute_search(trimmed, limit, false, true, filters)?; + if !fuzzy_hits.is_empty() { + return Ok(fuzzy_hits); + } + + // Both conjunctive passes require every query term to match, so a + // multi-word capability query ("email notification delivery") returns + // nothing when a symbol covers only some terms. Fall back to a + // disjunction with a majority floor so reuse-discovery queries still + // surface partial matches instead of an empty result. + self.execute_search_disjunctive(trimmed, limit, filters) } fn delete_by_files(&self, files: &[(&str, &str)]) -> Result<(), SearchStoreError> { @@ -1071,6 +1152,62 @@ fn infra_repo_penalty(symbol_exact_boost: f32, is_exported: bool, repo: &str) -> if is_infra { 0.85 } else { 1.0 } } +/// Curated concept→vocabulary synonym map (S4). +/// +/// Returns extra terms to OR into a query word's clause in the disjunction +/// fallback so capability phrases bridge to the identifiers code actually uses +/// (e.g. "login" → authenticate). Intentionally small and high-signal; the +/// original word is always searched too, so an empty list is a safe no-op. +fn synonym_terms(word: &str) -> &'static [&'static str] { + // Symmetric concept groups: any member expands to the whole group, so + // recall does not depend on which side of a synonym pair the user typed + // (e.g. "login" and "authenticate" reach each other). The word's own clause + // is also added by the caller; a duplicate OR clause is harmless. + const GROUPS: &[&[&str]] = &[ + &["login", "signin", "auth", "authenticate", "authentication"], + &["logout", "signout", "deauthenticate"], + &[ + "paginate", + "pagination", + "paging", + "cursor", + "offset", + "page", + ], + &["delete", "remove", "destroy"], + &["create", "add", "insert"], + &["update", "edit", "modify"], + &["list", "fetch", "get", "find"], + ]; + for group in GROUPS { + if group.iter().any(|member| word.eq_ignore_ascii_case(member)) { + return group; + } + } + &[] +} + +/// Term-coverage boost (S2). +/// +/// Rewards hits whose symbol-name tokens cover a larger fraction of the query's +/// tokens, so a higher-coverage match outranks a lower-coverage one at equal +/// base score. Scales linearly up to +25% at full coverage. Callers apply this +/// only for multi-word queries; for a single identifier it is meaningless. +fn query_term_coverage_boost(query_tokens: &[String], symbol_name: &str) -> f32 { + if query_tokens.is_empty() { + return 1.0; + } + let symbol_tokens = tokenize_camel_case(symbol_name); + let covered = query_tokens + .iter() + .filter(|qt| symbol_tokens.iter().any(|st| st == *qt)) + .count(); + // Token counts are tiny; u16 conversion avoids the cast-precision-loss lint. + let coverage = f32::from(u16::try_from(covered).unwrap_or(u16::MAX)) + / f32::from(u16::try_from(query_tokens.len()).unwrap_or(u16::MAX)); + 1.0 + coverage * 0.25 +} + fn rerank_hits(hits: &mut [ScoredSearchHit], query: &str) { let newest_timestamp = hits .iter() @@ -1086,6 +1223,10 @@ fn rerank_hits(hits: &mut [ScoredSearchHit], query: &str) { let query_is_pascal = query.chars().next().is_some_and(|c| c.is_ascii_uppercase()); // Tokenize the query once for the file-path token match boost. let query_tokens = tokenize_camel_case(query); + // Coverage boost only applies to genuine multi-word capability queries, + // mirroring the disjunction fallback — a single identifier must not be + // re-ranked by partial token overlap. + let multiword_query = query.split_whitespace().count() >= 2; for scored in hits.iter_mut() { let export_boost = if scored.hit.is_exported { 1.10 } else { 1.0 }; @@ -1110,6 +1251,10 @@ fn rerank_hits(hits: &mut [ScoredSearchHit], query: &str) { // Raised from 1.4× to break ties between multiple same-name hits where // secondary factors (BM25, recency) were pulling unrelated-repo symbols // above the user's likely target. + // NOTE: for a multi-word query (which contains spaces) this exact-match + // boost is always neutral — symbol names have no spaces, so neither + // comparison can hit. Term-coverage is the ordering signal for + // multi-word queries; this boost only differentiates single-token ones. let symbol_name_matches = scored.hit.symbol_name.eq_ignore_ascii_case(query); let symbol_token_matches = !symbol_name_matches && first_code_token(&scored.hit.symbol_name).eq_ignore_ascii_case(query); @@ -1154,6 +1299,13 @@ fn rerank_hits(hits: &mut [ScoredSearchHit], query: &str) { let infra_penalty = infra_repo_penalty(symbol_exact_boost, scored.hit.is_exported, &scored.hit.repo); + // term-coverage boost (S2) — higher query-term coverage ranks higher. + let coverage_boost = if multiword_query { + query_term_coverage_boost(&query_tokens, &scored.hit.symbol_name) + } else { + 1.0 + }; + scored.hit.adjusted_score = scored.base_score * export_boost * exact_boost @@ -1163,7 +1315,8 @@ fn rerank_hits(hits: &mut [ScoredSearchHit], query: &str) { * pascal_type_boost * path_boost * hook_boost - * infra_penalty; + * infra_penalty + * coverage_boost; } hits.sort_by(|left, right| { @@ -1172,6 +1325,11 @@ fn rerank_hits(hits: &mut [ScoredSearchHit], query: &str) { .adjusted_score .partial_cmp(&left.hit.adjusted_score) .unwrap_or(Ordering::Equal) + // Deterministic tie-breaks so equal-score hits have a stable order + // regardless of the underlying Tantivy doc-id ordering. + .then_with(|| left.hit.symbol_name.cmp(&right.hit.symbol_name)) + .then_with(|| left.file_path_stored.cmp(&right.file_path_stored)) + .then_with(|| left.hit.node_id.cmp(&right.hit.node_id)) }); } @@ -1550,6 +1708,170 @@ mod tests { assert!(results[0].exact_match); } + #[test] + fn partial_multiword_query_recovers_via_disjunction_fallback() { + let store = + TantivySearchStore::open(temp_search_dir("or-fallback")).expect("store should open"); + let mut symbol = node("dispatchEmailNotification", "src/notifications/dispatch.ts"); + // Keep `content` empty so the match can only come from the symbol-name + // tokens (dispatch / email / notification), making the assertion precise. + symbol.signature = None; + store + .index_symbol(&SearchDocument::from_node(&symbol, 1)) + .expect("document should index"); + + // Capability-style query: two of three terms ("email", "notification") + // match the symbol's camelCase tokens; "delivery" appears nowhere. The + // conjunctive (AND) default returns nothing, so a disjunction fallback + // with a majority floor must surface the symbol for reuse discovery. + let results = store + .search("email notification delivery", 10) + .expect("search should succeed"); + + assert_eq!( + results.len(), + 1, + "partial multi-word query should match via OR fallback" + ); + assert_eq!(results[0].symbol_name, "dispatchEmailNotification"); + } + + #[test] + fn single_identifier_miss_does_not_fall_back_to_token_sharing_sibling() { + let store = + TantivySearchStore::open(temp_search_dir("or-precision")).expect("store should open"); + let mut existing = node("updateOrderUseCase", "src/b.ts"); + existing.signature = None; + store + .index_symbol(&SearchDocument::from_node(&existing, 1)) + .expect("document should index"); + + // No `createOrderUseCase` exists. Because the query is a single + // identifier (one word), the disjunction fallback must NOT fire and + // resurrect the token-sharing sibling `updateOrderUseCase` on the + // shared Order/Use/Case tokens. Adversarial guard for the OR fallback. + let results = store + .search("createOrderUseCase", 10) + .expect("search should succeed"); + + assert!( + results.is_empty(), + "single-identifier miss must not OR-match a token-sharing sibling" + ); + } + + #[test] + fn rerank_prefers_higher_query_term_coverage_on_equal_base_score() { + use super::{ScoredSearchHit, SearchHit, rerank_hits}; + + fn scored(symbol: &str, base: f32) -> ScoredSearchHit { + ScoredSearchHit { + hit: SearchHit { + node_id: node_id("service-a", "src/x.ts", NodeKind::Function, symbol), + repo: "service-a".to_owned(), + file_path: String::new(), + symbol_name: symbol.to_owned(), + node_kind: NodeKind::Function, + adjusted_score: 0.0, + exact_match: false, + is_exported: true, + lang: "typescript".to_owned(), + }, + base_score: base, + last_modified: 1, + file_path_stored: String::new(), + } + } + + // Equal base score and identical flags. Against "email notification + // delivery", `dispatchEmailNotification` covers two query terms while + // `deliveryService` covers one. The higher-coverage hit must rank first + // even though it is listed second in the input. + let mut hits = vec![ + scored("deliveryService", 1.0), + scored("dispatchEmailNotification", 1.0), + ]; + rerank_hits(&mut hits, "email notification delivery"); + + assert_eq!(hits[0].hit.symbol_name, "dispatchEmailNotification"); + } + + #[test] + fn rerank_breaks_score_ties_deterministically_by_symbol_name() { + use super::{ScoredSearchHit, SearchHit, rerank_hits}; + + fn scored(symbol: &str) -> ScoredSearchHit { + ScoredSearchHit { + hit: SearchHit { + node_id: node_id("service-a", "src/x.ts", NodeKind::Function, symbol), + repo: "service-a".to_owned(), + file_path: String::new(), + symbol_name: symbol.to_owned(), + node_kind: NodeKind::Function, + adjusted_score: 0.0, + exact_match: false, + is_exported: true, + lang: "typescript".to_owned(), + }, + base_score: 1.0, + last_modified: 1, + file_path_stored: String::new(), + } + } + + // Single-word query (no coverage boost): both hits resolve to the same + // adjusted score, so the symbol-name tie-break must impose a stable + // order regardless of input order. + let mut hits = vec![scored("zebra"), scored("alpha")]; + rerank_hits(&mut hits, "unrelated"); + + assert_eq!(hits[0].hit.symbol_name, "alpha"); + assert_eq!(hits[1].hit.symbol_name, "zebra"); + } + + #[test] + fn disjunction_fallback_bridges_query_synonyms_to_symbol_vocabulary() { + let store = TantivySearchStore::open(temp_search_dir("synonym-fallback")) + .expect("store should open"); + let mut symbol = node("authenticateUser", "src/auth/authenticate-user.ts"); + symbol.signature = None; + store + .index_symbol(&SearchDocument::from_node(&symbol, 1)) + .expect("document should index"); + + // "login" is a concept synonym of the symbol's "authenticate" token; + // "workflow" matches nothing. Without synonym expansion the disjunction + // fallback finds no matching term and returns empty; the login → + // authenticate bridge must surface the symbol for capability search. + let results = store + .search("login workflow", 10) + .expect("search should succeed"); + + assert_eq!(results.len(), 1); + assert_eq!(results[0].symbol_name, "authenticateUser"); + } + + #[test] + fn disjunction_fallback_synonyms_are_symmetric() { + let store = TantivySearchStore::open(temp_search_dir("synonym-symmetric")) + .expect("store should open"); + let mut symbol = node("loginUser", "src/auth/login-user.ts"); + symbol.signature = None; + store + .index_symbol(&SearchDocument::from_node(&symbol, 1)) + .expect("document should index"); + + // Reverse of the previous test: the symbol uses "login" and the query + // uses "authenticate". Symmetric synonym groups must bridge either + // direction, so recall does not depend on which term the user typed. + let results = store + .search("authenticate workflow", 10) + .expect("search should succeed"); + + assert_eq!(results.len(), 1); + assert_eq!(results[0].symbol_name, "loginUser"); + } + #[test] fn search_filters_apply_before_collecting_hits() { let store = TantivySearchStore::open(temp_search_dir("filtered-search")) diff --git a/crates/gather-step-storage/src/stores.rs b/crates/gather-step-storage/src/stores.rs index 4865cc2e..ac7e90a4 100644 --- a/crates/gather-step-storage/src/stores.rs +++ b/crates/gather-step-storage/src/stores.rs @@ -52,7 +52,7 @@ impl WorkspaceStores { pub fn open_read_only_search(root: impl AsRef) -> Result { let root = root.as_ref().to_path_buf(); - let graph = Arc::new(GraphStoreDb::open(root.join("graph.redb"))?); + let graph = Arc::new(GraphStoreDb::open_read_only(root.join("graph.redb"))?); let search = Arc::new(TantivySearchStore::open_read_only(root.join("search"))?); let metadata = Arc::new(MetadataStoreDb::open(root.join("metadata.sqlite"))?); Ok(Self { diff --git a/website/package.json b/website/package.json index 9b136814..0b3f3e14 100644 --- a/website/package.json +++ b/website/package.json @@ -1,7 +1,7 @@ { "name": "gather-step-website", "type": "module", - "version": "4.2.1", + "version": "4.3.1", "private": true, "packageManager": "bun@1.3.12", "scripts": { diff --git a/website/src/components/landing/Benchmark.astro b/website/src/components/landing/Benchmark.astro index c84ba87d..78ff8c1e 100644 --- a/website/src/components/landing/Benchmark.astro +++ b/website/src/components/landing/Benchmark.astro @@ -69,7 +69,7 @@ const speedup = (totalManual / totalGatherStep).toFixed(1); on the same machine; medians of three runs.

- v4.2.1 · 2026-06-02 + v4.3.1 · 2026-06-02
@@ -141,7 +141,7 @@ const speedup = (totalManual / totalGatherStep).toFixed(1);
- Planning oracle (v4.2.1) + Planning oracle (v4.3.1)

Full 25-scenario suite, every metric at the HIGH ceiling, latency well under release-gate thresholds (p50 ≤ 50 ms, p95 ≤ 300 ms, p99 ≤ 1000 ms).

diff --git a/website/src/components/landing/Hero.astro b/website/src/components/landing/Hero.astro index ed4622e2..7135b601 100644 --- a/website/src/components/landing/Hero.astro +++ b/website/src/components/landing/Hero.astro @@ -6,7 +6,7 @@ const stats = [ }, { label: 'Spec · Release', - value: 'v4.2.1 · dependency refresh', + value: 'v4.3.1 · planning & reuse quality', }, { label: 'Spec · Deployment', diff --git a/website/src/components/landing/Topology.astro b/website/src/components/landing/Topology.astro index e158e2e2..2bc84d23 100644 --- a/website/src/components/landing/Topology.astro +++ b/website/src/components/landing/Topology.astro @@ -72,6 +72,6 @@ const consumers = [ CONSUMERS · N=4 - Approved · v4.2.1 + Approved · v4.3.1 diff --git a/website/src/content/docs/changelog.md b/website/src/content/docs/changelog.md index 0c39768f..d6a5f5cf 100644 --- a/website/src/content/docs/changelog.md +++ b/website/src/content/docs/changelog.md @@ -5,6 +5,37 @@ description: "User-visible changes to gather-step, listed by release. Updated ma This changelog lists significant user-visible changes. The latest release is shown in full at the top; earlier releases are collapsed under [Earlier releases](#earlier-releases) at the bottom of the page. +## v4.3.1 (2026-06-02) + +Release status: **prepared**. + +Planning- and reuse-quality release. Fixes retrieval recall so reuse search stops returning empty, ranks reuse candidates using the graph, ships a typed `plan_change` product with a stable section contract, and adds the first batch of v4.3.1 gap-closure work (lock-contention disclosure, a display-ownership planning dimension, and mongo/Atlas structural detectors). Partial against the full v4.3.0/v4.3.1 backlog — see the PR for the done/deferred ledger. + +### Added + +- **`gather-step doctor` code-quality advisories** — non-gating findings over the indexed graph: dependency cycles (incl. cross-repo, via Tarjan SCC), mock/fixture imports leaking into production modules, and local forks of shared/design-system components that should be reused. +- **Graph-ranked reuse evidence** in planning packs: reuse candidates are ranked by sibling-consumer count, shared/design-system membership, and cross-repo proof strength before truncation, so a blessed shared component ranks above a bespoke fork (S3). +- **Typed `plan_change` product** with a fixed, contract-checked section set (E1). Sections are always present (possibly empty), with an exclusion ledger recording what was dropped so a capped result is never read as exhaustive. +- **Display-ownership planning dimension** (`display_ownership_checks`): every cross-service reference surfaces the question of whether display fields come from the owner service (snapshot/API) rather than a direct cross-service DB lookup (DSO1). +- **Mongo/Atlas structural safety detectors** with stable rule IDs and confidence: `$lookup` join-key coercion that defeats an index (`GS-MONGO-INDEX-DEFEAT`), bare `$toObjectId` on untrusted input (`GS-MONGO-UNSAFE-COERCION`), unguarded dotted-path `$set` (`GS-MONGO-NULL-PARENT-PATH`), and `dynamic:false` Atlas index↔doc-field drift (`GS-MONGO-ATLAS-INDEX-DRIFT`) (MQS1–MQS3, AIX1). +- **Query-time index freshness** (`fresh`/`stale`/`never_indexed`) is now classified against the working tree's HEAD and surfaced per repo in `gather-step status` (A13). +- **Multi-path traversal provenance**: graph traversals now report every distinct path into a node plus `depth_capped`/`truncated` signals when a walk is cut short by depth or fan-out bounds (B8/B9). + +### Changed + +- **Multi-word search recall**: a conjunctive query that returns nothing now falls back to a disjunction with a min-should-match floor, so a capability query sharing most of its terms still finds the target symbol (S1). Hits are re-ranked by query-term coverage (S2) and expanded through a curated synonym map (S4). +- **Unified `min_confidence` edge filter** across trace/impact/pack traversal, with `None`-confidence edges treated as trusted (A14). +- The `plan_change` contract gate is now **evidentiary** — it asserts schema version, the exact section manifest, and the exclusion ledger, not just section presence (G7). + +### Fixed + +- `batch_query` now routes `plan_change` requests to the typed product instead of the raw planning pack. +- Read commands no longer block or fail when the graph store is held by an in-progress index or watch: they route to the holding daemon when one is serving, otherwise read a point-in-time snapshot of the graph. When neither is possible the command exits with a distinct, documented code and (under `--json`) a `degraded: graph_locked` disclosure, so a blocked read can never be mistaken for an empty-but-successful result. + +### Release-wide + +- Bumped the app, Cargo workspace, internal crate dependency versions, landing-page release stamps, and website package metadata to `4.3.1`. + ## v4.2.1 (2026-06-02) Release status: **prepared**.