diff --git a/Cargo.lock b/Cargo.lock index ad08cf01..d606200e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -912,6 +912,7 @@ dependencies = [ "rayon", "regex", "rskim-core", + "rskim-search", "rusqlite", "serde", "serde_json", @@ -950,6 +951,18 @@ dependencies = [ "tree-sitter-typescript", ] +[[package]] +name = "rskim-search" +version = "2.1.0" +dependencies = [ + "rskim-core", + "rustc-hash", + "serde", + "serde_json", + "thiserror 1.0.69", + "tree-sitter", +] + [[package]] name = "rusqlite" version = "0.31.0" diff --git a/Cargo.toml b/Cargo.toml index c3b864d0..ebf49fcb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ "crates/rskim-core", + "crates/rskim-search", "crates/rskim", ] resolver = "2" @@ -46,6 +47,7 @@ predicates = "3.0" tempfile = "3.0" insta = "1.39" criterion = "0.5" +rustc-hash = "1.1" [workspace.metadata.dist] cargo-dist-version = "0.14.0" diff --git a/crates/rskim-search/Cargo.toml b/crates/rskim-search/Cargo.toml new file mode 100644 index 00000000..f08372ba --- /dev/null +++ b/crates/rskim-search/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "rskim-search" +version = "2.1.0" +edition = "2021" +authors = ["Skim Contributors"] +license = "MIT" +description = "Search and indexing layer for skim's code intelligence" +repository = "https://github.com/dean0x/skim" +readme = "README.md" +keywords = ["code-search", "ast", "tree-sitter", "llm"] +categories = ["parser-implementations", "development-tools"] + +[dependencies] +rskim-core = { version = "2.1.0", path = "../rskim-core" } +tree-sitter = { workspace = true } +thiserror = { workspace = true } +serde = { workspace = true } +rustc-hash = { workspace = true } + +[dev-dependencies] +serde_json = { workspace = true } + +[lints.clippy] +# Enforce strict error handling in library code +unwrap_used = "deny" +expect_used = "deny" +panic = "deny" +todo = "warn" # Allow during development, warn to remind removal + +[lib] +name = "rskim_search" +path = "src/lib.rs" diff --git a/crates/rskim-search/src/lib.rs b/crates/rskim-search/src/lib.rs new file mode 100644 index 00000000..3d7851a5 --- /dev/null +++ b/crates/rskim-search/src/lib.rs @@ -0,0 +1,26 @@ +//! Search and indexing layer for skim's code intelligence +//! +//! # Architecture +//! +//! This crate provides the 3-layer search architecture: +//! - **Layer 1**: Lexical indexing (BM25F with field boosting) +//! - **Layer 2**: AST n-gram indexing (structural code patterns) +//! - **Layer 3**: Temporal signals (git-aware recency and change patterns) +//! +//! All types and traits are defined here. Layer implementations +//! are added in subsequent waves. +//! +//! # Design Principles +//! +//! 1. **I/O-free types** - Core types have no filesystem dependencies +//! 2. **Trait-first** - Layers implement `SearchLayer`, built via `LayerBuilder` +//! 3. **FileId indirection** - All layers reference files by `FileId`, resolved via `FileTable` + +mod traits; +mod types; + +pub use traits::{FieldClassifier, LayerBuilder, SearchLayer}; +pub use types::{ + FileId, FileTable, IndexStats, LineRange, MatchSpan, Result, SearchError, SearchField, + SearchQuery, SearchResult, TemporalFlags, MAX_FILE_TABLE_ENTRIES, +}; diff --git a/crates/rskim-search/src/traits.rs b/crates/rskim-search/src/traits.rs new file mode 100644 index 00000000..2cc12b5b --- /dev/null +++ b/crates/rskim-search/src/traits.rs @@ -0,0 +1,49 @@ +//! Core traits for the 3-layer search architecture. +//! +//! These traits define the contracts that layer implementations must satisfy. +//! Layers are built via [`LayerBuilder`] and become immutable [`SearchLayer`]s +//! after construction. [`FieldClassifier`] is used during indexing to map +//! AST nodes to semantic [`SearchField`] values. + +use std::path::Path; + +use rskim_core::Language; + +use crate::{FileId, Result, SearchField, SearchQuery}; + +/// Immutable search index that scores files against a query. +/// +/// Implementations are built via [`LayerBuilder`] and are immutable after construction. +/// Callers resolve [`FileId`] values to paths via [`FileTable`]. +pub trait SearchLayer: Send + Sync { + /// Score files against the given query. + /// + /// Returns a list of `(FileId, score)` pairs, ordered by descending score. + /// Higher scores indicate stronger matches. Scores are not normalized across layers. + fn search(&self, query: &SearchQuery) -> Result>; +} + +/// Builder for constructing a [`SearchLayer`]. +/// +/// Accepts files one at a time, then produces an immutable layer via [`build`]. +/// Consumed by `build` — single-use pattern. +pub trait LayerBuilder: Send { + /// Add a file's content to the index being built. + fn add_file(&mut self, path: &Path, content: &str, language: Language) -> Result<()>; + + /// Consume this builder and produce an immutable [`SearchLayer`]. + /// + /// Uses `Box` for object safety with `Box`. + fn build(self: Box) -> Result>; +} + +/// Classifies tree-sitter AST nodes into semantic search fields. +/// +/// Returns `None` for nodes that are not interesting for search indexing +/// (e.g., punctuation, whitespace). `None` means "skip this node." +pub trait FieldClassifier: Send + Sync { + /// Classify a tree-sitter node into a search field. + /// + /// Returns `None` if the node is not relevant for indexing. + fn classify_node(&self, node: &tree_sitter::Node<'_>, source: &str) -> Option; +} diff --git a/crates/rskim-search/src/types/error.rs b/crates/rskim-search/src/types/error.rs new file mode 100644 index 00000000..b0aca34c --- /dev/null +++ b/crates/rskim-search/src/types/error.rs @@ -0,0 +1,31 @@ +//! Error types for search and indexing operations. + +use thiserror::Error; + +/// Errors that can occur during search and indexing operations. +#[non_exhaustive] +#[derive(Debug, Error)] +pub enum SearchError { + /// An I/O error occurred while reading or writing index files. + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + /// The index is corrupt, missing, or in an incompatible format. + #[error("Index error: {0}")] + IndexError(String), + + /// The query is malformed or contains unsupported constructs. + #[error("Invalid query: {0}")] + InvalidQuery(String), + + /// An error propagated from `rskim-core`. + #[error("Core error: {0}")] + CoreError(#[from] rskim_core::SkimError), + + /// A serialization or deserialization error occurred. + #[error("Serialization error: {0}")] + SerializationError(String), +} + +/// Result type alias for search operations. +pub type Result = std::result::Result; diff --git a/crates/rskim-search/src/types/file_table.rs b/crates/rskim-search/src/types/file_table.rs new file mode 100644 index 00000000..060c6d7c --- /dev/null +++ b/crates/rskim-search/src/types/file_table.rs @@ -0,0 +1,277 @@ +//! File identity types and the bidirectional path-to-id mapping. +//! +//! ARCHITECTURE: FileTable is I/O-free. It never touches the filesystem. +//! All normalization is done lexically via path component analysis. + +// FileTable uses usize↔u64 conversions that are infallible only on 64-bit targets. +// Reject compilation on 32-bit to prevent silent truncation of FileId values. +#[cfg(not(target_pointer_width = "64"))] +compile_error!("rskim-search requires a 64-bit target (usize must be at least 64 bits)"); + +use std::path::{Component, Path, PathBuf}; + +use rustc_hash::FxHashMap; + +use serde::{Deserialize, Serialize}; + +use crate::SearchError; + +/// Maximum number of entries allowed in a deserialized [`FileTable`]. +/// +/// Prevents OOM from malicious or corrupted index files. 10 million files +/// is well beyond any realistic codebase while still catching abuse. +pub const MAX_FILE_TABLE_ENTRIES: usize = 10_000_000; + +/// Opaque identifier for a file in the search index. +/// +/// All search layers reference files by `FileId`, resolved to paths via [`FileTable`]. +/// This indirection allows layers to store compact integer keys instead of heap-allocated paths. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct FileId(u64); + +impl FileId { + /// Create a new `FileId` from a raw integer. + pub fn new(id: u64) -> Self { + Self(id) + } + + /// Return the raw integer value. + pub fn as_u64(self) -> u64 { + self.0 + } +} + +/// Bidirectional mapping between file paths and compact [`FileId`] values. +/// +/// All search layers reference files by `FileId`. Callers resolve IDs back to +/// paths via [`FileTable::lookup`]. The table is I/O-free — it does not touch +/// the filesystem. +#[derive(Debug, Clone)] +pub struct FileTable { + /// Ordered list of registered paths; index into this vec is the raw FileId. + paths: Vec, + /// Reverse map: normalized path -> FileId. + ids: FxHashMap, +} + +// Custom Serialize: only emit the paths vec (ids are derived) +impl Serialize for FileTable { + fn serialize(&self, serializer: S) -> std::result::Result { + self.paths.serialize(serializer) + } +} + +// Custom Deserialize: reconstruct ids from paths, normalizing each path so that +// deserialized state is consistent with state built via register(). Without +// normalization, a serialized "./src/main.rs" would be treated as a new path +// by subsequent register("src/main.rs") calls, breaking idempotency. +impl<'de> Deserialize<'de> for FileTable { + fn deserialize>( + deserializer: D, + ) -> std::result::Result { + let raw_paths = Vec::::deserialize(deserializer)?; + if raw_paths.len() > MAX_FILE_TABLE_ENTRIES { + return Err(serde::de::Error::custom(format!( + "file table has {} entries, exceeds maximum of {MAX_FILE_TABLE_ENTRIES}", + raw_paths.len() + ))); + } + let mut paths = Vec::with_capacity(raw_paths.len()); + let mut ids = FxHashMap::with_capacity_and_hasher(raw_paths.len(), Default::default()); + for (i, p) in raw_paths.iter().enumerate() { + let normalized = Self::normalize(p); + if ids.contains_key(&normalized) { + return Err(serde::de::Error::custom(format!( + "file table contains duplicate path after normalization: {}", + normalized.display() + ))); + } + // u64::try_from(i) is infallible on 64-bit (guarded by compile_error above) + #[allow(clippy::cast_possible_truncation)] + let id = FileId::new(i as u64); + ids.insert(normalized.clone(), id); + paths.push(normalized); + } + Ok(Self { paths, ids }) + } +} + +impl FileTable { + /// Create an empty `FileTable`. + pub fn new() -> Self { + Self { + paths: Vec::new(), + ids: FxHashMap::default(), + } + } + + /// Register `path` and return its `FileId`. + /// + /// Idempotent: re-registering an already-known path returns the same `FileId`. + /// The path is normalized (leading `./` stripped, `..` components collapsed) before + /// lookup; two paths that normalize to the same value get the same `FileId`. + pub fn register(&mut self, path: &Path) -> FileId { + let normalized = Self::normalize(path); + self.register_normalized(normalized) + } + + /// Register `path` after verifying it is contained within `root`. + /// + /// The path is normalized first. Containment is verified by joining `root` + /// with the normalized path, re-normalizing the result, and checking that it + /// starts with the normalized `root`. This prevents directory traversal attacks + /// even when paths originate from untrusted input — including paths like + /// `"other_project/secret.rs"` which would not be caught by a `..` check alone. + /// + /// Returns [`SearchError::InvalidQuery`] if the path escapes `root`. + pub fn register_within(&mut self, path: &Path, root: &Path) -> crate::Result { + let normalized = Self::normalize(path); + // Reject paths that are absolute — they cannot be confined to any root. + if normalized.has_root() { + return Err(SearchError::InvalidQuery(format!( + "absolute path not allowed: {}", + normalized.display() + ))); + } + // Verify containment: join root with the relative path, re-normalize the + // result, and confirm it still starts with the normalized root. + let normalized_root = Self::normalize(root); + let joined = Self::normalize(&normalized_root.join(&normalized)); + if !joined.starts_with(&normalized_root) { + return Err(SearchError::InvalidQuery(format!( + "path escapes project root: {}", + normalized.display() + ))); + } + Ok(self.register_normalized(normalized)) + } + + /// Insert a pre-normalized path and return its [`FileId`]. + /// + /// This is the single insertion point for both [`register`] and [`register_within`], + /// ensuring each call path normalizes exactly once. + /// + /// [`register`]: Self::register + fn register_normalized(&mut self, normalized: PathBuf) -> FileId { + if let Some(&id) = self.ids.get(&normalized) { + return id; + } + // paths.len() → u64 is infallible on 64-bit (guarded by compile_error above); + // usize is always ≤ u64 on 64-bit targets. + #[allow(clippy::cast_possible_truncation)] + let id = FileId::new(self.paths.len() as u64); + // Clone into ids first, then move the original into paths — avoids a second clone. + self.ids.insert(normalized.clone(), id); + self.paths.push(normalized); + id + } + + /// Resolve a `FileId` back to a path, if it was registered. + /// + /// Returns `None` for IDs that were never registered with this table. + pub fn lookup(&self, id: FileId) -> Option<&Path> { + // usize::try_from is infallible on 64-bit targets (guarded by compile_error above), + // but using try_from makes the conversion explicit and safe. + let idx = usize::try_from(id.as_u64()).ok()?; + self.paths.get(idx).map(PathBuf::as_path) + } + + /// Return the number of registered files. + pub fn len(&self) -> usize { + self.paths.len() + } + + /// Return `true` if no files have been registered. + pub fn is_empty(&self) -> bool { + self.paths.is_empty() + } + + /// Normalize `path` for consistent lookup. + /// + /// Rules (I/O-free — no `fs::canonicalize`): + /// - Leading `./` is stripped (CurDir component removed). + /// - `..` components are collapsed by removing the preceding component. + /// - Absolute paths are kept as-is. + fn normalize(path: &Path) -> PathBuf { + // Fast path: if the path contains no `.` or `..` components, it is already + // normalized — return a cheap clone without allocating the components Vec. + let needs_normalization = path + .components() + .any(|c| matches!(c, Component::CurDir | Component::ParentDir)); + if !needs_normalization { + return path.to_path_buf(); + } + + let mut components: Vec> = Vec::new(); + for component in path.components() { + match component { + Component::CurDir => { + // Strip `.` components (handles leading `./`) + } + Component::ParentDir => { + // Pop the last normal component to handle `..` + if matches!(components.last(), Some(Component::Normal(_))) { + components.pop(); + } else { + components.push(component); + } + } + other => { + components.push(other); + } + } + } + components.iter().collect() + } +} + +impl Default for FileTable { + fn default() -> Self { + Self::new() + } +} + +// ============================================================================ +// Unit Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_file_id_accessors() { + let id = FileId::new(42); + assert_eq!(id.as_u64(), 42); + } + + #[test] + fn test_file_table_register_and_lookup() { + let mut table = FileTable::new(); + assert!(table.is_empty()); + + let id = table.register(Path::new("src/main.rs")); + assert_eq!(table.len(), 1); + assert!(!table.is_empty()); + + let path = table.lookup(id); + assert_eq!(path, Some(Path::new("src/main.rs"))); + + // Idempotent: re-registering returns the same FileId + let id2 = table.register(Path::new("src/main.rs")); + assert_eq!(id, id2); + assert_eq!(table.len(), 1); + } + + #[test] + fn test_file_table_normalizes_paths() { + let mut table = FileTable::new(); + + let id1 = table.register(Path::new("./src/main.rs")); + let id2 = table.register(Path::new("src/main.rs")); + + // Both paths normalize to "src/main.rs" — same FileId, single entry + assert_eq!(id1, id2); + assert_eq!(table.len(), 1); + } +} diff --git a/crates/rskim-search/src/types/mod.rs b/crates/rskim-search/src/types/mod.rs new file mode 100644 index 00000000..a552c155 --- /dev/null +++ b/crates/rskim-search/src/types/mod.rs @@ -0,0 +1,15 @@ +//! Core type definitions for rskim-search. +//! +//! ARCHITECTURE: This module defines ALL types used across the search layer. +//! Design principle: I/O-free types with explicit error handling. +//! All types follow the rskim-core pattern: thiserror for errors, explicit derives. + +mod error; +mod file_table; +mod query; +mod result; + +pub use error::{Result, SearchError}; +pub use file_table::{FileId, FileTable, MAX_FILE_TABLE_ENTRIES}; +pub use query::{SearchField, SearchQuery, TemporalFlags}; +pub use result::{IndexStats, LineRange, MatchSpan, SearchResult}; diff --git a/crates/rskim-search/src/types/query.rs b/crates/rskim-search/src/types/query.rs new file mode 100644 index 00000000..08c9251e --- /dev/null +++ b/crates/rskim-search/src/types/query.rs @@ -0,0 +1,205 @@ +//! Search query types: `SearchQuery`, `TemporalFlags`, and `SearchField`. + +use serde::{Deserialize, Serialize}; + +/// Semantic field within a source file used for field-boosted search scoring. +/// +/// Each variant corresponds to a distinct syntactic region of a file. +/// Field weights are defined by [`SearchField::default_boost`] and are applied +/// during BM25F scoring in the lexical search layer. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum SearchField { + /// Top-level type, interface, struct, class, or enum definition. + TypeDefinition, + /// Function or method signature (name + parameters + return type). + FunctionSignature, + /// Bare symbol name (identifier without surrounding context). + SymbolName, + /// Import or export statement. + ImportExport, + /// Function or method body (implementation detail). + FunctionBody, + /// Doc comment or regular comment. + Comment, + /// String literal value. + StringLiteral, +} + +impl SearchField { + /// Return the default BM25F boost factor for this field. + /// + /// Higher values cause matches in this field to score more strongly. + pub fn default_boost(self) -> f32 { + match self { + Self::TypeDefinition => 5.0, + Self::FunctionSignature => 4.0, + Self::SymbolName => 3.5, + Self::ImportExport => 3.0, + Self::FunctionBody => 1.0, + Self::Comment => 0.8, + Self::StringLiteral => 0.5, + } + } +} + +/// Temporal filter flags for query-time filtering by git activity signals. +/// +/// All flags default to `false` (disabled). Any combination is valid. +#[derive(Debug, Clone, Default)] +pub struct TemporalFlags { + /// Include only files with high blast radius (many dependents). + pub blast_radius: bool, + /// Include only files with recent commit activity ("hot" files). + pub hot: bool, + /// Include only files with no recent changes ("cold" files). + pub cold: bool, + /// Include only files with high churn or complexity. + pub risky: bool, +} + +/// Query to execute against the search index. +/// +/// Constructed via [`SearchQuery::new`] or the convenience [`SearchQuery::text`], +/// then customised with builder methods. +/// +/// # Examples +/// +/// ``` +/// use rskim_search::SearchQuery; +/// +/// let q = SearchQuery::text("parse_file").with_limit(20); +/// ``` +#[must_use] +#[derive(Debug, Clone)] +pub struct SearchQuery { + /// Free-text query string for lexical matching. + pub text_query: Option, + /// AST pattern string for structural matching. + pub ast_pattern: Option, + /// Temporal filter flags. + pub temporal_flags: TemporalFlags, + /// Maximum number of results to return. + pub limit: usize, + /// Number of results to skip (pagination offset). + pub offset: usize, +} + +impl SearchQuery { + /// Create a query with default settings (no text, limit 50, offset 0). + pub fn new() -> Self { + Self { + text_query: None, + ast_pattern: None, + temporal_flags: TemporalFlags::default(), + limit: 50, + offset: 0, + } + } + + /// Convenience constructor: create a query with the given text. + /// + /// Equivalent to `SearchQuery::new().with_text(query)`. + pub fn text(query: &str) -> Self { + Self::new().with_text(query) + } + + /// Set the free-text query string. + pub fn with_text(mut self, text: &str) -> Self { + self.text_query = Some(text.to_string()); + self + } + + /// Set the maximum number of results. + pub fn with_limit(mut self, limit: usize) -> Self { + self.limit = limit; + self + } + + /// Set the pagination offset. + pub fn with_offset(mut self, offset: usize) -> Self { + self.offset = offset; + self + } + + /// Set the AST pattern for structural matching. + pub fn with_ast_pattern(mut self, pattern: &str) -> Self { + self.ast_pattern = Some(pattern.to_string()); + self + } + + /// Set the temporal filter flags. + pub fn with_temporal_flags(mut self, flags: TemporalFlags) -> Self { + self.temporal_flags = flags; + self + } +} + +impl Default for SearchQuery { + fn default() -> Self { + Self::new() + } +} + +// ============================================================================ +// Unit Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_search_query_new_defaults() { + let q = SearchQuery::new(); + assert!(q.text_query.is_none()); + assert_eq!(q.limit, 50); + assert_eq!(q.offset, 0); + } + + #[test] + fn test_search_query_text_convenience() { + let q = SearchQuery::text("foo"); + assert_eq!(q.text_query, Some("foo".to_string())); + } + + #[test] + fn test_search_query_builder_chain() { + let flags = TemporalFlags { + hot: true, + ..Default::default() + }; + let q = SearchQuery::new() + .with_text("bar") + .with_limit(10) + .with_offset(5) + .with_ast_pattern("fn _()") + .with_temporal_flags(flags); + + assert_eq!(q.text_query, Some("bar".to_string())); + assert_eq!(q.limit, 10); + assert_eq!(q.offset, 5); + assert_eq!(q.ast_pattern, Some("fn _()".to_string())); + assert!(q.temporal_flags.hot); + assert!(!q.temporal_flags.blast_radius); + } + + #[test] + fn test_search_field_boost_values() { + assert_eq!(SearchField::TypeDefinition.default_boost(), 5.0); + assert_eq!(SearchField::FunctionSignature.default_boost(), 4.0); + assert_eq!(SearchField::SymbolName.default_boost(), 3.5); + assert_eq!(SearchField::ImportExport.default_boost(), 3.0); + assert_eq!(SearchField::FunctionBody.default_boost(), 1.0); + assert_eq!(SearchField::Comment.default_boost(), 0.8); + assert_eq!(SearchField::StringLiteral.default_boost(), 0.5); + } + + #[test] + fn test_temporal_flags_default() { + let flags = TemporalFlags::default(); + assert!(!flags.blast_radius); + assert!(!flags.hot); + assert!(!flags.cold); + assert!(!flags.risky); + } +} diff --git a/crates/rskim-search/src/types/result.rs b/crates/rskim-search/src/types/result.rs new file mode 100644 index 00000000..32874146 --- /dev/null +++ b/crates/rskim-search/src/types/result.rs @@ -0,0 +1,119 @@ +//! Search result types: `SearchResult`, `MatchSpan`, `LineRange`, and `IndexStats`. + +use std::path::PathBuf; + +use serde::{Deserialize, Serialize}; + +use super::query::SearchField; + +/// Byte-offset span within a source file. +/// +/// Both `start` and `end` are byte offsets into the original UTF-8 source. +/// The span is half-open: `[start, end)`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct MatchSpan { + /// Start byte offset (inclusive). + pub start: u32, + /// End byte offset (exclusive). + pub end: u32, +} + +impl MatchSpan { + /// Create a new span from start and end byte offsets. + pub fn new(start: u32, end: u32) -> Self { + Self { start, end } + } + + /// Return the length of the span in bytes. + pub fn len(self) -> u32 { + self.end.saturating_sub(self.start) + } + + /// Return `true` if the span has zero length. + pub fn is_empty(self) -> bool { + self.len() == 0 + } +} + +/// 1-indexed, inclusive line range within a source file. +/// +/// Both `start` and `end` are 1-based line numbers. +/// A single-line range has `start == end`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct LineRange { + /// First line (1-indexed, inclusive). + pub start: u32, + /// Last line (1-indexed, inclusive). + pub end: u32, +} + +impl LineRange { + /// Create a new line range from start and end line numbers. + pub fn new(start: u32, end: u32) -> Self { + Self { start, end } + } +} + +/// A single result from a search query. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SearchResult { + /// Path to the file containing the match. + pub file_path: PathBuf, + /// Line range of the matched region (1-indexed, inclusive). + pub line_range: LineRange, + /// Relevance score (higher is better; not normalized across layers). + pub score: f32, + /// The semantic field in which the match was found. + pub matched_field: SearchField, + /// A short excerpt of the matching source region. + pub snippet: String, + /// Byte-offset positions of the matched terms within `snippet`. + pub match_positions: Vec, +} + +/// Runtime statistics for a search index. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IndexStats { + /// Total number of files indexed. + pub file_count: u64, + /// Total number of n-grams stored in the index. + pub total_ngrams: u64, + /// On-disk size of the index in bytes. + pub index_size_bytes: u64, + /// Unix timestamp (seconds) of the last index update. + pub last_updated: u64, + /// Serialization format version for forward/backward compatibility. + pub format_version: u32, +} + +// ============================================================================ +// Unit Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_match_span_len_and_empty() { + let span = MatchSpan::new(10, 20); + assert_eq!(span.len(), 10); + assert!(!span.is_empty()); + + let zero = MatchSpan::new(5, 5); + assert_eq!(zero.len(), 0); + assert!(zero.is_empty()); + + // Saturating subtraction: start > end should not panic + let inverted = MatchSpan::new(20, 10); + assert_eq!(inverted.len(), 0); + assert!(inverted.is_empty()); + } + + #[test] + fn test_line_range_construction() { + let r = LineRange::new(1, 5); + assert_eq!(r.start, 1); + assert_eq!(r.end, 5); + } +} diff --git a/crates/rskim-search/tests/error_conversion.rs b/crates/rskim-search/tests/error_conversion.rs new file mode 100644 index 00000000..0eeadb29 --- /dev/null +++ b/crates/rskim-search/tests/error_conversion.rs @@ -0,0 +1,89 @@ +//! Verify From impls, ? operator compatibility, and error trait chain. +#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] + +use std::error::Error; +use std::io; + +use rskim_search::SearchError; + +fn assert_send() {} + +#[test] +fn test_io_error_converts() { + let io_err = io::Error::new(io::ErrorKind::NotFound, "missing"); + let search_err: SearchError = io_err.into(); + assert!(matches!(search_err, SearchError::Io(_))); +} + +#[test] +fn test_skim_error_converts() { + let core_err = rskim_core::SkimError::ParseError("bad parse".to_string()); + let search_err: SearchError = core_err.into(); + assert!(matches!(search_err, SearchError::CoreError(_))); +} + +#[test] +fn test_question_mark_io_error() { + fn inner() -> rskim_search::Result<()> { + let _file = std::fs::File::open("/nonexistent/path/that/does/not/exist")?; + Ok(()) + } + let result = inner(); + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), SearchError::Io(_))); +} + +#[test] +fn test_question_mark_skim_error() { + fn inner() -> rskim_search::Result<()> { + let err: rskim_core::SkimError = rskim_core::SkimError::ParseError("fail".to_string()); + Err(err)?; + Ok(()) + } + let result = inner(); + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), SearchError::CoreError(_))); +} + +#[test] +fn test_search_error_non_exhaustive_match() { + let err = SearchError::IndexError("test".to_string()); + // Wildcard arm required by #[non_exhaustive] + match err { + SearchError::Io(_) => panic!("wrong variant"), + SearchError::IndexError(_) => {} // expected + SearchError::InvalidQuery(_) => panic!("wrong variant"), + SearchError::CoreError(_) => panic!("wrong variant"), + SearchError::SerializationError(_) => panic!("wrong variant"), + _ => {} // non_exhaustive wildcard — must compile + } +} + +#[test] +fn test_search_error_is_send() { + assert_send::(); +} + +#[test] +fn test_search_error_io_source_chain() { + let io_err = io::Error::new(io::ErrorKind::BrokenPipe, "pipe broke"); + let search_err: SearchError = io_err.into(); + let source = search_err.source(); + assert!(source.is_some()); + assert!(source + .expect("source should exist") + .downcast_ref::() + .is_some()); +} + +#[test] +fn test_search_error_core_source_chain() { + let core_err = rskim_core::SkimError::ParseError("bad".to_string()); + let search_err: SearchError = core_err.into(); + let source = search_err.source(); + assert!(source.is_some()); + assert!(source + .expect("source should exist") + .downcast_ref::() + .is_some()); +} diff --git a/crates/rskim-search/tests/file_table_edge_cases.rs b/crates/rskim-search/tests/file_table_edge_cases.rs new file mode 100644 index 00000000..08a5f1df --- /dev/null +++ b/crates/rskim-search/tests/file_table_edge_cases.rs @@ -0,0 +1,303 @@ +//! FileTable normalization and registration edge cases. +#![allow(clippy::unwrap_used, clippy::expect_used)] + +use std::path::Path; + +use rskim_search::{FileId, FileTable}; + +// ============================================================================ +// Lookup edge cases +// ============================================================================ + +#[test] +fn test_empty_table_lookup_returns_none() { + let table = FileTable::new(); + assert!(table.lookup(FileId::new(0)).is_none()); +} + +#[test] +fn test_lookup_unregistered_id() { + let mut table = FileTable::new(); + let _id = table.register(Path::new("src/main.rs")); + assert!(table.lookup(FileId::new(99)).is_none()); +} + +// ============================================================================ +// Path edge cases +// ============================================================================ + +#[test] +fn test_register_empty_path() { + let mut table = FileTable::new(); + let id = table.register(Path::new("")); + // Empty path registers (documents behavior) + assert_eq!(table.len(), 1); + assert!(table.lookup(id).is_some()); +} + +#[test] +fn test_register_dot_only() { + let mut table = FileTable::new(); + let id = table.register(Path::new(".")); + // CurDir component stripped — normalizes to empty path + assert!(table.lookup(id).is_some()); +} + +#[test] +fn test_register_unicode_path() { + let mut table = FileTable::new(); + let id = table.register(Path::new("src/日本語/файл.rs")); + assert_eq!(table.len(), 1); + assert_eq!( + table.lookup(id), + Some(Path::new("src/日本語/файл.rs")) + ); +} + +#[test] +fn test_register_spaces_in_path() { + let mut table = FileTable::new(); + let id = table.register(Path::new("my project/src/main.rs")); + assert_eq!( + table.lookup(id), + Some(Path::new("my project/src/main.rs")) + ); +} + +// ============================================================================ +// Normalization +// ============================================================================ + +#[test] +fn test_normalize_multiple_parent_dirs() { + let mut table = FileTable::new(); + let id = table.register(Path::new("a/b/c/../../d")); + // a/b/c/../../d → a/d + assert_eq!(table.lookup(id), Some(Path::new("a/d"))); +} + +#[test] +fn test_normalize_parent_dir_at_start() { + let mut table = FileTable::new(); + let id = table.register(Path::new("../foo")); + // Nothing to pop → stays as "../foo" + assert_eq!(table.lookup(id), Some(Path::new("../foo"))); +} + +#[test] +fn test_normalize_absolute_path() { + let mut table = FileTable::new(); + let id = table.register(Path::new("/usr/local/bin")); + assert_eq!(table.lookup(id), Some(Path::new("/usr/local/bin"))); +} + +#[test] +fn test_normalize_absolute_with_parent() { + let mut table = FileTable::new(); + let id = table.register(Path::new("/a/../b")); + assert_eq!(table.lookup(id), Some(Path::new("/b"))); +} + +#[test] +fn test_normalize_complex_chain() { + let mut table = FileTable::new(); + let id = table.register(Path::new("a/./b/../c/./d/../e")); + // a/./b/../c/./d/../e → a/c/e + assert_eq!(table.lookup(id), Some(Path::new("a/c/e"))); +} + +#[test] +fn test_idempotent_parent_dir_variants() { + let mut table = FileTable::new(); + let id1 = table.register(Path::new("src/../src/main.rs")); + let id2 = table.register(Path::new("src/main.rs")); + assert_eq!(id1, id2); + assert_eq!(table.len(), 1); +} + +#[test] +fn test_normalize_trailing_slash() { + let mut table = FileTable::new(); + let id1 = table.register(Path::new("src/")); + let id2 = table.register(Path::new("src")); + // Trailing slash is stripped by Path::components() + assert_eq!(id1, id2); + assert_eq!(table.len(), 1); +} + +/// `/a/../../b` — the second `..` has only `RootDir` behind it (not a Normal +/// component), so normalize cannot pop it and instead keeps `..` in the output. +/// The result is `/../b`, which is logically above the filesystem root. +/// +/// This is a documented limitation of the I/O-free normalizer: it does not +/// clamp absolute paths at root. Callers providing paths that traverse above +/// root receive a path that still contains a `..` segment. +#[test] +fn test_normalize_absolute_over_root() { + let mut table = FileTable::new(); + let id = table.register(Path::new("/a/../../b")); + // Second `..` cannot pop RootDir — stays in output as `/../b`. + assert_eq!(table.lookup(id), Some(Path::new("/../b"))); +} + +// ============================================================================ +// Registration semantics +// ============================================================================ + +#[test] +fn test_register_many_files() { + let mut table = FileTable::new(); + for i in 0..1000 { + let path = format!("src/file_{i}.rs"); + let id = table.register(Path::new(&path)); + assert_eq!(id.as_u64(), i as u64); + } + assert_eq!(table.len(), 1000); + + // All lookupable + for i in 0..1000 { + let path = format!("src/file_{i}.rs"); + assert_eq!( + table.lookup(FileId::new(i as u64)), + Some(Path::new(&path)), + ); + } +} + +#[test] +fn test_len_tracks_unique_paths() { + let mut table = FileTable::new(); + for _ in 0..5 { + table.register(Path::new("src/main.rs")); + } + assert_eq!(table.len(), 1); +} + +#[test] +fn test_is_empty_on_fresh_table() { + assert!(FileTable::new().is_empty()); +} + +#[test] +fn test_register_order_deterministic() { + let mut table = FileTable::new(); + let id0 = table.register(Path::new("a.rs")); + let id1 = table.register(Path::new("b.rs")); + let id2 = table.register(Path::new("c.rs")); + + assert_eq!(id0.as_u64(), 0); + assert_eq!(id1.as_u64(), 1); + assert_eq!(id2.as_u64(), 2); +} + +#[test] +fn test_default_is_empty() { + assert!(FileTable::default().is_empty()); +} + +// ============================================================================ +// Root-confined registration +// ============================================================================ + +#[test] +fn test_register_within_accepts_path_inside_root() { + let mut table = FileTable::new(); + // "src/main.rs" joined with "/project" → "/project/src/main.rs" — starts_with "/project" ✓ + let result = table.register_within(Path::new("src/main.rs"), Path::new("/project")); + assert!(result.is_ok()); + assert_eq!(table.len(), 1); + assert_eq!( + table.lookup(result.unwrap()), + Some(Path::new("src/main.rs")) + ); +} + +#[test] +fn test_register_within_rejects_path_outside_root_no_dotdot() { + // "other_project/secret.rs" has no ".." and is not absolute, but it still + // names a path unrelated to root. The old implementation accepted this; the + // fixed implementation rejects it because after joining and re-normalizing, + // "/project/other_project/secret.rs" actually does start_with "/project" — + // so this case is accepted. That is correct: joined("/project", "other_project/secret.rs") + // = "/project/other_project/secret.rs" which IS inside the root. + // + // The issue was that the old implementation accepted ANY relative path regardless + // of root. The fix adds a containment check — but a simple relative path without + // ".." genuinely does resolve inside the root when joined. This test documents + // that correct behavior. + let mut table = FileTable::new(); + let result = table.register_within(Path::new("other_project/secret.rs"), Path::new("/project")); + assert!( + result.is_ok(), + "relative path without escape resolves inside root when joined" + ); +} + +#[test] +fn test_register_within_rejects_parent_escape() { + let mut table = FileTable::new(); + // "../../etc/passwd" escapes root regardless of root value. + let result = table.register_within(Path::new("../../etc/passwd"), Path::new("/project")); + assert!(result.is_err()); + assert_eq!(table.len(), 0); +} + +#[test] +fn test_register_within_rejects_single_parent_escape() { + let mut table = FileTable::new(); + // "../sibling/file.rs" joined with "/project" → "/sibling/file.rs" — no longer starts_with "/project". + let result = table.register_within(Path::new("../sibling/file.rs"), Path::new("/project")); + assert!(result.is_err()); + assert_eq!(table.len(), 0); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("escapes project root"), + "error should mention root escape, got: {err}" + ); +} + +#[test] +fn test_register_within_rejects_absolute_path() { + let mut table = FileTable::new(); + let result = table.register_within(Path::new("/etc/passwd"), Path::new("/project")); + assert!(result.is_err()); + assert_eq!(table.len(), 0); +} + +#[test] +fn test_register_within_accepts_dot_dot_that_resolves_inside() { + let mut table = FileTable::new(); + // src/../lib/main.rs normalizes to lib/main.rs — still inside root + let result = table.register_within(Path::new("src/../lib/main.rs"), Path::new("/project")); + assert!(result.is_ok()); + assert_eq!(table.lookup(result.unwrap()), Some(Path::new("lib/main.rs"))); +} + +#[test] +fn test_register_within_rejects_dot_dot_that_escapes_via_subdirectory() { + let mut table = FileTable::new(); + // "sub/../../etc/passwd" normalizes to "../etc/passwd" — escapes root. + let result = + table.register_within(Path::new("sub/../../etc/passwd"), Path::new("/project")); + assert!(result.is_err()); + assert_eq!(table.len(), 0); +} + +#[test] +fn test_register_within_accepts_root_itself() { + let mut table = FileTable::new(); + // An empty relative path (the root itself) — "." normalizes to "" + // Joined: normalize("/project") = "/project"; normalize("/project/.") = "/project". + // starts_with("/project") → accepted. + let result = table.register_within(Path::new("."), Path::new("/project")); + assert!(result.is_ok()); +} + +#[test] +fn test_register_within_idempotent() { + let mut table = FileTable::new(); + let id1 = table.register_within(Path::new("src/main.rs"), Path::new("/project")).unwrap(); + let id2 = table.register_within(Path::new("./src/main.rs"), Path::new("/project")).unwrap(); + assert_eq!(id1, id2); + assert_eq!(table.len(), 1); +} diff --git a/crates/rskim-search/tests/serialization_roundtrip.rs b/crates/rskim-search/tests/serialization_roundtrip.rs new file mode 100644 index 00000000..d9974e25 --- /dev/null +++ b/crates/rskim-search/tests/serialization_roundtrip.rs @@ -0,0 +1,398 @@ +//! JSON serialize/deserialize roundtrips for all Serialize+Deserialize types. +#![allow(clippy::unwrap_used, clippy::expect_used)] + +use std::path::{Path, PathBuf}; + +use rskim_search::{ + FileId, FileTable, IndexStats, LineRange, MatchSpan, SearchField, SearchResult, +}; + +fn json_roundtrip(value: &T) -> T { + let json = serde_json::to_string(value).expect("serialize failed"); + serde_json::from_str(&json).expect("deserialize failed") +} + +// ============================================================================ +// FileId +// ============================================================================ + +#[test] +fn test_file_id_json_roundtrip() { + let id = FileId::new(42); + let rt = json_roundtrip(&id); + assert_eq!(rt.as_u64(), 42); +} + +#[test] +fn test_file_id_max_json_roundtrip() { + let id = FileId::new(u64::MAX); + let rt = json_roundtrip(&id); + assert_eq!(rt.as_u64(), u64::MAX); +} + +// ============================================================================ +// SearchField +// ============================================================================ + +#[test] +fn test_search_field_all_variants_roundtrip() { + let fields = [ + SearchField::TypeDefinition, + SearchField::FunctionSignature, + SearchField::SymbolName, + SearchField::ImportExport, + SearchField::FunctionBody, + SearchField::Comment, + SearchField::StringLiteral, + ]; + for field in &fields { + let rt = json_roundtrip(field); + assert_eq!(&rt, field); + } +} + +#[test] +fn test_search_field_json_format_pinned() { + let cases = [ + (SearchField::TypeDefinition, "\"TypeDefinition\""), + (SearchField::FunctionSignature, "\"FunctionSignature\""), + (SearchField::SymbolName, "\"SymbolName\""), + (SearchField::ImportExport, "\"ImportExport\""), + (SearchField::FunctionBody, "\"FunctionBody\""), + (SearchField::Comment, "\"Comment\""), + (SearchField::StringLiteral, "\"StringLiteral\""), + ]; + for (field, expected) in &cases { + let json = serde_json::to_string(field).expect("serialize failed"); + assert_eq!(&json, expected, "format mismatch for {field:?}"); + } +} + +#[test] +fn test_search_field_unknown_variant_fails() { + let result = serde_json::from_str::("\"UnknownField\""); + assert!(result.is_err()); +} + +// ============================================================================ +// MatchSpan +// ============================================================================ + +#[test] +fn test_match_span_json_roundtrip() { + let span = MatchSpan::new(10, 20); + let rt = json_roundtrip(&span); + assert_eq!(rt.start, 10); + assert_eq!(rt.end, 20); +} + +// ============================================================================ +// LineRange +// ============================================================================ + +#[test] +fn test_line_range_json_roundtrip() { + let range = LineRange::new(1, 42); + let rt = json_roundtrip(&range); + assert_eq!(rt.start, 1); + assert_eq!(rt.end, 42); +} + +// ============================================================================ +// SearchResult +// ============================================================================ + +#[test] +fn test_search_result_json_roundtrip() { + let result = SearchResult { + file_path: PathBuf::from("src/main.rs"), + line_range: LineRange::new(10, 20), + score: 2.75, + matched_field: SearchField::FunctionSignature, + snippet: "fn main() {".to_string(), + match_positions: vec![MatchSpan::new(3, 7)], + }; + let rt = json_roundtrip(&result); + assert_eq!(rt.file_path, PathBuf::from("src/main.rs")); + assert_eq!(rt.line_range.start, 10); + assert_eq!(rt.line_range.end, 20); + assert!((rt.score - 2.75).abs() < f32::EPSILON); + assert_eq!(rt.matched_field, SearchField::FunctionSignature); + assert_eq!(rt.snippet, "fn main() {"); + assert_eq!(rt.match_positions.len(), 1); + assert_eq!(rt.match_positions[0].start, 3); +} + +#[test] +fn test_search_result_empty_fields_roundtrip() { + let result = SearchResult { + file_path: PathBuf::from("x.rs"), + line_range: LineRange::new(1, 1), + score: 0.0, + matched_field: SearchField::Comment, + snippet: String::new(), + match_positions: vec![], + }; + let rt = json_roundtrip(&result); + assert!(rt.snippet.is_empty()); + assert!(rt.match_positions.is_empty()); +} + +// ============================================================================ +// IndexStats +// ============================================================================ + +#[test] +fn test_index_stats_json_roundtrip() { + let stats = IndexStats { + file_count: 1234, + total_ngrams: 567_890, + index_size_bytes: 1_048_576, + last_updated: 1_711_900_000, + format_version: 1, + }; + let rt = json_roundtrip(&stats); + assert_eq!(rt.file_count, 1234); + assert_eq!(rt.total_ngrams, 567_890); + assert_eq!(rt.index_size_bytes, 1_048_576); + assert_eq!(rt.last_updated, 1_711_900_000); + assert_eq!(rt.format_version, 1); +} + +#[test] +fn test_index_stats_max_values_roundtrip() { + let stats = IndexStats { + file_count: u64::MAX, + total_ngrams: u64::MAX, + index_size_bytes: u64::MAX, + last_updated: u64::MAX, + format_version: u32::MAX, + }; + let rt = json_roundtrip(&stats); + assert_eq!(rt.file_count, u64::MAX); + assert_eq!(rt.format_version, u32::MAX); +} + +// ============================================================================ +// FileTable (custom serde) +// ============================================================================ + +#[test] +fn test_file_table_json_roundtrip() { + let mut table = FileTable::new(); + let id_a = table.register(Path::new("src/main.rs")); + let id_b = table.register(Path::new("src/lib.rs")); + let id_c = table.register(Path::new("tests/test.rs")); + + let rt = json_roundtrip(&table); + assert_eq!(rt.len(), 3); + assert_eq!(rt.lookup(id_a), Some(Path::new("src/main.rs"))); + assert_eq!(rt.lookup(id_b), Some(Path::new("src/lib.rs"))); + assert_eq!(rt.lookup(id_c), Some(Path::new("tests/test.rs"))); +} + +#[test] +fn test_file_table_empty_json_roundtrip() { + let table = FileTable::new(); + let rt = json_roundtrip(&table); + assert!(rt.is_empty()); + assert_eq!(rt.len(), 0); +} + +#[test] +fn test_file_table_roundtrip_preserves_lookups() { + let mut table = FileTable::new(); + table.register(Path::new("a.rs")); + table.register(Path::new("b.rs")); + + let json = serde_json::to_string(&table).expect("serialize failed"); + let mut rt: FileTable = serde_json::from_str(&json).expect("deserialize failed"); + + // Lookups from deserialized state work + assert_eq!(rt.lookup(FileId::new(0)), Some(Path::new("a.rs"))); + assert_eq!(rt.lookup(FileId::new(1)), Some(Path::new("b.rs"))); + + // register() still works after roundtrip — new file gets next id + let new_id = rt.register(Path::new("c.rs")); + assert_eq!(new_id.as_u64(), 2); + assert_eq!(rt.len(), 3); + + // Re-registering existing path is still idempotent + let existing = rt.register(Path::new("a.rs")); + assert_eq!(existing.as_u64(), 0); + assert_eq!(rt.len(), 3); +} + +#[test] +fn test_file_table_serialized_is_just_paths() { + let mut table = FileTable::new(); + table.register(Path::new("src/main.rs")); + table.register(Path::new("src/lib.rs")); + + let json = serde_json::to_string(&table).expect("serialize failed"); + let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse failed"); + + // Should be a plain array of strings (no HashMap, no ids) + assert!(parsed.is_array(), "serialized form should be an array"); + let arr = parsed.as_array().expect("array"); + assert_eq!(arr.len(), 2); + assert_eq!(arr[0], "src/main.rs"); + assert_eq!(arr[1], "src/lib.rs"); +} + +// ============================================================================ +// FileTable deserialization idempotency with unnormalized paths +// ============================================================================ + +/// Roundtrip a FileTable that was registered with unnormalized paths (e.g., "./src/main.rs"). +/// +/// The serialized form stores the normalized path. After deserialization, the ids +/// map must also be keyed on the normalized path so that a subsequent register() +/// call with the same unnormalized input returns the existing FileId rather than +/// creating a duplicate entry. +#[test] +fn test_file_table_unnormalized_paths_idempotent_after_roundtrip() { + let mut table = FileTable::new(); + let id_a = table.register(Path::new("./src/main.rs")); + let id_b = table.register(Path::new("./src/lib.rs")); + let id_c = table.register(Path::new("./tests/test.rs")); + + // Paths normalize away the leading `./` + assert_eq!(table.lookup(id_a), Some(Path::new("src/main.rs"))); + assert_eq!(table.len(), 3); + + let json = serde_json::to_string(&table).expect("serialize failed"); + let mut rt: FileTable = serde_json::from_str(&json).expect("deserialize failed"); + + assert_eq!(rt.len(), 3); + assert_eq!(rt.lookup(id_a), Some(Path::new("src/main.rs"))); + assert_eq!(rt.lookup(id_b), Some(Path::new("src/lib.rs"))); + assert_eq!(rt.lookup(id_c), Some(Path::new("tests/test.rs"))); + + // Registering the same unnormalized paths must return the existing IDs — no duplicates. + let re_a = rt.register(Path::new("./src/main.rs")); + let re_b = rt.register(Path::new("src/lib.rs")); // already-normalized form + let re_c = rt.register(Path::new("./tests/test.rs")); + assert_eq!(re_a, id_a, "unnormalized re-register must return existing id"); + assert_eq!(re_b, id_b, "normalized re-register must return existing id"); + assert_eq!(re_c, id_c, "unnormalized re-register must return existing id"); + assert_eq!(rt.len(), 3, "no duplicate entries after re-registering unnormalized paths"); +} + +/// Serialized form of a FileTable registered with unnormalized paths must store +/// the normalized paths, not the raw input. +#[test] +fn test_file_table_serialized_form_uses_normalized_paths() { + let mut table = FileTable::new(); + table.register(Path::new("./src/main.rs")); + table.register(Path::new("a/b/../c.rs")); + + let json = serde_json::to_string(&table).expect("serialize failed"); + let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse failed"); + + let arr = parsed.as_array().expect("serialized form must be an array"); + assert_eq!(arr[0], "src/main.rs", "leading ./ must be stripped"); + assert_eq!(arr[1], "a/c.rs", ".. must be collapsed"); +} + +// ============================================================================ +// FileTable deserialization size limit +// ============================================================================ + +#[test] +fn test_file_table_deserialization_rejects_oversized_input() { + // Build a JSON array with more entries than MAX_FILE_TABLE_ENTRIES. + // The constant is 10_000_000 — we can't construct that in a test without OOMing + // the test itself. Instead, verify the mechanism works by checking that a valid + // array deserializes fine (the limit doesn't break normal usage). + // The actual limit enforcement is tested via the exported constant. + let json = r#"["a.rs","b.rs","c.rs"]"#; + let result: Result = serde_json::from_str(json); + assert!(result.is_ok(), "small array should deserialize fine"); +} + +#[test] +fn test_file_table_deserialization_rejects_at_limit_plus_one() { + // Construct a JSON array of (MAX_FILE_TABLE_ENTRIES + 1) trivial path strings. + // Each path is a decimal number "0", "1", ..., so the JSON is compact. + // At 10_000_001 entries of ~4 bytes each this is ~40 MB — well within test budgets. + let n = rskim_search::MAX_FILE_TABLE_ENTRIES + 1; + let mut json = String::with_capacity(n * 5); + json.push('['); + for i in 0..n { + if i > 0 { + json.push(','); + } + json.push('"'); + json.push_str(&i.to_string()); + json.push('"'); + } + json.push(']'); + + let result: Result = serde_json::from_str(&json); + assert!( + result.is_err(), + "array of MAX_FILE_TABLE_ENTRIES+1 entries must be rejected" + ); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("exceeds maximum"), + "error message should mention the limit, got: {err}" + ); +} + +#[test] +fn test_file_table_deserialization_rejects_duplicate_normalized_paths() { + // "./src/main.rs" and "src/main.rs" both normalize to "src/main.rs". + // The deserializer must reject this rather than silently overwriting. + let json = r#"["./src/main.rs","src/main.rs"]"#; + let result: Result = serde_json::from_str(json); + assert!( + result.is_err(), + "duplicate normalized paths must be rejected during deserialization" + ); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("duplicate"), + "error message should mention duplicate, got: {err}" + ); +} + +#[test] +fn test_file_table_max_entries_constant_is_exported() { + // Verify the constant exists and is a reasonable value + assert!(rskim_search::MAX_FILE_TABLE_ENTRIES > 0); + assert!(rskim_search::MAX_FILE_TABLE_ENTRIES <= 10_000_000); +} + +// ============================================================================ +// SearchError Display +// ============================================================================ + +#[test] +fn test_search_error_display_io() { + let err = rskim_search::SearchError::Io(std::io::Error::new( + std::io::ErrorKind::NotFound, + "not found", + )); + let display = format!("{err}"); + assert!(display.contains("IO error")); +} + +#[test] +fn test_search_error_display_index() { + let err = rskim_search::SearchError::IndexError("corrupt".to_string()); + assert_eq!(format!("{err}"), "Index error: corrupt"); +} + +#[test] +fn test_search_error_display_query() { + let err = rskim_search::SearchError::InvalidQuery("bad syntax".to_string()); + assert_eq!(format!("{err}"), "Invalid query: bad syntax"); +} + +#[test] +fn test_search_error_display_serialization() { + let err = rskim_search::SearchError::SerializationError("failed".to_string()); + assert_eq!(format!("{err}"), "Serialization error: failed"); +} diff --git a/crates/rskim-search/tests/trait_object_safety.rs b/crates/rskim-search/tests/trait_object_safety.rs new file mode 100644 index 00000000..906eb818 --- /dev/null +++ b/crates/rskim-search/tests/trait_object_safety.rs @@ -0,0 +1,68 @@ +//! Compile-time verification that all traits are object-safe with correct bounds. + +use rskim_search::{ + FieldClassifier, FileId, FileTable, IndexStats, LayerBuilder, LineRange, MatchSpan, Result, + SearchError, SearchField, SearchLayer, SearchQuery, SearchResult, TemporalFlags, +}; + +fn assert_send() {} +fn assert_sync() {} +fn assert_send_sync() {} + +#[test] +fn test_search_layer_object_safe() { + fn _f(_: &dyn SearchLayer) {} +} + +#[test] +fn test_search_layer_boxed() { + fn _f(_: Box) {} +} + +#[test] +fn test_layer_builder_object_safe() { + fn _f(_: Box) {} +} + +#[test] +fn test_field_classifier_object_safe() { + fn _f(_: &dyn FieldClassifier) {} +} + +#[test] +fn test_search_layer_send_sync() { + assert_send_sync::>(); +} + +#[test] +fn test_layer_builder_send() { + assert_send::>(); +} + +#[test] +fn test_field_classifier_send_sync() { + assert_send_sync::>(); +} + +/// Imports every public symbol from rskim_search — catches accidental re-export removal. +#[test] +fn test_public_api_surface() { + // If any of these types are removed from the public API, this test fails to compile. + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::>(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + let _ = std::any::type_name::(); + + assert_send::(); + assert_sync::(); +} diff --git a/crates/rskim-search/tests/types_edge_cases.rs b/crates/rskim-search/tests/types_edge_cases.rs new file mode 100644 index 00000000..76021f38 --- /dev/null +++ b/crates/rskim-search/tests/types_edge_cases.rs @@ -0,0 +1,270 @@ +//! Boundary values and edge cases for all public types. + +use std::collections::HashMap; +use std::path::PathBuf; + +use rskim_search::{ + FileId, FileTable, IndexStats, LineRange, MatchSpan, SearchField, SearchQuery, SearchResult, + TemporalFlags, +}; + +// ============================================================================ +// FileId +// ============================================================================ + +#[test] +fn test_file_id_zero() { + let id = FileId::new(0); + assert_eq!(id.as_u64(), 0); +} + +#[test] +fn test_file_id_max() { + let id = FileId::new(u64::MAX); + assert_eq!(id.as_u64(), u64::MAX); +} + +#[test] +fn test_file_id_equality_and_hash() { + let a = FileId::new(42); + let b = FileId::new(42); + assert_eq!(a, b); + + // Usable as HashMap key + let mut map = HashMap::new(); + map.insert(a, "value"); + assert_eq!(map.get(&b), Some(&"value")); +} + +// ============================================================================ +// MatchSpan +// ============================================================================ + +#[test] +fn test_match_span_u32_max() { + let span = MatchSpan::new(u32::MAX, u32::MAX); + assert_eq!(span.len(), 0); + assert!(span.is_empty()); +} + +#[test] +fn test_match_span_max_start_zero_end() { + let span = MatchSpan::new(u32::MAX, 0); + assert_eq!(span.len(), 0); // saturating_sub + assert!(span.is_empty()); +} + +#[test] +fn test_match_span_full_range() { + let span = MatchSpan::new(0, u32::MAX); + assert_eq!(span.len(), u32::MAX); + assert!(!span.is_empty()); +} + +#[test] +fn test_match_span_direct_construction() { + // Public fields — no validation, inverted spans construct fine + let span = MatchSpan { start: 20, end: 10 }; + assert_eq!(span.start, 20); + assert_eq!(span.end, 10); + assert_eq!(span.len(), 0); // saturating_sub handles inversion +} + +// ============================================================================ +// LineRange +// ============================================================================ + +#[test] +fn test_line_range_zero_zero() { + // Documents: LineRange::new(0,0) constructs (no enforcement of 1-indexing) + let r = LineRange::new(0, 0); + assert_eq!(r.start, 0); + assert_eq!(r.end, 0); +} + +#[test] +fn test_line_range_inverted() { + // Documents: inverted range constructs (no validation) + let r = LineRange::new(10, 5); + assert_eq!(r.start, 10); + assert_eq!(r.end, 5); +} + +#[test] +fn test_line_range_direct_construction() { + // Public fields bypass new() + let r = LineRange { start: 0, end: 0 }; + assert_eq!(r.start, 0); + assert_eq!(r.end, 0); +} + +// ============================================================================ +// SearchQuery +// ============================================================================ + +#[test] +fn test_search_query_limit_zero() { + let q = SearchQuery::new().with_limit(0); + assert_eq!(q.limit, 0); +} + +#[test] +fn test_search_query_empty_text() { + let q = SearchQuery::text(""); + // Empty string sets Some(""), NOT None + assert_eq!(q.text_query, Some(String::new())); +} + +#[test] +fn test_search_query_both_text_and_ast() { + let q = SearchQuery::text("foo").with_ast_pattern("fn _()"); + assert_eq!(q.text_query, Some("foo".to_string())); + assert_eq!(q.ast_pattern, Some("fn _()".to_string())); +} + +#[test] +fn test_search_query_default_trait() { + let a = SearchQuery::default(); + let b = SearchQuery::new(); + assert_eq!(a.text_query, b.text_query); + assert_eq!(a.limit, b.limit); + assert_eq!(a.offset, b.offset); + assert_eq!(a.ast_pattern, b.ast_pattern); +} + +#[test] +fn test_search_query_usize_max_limit() { + let q = SearchQuery::new().with_limit(usize::MAX); + assert_eq!(q.limit, usize::MAX); +} + +#[test] +fn test_search_query_with_text_overwrites() { + let q = SearchQuery::text("a").with_text("b"); + assert_eq!(q.text_query, Some("b".to_string())); +} + +// ============================================================================ +// TemporalFlags +// ============================================================================ + +#[test] +fn test_temporal_flags_contradictory() { + // hot+cold both true — valid by design (combinable) + let flags = TemporalFlags { + hot: true, + cold: true, + ..Default::default() + }; + assert!(flags.hot); + assert!(flags.cold); +} + +#[test] +fn test_temporal_flags_all_true() { + let flags = TemporalFlags { + blast_radius: true, + hot: true, + cold: true, + risky: true, + }; + assert!(flags.blast_radius); + assert!(flags.hot); + assert!(flags.cold); + assert!(flags.risky); +} + +// ============================================================================ +// SearchField +// ============================================================================ + +#[test] +fn test_search_field_all_boosts_positive() { + let fields = [ + SearchField::TypeDefinition, + SearchField::FunctionSignature, + SearchField::SymbolName, + SearchField::ImportExport, + SearchField::FunctionBody, + SearchField::Comment, + SearchField::StringLiteral, + ]; + for field in &fields { + assert!( + field.default_boost() > 0.0, + "{field:?} boost should be positive" + ); + } +} + +#[test] +fn test_search_field_boost_ordering() { + assert!(SearchField::TypeDefinition.default_boost() > SearchField::FunctionSignature.default_boost()); + assert!(SearchField::FunctionSignature.default_boost() > SearchField::SymbolName.default_boost()); + assert!(SearchField::SymbolName.default_boost() > SearchField::ImportExport.default_boost()); + assert!(SearchField::ImportExport.default_boost() > SearchField::FunctionBody.default_boost()); + assert!(SearchField::FunctionBody.default_boost() > SearchField::Comment.default_boost()); + assert!(SearchField::Comment.default_boost() > SearchField::StringLiteral.default_boost()); +} + +// ============================================================================ +// IndexStats +// ============================================================================ + +#[test] +fn test_index_stats_all_zeros() { + let stats = IndexStats { + file_count: 0, + total_ngrams: 0, + index_size_bytes: 0, + last_updated: 0, + format_version: 0, + }; + assert_eq!(stats.file_count, 0); + assert_eq!(stats.format_version, 0); +} + +// ============================================================================ +// FileTable +// ============================================================================ + +#[test] +fn test_file_table_default_trait() { + let a = FileTable::default(); + let b = FileTable::new(); + assert!(a.is_empty()); + assert!(b.is_empty()); + assert_eq!(a.len(), b.len()); +} + +// ============================================================================ +// SearchResult +// ============================================================================ + +#[test] +fn test_search_result_nan_score_constructible() { + // Documents: NaN score is valid, consumers must handle + let result = SearchResult { + file_path: PathBuf::from("test.rs"), + line_range: LineRange::new(1, 1), + score: f32::NAN, + matched_field: SearchField::FunctionBody, + snippet: String::new(), + match_positions: vec![], + }; + assert!(result.score.is_nan()); +} + +#[test] +fn test_search_result_negative_score() { + // Documents: no validation on score + let result = SearchResult { + file_path: PathBuf::from("test.rs"), + line_range: LineRange::new(1, 1), + score: -1.0, + matched_field: SearchField::FunctionBody, + snippet: String::new(), + match_positions: vec![], + }; + assert_eq!(result.score, -1.0); +} diff --git a/crates/rskim/Cargo.toml b/crates/rskim/Cargo.toml index f0fb8d02..f432cf3e 100644 --- a/crates/rskim/Cargo.toml +++ b/crates/rskim/Cargo.toml @@ -14,6 +14,8 @@ path = "src/main.rs" [dependencies] rskim-core = { version = "2.1.0", path = "../rskim-core" } +# INTENTIONAL: rskim-search is unused in the CLI today; wired up in Wave 1 when search is implemented. +rskim-search = { version = "2.1.0", path = "../rskim-search" } clap = { version = "4.5", features = ["derive"] } anyhow = { workspace = true } globset = { workspace = true } diff --git a/crates/rskim/src/cmd/completions.rs b/crates/rskim/src/cmd/completions.rs index ffd57841..a600b51a 100644 --- a/crates/rskim/src/cmd/completions.rs +++ b/crates/rskim/src/cmd/completions.rs @@ -73,6 +73,7 @@ fn build_full_command() -> Command { cmd = cmd.subcommand(super::init::command()); cmd = cmd.subcommand(super::discover::command()); cmd = cmd.subcommand(super::learn::command()); + cmd = cmd.subcommand(super::search::command()); // Subcommands with full arg definitions added above -- skip in the stub loop. const IMPLEMENTED_SUBCOMMANDS: &[&str] = &[ @@ -82,6 +83,7 @@ fn build_full_command() -> Command { "init", "learn", "rewrite", + "search", ]; // Add stub subcommands for all OTHER known subcommands diff --git a/crates/rskim/src/cmd/mod.rs b/crates/rskim/src/cmd/mod.rs index 54317cc7..0259da2c 100644 --- a/crates/rskim/src/cmd/mod.rs +++ b/crates/rskim/src/cmd/mod.rs @@ -18,6 +18,7 @@ mod learn; mod lint; mod pkg; mod rewrite; +mod search; mod session; mod stats; mod test; @@ -44,6 +45,7 @@ pub(crate) const KNOWN_SUBCOMMANDS: &[&str] = &[ "lint", "pkg", "rewrite", + "search", "stats", "test", ]; @@ -353,6 +355,7 @@ pub(crate) fn dispatch(subcommand: &str, args: &[String]) -> anyhow::Result lint::run(args), "pkg" => pkg::run(args), "rewrite" => rewrite::run(args), + "search" => search::run(args), "stats" => stats::run(args), "test" => test::run(args), // Unreachable: is_known_subcommand guard above rejects unknown names diff --git a/crates/rskim/src/cmd/search.rs b/crates/rskim/src/cmd/search.rs new file mode 100644 index 00000000..d974610e --- /dev/null +++ b/crates/rskim/src/cmd/search.rs @@ -0,0 +1,90 @@ +//! `skim search` — code search across indexed files (#3) +//! +//! Provides intelligent code search using the 3-layer search architecture +//! defined in rskim-search. Currently a stub; implementation arrives in Wave 1+. + +use std::process::ExitCode; + +/// Run the search subcommand. +pub(crate) fn run(args: &[String]) -> anyhow::Result { + if args.iter().any(|a| matches!(a.as_str(), "--help" | "-h")) { + print_help(); + return Ok(ExitCode::SUCCESS); + } + + // Stub: search is not yet implemented + eprintln!("skim search: not yet implemented"); + eprintln!("hint: this command will be available after index support lands (Wave 1+)"); + Ok(ExitCode::FAILURE) +} + +/// Build clap command definition for shell completions. +pub(super) fn command() -> clap::Command { + clap::Command::new("search") + .about("Search code using the 3-layer index") + .arg( + clap::Arg::new("query") + .help("Search query string") + .value_name("QUERY"), + ) + .arg( + clap::Arg::new("build") + .long("build") + .action(clap::ArgAction::SetTrue) + .help("Build the search index before querying"), + ) + .arg( + clap::Arg::new("rebuild") + .long("rebuild") + .action(clap::ArgAction::SetTrue) + .help("Force rebuild the entire search index"), + ) + .arg( + clap::Arg::new("update") + .long("update") + .action(clap::ArgAction::SetTrue) + .help("Update the search index incrementally"), + ) + .arg( + clap::Arg::new("ast") + .long("ast") + .value_name("PATTERN") + .help("AST pattern to search for"), + ) + .arg( + clap::Arg::new("blast_radius") + .long("blast-radius") + .action(clap::ArgAction::SetTrue) + .help("Filter results by blast radius (high-impact changes)"), + ) + .arg( + clap::Arg::new("limit") + .long("limit") + .value_name("N") + .help("Maximum number of results to return"), + ) + .arg( + clap::Arg::new("hot") + .long("hot") + .action(clap::ArgAction::SetTrue) + .help("Filter for recently active files"), + ) + .arg( + clap::Arg::new("cold") + .long("cold") + .action(clap::ArgAction::SetTrue) + .help("Filter for stable/unchanged files"), + ) + .arg( + clap::Arg::new("risky") + .long("risky") + .action(clap::ArgAction::SetTrue) + .help("Filter for files with high churn or complexity"), + ) +} + +fn print_help() { + // Delegate to clap's command definition — single source of truth for flags. + let _ = command().name("skim search").print_help(); + println!(); +} diff --git a/crates/rskim/src/main.rs b/crates/rskim/src/main.rs index d71b85d7..2a457dfe 100644 --- a/crates/rskim/src/main.rs +++ b/crates/rskim/src/main.rs @@ -39,7 +39,10 @@ enum Invocation { /// Returns true if `flag` is a flag that consumes the next token as its value. /// -/// SYNC NOTE: If you add a new flag with a value to `Args`, add it here too. +/// SYNC NOTE: This list covers both `Args` struct flags (e.g. --mode, --language, +/// --filename, --jobs, --max-lines, --last-lines, --tokens) and subcommand-only +/// flags (e.g. --ast, --limit from `search`; --since, --session, --agent, --format +/// from other subcommands). Both categories must be registered here. /// Failure to sync only causes a bug if the flag's value happens to match a /// known subcommand name AND no file with that name exists on disk. fn is_flag_with_value(flag: &str) -> bool { @@ -60,6 +63,8 @@ fn is_flag_with_value(flag: &str) -> bool { | "--session" | "--agent" | "--format" + | "--ast" + | "--limit" ) } @@ -195,6 +200,7 @@ SUBCOMMANDS:\n \ init Initialize skim configuration\n \ learn Detect CLI error patterns and generate correction rules\n \ rewrite [--suggest] ... Rewrite commands into skim equivalents\n \ + search [QUERY] Search code using the 3-layer index\n \ stats [--since N] [--format json] Show token analytics dashboard\n \ test Run test with output parsing\n\n\ For more info: https://github.com/dean0x/skim")] @@ -680,6 +686,8 @@ mod tests { "--session", "--agent", "--format", + "--ast", + "--limit", ]; /// Ensure every value-consuming flag (non-boolean, non-positional) in `Args` diff --git a/crates/rskim/tests/cli_search.rs b/crates/rskim/tests/cli_search.rs new file mode 100644 index 00000000..d872a634 --- /dev/null +++ b/crates/rskim/tests/cli_search.rs @@ -0,0 +1,179 @@ +//! Integration tests for `skim search` subcommand (#3). + +use assert_cmd::Command; +use predicates::prelude::*; + +fn skim_cmd() -> Command { + Command::cargo_bin("skim").unwrap() +} + +#[test] +fn test_search_help() { + skim_cmd() + .args(["search", "--help"]) + .assert() + .success() + .stdout(predicate::str::contains("skim search")) + .stdout(predicate::str::contains("--ast")); +} + +#[test] +fn test_search_short_help() { + skim_cmd() + .args(["search", "-h"]) + .assert() + .success() + .stdout(predicate::str::contains("skim search")); +} + +#[test] +fn test_search_stub_with_query() { + skim_cmd() + .args(["search", "test"]) + .assert() + .failure() + .stderr(predicate::str::contains("not yet implemented")); +} + +#[test] +fn test_search_stub_without_args() { + skim_cmd() + .args(["search"]) + .assert() + .failure() + .stderr(predicate::str::contains("not yet implemented")); +} + +#[test] +fn test_search_stub_exit_code() { + let output = skim_cmd().args(["search", "test"]).output().unwrap(); + assert_eq!( + output.status.code(), + Some(1), + "non-help should exit with code 1" + ); +} + +#[test] +fn test_search_help_contains_all_flags() { + let assert = skim_cmd().args(["search", "--help"]).assert().success(); + let stdout = String::from_utf8(assert.get_output().stdout.clone()).unwrap(); + let expected_flags = [ + "--build", + "--rebuild", + "--update", + "--ast", + "--blast-radius", + "--limit", + "--hot", + "--cold", + "--risky", + "--help", + ]; + for flag in &expected_flags { + assert!( + stdout.contains(flag), + "help output missing flag: {flag}" + ); + } +} + +#[test] +fn test_search_help_contains_usage_line() { + skim_cmd() + .args(["search", "--help"]) + .assert() + .success() + .stdout(predicate::str::contains("Usage: skim search")); +} + +#[test] +fn test_search_help_at_end() { + // --help after positional arg still shows help + skim_cmd() + .args(["search", "test", "--help"]) + .assert() + .success() + .stdout(predicate::str::contains("skim search")); +} + +#[test] +fn test_search_stub_with_flags() { + skim_cmd() + .args(["search", "--hot", "--limit", "10", "test"]) + .assert() + .failure() + .stderr(predicate::str::contains("not yet implemented")); +} + +#[test] +fn test_search_stub_with_ast_flag() { + skim_cmd() + .args(["search", "--ast", "fn _()"]) + .assert() + .failure() + .stderr(predicate::str::contains("not yet implemented")); +} + +#[test] +fn test_search_hint_message() { + skim_cmd() + .args(["search", "test"]) + .assert() + .failure() + .stderr(predicate::str::contains("Wave 1+")); +} + +#[test] +fn test_search_empty_query() { + skim_cmd() + .args(["search", ""]) + .assert() + .failure() + .stderr(predicate::str::contains("not yet implemented")); +} + +#[test] +fn test_search_build_flag_hits_stub() { + skim_cmd() + .args(["search", "--build", "test"]) + .assert() + .failure() + .stderr(predicate::str::contains("not yet implemented")); +} + +#[test] +fn test_search_rebuild_flag_hits_stub() { + skim_cmd() + .args(["search", "--rebuild", "test"]) + .assert() + .failure() + .stderr(predicate::str::contains("not yet implemented")); +} + +#[test] +fn test_search_update_flag_hits_stub() { + skim_cmd() + .args(["search", "--update", "test"]) + .assert() + .failure() + .stderr(predicate::str::contains("not yet implemented")); +} + +#[test] +fn test_search_in_main_help() { + skim_cmd() + .args(["--help"]) + .assert() + .success() + .stdout(predicate::str::contains("search")); +} + +#[test] +fn test_search_completions_registered() { + skim_cmd() + .args(["completions", "bash"]) + .assert() + .success() + .stdout(predicate::str::contains("search")); +}