|
| 1 | +//! Tantivy full-text search backend — optional feature. |
| 2 | +//! |
| 3 | +//! Provides a PyO3-exposed `TantivyIndex` class wrapping a Tantivy |
| 4 | +//! index for BM25-quality full-text search with sub-100ms query latency. |
| 5 | +//! Documents are code chunks (file_path, content, language, line range). |
| 6 | +
|
| 7 | +#[cfg(feature = "tantivy-backend")] |
| 8 | +use pyo3::prelude::*; |
| 9 | + |
| 10 | +#[cfg(feature = "tantivy-backend")] |
| 11 | +use tantivy::{ |
| 12 | + collector::TopDocs, |
| 13 | + doc, |
| 14 | + query::QueryParser, |
| 15 | + schema::{Field, Schema, STORED, TEXT}, |
| 16 | + Index, IndexReader, IndexWriter, ReloadPolicy, |
| 17 | +}; |
| 18 | + |
| 19 | +#[cfg(feature = "tantivy-backend")] |
| 20 | +use std::path::PathBuf; |
| 21 | + |
| 22 | +/// A Tantivy-backed full-text search index for code chunks. |
| 23 | +/// |
| 24 | +/// Wraps Tantivy's inverted index for BM25-quality full-text search. |
| 25 | +/// Created via `TantivyIndex(directory)` — the index is disk-persistent. |
| 26 | +#[cfg(feature = "tantivy-backend")] |
| 27 | +#[pyclass] |
| 28 | +pub struct TantivyIndex { |
| 29 | + index: Index, |
| 30 | + reader: IndexReader, |
| 31 | + f_file_path: Field, |
| 32 | + f_content: Field, |
| 33 | + f_language: Field, |
| 34 | + f_start_line: Field, |
| 35 | + f_end_line: Field, |
| 36 | + f_chunk_index: Field, |
| 37 | + schema: Schema, |
| 38 | + index_dir: PathBuf, |
| 39 | +} |
| 40 | + |
| 41 | +#[cfg(feature = "tantivy-backend")] |
| 42 | +#[pymethods] |
| 43 | +impl TantivyIndex { |
| 44 | + /// Create or open a Tantivy index at the given directory. |
| 45 | + #[new] |
| 46 | + fn new(directory: String) -> PyResult<Self> { |
| 47 | + let dir = PathBuf::from(&directory); |
| 48 | + std::fs::create_dir_all(&dir).map_err(|e| { |
| 49 | + pyo3::exceptions::PyIOError::new_err(format!("Cannot create index dir: {e}")) |
| 50 | + })?; |
| 51 | + |
| 52 | + let mut schema_builder = Schema::builder(); |
| 53 | + let f_file_path = schema_builder.add_text_field("file_path", STORED); |
| 54 | + let f_content = schema_builder.add_text_field("content", TEXT | STORED); |
| 55 | + let f_language = schema_builder.add_text_field("language", STORED); |
| 56 | + let f_start_line = schema_builder.add_text_field("start_line", STORED); |
| 57 | + let f_end_line = schema_builder.add_text_field("end_line", STORED); |
| 58 | + let f_chunk_index = schema_builder.add_text_field("chunk_index", STORED); |
| 59 | + let schema = schema_builder.build(); |
| 60 | + |
| 61 | + let mmap_dir = |
| 62 | + tantivy::directory::MmapDirectory::open(&dir).map_err(|e| { |
| 63 | + pyo3::exceptions::PyIOError::new_err(format!("Tantivy dir error: {e}")) |
| 64 | + })?; |
| 65 | + |
| 66 | + let index = Index::open_or_create(mmap_dir, schema.clone()).map_err(|e| { |
| 67 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Tantivy index error: {e}")) |
| 68 | + })?; |
| 69 | + |
| 70 | + let reader = index |
| 71 | + .reader_builder() |
| 72 | + .reload_policy(ReloadPolicy::OnCommitWithDelay) |
| 73 | + .try_into() |
| 74 | + .map_err(|e| { |
| 75 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Reader error: {e}")) |
| 76 | + })?; |
| 77 | + |
| 78 | + Ok(Self { |
| 79 | + index, |
| 80 | + reader, |
| 81 | + f_file_path, |
| 82 | + f_content, |
| 83 | + f_language, |
| 84 | + f_start_line, |
| 85 | + f_end_line, |
| 86 | + f_chunk_index, |
| 87 | + schema, |
| 88 | + index_dir: dir, |
| 89 | + }) |
| 90 | + } |
| 91 | + |
| 92 | + /// Add a batch of code chunks to the index. |
| 93 | + /// |
| 94 | + /// Each chunk is a tuple: (file_path, content, language, start_line, end_line, chunk_index) |
| 95 | + fn add_chunks(&self, chunks: Vec<(String, String, String, usize, usize, usize)>) -> PyResult<u64> { |
| 96 | + let mut writer: IndexWriter = self.index.writer(50_000_000).map_err(|e| { |
| 97 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Writer error: {e}")) |
| 98 | + })?; |
| 99 | + |
| 100 | + let mut count = 0u64; |
| 101 | + for (fp, content, lang, sl, el, ci) in chunks { |
| 102 | + writer.add_document(doc!( |
| 103 | + self.f_file_path => fp, |
| 104 | + self.f_content => content, |
| 105 | + self.f_language => lang, |
| 106 | + self.f_start_line => sl.to_string(), |
| 107 | + self.f_end_line => el.to_string(), |
| 108 | + self.f_chunk_index => ci.to_string(), |
| 109 | + )).map_err(|e| { |
| 110 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Add doc error: {e}")) |
| 111 | + })?; |
| 112 | + count += 1; |
| 113 | + } |
| 114 | + |
| 115 | + writer.commit().map_err(|e| { |
| 116 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Commit error: {e}")) |
| 117 | + })?; |
| 118 | + |
| 119 | + Ok(count) |
| 120 | + } |
| 121 | + |
| 122 | + /// Search the index for a query string, returning up to `top_k` results. |
| 123 | + /// |
| 124 | + /// Returns a list of (file_path, content, language, start_line, end_line, chunk_index, score). |
| 125 | + fn search(&self, query: &str, top_k: usize) -> PyResult<Vec<(String, String, String, usize, usize, usize, f32)>> { |
| 126 | + let searcher = self.reader.searcher(); |
| 127 | + let query_parser = QueryParser::for_index(&self.index, vec![self.f_content]); |
| 128 | + let parsed = query_parser.parse_query(query).map_err(|e| { |
| 129 | + pyo3::exceptions::PyValueError::new_err(format!("Query parse error: {e}")) |
| 130 | + })?; |
| 131 | + |
| 132 | + let top_docs = searcher.search(&parsed, &TopDocs::with_limit(top_k)).map_err(|e| { |
| 133 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Search error: {e}")) |
| 134 | + })?; |
| 135 | + |
| 136 | + let mut results = Vec::with_capacity(top_docs.len()); |
| 137 | + for (score, doc_address) in top_docs { |
| 138 | + let doc = searcher.doc::<tantivy::TantivyDocument>(doc_address).map_err(|e| { |
| 139 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Doc fetch error: {e}")) |
| 140 | + })?; |
| 141 | + |
| 142 | + let get_text = |field: Field| -> String { |
| 143 | + doc.get_first(field) |
| 144 | + .and_then(|v| v.as_str()) |
| 145 | + .unwrap_or("") |
| 146 | + .to_string() |
| 147 | + }; |
| 148 | + |
| 149 | + let file_path = get_text(self.f_file_path); |
| 150 | + let content = get_text(self.f_content); |
| 151 | + let language = get_text(self.f_language); |
| 152 | + let start_line: usize = get_text(self.f_start_line).parse().unwrap_or(0); |
| 153 | + let end_line: usize = get_text(self.f_end_line).parse().unwrap_or(0); |
| 154 | + let chunk_index: usize = get_text(self.f_chunk_index).parse().unwrap_or(0); |
| 155 | + |
| 156 | + results.push((file_path, content, language, start_line, end_line, chunk_index, score)); |
| 157 | + } |
| 158 | + |
| 159 | + Ok(results) |
| 160 | + } |
| 161 | + |
| 162 | + /// Remove all documents for a given file path. |
| 163 | + fn remove_file(&self, file_path: &str) -> PyResult<u64> { |
| 164 | + let mut writer: IndexWriter = self.index.writer(50_000_000).map_err(|e| { |
| 165 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Writer error: {e}")) |
| 166 | + })?; |
| 167 | + |
| 168 | + let term = tantivy::Term::from_field_text(self.f_file_path, file_path); |
| 169 | + writer.delete_term(term); |
| 170 | + writer.commit().map_err(|e| { |
| 171 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Commit error: {e}")) |
| 172 | + })?; |
| 173 | + |
| 174 | + Ok(0) // Tantivy doesn't easily report deleted count |
| 175 | + } |
| 176 | + |
| 177 | + /// Clear the entire index. |
| 178 | + fn clear(&self) -> PyResult<()> { |
| 179 | + let mut writer: IndexWriter = self.index.writer(50_000_000).map_err(|e| { |
| 180 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Writer error: {e}")) |
| 181 | + })?; |
| 182 | + writer.delete_all_documents().map_err(|e| { |
| 183 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Clear error: {e}")) |
| 184 | + })?; |
| 185 | + writer.commit().map_err(|e| { |
| 186 | + pyo3::exceptions::PyRuntimeError::new_err(format!("Commit error: {e}")) |
| 187 | + })?; |
| 188 | + Ok(()) |
| 189 | + } |
| 190 | + |
| 191 | + /// Return the number of documents in the index. |
| 192 | + fn num_docs(&self) -> u64 { |
| 193 | + self.reader.searcher().num_docs() |
| 194 | + } |
| 195 | +} |
0 commit comments