From 1749d2aa507da5f8779c7b07baba3e0f5dbe45c5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 20 Oct 2025 03:03:38 +0000 Subject: [PATCH 1/3] Initial plan From 527a7b3fbed9a98acdf9ed51e0e1b7bf92e543ed Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 20 Oct 2025 03:19:59 +0000 Subject: [PATCH 2/3] Add NLP features: NER, sentiment analysis, and text summarization Co-authored-by: wesleyscholl <128409641+wesleyscholl@users.noreply.github.com> --- README.md | 79 +++++++++++++- src/main.rs | 220 ++++++++++++++++++++++++++++++++++++++ src/nlp/mod.rs | 8 ++ src/nlp/ner.rs | 221 +++++++++++++++++++++++++++++++++++++++ src/nlp/sentiment.rs | 197 ++++++++++++++++++++++++++++++++++ src/nlp/summarization.rs | 174 ++++++++++++++++++++++++++++++ 6 files changed, 898 insertions(+), 1 deletion(-) create mode 100644 src/nlp/mod.rs create mode 100644 src/nlp/ner.rs create mode 100644 src/nlp/sentiment.rs create mode 100644 src/nlp/summarization.rs diff --git a/README.md b/README.md index 396755a..a3da965 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,10 @@ BoltAI is a compact, local-first AI agent implemented in Rust with a companion m - Query CLI — simple commands for search, summarization, and diagnostic output - macOS SwiftUI front-end — drag/drop indexing, chat-style interface, and previewed document snippets - PDF extraction support and safety measures to avoid dumping raw documents in prompts or UI + - **NLP capabilities** — Named Entity Recognition, Sentiment Analysis, and Text Summarization + - Pattern-based NER for extracting names, locations, organizations, dates, emails, and monetary values + - Lexicon-based sentiment analysis (positive, neutral, negative) + - Extractive text summarization using sentence scoring ## Quickstart @@ -78,6 +82,49 @@ BoltAI is a compact, local-first AI agent implemented in Rust with a companion m # or open in Xcode to run the app target and inspect the UI ``` + ### Use NLP features + + #### Named Entity Recognition (NER) + + Extract entities like names, locations, organizations, dates, emails, and monetary values from text: + + ```bash + # Analyze a single file + ./target/release/boltai ner -i document.txt + + # Analyze all files in a directory + ./target/release/boltai ner -i /path/to/docs + + # Save results to a file + ./target/release/boltai ner -i document.txt -o entities.txt + ``` + + #### Sentiment Analysis + + Determine sentiment (positive, neutral, or negative) of text: + + ```bash + # Analyze a single file + ./target/release/boltai sentiment -i review.txt + + # Batch analyze multiple files + ./target/release/boltai sentiment -i /path/to/reviews -o sentiment_results.txt + ``` + + #### Text Summarization + + Generate extractive summaries of documents: + + ```bash + # Summarize a single file + ./target/release/boltai summarize -i article.txt + + # Summarize multiple files in a directory + ./target/release/boltai summarize -i /path/to/articles -o summaries.txt + ``` + + **Supported file formats**: `.txt`, `.md`, `.csv`, `.json`, `.pdf` + ## Example output (CLI) After indexing, `boltai query` returns top-k similar documents and a short summary. Example (truncated): @@ -93,15 +140,45 @@ BoltAI is a compact, local-first AI agent implemented in Rust with a companion m BoltAI demonstrates a privacy-first local retrieval pipeline that indexes developer documentation and supports fast summarization and search. It uses TF-IDF for initial vectorization and provides clear extension points for embeddings and LLM-based abstraction. ``` + ### NLP Feature Examples + + **Named Entity Recognition output:** + ``` + Named Entities found in document.txt: + - John Smith (PERSON): score 0.750 + - john.smith@example.com (EMAIL): score 0.950 + - New York (LOCATION): score 0.850 + - Microsoft Corporation (ORGANIZATION): score 0.800 + - $150,000 (MONEY): score 0.900 + - Jan 15, 2024 (DATE): score 0.900 + ``` + + **Sentiment Analysis output:** + ``` + Sentiment analysis for review.txt: + - Label: Positive, Score: 0.857 + ``` + + **Text Summarization output:** + ``` + Summary of article.txt: + Artificial intelligence has become one of the most transformative technologies. + Deep learning has achieved remarkable breakthroughs in computer vision and natural + language processing. Machine learning algorithms optimize trading strategies and + detect fraudulent transactions. + ``` + ## Project architecture - Rust CLI (`src/main.rs`): walks directories, extracts text (including PDFs), computes TF-IDF vectors, and writes `boltai_index.json`. + - NLP module (`src/nlp/`): provides pattern-based NER, lexicon-based sentiment analysis, and extractive text summarization. - mac-ui SwiftUI: orchestrates indexing runs, loads a capped preview of index docs (to avoid huge JSON parsing on the main thread), and sends queries to the CLI. - - Extensibility: The CLI prompt layer is isolated to make it easy to swap the query strategy (keywords → embeddings → hybrid retrieval-augmented generation). + - Extensibility: The CLI prompt layer is isolated to make it easy to swap the query strategy (keywords → embeddings → hybrid retrieval-augmented generation). NLP features use lightweight rule-based approaches but can be upgraded to ML models (rust-bert) when libtorch is available. ## Design decisions & trade-offs - TF-IDF first: fast to compute, explainable, and sufficient for small-to-medium corpora. Replacing TF-IDF with dense embeddings is an intended next step for semantic search. + - Rule-based NLP: Uses regex patterns and lexicons for NER and sentiment analysis. Fast, no external dependencies, but less accurate than ML models. Can be upgraded to rust-bert/transformers when libtorch is available in the environment. - Local-first: prioritizes data privacy and low-latency responses at the expense of requiring local compute resources. - Safety: the UI and CLI avoid including full raw documents in prompts and no longer print raw text as a fallback. The project logs prompts to a local debug file for reproducible tuning. diff --git a/src/main.rs b/src/main.rs index 8cea5f8..dc11e79 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,6 +15,8 @@ use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; use walkdir::WalkDir; +mod nlp; + static WORD_RE: Lazy = Lazy::new(|| Regex::new(r"[a-zA-Z0-9']+").unwrap()); #[derive(Parser)] @@ -43,6 +45,27 @@ enum Commands { #[arg(short = 'm', long = "model")] model: Option, }, + /// Extract named entities from text files + Ner { + #[arg(short, long, help = "Input file path or directory")] + input: PathBuf, + #[arg(short, long, help = "Output file for results (optional)")] + output: Option, + }, + /// Analyze sentiment of text files + Sentiment { + #[arg(short, long, help = "Input file path or directory")] + input: PathBuf, + #[arg(short, long, help = "Output file for results (optional)")] + output: Option, + }, + /// Summarize text from files + Summarize { + #[arg(short, long, help = "Input file path or directory")] + input: PathBuf, + #[arg(short, long, help = "Output file for results (optional)")] + output: Option, + }, } #[derive(Serialize, Deserialize, Debug)] @@ -403,11 +426,208 @@ fn query_with_ollama(index_file: &Path, q: &str, k: usize, model_override: Optio } } +fn handle_ner(input: &Path, output: Option<&Path>) -> Result<()> { + use std::io::Write; + + if input.is_file() { + println!("Analyzing file: {}", input.display()); + let entities = nlp::extract_entities(input)?; + + let mut result = format!("Named Entities found in {}:\n", input.display()); + for entity in &entities { + result.push_str(&format!(" - {} ({}): score {:.3}\n", + entity.word, entity.label, entity.score)); + } + + if let Some(out_path) = output { + let mut file = File::create(out_path)?; + file.write_all(result.as_bytes())?; + println!("Results written to {}", out_path.display()); + } else { + print!("{}", result); + } + } else if input.is_dir() { + let allowed_exts = ["txt", "md", "csv", "json", "pdf"]; + let files: Vec = WalkDir::new(input) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| e.file_type().is_file()) + .filter(|e| { + e.path() + .extension() + .and_then(|s| s.to_str()) + .map(|ext| allowed_exts.contains(&ext)) + .unwrap_or(false) + }) + .map(|e| e.path().to_path_buf()) + .collect(); + + let mut all_results = String::new(); + for file_path in files { + println!("Analyzing: {}", file_path.display()); + match nlp::extract_entities(&file_path) { + Ok(entities) => { + all_results.push_str(&format!("\nFile: {}\n", file_path.display())); + for entity in &entities { + all_results.push_str(&format!(" - {} ({}): score {:.3}\n", + entity.word, entity.label, entity.score)); + } + } + Err(e) => { + eprintln!("Error processing {}: {}", file_path.display(), e); + } + } + } + + if let Some(out_path) = output { + let mut file = File::create(out_path)?; + file.write_all(all_results.as_bytes())?; + println!("Results written to {}", out_path.display()); + } else { + print!("{}", all_results); + } + } else { + return Err(anyhow!("Input path does not exist or is not a file/directory")); + } + + Ok(()) +} + +fn handle_sentiment(input: &Path, output: Option<&Path>) -> Result<()> { + use std::io::Write; + + if input.is_file() { + println!("Analyzing sentiment of file: {}", input.display()); + let sentiments = nlp::analyze_sentiment(input)?; + + let mut result = format!("Sentiment analysis for {}:\n", input.display()); + for sentiment in &sentiments { + result.push_str(&format!(" - Label: {}, Score: {:.3}\n", + sentiment.label, sentiment.score)); + } + + if let Some(out_path) = output { + let mut file = File::create(out_path)?; + file.write_all(result.as_bytes())?; + println!("Results written to {}", out_path.display()); + } else { + print!("{}", result); + } + } else if input.is_dir() { + let allowed_exts = ["txt", "md", "csv", "json", "pdf"]; + let files: Vec = WalkDir::new(input) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| e.file_type().is_file()) + .filter(|e| { + e.path() + .extension() + .and_then(|s| s.to_str()) + .map(|ext| allowed_exts.contains(&ext)) + .unwrap_or(false) + }) + .map(|e| e.path().to_path_buf()) + .collect(); + + let mut all_results = String::new(); + for file_path in files { + println!("Analyzing: {}", file_path.display()); + match nlp::analyze_sentiment(&file_path) { + Ok(sentiments) => { + all_results.push_str(&format!("\nFile: {}\n", file_path.display())); + for sentiment in &sentiments { + all_results.push_str(&format!(" - Label: {}, Score: {:.3}\n", + sentiment.label, sentiment.score)); + } + } + Err(e) => { + eprintln!("Error processing {}: {}", file_path.display(), e); + } + } + } + + if let Some(out_path) = output { + let mut file = File::create(out_path)?; + file.write_all(all_results.as_bytes())?; + println!("Results written to {}", out_path.display()); + } else { + print!("{}", all_results); + } + } else { + return Err(anyhow!("Input path does not exist or is not a file/directory")); + } + + Ok(()) +} + +fn handle_summarize(input: &Path, output: Option<&Path>) -> Result<()> { + use std::io::Write; + + if input.is_file() { + println!("Summarizing file: {}", input.display()); + let summary = nlp::summarize_text(input)?; + + let result = format!("Summary of {}:\n{}\n", input.display(), summary); + + if let Some(out_path) = output { + let mut file = File::create(out_path)?; + file.write_all(result.as_bytes())?; + println!("Summary written to {}", out_path.display()); + } else { + print!("{}", result); + } + } else if input.is_dir() { + let allowed_exts = ["txt", "md", "csv", "json", "pdf"]; + let files: Vec = WalkDir::new(input) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| e.file_type().is_file()) + .filter(|e| { + e.path() + .extension() + .and_then(|s| s.to_str()) + .map(|ext| allowed_exts.contains(&ext)) + .unwrap_or(false) + }) + .map(|e| e.path().to_path_buf()) + .collect(); + + let mut all_results = String::new(); + for file_path in files { + println!("Summarizing: {}", file_path.display()); + match nlp::summarize_text(&file_path) { + Ok(summary) => { + all_results.push_str(&format!("\nFile: {}\nSummary: {}\n", + file_path.display(), summary)); + } + Err(e) => { + eprintln!("Error processing {}: {}", file_path.display(), e); + } + } + } + + if let Some(out_path) = output { + let mut file = File::create(out_path)?; + file.write_all(all_results.as_bytes())?; + println!("Summaries written to {}", out_path.display()); + } else { + print!("{}", all_results); + } + } else { + return Err(anyhow!("Input path does not exist or is not a file/directory")); + } + + Ok(()) +} + fn main() -> Result<()> { let cli = Cli::parse(); match cli.command { Commands::Index { dir, out } => index_dir(&dir, &out)?, Commands::Query { index, q, k, model } => query_with_ollama(&index, &q, k, model)?, + Commands::Ner { input, output } => handle_ner(&input, output.as_deref())?, + Commands::Sentiment { input, output } => handle_sentiment(&input, output.as_deref())?, + Commands::Summarize { input, output } => handle_summarize(&input, output.as_deref())?, } Ok(()) } diff --git a/src/nlp/mod.rs b/src/nlp/mod.rs new file mode 100644 index 0000000..2a251f5 --- /dev/null +++ b/src/nlp/mod.rs @@ -0,0 +1,8 @@ +// NLP module for BoltAI +pub mod ner; +pub mod sentiment; +pub mod summarization; + +pub use ner::extract_entities; +pub use sentiment::analyze_sentiment; +pub use summarization::summarize_text; diff --git a/src/nlp/ner.rs b/src/nlp/ner.rs new file mode 100644 index 0000000..ce2949c --- /dev/null +++ b/src/nlp/ner.rs @@ -0,0 +1,221 @@ +// Named Entity Recognition module using pattern-based approach +// This is a lightweight implementation that uses regex patterns to identify entities. +// For production use, consider integrating rust-bert when libtorch is available. +use anyhow::{anyhow, Result}; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::fs::File; +use std::io::Read; +use std::path::Path; +use once_cell::sync::Lazy; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Entity { + pub word: String, + pub label: String, + pub score: f32, + pub start: usize, + pub end: usize, +} + +// Regex patterns for common entity types +static PERSON_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b").unwrap() +}); + +static ORGANIZATION_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r"\b([A-Z][a-z]+(?:\s+(?:Inc|LLC|Corp|Corporation|Ltd|Limited|Company|Co|Group|Institute|University|College)\.?))\b").unwrap() +}); + +static LOCATION_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r"\b((?:United States|USA|UK|United Kingdom|New York|California|Texas|London|Paris|Tokyo|Beijing|Washington|Chicago|Los Angeles|San Francisco|Boston|Seattle|Miami|Austin|Denver|Portland|Atlanta))\b").unwrap() +}); + +static EMAIL_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r"\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b").unwrap() +}); + +static DATE_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4})\b").unwrap() +}); + +static MONEY_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r"\$\s*\d+(?:,\d{3})*(?:\.\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|EUR|GBP|dollars?|euros?|pounds?)").unwrap() +}); + +pub fn extract_entities(file_path: &Path) -> Result> { + let text = read_file(file_path)?; + extract_entities_from_text(&text) +} + +pub fn extract_entities_from_text(text: &str) -> Result> { + let mut entities = Vec::new(); + + // Track seen entities to avoid duplicates + let mut seen: HashMap = HashMap::new(); + + // Extract emails + for cap in EMAIL_PATTERN.captures_iter(text) { + if let Some(m) = cap.get(1) { + let word = m.as_str().to_string(); + if !seen.contains_key(&word) { + seen.insert(word.clone(), true); + entities.push(Entity { + word, + label: "EMAIL".to_string(), + score: 0.95, + start: m.start(), + end: m.end(), + }); + } + } + } + + // Extract dates + for cap in DATE_PATTERN.captures_iter(text) { + if let Some(m) = cap.get(0) { + let word = m.as_str().to_string(); + if !seen.contains_key(&word) { + seen.insert(word.clone(), true); + entities.push(Entity { + word, + label: "DATE".to_string(), + score: 0.90, + start: m.start(), + end: m.end(), + }); + } + } + } + + // Extract money + for cap in MONEY_PATTERN.captures_iter(text) { + if let Some(m) = cap.get(0) { + let word = m.as_str().to_string(); + if !seen.contains_key(&word) { + seen.insert(word.clone(), true); + entities.push(Entity { + word, + label: "MONEY".to_string(), + score: 0.90, + start: m.start(), + end: m.end(), + }); + } + } + } + + // Extract locations + for cap in LOCATION_PATTERN.captures_iter(text) { + if let Some(m) = cap.get(1) { + let word = m.as_str().to_string(); + if !seen.contains_key(&word) { + seen.insert(word.clone(), true); + entities.push(Entity { + word, + label: "LOCATION".to_string(), + score: 0.85, + start: m.start(), + end: m.end(), + }); + } + } + } + + // Extract organizations + for cap in ORGANIZATION_PATTERN.captures_iter(text) { + if let Some(m) = cap.get(1) { + let word = m.as_str().to_string(); + if !seen.contains_key(&word) { + seen.insert(word.clone(), true); + entities.push(Entity { + word, + label: "ORGANIZATION".to_string(), + score: 0.80, + start: m.start(), + end: m.end(), + }); + } + } + } + + // Extract person names (after organizations to avoid false positives) + for cap in PERSON_PATTERN.captures_iter(text) { + if let Some(m) = cap.get(1) { + let word = m.as_str().to_string(); + // Filter out likely organizations and other false positives + if !word.contains("Inc") && !word.contains("Corp") && + !word.contains("LLC") && !word.contains("Ltd") && + !word.contains("University") && !word.contains("College") && + !seen.contains_key(&word) { + seen.insert(word.clone(), true); + entities.push(Entity { + word, + label: "PERSON".to_string(), + score: 0.75, + start: m.start(), + end: m.end(), + }); + } + } + } + + // Sort by position in text + entities.sort_by_key(|e| e.start); + + Ok(entities) +} + +fn read_file(path: &Path) -> Result { + let ext = path.extension().and_then(|s| s.to_str()).unwrap_or(""); + + match ext { + "txt" | "md" | "csv" | "json" => { + let mut file = File::open(path)?; + let mut content = String::new(); + file.read_to_string(&mut content)?; + Ok(content) + } + "pdf" => { + pdf_extract::extract_text(path) + .map_err(|e| anyhow!("PDF extraction failed: {}", e)) + } + _ => Err(anyhow!("Unsupported file format: {}", ext)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_entities_from_text() { + let text = "Barack Obama was born in Hawaii. He worked in Chicago and later became the 44th President of the United States."; + let result = extract_entities_from_text(text); + assert!(result.is_ok()); + let entities = result.unwrap(); + // Should find entities + assert!(!entities.is_empty()); + // Should find locations + assert!(entities.iter().any(|e| e.label == "LOCATION")); + } + + #[test] + fn test_extract_email() { + let text = "Contact us at support@example.com for more information."; + let result = extract_entities_from_text(text); + assert!(result.is_ok()); + let entities = result.unwrap(); + assert!(entities.iter().any(|e| e.label == "EMAIL")); + } + + #[test] + fn test_extract_date() { + let text = "The meeting is scheduled for Jan 15, 2024."; + let result = extract_entities_from_text(text); + assert!(result.is_ok()); + let entities = result.unwrap(); + assert!(entities.iter().any(|e| e.label == "DATE")); + } +} diff --git a/src/nlp/sentiment.rs b/src/nlp/sentiment.rs new file mode 100644 index 0000000..cd9b79f --- /dev/null +++ b/src/nlp/sentiment.rs @@ -0,0 +1,197 @@ +// Sentiment Analysis module using lexicon-based approach +// This is a lightweight implementation that uses word lists to determine sentiment. +// For production use, consider integrating rust-bert when libtorch is available. +use anyhow::{anyhow, Result}; +use serde::{Deserialize, Serialize}; +use std::collections::HashSet; +use std::fs::File; +use std::io::Read; +use std::path::Path; +use once_cell::sync::Lazy; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Sentiment { + pub label: String, + pub score: f32, +} + +// Positive words lexicon +static POSITIVE_WORDS: Lazy> = Lazy::new(|| { + [ + "good", "great", "excellent", "wonderful", "fantastic", "amazing", "awesome", + "love", "happy", "joy", "pleased", "delighted", "satisfied", "perfect", + "beautiful", "brilliant", "outstanding", "superb", "magnificent", "marvelous", + "terrific", "fabulous", "exceptional", "impressive", "remarkable", "best", + "better", "positive", "advantage", "benefit", "success", "successful", + "win", "winner", "winning", "accomplished", "achievement", "triumph", + "enjoy", "pleasant", "comfortable", "excited", "exciting", "thrilled", + "approve", "approved", "approval", "like", "liked", "favorite", "prefer" + ].iter().copied().collect() +}); + +// Negative words lexicon +static NEGATIVE_WORDS: Lazy> = Lazy::new(|| { + [ + "bad", "terrible", "awful", "horrible", "poor", "worst", "worse", + "hate", "angry", "sad", "upset", "disappointed", "dissatisfied", "unhappy", + "fail", "failure", "failed", "problem", "issue", "wrong", "error", + "difficult", "hard", "tough", "struggle", "struggling", "broken", + "pain", "painful", "hurt", "hurting", "damage", "damaged", "disaster", + "negative", "loss", "lose", "losing", "lost", "defeat", "defeated", + "reject", "rejected", "rejection", "dislike", "disliked", "unpleasant", + "uncomfortable", "disappointing", "frustrate", "frustrated", "frustrating" + ].iter().copied().collect() +}); + +// Intensifiers +static INTENSIFIERS: Lazy> = Lazy::new(|| { + ["very", "extremely", "absolutely", "really", "incredibly", "highly", "totally"] + .iter().copied().collect() +}); + +// Negation words +static NEGATIONS: Lazy> = Lazy::new(|| { + ["not", "no", "never", "nothing", "nobody", "nowhere", "neither", "nor", "none"] + .iter().copied().collect() +}); + +pub fn analyze_sentiment(file_path: &Path) -> Result> { + let text = read_file(file_path)?; + analyze_sentiment_text(&text) +} + +pub fn analyze_sentiment_text(text: &str) -> Result> { + let words: Vec = text + .to_lowercase() + .split(|c: char| !c.is_alphanumeric() && c != '\'') + .filter(|s| !s.is_empty()) + .map(String::from) + .collect(); + + let mut positive_score = 0.0; + let mut negative_score = 0.0; + + let mut i = 0; + while i < words.len() { + let word = &words[i]; + let mut multiplier = 1.0; + + // Check for intensifiers in the previous word + if i > 0 && INTENSIFIERS.contains(words[i - 1].as_str()) { + multiplier = 1.5; + } + + // Check for negation in previous 1-2 words + let is_negated = (i > 0 && NEGATIONS.contains(words[i - 1].as_str())) || + (i > 1 && NEGATIONS.contains(words[i - 2].as_str())); + + if POSITIVE_WORDS.contains(word.as_str()) { + if is_negated { + negative_score += 1.0 * multiplier; + } else { + positive_score += 1.0 * multiplier; + } + } else if NEGATIVE_WORDS.contains(word.as_str()) { + if is_negated { + positive_score += 1.0 * multiplier; + } else { + negative_score += 1.0 * multiplier; + } + } + + i += 1; + } + + // Determine overall sentiment + let total_score = positive_score + negative_score; + let sentiment = if total_score == 0.0 { + Sentiment { + label: "Neutral".to_string(), + score: 0.5, + } + } else { + let pos_ratio = positive_score / total_score; + let neg_ratio = negative_score / total_score; + + if pos_ratio > neg_ratio + 0.1 { + Sentiment { + label: "Positive".to_string(), + score: pos_ratio, + } + } else if neg_ratio > pos_ratio + 0.1 { + Sentiment { + label: "Negative".to_string(), + score: neg_ratio, + } + } else { + Sentiment { + label: "Neutral".to_string(), + score: 0.5, + } + } + }; + + Ok(vec![sentiment]) +} + +fn read_file(path: &Path) -> Result { + let ext = path.extension().and_then(|s| s.to_str()).unwrap_or(""); + + match ext { + "txt" | "md" | "csv" | "json" => { + let mut file = File::open(path)?; + let mut content = String::new(); + file.read_to_string(&mut content)?; + Ok(content) + } + "pdf" => { + pdf_extract::extract_text(path) + .map_err(|e| anyhow!("PDF extraction failed: {}", e)) + } + _ => Err(anyhow!("Unsupported file format: {}", ext)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_analyze_sentiment_positive() { + let positive_text = "This is a wonderful day! I'm feeling great and happy!"; + let result = analyze_sentiment_text(positive_text); + assert!(result.is_ok()); + let sentiments = result.unwrap(); + assert!(!sentiments.is_empty()); + assert_eq!(sentiments[0].label, "Positive"); + } + + #[test] + fn test_analyze_sentiment_negative() { + let negative_text = "This is terrible and awful. I hate it!"; + let result = analyze_sentiment_text(negative_text); + assert!(result.is_ok()); + let sentiments = result.unwrap(); + assert!(!sentiments.is_empty()); + assert_eq!(sentiments[0].label, "Negative"); + } + + #[test] + fn test_analyze_sentiment_neutral() { + let neutral_text = "The sky is blue. The grass is green."; + let result = analyze_sentiment_text(neutral_text); + assert!(result.is_ok()); + let sentiments = result.unwrap(); + assert!(!sentiments.is_empty()); + assert_eq!(sentiments[0].label, "Neutral"); + } + + #[test] + fn test_negation_handling() { + let negated_text = "This is not good at all."; + let result = analyze_sentiment_text(negated_text); + assert!(result.is_ok()); + let sentiments = result.unwrap(); + assert_eq!(sentiments[0].label, "Negative"); + } +} diff --git a/src/nlp/summarization.rs b/src/nlp/summarization.rs new file mode 100644 index 0000000..17214f4 --- /dev/null +++ b/src/nlp/summarization.rs @@ -0,0 +1,174 @@ +// Text Summarization module using extractive approach +// This is a lightweight implementation that uses sentence scoring to extract key sentences. +// For production use, consider integrating rust-bert when libtorch is available. +use anyhow::{anyhow, Result}; +use regex::Regex; +use std::collections::HashMap; +use std::fs::File; +use std::io::Read; +use std::path::Path; +use once_cell::sync::Lazy; + +static SENTENCE_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r"[^.!?]+[.!?]+").unwrap() +}); + +static WORD_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r"[a-zA-Z0-9']+").unwrap() +}); + +// Common stop words to filter out when scoring sentences +static STOP_WORDS: Lazy> = Lazy::new(|| { + [ + "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", + "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", + "to", "was", "will", "with", "the", "this", "but", "they", "have", + "had", "what", "when", "where", "who", "which", "why", "how" + ].iter().copied().collect() +}); + +pub fn summarize_text(file_path: &Path) -> Result { + let text = read_file(file_path)?; + summarize_text_content(&text) +} + +pub fn summarize_text_content(text: &str) -> Result { + // Extract sentences + let sentences: Vec<&str> = SENTENCE_PATTERN + .find_iter(text) + .map(|m| m.as_str().trim()) + .filter(|s| !s.is_empty()) + .collect(); + + if sentences.is_empty() { + return Ok("(No content to summarize)".to_string()); + } + + // If text is short, return it as is + if sentences.len() <= 3 { + return Ok(text.to_string()); + } + + // Calculate word frequencies (excluding stop words) + let mut word_freq: HashMap = HashMap::new(); + for sentence in &sentences { + for word in WORD_PATTERN.find_iter(sentence) { + let word_str = word.as_str().to_lowercase(); + if !STOP_WORDS.contains(word_str.as_str()) && word_str.len() > 2 { + *word_freq.entry(word_str).or_insert(0) += 1; + } + } + } + + // Find the maximum frequency + let max_freq = word_freq.values().max().copied().unwrap_or(1); + + // Normalize frequencies + for freq in word_freq.values_mut() { + *freq = (*freq * 100) / max_freq; + } + + // Score sentences based on word frequencies + let mut sentence_scores: Vec<(usize, usize)> = Vec::new(); + for (idx, sentence) in sentences.iter().enumerate() { + let mut score = 0; + let words: Vec<_> = WORD_PATTERN.find_iter(sentence).collect(); + + for word in &words { + let word_str = word.as_str().to_lowercase(); + if let Some(&freq) = word_freq.get(&word_str) { + score += freq; + } + } + + // Normalize by sentence length to avoid bias toward long sentences + if !words.is_empty() { + score /= words.len(); + } + + // Boost score for sentences at the beginning (often contain key info) + if idx == 0 { + score = (score as f32 * 1.5) as usize; + } + + sentence_scores.push((idx, score)); + } + + // Sort by score and select top sentences + sentence_scores.sort_by(|a, b| b.1.cmp(&a.1)); + + // Select top 30% of sentences (minimum 2, maximum 5) + let num_summary_sentences = (sentences.len() * 30 / 100).max(2).min(5); + let mut selected_indices: Vec = sentence_scores + .iter() + .take(num_summary_sentences) + .map(|(idx, _)| *idx) + .collect(); + + // Sort selected sentences by their original order + selected_indices.sort(); + + // Build the summary + let summary: Vec = selected_indices + .iter() + .map(|&idx| sentences[idx].to_string()) + .collect(); + + Ok(summary.join(" ")) +} + +fn read_file(path: &Path) -> Result { + let ext = path.extension().and_then(|s| s.to_str()).unwrap_or(""); + + match ext { + "txt" | "md" | "csv" | "json" => { + let mut file = File::open(path)?; + let mut content = String::new(); + file.read_to_string(&mut content)?; + Ok(content) + } + "pdf" => { + pdf_extract::extract_text(path) + .map_err(|e| anyhow!("PDF extraction failed: {}", e)) + } + _ => Err(anyhow!("Unsupported file format: {}", ext)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_summarize_text_content() { + let long_text = "Natural language processing is a field of artificial intelligence \ + that focuses on the interaction between computers and humans through \ + natural language. The ultimate objective of NLP is to read, decipher, \ + understand, and make sense of the human languages in a manner that is valuable. \ + NLP combines computational linguistics with statistical models and machine learning. \ + Applications include translation, sentiment analysis, and chatbots."; + let result = summarize_text_content(long_text); + assert!(result.is_ok()); + let summary = result.unwrap(); + assert!(!summary.is_empty()); + assert!(summary.len() < long_text.len()); + } + + #[test] + fn test_summarize_short_text() { + let short_text = "This is a short text."; + let result = summarize_text_content(short_text); + assert!(result.is_ok()); + let summary = result.unwrap(); + assert_eq!(summary, short_text); + } + + #[test] + fn test_summarize_empty_text() { + let empty_text = ""; + let result = summarize_text_content(empty_text); + assert!(result.is_ok()); + let summary = result.unwrap(); + assert_eq!(summary, "(No content to summarize)"); + } +} From 4e97eef98b24bce14485c548def4086197e37a82 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 20 Oct 2025 03:24:24 +0000 Subject: [PATCH 3/3] Fix code review issues: use HashSet for duplicates, remove duplicate stop word Co-authored-by: wesleyscholl <128409641+wesleyscholl@users.noreply.github.com> --- src/nlp/ner.rs | 28 ++++++++++++++-------------- src/nlp/summarization.rs | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/nlp/ner.rs b/src/nlp/ner.rs index ce2949c..32b057b 100644 --- a/src/nlp/ner.rs +++ b/src/nlp/ner.rs @@ -4,7 +4,7 @@ use anyhow::{anyhow, Result}; use regex::Regex; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; +use std::collections::HashSet; use std::fs::File; use std::io::Read; use std::path::Path; @@ -53,14 +53,14 @@ pub fn extract_entities_from_text(text: &str) -> Result> { let mut entities = Vec::new(); // Track seen entities to avoid duplicates - let mut seen: HashMap = HashMap::new(); + let mut seen: HashSet = HashSet::new(); // Extract emails for cap in EMAIL_PATTERN.captures_iter(text) { if let Some(m) = cap.get(1) { let word = m.as_str().to_string(); - if !seen.contains_key(&word) { - seen.insert(word.clone(), true); + if !seen.contains(&word) { + seen.insert(word.clone()); entities.push(Entity { word, label: "EMAIL".to_string(), @@ -76,8 +76,8 @@ pub fn extract_entities_from_text(text: &str) -> Result> { for cap in DATE_PATTERN.captures_iter(text) { if let Some(m) = cap.get(0) { let word = m.as_str().to_string(); - if !seen.contains_key(&word) { - seen.insert(word.clone(), true); + if !seen.contains(&word) { + seen.insert(word.clone()); entities.push(Entity { word, label: "DATE".to_string(), @@ -93,8 +93,8 @@ pub fn extract_entities_from_text(text: &str) -> Result> { for cap in MONEY_PATTERN.captures_iter(text) { if let Some(m) = cap.get(0) { let word = m.as_str().to_string(); - if !seen.contains_key(&word) { - seen.insert(word.clone(), true); + if !seen.contains(&word) { + seen.insert(word.clone()); entities.push(Entity { word, label: "MONEY".to_string(), @@ -110,8 +110,8 @@ pub fn extract_entities_from_text(text: &str) -> Result> { for cap in LOCATION_PATTERN.captures_iter(text) { if let Some(m) = cap.get(1) { let word = m.as_str().to_string(); - if !seen.contains_key(&word) { - seen.insert(word.clone(), true); + if !seen.contains(&word) { + seen.insert(word.clone()); entities.push(Entity { word, label: "LOCATION".to_string(), @@ -127,8 +127,8 @@ pub fn extract_entities_from_text(text: &str) -> Result> { for cap in ORGANIZATION_PATTERN.captures_iter(text) { if let Some(m) = cap.get(1) { let word = m.as_str().to_string(); - if !seen.contains_key(&word) { - seen.insert(word.clone(), true); + if !seen.contains(&word) { + seen.insert(word.clone()); entities.push(Entity { word, label: "ORGANIZATION".to_string(), @@ -148,8 +148,8 @@ pub fn extract_entities_from_text(text: &str) -> Result> { if !word.contains("Inc") && !word.contains("Corp") && !word.contains("LLC") && !word.contains("Ltd") && !word.contains("University") && !word.contains("College") && - !seen.contains_key(&word) { - seen.insert(word.clone(), true); + !seen.contains(&word) { + seen.insert(word.clone()); entities.push(Entity { word, label: "PERSON".to_string(), diff --git a/src/nlp/summarization.rs b/src/nlp/summarization.rs index 17214f4..1be3392 100644 --- a/src/nlp/summarization.rs +++ b/src/nlp/summarization.rs @@ -22,7 +22,7 @@ static STOP_WORDS: Lazy> = Lazy::new(|| [ "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", - "to", "was", "will", "with", "the", "this", "but", "they", "have", + "to", "was", "will", "with", "this", "but", "they", "have", "had", "what", "when", "where", "who", "which", "why", "how" ].iter().copied().collect() });