From 1749d2aa507da5f8779c7b07baba3e0f5dbe45c5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 20 Oct 2025 03:03:38 +0000
Subject: [PATCH 1/3] Initial plan


From 527a7b3fbed9a98acdf9ed51e0e1b7bf92e543ed Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 20 Oct 2025 03:19:59 +0000
Subject: [PATCH 2/3] Add NLP features: NER, sentiment analysis, and text
 summarization

Co-authored-by: wesleyscholl <128409641+wesleyscholl@users.noreply.github.com>
---
 README.md                |  79 +++++++++++++-
 src/main.rs              | 220 ++++++++++++++++++++++++++++++++++++++
 src/nlp/mod.rs           |   8 ++
 src/nlp/ner.rs           | 221 +++++++++++++++++++++++++++++++++++++++
 src/nlp/sentiment.rs     | 197 ++++++++++++++++++++++++++++++++++
 src/nlp/summarization.rs | 174 ++++++++++++++++++++++++++++++
 6 files changed, 898 insertions(+), 1 deletion(-)
 create mode 100644 src/nlp/mod.rs
 create mode 100644 src/nlp/ner.rs
 create mode 100644 src/nlp/sentiment.rs
 create mode 100644 src/nlp/summarization.rs

diff --git a/README.md b/README.md
index 396755a..a3da965 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,10 @@ BoltAI is a compact, local-first AI agent implemented in Rust with a companion m
  - Query CLI — simple commands for search, summarization, and diagnostic output
  - macOS SwiftUI front-end — drag/drop indexing, chat-style interface, and previewed document snippets
  - PDF extraction support and safety measures to avoid dumping raw documents in prompts or UI
+ - **NLP capabilities** — Named Entity Recognition, Sentiment Analysis, and Text Summarization
+   - Pattern-based NER for extracting names, locations, organizations, dates, emails, and monetary values
+   - Lexicon-based sentiment analysis (positive, neutral, negative)
+   - Extractive text summarization using sentence scoring
 
  ## Quickstart
 
@@ -78,6 +82,49 @@ BoltAI is a compact, local-first AI agent implemented in Rust with a companion m
  # or open in Xcode to run the app target and inspect the UI
  ```
 
+ ### Use NLP features
+
+ #### Named Entity Recognition (NER)
+
+ Extract entities like names, locations, organizations, dates, emails, and monetary values from text:
+
+ ```bash
+ # Analyze a single file
+ ./target/release/boltai ner -i document.txt
+
+ # Analyze all files in a directory
+ ./target/release/boltai ner -i /path/to/docs
+
+ # Save results to a file
+ ./target/release/boltai ner -i document.txt -o entities.txt
+ ```
+
+ #### Sentiment Analysis
+
+ Determine sentiment (positive, neutral, or negative) of text:
+
+ ```bash
+ # Analyze a single file
+ ./target/release/boltai sentiment -i review.txt
+
+ # Batch analyze multiple files
+ ./target/release/boltai sentiment -i /path/to/reviews -o sentiment_results.txt
+ ```
+
+ #### Text Summarization
+
+ Generate extractive summaries of documents:
+
+ ```bash
+ # Summarize a single file
+ ./target/release/boltai summarize -i article.txt
+
+ # Summarize multiple files in a directory
+ ./target/release/boltai summarize -i /path/to/articles -o summaries.txt
+ ```
+
+ **Supported file formats**: `.txt`, `.md`, `.csv`, `.json`, `.pdf`
+
  ## Example output (CLI)
 
  After indexing, `boltai query` returns top-k similar documents and a short summary. Example (truncated):
@@ -93,15 +140,45 @@ BoltAI is a compact, local-first AI agent implemented in Rust with a companion m
  BoltAI demonstrates a privacy-first local retrieval pipeline that indexes developer documentation and supports fast summarization and search. It uses TF-IDF for initial vectorization and provides clear extension points for embeddings and LLM-based abstraction.
  ```
 
+ ### NLP Feature Examples
+
+ **Named Entity Recognition output:**
+ ```
+ Named Entities found in document.txt:
+   - John Smith (PERSON): score 0.750
+   - john.smith@example.com (EMAIL): score 0.950
+   - New York (LOCATION): score 0.850
+   - Microsoft Corporation (ORGANIZATION): score 0.800
+   - $150,000 (MONEY): score 0.900
+   - Jan 15, 2024 (DATE): score 0.900
+ ```
+
+ **Sentiment Analysis output:**
+ ```
+ Sentiment analysis for review.txt:
+   - Label: Positive, Score: 0.857
+ ```
+
+ **Text Summarization output:**
+ ```
+ Summary of article.txt:
+ Artificial intelligence has become one of the most transformative technologies. 
+ Deep learning has achieved remarkable breakthroughs in computer vision and natural 
+ language processing. Machine learning algorithms optimize trading strategies and 
+ detect fraudulent transactions.
+ ```
+
  ## Project architecture
 
  - Rust CLI (`src/main.rs`): walks directories, extracts text (including PDFs), computes TF-IDF vectors, and writes `boltai_index.json`.
+ - NLP module (`src/nlp/`): provides pattern-based NER, lexicon-based sentiment analysis, and extractive text summarization.
  - mac-ui SwiftUI: orchestrates indexing runs, loads a capped preview of index docs (to avoid huge JSON parsing on the main thread), and sends queries to the CLI.
- - Extensibility: The CLI prompt layer is isolated to make it easy to swap the query strategy (keywords → embeddings → hybrid retrieval-augmented generation).
+ - Extensibility: The CLI prompt layer is isolated to make it easy to swap the query strategy (keywords → embeddings → hybrid retrieval-augmented generation). NLP features use lightweight rule-based approaches but can be upgraded to ML models (rust-bert) when libtorch is available.
 
  ## Design decisions & trade-offs
 
  - TF-IDF first: fast to compute, explainable, and sufficient for small-to-medium corpora. Replacing TF-IDF with dense embeddings is an intended next step for semantic search.
+ - Rule-based NLP: Uses regex patterns and lexicons for NER and sentiment analysis. Fast, no external dependencies, but less accurate than ML models. Can be upgraded to rust-bert/transformers when libtorch is available in the environment.
  - Local-first: prioritizes data privacy and low-latency responses at the expense of requiring local compute resources.
  - Safety: the UI and CLI avoid including full raw documents in prompts and no longer print raw text as a fallback. The project logs prompts to a local debug file for reproducible tuning.
 
diff --git a/src/main.rs b/src/main.rs
index 8cea5f8..dc11e79 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,6 +15,8 @@ use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
 use walkdir::WalkDir;
 
+mod nlp;
+
 static WORD_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[a-zA-Z0-9']+").unwrap());
 
 #[derive(Parser)]
@@ -43,6 +45,27 @@ enum Commands {
         #[arg(short = 'm', long = "model")]
         model: Option<String>,
     },
+    /// Extract named entities from text files
+    Ner {
+        #[arg(short, long, help = "Input file path or directory")]
+        input: PathBuf,
+        #[arg(short, long, help = "Output file for results (optional)")]
+        output: Option<PathBuf>,
+    },
+    /// Analyze sentiment of text files
+    Sentiment {
+        #[arg(short, long, help = "Input file path or directory")]
+        input: PathBuf,
+        #[arg(short, long, help = "Output file for results (optional)")]
+        output: Option<PathBuf>,
+    },
+    /// Summarize text from files
+    Summarize {
+        #[arg(short, long, help = "Input file path or directory")]
+        input: PathBuf,
+        #[arg(short, long, help = "Output file for results (optional)")]
+        output: Option<PathBuf>,
+    },
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -403,11 +426,208 @@ fn query_with_ollama(index_file: &Path, q: &str, k: usize, model_override: Optio
     }
 }
 
+fn handle_ner(input: &Path, output: Option<&Path>) -> Result<()> {
+    use std::io::Write;
+    
+    if input.is_file() {
+        println!("Analyzing file: {}", input.display());
+        let entities = nlp::extract_entities(input)?;
+        
+        let mut result = format!("Named Entities found in {}:\n", input.display());
+        for entity in &entities {
+            result.push_str(&format!("  - {} ({}): score {:.3}\n", 
+                entity.word, entity.label, entity.score));
+        }
+        
+        if let Some(out_path) = output {
+            let mut file = File::create(out_path)?;
+            file.write_all(result.as_bytes())?;
+            println!("Results written to {}", out_path.display());
+        } else {
+            print!("{}", result);
+        }
+    } else if input.is_dir() {
+        let allowed_exts = ["txt", "md", "csv", "json", "pdf"];
+        let files: Vec<PathBuf> = WalkDir::new(input)
+            .into_iter()
+            .filter_map(|e| e.ok())
+            .filter(|e| e.file_type().is_file())
+            .filter(|e| {
+                e.path()
+                    .extension()
+                    .and_then(|s| s.to_str())
+                    .map(|ext| allowed_exts.contains(&ext))
+                    .unwrap_or(false)
+            })
+            .map(|e| e.path().to_path_buf())
+            .collect();
+        
+        let mut all_results = String::new();
+        for file_path in files {
+            println!("Analyzing: {}", file_path.display());
+            match nlp::extract_entities(&file_path) {
+                Ok(entities) => {
+                    all_results.push_str(&format!("\nFile: {}\n", file_path.display()));
+                    for entity in &entities {
+                        all_results.push_str(&format!("  - {} ({}): score {:.3}\n", 
+                            entity.word, entity.label, entity.score));
+                    }
+                }
+                Err(e) => {
+                    eprintln!("Error processing {}: {}", file_path.display(), e);
+                }
+            }
+        }
+        
+        if let Some(out_path) = output {
+            let mut file = File::create(out_path)?;
+            file.write_all(all_results.as_bytes())?;
+            println!("Results written to {}", out_path.display());
+        } else {
+            print!("{}", all_results);
+        }
+    } else {
+        return Err(anyhow!("Input path does not exist or is not a file/directory"));
+    }
+    
+    Ok(())
+}
+
+fn handle_sentiment(input: &Path, output: Option<&Path>) -> Result<()> {
+    use std::io::Write;
+    
+    if input.is_file() {
+        println!("Analyzing sentiment of file: {}", input.display());
+        let sentiments = nlp::analyze_sentiment(input)?;
+        
+        let mut result = format!("Sentiment analysis for {}:\n", input.display());
+        for sentiment in &sentiments {
+            result.push_str(&format!("  - Label: {}, Score: {:.3}\n", 
+                sentiment.label, sentiment.score));
+        }
+        
+        if let Some(out_path) = output {
+            let mut file = File::create(out_path)?;
+            file.write_all(result.as_bytes())?;
+            println!("Results written to {}", out_path.display());
+        } else {
+            print!("{}", result);
+        }
+    } else if input.is_dir() {
+        let allowed_exts = ["txt", "md", "csv", "json", "pdf"];
+        let files: Vec<PathBuf> = WalkDir::new(input)
+            .into_iter()
+            .filter_map(|e| e.ok())
+            .filter(|e| e.file_type().is_file())
+            .filter(|e| {
+                e.path()
+                    .extension()
+                    .and_then(|s| s.to_str())
+                    .map(|ext| allowed_exts.contains(&ext))
+                    .unwrap_or(false)
+            })
+            .map(|e| e.path().to_path_buf())
+            .collect();
+        
+        let mut all_results = String::new();
+        for file_path in files {
+            println!("Analyzing: {}", file_path.display());
+            match nlp::analyze_sentiment(&file_path) {
+                Ok(sentiments) => {
+                    all_results.push_str(&format!("\nFile: {}\n", file_path.display()));
+                    for sentiment in &sentiments {
+                        all_results.push_str(&format!("  - Label: {}, Score: {:.3}\n", 
+                            sentiment.label, sentiment.score));
+                    }
+                }
+                Err(e) => {
+                    eprintln!("Error processing {}: {}", file_path.display(), e);
+                }
+            }
+        }
+        
+        if let Some(out_path) = output {
+            let mut file = File::create(out_path)?;
+            file.write_all(all_results.as_bytes())?;
+            println!("Results written to {}", out_path.display());
+        } else {
+            print!("{}", all_results);
+        }
+    } else {
+        return Err(anyhow!("Input path does not exist or is not a file/directory"));
+    }
+    
+    Ok(())
+}
+
+fn handle_summarize(input: &Path, output: Option<&Path>) -> Result<()> {
+    use std::io::Write;
+    
+    if input.is_file() {
+        println!("Summarizing file: {}", input.display());
+        let summary = nlp::summarize_text(input)?;
+        
+        let result = format!("Summary of {}:\n{}\n", input.display(), summary);
+        
+        if let Some(out_path) = output {
+            let mut file = File::create(out_path)?;
+            file.write_all(result.as_bytes())?;
+            println!("Summary written to {}", out_path.display());
+        } else {
+            print!("{}", result);
+        }
+    } else if input.is_dir() {
+        let allowed_exts = ["txt", "md", "csv", "json", "pdf"];
+        let files: Vec<PathBuf> = WalkDir::new(input)
+            .into_iter()
+            .filter_map(|e| e.ok())
+            .filter(|e| e.file_type().is_file())
+            .filter(|e| {
+                e.path()
+                    .extension()
+                    .and_then(|s| s.to_str())
+                    .map(|ext| allowed_exts.contains(&ext))
+                    .unwrap_or(false)
+            })
+            .map(|e| e.path().to_path_buf())
+            .collect();
+        
+        let mut all_results = String::new();
+        for file_path in files {
+            println!("Summarizing: {}", file_path.display());
+            match nlp::summarize_text(&file_path) {
+                Ok(summary) => {
+                    all_results.push_str(&format!("\nFile: {}\nSummary: {}\n", 
+                        file_path.display(), summary));
+                }
+                Err(e) => {
+                    eprintln!("Error processing {}: {}", file_path.display(), e);
+                }
+            }
+        }
+        
+        if let Some(out_path) = output {
+            let mut file = File::create(out_path)?;
+            file.write_all(all_results.as_bytes())?;
+            println!("Summaries written to {}", out_path.display());
+        } else {
+            print!("{}", all_results);
+        }
+    } else {
+        return Err(anyhow!("Input path does not exist or is not a file/directory"));
+    }
+    
+    Ok(())
+}
+
 fn main() -> Result<()> {
     let cli = Cli::parse();
     match cli.command {
         Commands::Index { dir, out } => index_dir(&dir, &out)?,
         Commands::Query { index, q, k, model } => query_with_ollama(&index, &q, k, model)?,
+        Commands::Ner { input, output } => handle_ner(&input, output.as_deref())?,
+        Commands::Sentiment { input, output } => handle_sentiment(&input, output.as_deref())?,
+        Commands::Summarize { input, output } => handle_summarize(&input, output.as_deref())?,
     }
     Ok(())
 }
diff --git a/src/nlp/mod.rs b/src/nlp/mod.rs
new file mode 100644
index 0000000..2a251f5
--- /dev/null
+++ b/src/nlp/mod.rs
@@ -0,0 +1,8 @@
+// NLP module for BoltAI
+pub mod ner;
+pub mod sentiment;
+pub mod summarization;
+
+pub use ner::extract_entities;
+pub use sentiment::analyze_sentiment;
+pub use summarization::summarize_text;
diff --git a/src/nlp/ner.rs b/src/nlp/ner.rs
new file mode 100644
index 0000000..ce2949c
--- /dev/null
+++ b/src/nlp/ner.rs
@@ -0,0 +1,221 @@
+// Named Entity Recognition module using pattern-based approach
+// This is a lightweight implementation that uses regex patterns to identify entities.
+// For production use, consider integrating rust-bert when libtorch is available.
+use anyhow::{anyhow, Result};
+use regex::Regex;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::Read;
+use std::path::Path;
+use once_cell::sync::Lazy;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Entity {
+    pub word: String,
+    pub label: String,
+    pub score: f32,
+    pub start: usize,
+    pub end: usize,
+}
+
+// Regex patterns for common entity types
+static PERSON_PATTERN: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b").unwrap()
+});
+
+static ORGANIZATION_PATTERN: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"\b([A-Z][a-z]+(?:\s+(?:Inc|LLC|Corp|Corporation|Ltd|Limited|Company|Co|Group|Institute|University|College)\.?))\b").unwrap()
+});
+
+static LOCATION_PATTERN: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"\b((?:United States|USA|UK|United Kingdom|New York|California|Texas|London|Paris|Tokyo|Beijing|Washington|Chicago|Los Angeles|San Francisco|Boston|Seattle|Miami|Austin|Denver|Portland|Atlanta))\b").unwrap()
+});
+
+static EMAIL_PATTERN: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b").unwrap()
+});
+
+static DATE_PATTERN: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4})\b").unwrap()
+});
+
+static MONEY_PATTERN: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"\$\s*\d+(?:,\d{3})*(?:\.\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|EUR|GBP|dollars?|euros?|pounds?)").unwrap()
+});
+
+pub fn extract_entities(file_path: &Path) -> Result<Vec<Entity>> {
+    let text = read_file(file_path)?;
+    extract_entities_from_text(&text)
+}
+
+pub fn extract_entities_from_text(text: &str) -> Result<Vec<Entity>> {
+    let mut entities = Vec::new();
+    
+    // Track seen entities to avoid duplicates
+    let mut seen: HashMap<String, bool> = HashMap::new();
+    
+    // Extract emails
+    for cap in EMAIL_PATTERN.captures_iter(text) {
+        if let Some(m) = cap.get(1) {
+            let word = m.as_str().to_string();
+            if !seen.contains_key(&word) {
+                seen.insert(word.clone(), true);
+                entities.push(Entity {
+                    word,
+                    label: "EMAIL".to_string(),
+                    score: 0.95,
+                    start: m.start(),
+                    end: m.end(),
+                });
+            }
+        }
+    }
+    
+    // Extract dates
+    for cap in DATE_PATTERN.captures_iter(text) {
+        if let Some(m) = cap.get(0) {
+            let word = m.as_str().to_string();
+            if !seen.contains_key(&word) {
+                seen.insert(word.clone(), true);
+                entities.push(Entity {
+                    word,
+                    label: "DATE".to_string(),
+                    score: 0.90,
+                    start: m.start(),
+                    end: m.end(),
+                });
+            }
+        }
+    }
+    
+    // Extract money
+    for cap in MONEY_PATTERN.captures_iter(text) {
+        if let Some(m) = cap.get(0) {
+            let word = m.as_str().to_string();
+            if !seen.contains_key(&word) {
+                seen.insert(word.clone(), true);
+                entities.push(Entity {
+                    word,
+                    label: "MONEY".to_string(),
+                    score: 0.90,
+                    start: m.start(),
+                    end: m.end(),
+                });
+            }
+        }
+    }
+    
+    // Extract locations
+    for cap in LOCATION_PATTERN.captures_iter(text) {
+        if let Some(m) = cap.get(1) {
+            let word = m.as_str().to_string();
+            if !seen.contains_key(&word) {
+                seen.insert(word.clone(), true);
+                entities.push(Entity {
+                    word,
+                    label: "LOCATION".to_string(),
+                    score: 0.85,
+                    start: m.start(),
+                    end: m.end(),
+                });
+            }
+        }
+    }
+    
+    // Extract organizations
+    for cap in ORGANIZATION_PATTERN.captures_iter(text) {
+        if let Some(m) = cap.get(1) {
+            let word = m.as_str().to_string();
+            if !seen.contains_key(&word) {
+                seen.insert(word.clone(), true);
+                entities.push(Entity {
+                    word,
+                    label: "ORGANIZATION".to_string(),
+                    score: 0.80,
+                    start: m.start(),
+                    end: m.end(),
+                });
+            }
+        }
+    }
+    
+    // Extract person names (after organizations to avoid false positives)
+    for cap in PERSON_PATTERN.captures_iter(text) {
+        if let Some(m) = cap.get(1) {
+            let word = m.as_str().to_string();
+            // Filter out likely organizations and other false positives
+            if !word.contains("Inc") && !word.contains("Corp") && 
+               !word.contains("LLC") && !word.contains("Ltd") &&
+               !word.contains("University") && !word.contains("College") &&
+               !seen.contains_key(&word) {
+                seen.insert(word.clone(), true);
+                entities.push(Entity {
+                    word,
+                    label: "PERSON".to_string(),
+                    score: 0.75,
+                    start: m.start(),
+                    end: m.end(),
+                });
+            }
+        }
+    }
+    
+    // Sort by position in text
+    entities.sort_by_key(|e| e.start);
+    
+    Ok(entities)
+}
+
+fn read_file(path: &Path) -> Result<String> {
+    let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("");
+    
+    match ext {
+        "txt" | "md" | "csv" | "json" => {
+            let mut file = File::open(path)?;
+            let mut content = String::new();
+            file.read_to_string(&mut content)?;
+            Ok(content)
+        }
+        "pdf" => {
+            pdf_extract::extract_text(path)
+                .map_err(|e| anyhow!("PDF extraction failed: {}", e))
+        }
+        _ => Err(anyhow!("Unsupported file format: {}", ext)),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_extract_entities_from_text() {
+        let text = "Barack Obama was born in Hawaii. He worked in Chicago and later became the 44th President of the United States.";
+        let result = extract_entities_from_text(text);
+        assert!(result.is_ok());
+        let entities = result.unwrap();
+        // Should find entities
+        assert!(!entities.is_empty());
+        // Should find locations
+        assert!(entities.iter().any(|e| e.label == "LOCATION"));
+    }
+
+    #[test]
+    fn test_extract_email() {
+        let text = "Contact us at support@example.com for more information.";
+        let result = extract_entities_from_text(text);
+        assert!(result.is_ok());
+        let entities = result.unwrap();
+        assert!(entities.iter().any(|e| e.label == "EMAIL"));
+    }
+
+    #[test]
+    fn test_extract_date() {
+        let text = "The meeting is scheduled for Jan 15, 2024.";
+        let result = extract_entities_from_text(text);
+        assert!(result.is_ok());
+        let entities = result.unwrap();
+        assert!(entities.iter().any(|e| e.label == "DATE"));
+    }
+}
diff --git a/src/nlp/sentiment.rs b/src/nlp/sentiment.rs
new file mode 100644
index 0000000..cd9b79f
--- /dev/null
+++ b/src/nlp/sentiment.rs
@@ -0,0 +1,197 @@
+// Sentiment Analysis module using lexicon-based approach
+// This is a lightweight implementation that uses word lists to determine sentiment.
+// For production use, consider integrating rust-bert when libtorch is available.
+use anyhow::{anyhow, Result};
+use serde::{Deserialize, Serialize};
+use std::collections::HashSet;
+use std::fs::File;
+use std::io::Read;
+use std::path::Path;
+use once_cell::sync::Lazy;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Sentiment {
+    pub label: String,
+    pub score: f32,
+}
+
+// Positive words lexicon
+static POSITIVE_WORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
+    [
+        "good", "great", "excellent", "wonderful", "fantastic", "amazing", "awesome",
+        "love", "happy", "joy", "pleased", "delighted", "satisfied", "perfect",
+        "beautiful", "brilliant", "outstanding", "superb", "magnificent", "marvelous",
+        "terrific", "fabulous", "exceptional", "impressive", "remarkable", "best",
+        "better", "positive", "advantage", "benefit", "success", "successful",
+        "win", "winner", "winning", "accomplished", "achievement", "triumph",
+        "enjoy", "pleasant", "comfortable", "excited", "exciting", "thrilled",
+        "approve", "approved", "approval", "like", "liked", "favorite", "prefer"
+    ].iter().copied().collect()
+});
+
+// Negative words lexicon
+static NEGATIVE_WORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
+    [
+        "bad", "terrible", "awful", "horrible", "poor", "worst", "worse",
+        "hate", "angry", "sad", "upset", "disappointed", "dissatisfied", "unhappy",
+        "fail", "failure", "failed", "problem", "issue", "wrong", "error",
+        "difficult", "hard", "tough", "struggle", "struggling", "broken",
+        "pain", "painful", "hurt", "hurting", "damage", "damaged", "disaster",
+        "negative", "loss", "lose", "losing", "lost", "defeat", "defeated",
+        "reject", "rejected", "rejection", "dislike", "disliked", "unpleasant",
+        "uncomfortable", "disappointing", "frustrate", "frustrated", "frustrating"
+    ].iter().copied().collect()
+});
+
+// Intensifiers
+static INTENSIFIERS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
+    ["very", "extremely", "absolutely", "really", "incredibly", "highly", "totally"]
+        .iter().copied().collect()
+});
+
+// Negation words
+static NEGATIONS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
+    ["not", "no", "never", "nothing", "nobody", "nowhere", "neither", "nor", "none"]
+        .iter().copied().collect()
+});
+
+pub fn analyze_sentiment(file_path: &Path) -> Result<Vec<Sentiment>> {
+    let text = read_file(file_path)?;
+    analyze_sentiment_text(&text)
+}
+
+pub fn analyze_sentiment_text(text: &str) -> Result<Vec<Sentiment>> {
+    let words: Vec<String> = text
+        .to_lowercase()
+        .split(|c: char| !c.is_alphanumeric() && c != '\'')
+        .filter(|s| !s.is_empty())
+        .map(String::from)
+        .collect();
+    
+    let mut positive_score = 0.0;
+    let mut negative_score = 0.0;
+    
+    let mut i = 0;
+    while i < words.len() {
+        let word = &words[i];
+        let mut multiplier = 1.0;
+        
+        // Check for intensifiers in the previous word
+        if i > 0 && INTENSIFIERS.contains(words[i - 1].as_str()) {
+            multiplier = 1.5;
+        }
+        
+        // Check for negation in previous 1-2 words
+        let is_negated = (i > 0 && NEGATIONS.contains(words[i - 1].as_str())) ||
+                        (i > 1 && NEGATIONS.contains(words[i - 2].as_str()));
+        
+        if POSITIVE_WORDS.contains(word.as_str()) {
+            if is_negated {
+                negative_score += 1.0 * multiplier;
+            } else {
+                positive_score += 1.0 * multiplier;
+            }
+        } else if NEGATIVE_WORDS.contains(word.as_str()) {
+            if is_negated {
+                positive_score += 1.0 * multiplier;
+            } else {
+                negative_score += 1.0 * multiplier;
+            }
+        }
+        
+        i += 1;
+    }
+    
+    // Determine overall sentiment
+    let total_score = positive_score + negative_score;
+    let sentiment = if total_score == 0.0 {
+        Sentiment {
+            label: "Neutral".to_string(),
+            score: 0.5,
+        }
+    } else {
+        let pos_ratio = positive_score / total_score;
+        let neg_ratio = negative_score / total_score;
+        
+        if pos_ratio > neg_ratio + 0.1 {
+            Sentiment {
+                label: "Positive".to_string(),
+                score: pos_ratio,
+            }
+        } else if neg_ratio > pos_ratio + 0.1 {
+            Sentiment {
+                label: "Negative".to_string(),
+                score: neg_ratio,
+            }
+        } else {
+            Sentiment {
+                label: "Neutral".to_string(),
+                score: 0.5,
+            }
+        }
+    };
+    
+    Ok(vec![sentiment])
+}
+
+fn read_file(path: &Path) -> Result<String> {
+    let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("");
+    
+    match ext {
+        "txt" | "md" | "csv" | "json" => {
+            let mut file = File::open(path)?;
+            let mut content = String::new();
+            file.read_to_string(&mut content)?;
+            Ok(content)
+        }
+        "pdf" => {
+            pdf_extract::extract_text(path)
+                .map_err(|e| anyhow!("PDF extraction failed: {}", e))
+        }
+        _ => Err(anyhow!("Unsupported file format: {}", ext)),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_analyze_sentiment_positive() {
+        let positive_text = "This is a wonderful day! I'm feeling great and happy!";
+        let result = analyze_sentiment_text(positive_text);
+        assert!(result.is_ok());
+        let sentiments = result.unwrap();
+        assert!(!sentiments.is_empty());
+        assert_eq!(sentiments[0].label, "Positive");
+    }
+
+    #[test]
+    fn test_analyze_sentiment_negative() {
+        let negative_text = "This is terrible and awful. I hate it!";
+        let result = analyze_sentiment_text(negative_text);
+        assert!(result.is_ok());
+        let sentiments = result.unwrap();
+        assert!(!sentiments.is_empty());
+        assert_eq!(sentiments[0].label, "Negative");
+    }
+
+    #[test]
+    fn test_analyze_sentiment_neutral() {
+        let neutral_text = "The sky is blue. The grass is green.";
+        let result = analyze_sentiment_text(neutral_text);
+        assert!(result.is_ok());
+        let sentiments = result.unwrap();
+        assert!(!sentiments.is_empty());
+        assert_eq!(sentiments[0].label, "Neutral");
+    }
+
+    #[test]
+    fn test_negation_handling() {
+        let negated_text = "This is not good at all.";
+        let result = analyze_sentiment_text(negated_text);
+        assert!(result.is_ok());
+        let sentiments = result.unwrap();
+        assert_eq!(sentiments[0].label, "Negative");
+    }
+}
diff --git a/src/nlp/summarization.rs b/src/nlp/summarization.rs
new file mode 100644
index 0000000..17214f4
--- /dev/null
+++ b/src/nlp/summarization.rs
@@ -0,0 +1,174 @@
+// Text Summarization module using extractive approach
+// This is a lightweight implementation that uses sentence scoring to extract key sentences.
+// For production use, consider integrating rust-bert when libtorch is available.
+use anyhow::{anyhow, Result};
+use regex::Regex;
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::Read;
+use std::path::Path;
+use once_cell::sync::Lazy;
+
+static SENTENCE_PATTERN: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"[^.!?]+[.!?]+").unwrap()
+});
+
+static WORD_PATTERN: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"[a-zA-Z0-9']+").unwrap()
+});
+
+// Common stop words to filter out when scoring sentences
+static STOP_WORDS: Lazy<std::collections::HashSet<&'static str>> = Lazy::new(|| {
+    [
+        "a", "an", "and", "are", "as", "at", "be", "by", "for", "from",
+        "has", "he", "in", "is", "it", "its", "of", "on", "that", "the",
+        "to", "was", "will", "with", "the", "this", "but", "they", "have",
+        "had", "what", "when", "where", "who", "which", "why", "how"
+    ].iter().copied().collect()
+});
+
+pub fn summarize_text(file_path: &Path) -> Result<String> {
+    let text = read_file(file_path)?;
+    summarize_text_content(&text)
+}
+
+pub fn summarize_text_content(text: &str) -> Result<String> {
+    // Extract sentences
+    let sentences: Vec<&str> = SENTENCE_PATTERN
+        .find_iter(text)
+        .map(|m| m.as_str().trim())
+        .filter(|s| !s.is_empty())
+        .collect();
+    
+    if sentences.is_empty() {
+        return Ok("(No content to summarize)".to_string());
+    }
+    
+    // If text is short, return it as is
+    if sentences.len() <= 3 {
+        return Ok(text.to_string());
+    }
+    
+    // Calculate word frequencies (excluding stop words)
+    let mut word_freq: HashMap<String, usize> = HashMap::new();
+    for sentence in &sentences {
+        for word in WORD_PATTERN.find_iter(sentence) {
+            let word_str = word.as_str().to_lowercase();
+            if !STOP_WORDS.contains(word_str.as_str()) && word_str.len() > 2 {
+                *word_freq.entry(word_str).or_insert(0) += 1;
+            }
+        }
+    }
+    
+    // Find the maximum frequency
+    let max_freq = word_freq.values().max().copied().unwrap_or(1);
+    
+    // Normalize frequencies
+    for freq in word_freq.values_mut() {
+        *freq = (*freq * 100) / max_freq;
+    }
+    
+    // Score sentences based on word frequencies
+    let mut sentence_scores: Vec<(usize, usize)> = Vec::new();
+    for (idx, sentence) in sentences.iter().enumerate() {
+        let mut score = 0;
+        let words: Vec<_> = WORD_PATTERN.find_iter(sentence).collect();
+        
+        for word in &words {
+            let word_str = word.as_str().to_lowercase();
+            if let Some(&freq) = word_freq.get(&word_str) {
+                score += freq;
+            }
+        }
+        
+        // Normalize by sentence length to avoid bias toward long sentences
+        if !words.is_empty() {
+            score /= words.len();
+        }
+        
+        // Boost score for sentences at the beginning (often contain key info)
+        if idx == 0 {
+            score = (score as f32 * 1.5) as usize;
+        }
+        
+        sentence_scores.push((idx, score));
+    }
+    
+    // Sort by score and select top sentences
+    sentence_scores.sort_by(|a, b| b.1.cmp(&a.1));
+    
+    // Select top 30% of sentences (minimum 2, maximum 5)
+    let num_summary_sentences = (sentences.len() * 30 / 100).max(2).min(5);
+    let mut selected_indices: Vec<usize> = sentence_scores
+        .iter()
+        .take(num_summary_sentences)
+        .map(|(idx, _)| *idx)
+        .collect();
+    
+    // Sort selected sentences by their original order
+    selected_indices.sort();
+    
+    // Build the summary
+    let summary: Vec<String> = selected_indices
+        .iter()
+        .map(|&idx| sentences[idx].to_string())
+        .collect();
+    
+    Ok(summary.join(" "))
+}
+
+fn read_file(path: &Path) -> Result<String> {
+    let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("");
+    
+    match ext {
+        "txt" | "md" | "csv" | "json" => {
+            let mut file = File::open(path)?;
+            let mut content = String::new();
+            file.read_to_string(&mut content)?;
+            Ok(content)
+        }
+        "pdf" => {
+            pdf_extract::extract_text(path)
+                .map_err(|e| anyhow!("PDF extraction failed: {}", e))
+        }
+        _ => Err(anyhow!("Unsupported file format: {}", ext)),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_summarize_text_content() {
+        let long_text = "Natural language processing is a field of artificial intelligence \
+                        that focuses on the interaction between computers and humans through \
+                        natural language. The ultimate objective of NLP is to read, decipher, \
+                        understand, and make sense of the human languages in a manner that is valuable. \
+                        NLP combines computational linguistics with statistical models and machine learning. \
+                        Applications include translation, sentiment analysis, and chatbots.";
+        let result = summarize_text_content(long_text);
+        assert!(result.is_ok());
+        let summary = result.unwrap();
+        assert!(!summary.is_empty());
+        assert!(summary.len() < long_text.len());
+    }
+
+    #[test]
+    fn test_summarize_short_text() {
+        let short_text = "This is a short text.";
+        let result = summarize_text_content(short_text);
+        assert!(result.is_ok());
+        let summary = result.unwrap();
+        assert_eq!(summary, short_text);
+    }
+
+    #[test]
+    fn test_summarize_empty_text() {
+        let empty_text = "";
+        let result = summarize_text_content(empty_text);
+        assert!(result.is_ok());
+        let summary = result.unwrap();
+        assert_eq!(summary, "(No content to summarize)");
+    }
+}

From 4e97eef98b24bce14485c548def4086197e37a82 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 20 Oct 2025 03:24:24 +0000
Subject: [PATCH 3/3] Fix code review issues: use HashSet for duplicates,
 remove duplicate stop word

Co-authored-by: wesleyscholl <128409641+wesleyscholl@users.noreply.github.com>
---
 src/nlp/ner.rs           | 28 ++++++++++++++--------------
 src/nlp/summarization.rs |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/nlp/ner.rs b/src/nlp/ner.rs
index ce2949c..32b057b 100644
--- a/src/nlp/ner.rs
+++ b/src/nlp/ner.rs
@@ -4,7 +4,7 @@
 use anyhow::{anyhow, Result};
 use regex::Regex;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
+use std::collections::HashSet;
 use std::fs::File;
 use std::io::Read;
 use std::path::Path;
@@ -53,14 +53,14 @@ pub fn extract_entities_from_text(text: &str) -> Result<Vec<Entity>> {
     let mut entities = Vec::new();
     
     // Track seen entities to avoid duplicates
-    let mut seen: HashMap<String, bool> = HashMap::new();
+    let mut seen: HashSet<String> = HashSet::new();
     
     // Extract emails
     for cap in EMAIL_PATTERN.captures_iter(text) {
         if let Some(m) = cap.get(1) {
             let word = m.as_str().to_string();
-            if !seen.contains_key(&word) {
-                seen.insert(word.clone(), true);
+            if !seen.contains(&word) {
+                seen.insert(word.clone());
                 entities.push(Entity {
                     word,
                     label: "EMAIL".to_string(),
@@ -76,8 +76,8 @@ pub fn extract_entities_from_text(text: &str) -> Result<Vec<Entity>> {
     for cap in DATE_PATTERN.captures_iter(text) {
         if let Some(m) = cap.get(0) {
             let word = m.as_str().to_string();
-            if !seen.contains_key(&word) {
-                seen.insert(word.clone(), true);
+            if !seen.contains(&word) {
+                seen.insert(word.clone());
                 entities.push(Entity {
                     word,
                     label: "DATE".to_string(),
@@ -93,8 +93,8 @@ pub fn extract_entities_from_text(text: &str) -> Result<Vec<Entity>> {
     for cap in MONEY_PATTERN.captures_iter(text) {
         if let Some(m) = cap.get(0) {
             let word = m.as_str().to_string();
-            if !seen.contains_key(&word) {
-                seen.insert(word.clone(), true);
+            if !seen.contains(&word) {
+                seen.insert(word.clone());
                 entities.push(Entity {
                     word,
                     label: "MONEY".to_string(),
@@ -110,8 +110,8 @@ pub fn extract_entities_from_text(text: &str) -> Result<Vec<Entity>> {
     for cap in LOCATION_PATTERN.captures_iter(text) {
         if let Some(m) = cap.get(1) {
             let word = m.as_str().to_string();
-            if !seen.contains_key(&word) {
-                seen.insert(word.clone(), true);
+            if !seen.contains(&word) {
+                seen.insert(word.clone());
                 entities.push(Entity {
                     word,
                     label: "LOCATION".to_string(),
@@ -127,8 +127,8 @@ pub fn extract_entities_from_text(text: &str) -> Result<Vec<Entity>> {
     for cap in ORGANIZATION_PATTERN.captures_iter(text) {
         if let Some(m) = cap.get(1) {
             let word = m.as_str().to_string();
-            if !seen.contains_key(&word) {
-                seen.insert(word.clone(), true);
+            if !seen.contains(&word) {
+                seen.insert(word.clone());
                 entities.push(Entity {
                     word,
                     label: "ORGANIZATION".to_string(),
@@ -148,8 +148,8 @@ pub fn extract_entities_from_text(text: &str) -> Result<Vec<Entity>> {
             if !word.contains("Inc") && !word.contains("Corp") && 
                !word.contains("LLC") && !word.contains("Ltd") &&
                !word.contains("University") && !word.contains("College") &&
-               !seen.contains_key(&word) {
-                seen.insert(word.clone(), true);
+               !seen.contains(&word) {
+                seen.insert(word.clone());
                 entities.push(Entity {
                     word,
                     label: "PERSON".to_string(),
diff --git a/src/nlp/summarization.rs b/src/nlp/summarization.rs
index 17214f4..1be3392 100644
--- a/src/nlp/summarization.rs
+++ b/src/nlp/summarization.rs
@@ -22,7 +22,7 @@ static STOP_WORDS: Lazy<std::collections::HashSet<&'static str>> = Lazy::new(||
     [
         "a", "an", "and", "are", "as", "at", "be", "by", "for", "from",
         "has", "he", "in", "is", "it", "its", "of", "on", "that", "the",
-        "to", "was", "will", "with", "the", "this", "but", "they", "have",
+        "to", "was", "will", "with", "this", "but", "they", "have",
         "had", "what", "when", "where", "who", "which", "why", "how"
     ].iter().copied().collect()
 });