diff --git a/src/cli.rs b/src/cli.rs index c3a6400..20dd7d9 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -4,7 +4,7 @@ use url::Url; #[derive(Debug, Clone)] pub struct CliArgs { pub domain: String, - pub prep: bool, + pub prep: Option, } impl CliArgs { @@ -22,10 +22,11 @@ impl CliArgs { .arg( Arg::new("prep") .long("prep") + .value_name("FILE") .help( - "Enable preparation mode to discover template patterns across domain pages", + "Enable preparation mode to discover template patterns across domain pages. Saves detected template paths to the specified JSON file", ) - .action(clap::ArgAction::SetTrue), + .required(false), ) .get_matches(); @@ -34,7 +35,7 @@ impl CliArgs { .ok_or("Domain argument is required")?; let validated_domain = Self::extract_domain(domain_input)?; - let prep = matches.get_flag("prep"); + let prep = matches.get_one::("prep").cloned(); Ok(CliArgs { domain: validated_domain, @@ -74,11 +75,11 @@ mod tests { // Test that single domain parsing works correctly let args = CliArgs { domain: "example.com".to_string(), - prep: false, + prep: None, }; assert_eq!(args.domain, "example.com"); - assert!(!args.prep); + assert!(args.prep.is_none()); } #[test] @@ -124,10 +125,11 @@ mod tests { // since we can't easily test the full CLI parsing in unit tests) let args = CliArgs { domain: "example.com".to_string(), - prep: true, + prep: Some("templates.json".to_string()), }; - assert!(args.prep); + assert!(args.prep.is_some()); + assert_eq!(args.prep.unwrap(), "templates.json"); assert_eq!(args.domain, "example.com"); } } diff --git a/src/main.rs b/src/main.rs index 7cfb33d..f84efbb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -57,7 +57,7 @@ async fn main() { // Phase 1: URL Discovery - find additional URLs for each domain info!("Starting URL discovery for domains"); - let max_urls_per_domain = if args.prep { 10 } else { 3 }; + let max_urls_per_domain = if args.prep.is_some() { 10 } else { 3 }; // Discover additional URLs for the domain let domain = &args.domain; @@ -135,7 +135,7 @@ async fn main() { } // Phase 3: Template analysis (prep mode) or standard duplicate analysis - if args.prep { + if let Some(prep_file) = &args.prep { info!("Running template detection analysis in prep mode"); let mut combined_store = TemplatePathStore::new(); let template_detector = TemplateDetector::new(); @@ -151,10 +151,18 @@ async fn main() { } } + let validated_paths = combined_store.get_validated_paths(); info!( - "Template analysis complete, found {} unique template paths", - combined_store.get_paths().len() + "Template analysis complete, found {} total template paths, {} validated (>=2 elements)", + combined_store.get_paths().len(), + validated_paths.len() ); + + // Write template data to JSON file + match std::fs::write(prep_file, combined_store.to_validated_serialized_string()) { + Ok(_) => info!("Template data written to {}", prep_file), + Err(e) => error!("Failed to write template data to {}: {}", prep_file, e), + } } else { info!("Running standard duplicate analysis"); @@ -178,8 +186,8 @@ async fn main() { let _ = browser.close().await; - if args.prep { - // In prep mode, output detected template paths in serialized format + if args.prep.is_some() { + // In prep mode, output template count summary to terminal println!("\n=== Template Path Detection Results ==="); let mut combined_store = TemplatePathStore::new(); @@ -210,8 +218,17 @@ async fn main() { } } - println!("\nDetected Template Paths (Rust-serializable format):"); - println!("{}", combined_store.to_serialized_string()); + let validated_paths = combined_store.get_validated_paths(); + println!("\nTemplate Detection Summary:"); + println!( + " Total template paths found: {}", + combined_store.get_paths().len() + ); + println!( + " Validated template paths (>=2 elements): {}", + validated_paths.len() + ); + println!(" Template data saved to: {}", args.prep.as_ref().unwrap()); } } else { // Regular mode - show crawling results diff --git a/src/template_detection.rs b/src/template_detection.rs index d3d9516..e5e1887 100644 --- a/src/template_detection.rs +++ b/src/template_detection.rs @@ -20,16 +20,30 @@ pub struct ElementPath { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TemplatePathStore { pub detected_paths: HashSet, + // Track how many HTML elements each template pattern appears in + template_counts: HashMap, +} + +/// Simplified store for JSON output containing only detected paths +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ValidatedTemplatePathStore { + pub detected_paths: HashSet, } impl TemplatePathStore { pub fn new() -> Self { Self { detected_paths: HashSet::new(), + template_counts: HashMap::new(), } } pub fn add_path(&mut self, path: ElementPath) { + // Track the count for this template pattern + *self + .template_counts + .entry(path.template_pattern.clone()) + .or_insert(0) += 1; self.detected_paths.insert(path); } @@ -37,9 +51,30 @@ impl TemplatePathStore { &self.detected_paths } + /// Get validated paths that appear in at least 2 HTML elements + pub fn get_validated_paths(&self) -> HashSet { + self.detected_paths + .iter() + .filter(|path| { + self.template_counts + .get(&path.template_pattern) + .is_some_and(|&count| count >= 2) + }) + .cloned() + .collect() + } + pub fn to_serialized_string(&self) -> String { serde_json::to_string_pretty(self).unwrap_or_default() } + + /// Get serialized string with only validated paths (for prep mode) + pub fn to_validated_serialized_string(&self) -> String { + let validated_store = ValidatedTemplatePathStore { + detected_paths: self.get_validated_paths(), + }; + serde_json::to_string_pretty(&validated_store).unwrap_or_default() + } } impl Default for TemplatePathStore { @@ -268,6 +303,11 @@ impl TemplateDetector { return false; } + // Limit templates to 5 parts maximum (words/numbers/floats) + if words.len() > 5 { + return false; + } + // Check for known patterns for word in &words { let lowercase = word.to_lowercase(); @@ -456,6 +496,38 @@ mod tests { } } + #[test] + fn test_five_part_limit() { + let detector = TemplateDetector::new(); + + // Valid: 5 parts or less + let valid_patterns = vec![ + "42 comments", // 2 parts + "Posted 2 hours ago", // 3 parts + "Page 5 of 100 items", // 5 parts + ]; + + for input in valid_patterns { + assert!( + detector.detect_template(input).is_some(), + "Should detect pattern for: {input}" + ); + } + + // Invalid: More than 5 parts + let invalid_patterns = vec![ + "This is a very long sentence with 42 comments here", // 10 parts + "Posted by user 123 about 2 hours ago today", // 8 parts + ]; + + for input in invalid_patterns { + assert!( + detector.detect_template(input).is_none(), + "Should NOT detect pattern for long sentence: {input}" + ); + } + } + #[test] fn test_apply_template() { let detector = TemplateDetector::new(); @@ -469,15 +541,18 @@ mod tests { fn test_edge_cases() { let detector = TemplateDetector::new(); - // Multiple numbers - should pick the first one that makes sense - let template = detector - .detect_template("Posted 2 hours ago by user123") - .unwrap(); - assert_eq!(template.pattern, "Posted {time} hours ago by user123"); + // Multiple numbers within 5-part limit + let template = detector.detect_template("Posted 2 hours ago").unwrap(); + assert_eq!(template.pattern, "Posted {time} hours ago"); - // Complex patterns + // Complex patterns within limit let template = detector.detect_template("Page 5 of 100").unwrap(); assert_eq!(template.pattern, "Page {count} of 100"); + + // Test that patterns exceeding 5 parts are rejected + assert!(detector + .detect_template("Posted 2 hours ago by user123") + .is_none()); } #[test] @@ -746,4 +821,101 @@ mod tests { assert_eq!(body1.children[0].content, body2.children[0].content); assert_eq!(body1.children[1].content, body2.children[1].content); } + + #[test] + fn test_template_count_validation() { + let mut store = TemplatePathStore::new(); + + // Add the same template pattern multiple times (simulating multiple HTML elements) + let template_pattern = "{count} comments".to_string(); + + // Add template path 1 time (should not be validated) + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "div".to_string(), + classes: vec!["comment-0".to_string()], + }], + template_pattern: template_pattern.clone(), + }; + store.add_path(path); + + // Should not be validated (only 1 element) + assert_eq!(store.get_validated_paths().len(), 0); + + // Add 1 more time (total 2, should be validated) + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "div".to_string(), + classes: vec!["comment-1".to_string()], + }], + template_pattern: template_pattern.clone(), + }; + store.add_path(path); + + // Should now be validated (>=2 elements) + assert_eq!(store.get_validated_paths().len(), 2); + + // Add 2 more times (total 4, should still be validated) + for i in 2..4 { + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "div".to_string(), + classes: vec![format!("comment-{}", i)], + }], + template_pattern: template_pattern.clone(), + }; + store.add_path(path); + } + + // Should still be validated (4 elements >= 2) + assert_eq!(store.get_validated_paths().len(), 4); + + // Test that different template patterns are counted separately + let different_pattern = "{time} hours ago".to_string(); + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "span".to_string(), + classes: vec!["timestamp".to_string()], + }], + template_pattern: different_pattern, + }; + store.add_path(path); + + // Still only 4 validated paths (the new pattern appears only once, count < 2) + assert_eq!(store.get_validated_paths().len(), 4); + } + + #[test] + fn test_validated_json_output_structure() { + let mut store = TemplatePathStore::new(); + + // Add some template paths + let template_pattern = "{count} comments".to_string(); + for i in 0..3 { + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "div".to_string(), + classes: vec![format!("comment-{}", i)], + }], + template_pattern: template_pattern.clone(), + }; + store.add_path(path); + } + + // Get the JSON output + let json_output = store.to_validated_serialized_string(); + + // Parse the JSON to verify structure + let parsed: serde_json::Value = serde_json::from_str(&json_output).unwrap(); + + // Should have "detected_paths" field + assert!(parsed.get("detected_paths").is_some()); + + // Should NOT have "template_counts" field + assert!(parsed.get("template_counts").is_none()); + + // Should have the validated paths (3 elements >= 2) + let detected_paths = parsed["detected_paths"].as_array().unwrap(); + assert_eq!(detected_paths.len(), 3); + } }