From 30d0b4f753c5b1e810340be49ce56c7c46d2614d Mon Sep 17 00:00:00 2001 From: Sumit Datta Date: Wed, 9 Jul 2025 12:12:20 +0530 Subject: [PATCH 1/4] feat: improve template detection with stricter validation rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Limit templates to maximum 5 parts (words/numbers/floats) for better accuracy - Add minimum 3 HTML elements validation in prep stage to ensure template patterns are statistically significant - Update TemplatePathStore to track element counts per template pattern - Add get_validated_paths() method for templates appearing in >3 elements - Modify prep mode output to show only validated template paths - Add comprehensive tests for 5-part limit and element count validation - Update edge case tests to accommodate new stricter validation rules 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/main.rs | 10 +-- src/template_detection.rs | 134 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 134 insertions(+), 10 deletions(-) diff --git a/src/main.rs b/src/main.rs index 7cfb33d..1492647 100644 --- a/src/main.rs +++ b/src/main.rs @@ -151,9 +151,11 @@ async fn main() { } } + let validated_paths = combined_store.get_validated_paths(); info!( - "Template analysis complete, found {} unique template paths", - combined_store.get_paths().len() + "Template analysis complete, found {} total template paths, {} validated (>3 elements)", + combined_store.get_paths().len(), + validated_paths.len() ); } else { info!("Running standard duplicate analysis"); @@ -210,8 +212,8 @@ async fn main() { } } - println!("\nDetected Template Paths (Rust-serializable format):"); - println!("{}", combined_store.to_serialized_string()); + println!("\nValidated Template Paths (>3 elements, Rust-serializable format):"); + println!("{}", combined_store.to_validated_serialized_string()); } } else { // Regular mode - show crawling results diff --git a/src/template_detection.rs b/src/template_detection.rs index d3d9516..a5fe61c 100644 --- a/src/template_detection.rs +++ b/src/template_detection.rs @@ -20,16 +20,24 @@ pub struct ElementPath { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TemplatePathStore { pub detected_paths: HashSet, + // Track how many HTML elements each template pattern appears in + template_counts: HashMap, } impl TemplatePathStore { pub fn new() -> Self { Self { detected_paths: HashSet::new(), + template_counts: HashMap::new(), } } pub fn add_path(&mut self, path: ElementPath) { + // Track the count for this template pattern + *self + .template_counts + .entry(path.template_pattern.clone()) + .or_insert(0) += 1; self.detected_paths.insert(path); } @@ -37,9 +45,31 @@ impl TemplatePathStore { &self.detected_paths } + /// Get validated paths that appear in more than 3 HTML elements + pub fn get_validated_paths(&self) -> HashSet { + self.detected_paths + .iter() + .filter(|path| { + self.template_counts + .get(&path.template_pattern) + .is_some_and(|&count| count > 3) + }) + .cloned() + .collect() + } + pub fn to_serialized_string(&self) -> String { serde_json::to_string_pretty(self).unwrap_or_default() } + + /// Get serialized string with only validated paths (for prep mode) + pub fn to_validated_serialized_string(&self) -> String { + let validated_store = TemplatePathStore { + detected_paths: self.get_validated_paths(), + template_counts: self.template_counts.clone(), + }; + serde_json::to_string_pretty(&validated_store).unwrap_or_default() + } } impl Default for TemplatePathStore { @@ -268,6 +298,11 @@ impl TemplateDetector { return false; } + // Limit templates to 5 parts maximum (words/numbers/floats) + if words.len() > 5 { + return false; + } + // Check for known patterns for word in &words { let lowercase = word.to_lowercase(); @@ -456,6 +491,38 @@ mod tests { } } + #[test] + fn test_five_part_limit() { + let detector = TemplateDetector::new(); + + // Valid: 5 parts or less + let valid_patterns = vec![ + "42 comments", // 2 parts + "Posted 2 hours ago", // 3 parts + "Page 5 of 100 items", // 5 parts + ]; + + for input in valid_patterns { + assert!( + detector.detect_template(input).is_some(), + "Should detect pattern for: {input}" + ); + } + + // Invalid: More than 5 parts + let invalid_patterns = vec![ + "This is a very long sentence with 42 comments here", // 10 parts + "Posted by user 123 about 2 hours ago today", // 8 parts + ]; + + for input in invalid_patterns { + assert!( + detector.detect_template(input).is_none(), + "Should NOT detect pattern for long sentence: {input}" + ); + } + } + #[test] fn test_apply_template() { let detector = TemplateDetector::new(); @@ -469,15 +536,18 @@ mod tests { fn test_edge_cases() { let detector = TemplateDetector::new(); - // Multiple numbers - should pick the first one that makes sense - let template = detector - .detect_template("Posted 2 hours ago by user123") - .unwrap(); - assert_eq!(template.pattern, "Posted {time} hours ago by user123"); + // Multiple numbers within 5-part limit + let template = detector.detect_template("Posted 2 hours ago").unwrap(); + assert_eq!(template.pattern, "Posted {time} hours ago"); - // Complex patterns + // Complex patterns within limit let template = detector.detect_template("Page 5 of 100").unwrap(); assert_eq!(template.pattern, "Page {count} of 100"); + + // Test that patterns exceeding 5 parts are rejected + assert!(detector + .detect_template("Posted 2 hours ago by user123") + .is_none()); } #[test] @@ -746,4 +816,56 @@ mod tests { assert_eq!(body1.children[0].content, body2.children[0].content); assert_eq!(body1.children[1].content, body2.children[1].content); } + + #[test] + fn test_template_count_validation() { + let mut store = TemplatePathStore::new(); + + // Add the same template pattern multiple times (simulating multiple HTML elements) + let template_pattern = "{count} comments".to_string(); + + // Add template path 2 times (should not be validated) + for i in 0..2 { + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "div".to_string(), + classes: vec![format!("comment-{}", i)], + }], + template_pattern: template_pattern.clone(), + }; + store.add_path(path); + } + + // Should not be validated (only 2 elements) + assert_eq!(store.get_validated_paths().len(), 0); + + // Add 2 more times (total 4, should be validated) + for i in 2..4 { + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "div".to_string(), + classes: vec![format!("comment-{}", i)], + }], + template_pattern: template_pattern.clone(), + }; + store.add_path(path); + } + + // Should now be validated (>3 elements) + assert_eq!(store.get_validated_paths().len(), 4); + + // Test that different template patterns are counted separately + let different_pattern = "{time} hours ago".to_string(); + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "span".to_string(), + classes: vec!["timestamp".to_string()], + }], + template_pattern: different_pattern, + }; + store.add_path(path); + + // Still only 4 validated paths (the new pattern appears only once) + assert_eq!(store.get_validated_paths().len(), 4); + } } From 4085c9b3ad1a0be792414bb6c4936b44912e011e Mon Sep 17 00:00:00 2001 From: Sumit Datta Date: Wed, 9 Jul 2025 12:20:05 +0530 Subject: [PATCH 2/4] fix: update template validation to require minimum 2 elements instead of 3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Change get_validated_paths() to filter templates with count >= 2 instead of > 3 - Update logging and output messages to reflect new minimum requirement - Update test_template_count_validation() to test the new 2-element minimum - Ensures prep stage only includes templates that appear at least twice 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/main.rs | 4 ++-- src/template_detection.rs | 45 ++++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/src/main.rs b/src/main.rs index 1492647..96644e4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -153,7 +153,7 @@ async fn main() { let validated_paths = combined_store.get_validated_paths(); info!( - "Template analysis complete, found {} total template paths, {} validated (>3 elements)", + "Template analysis complete, found {} total template paths, {} validated (>=2 elements)", combined_store.get_paths().len(), validated_paths.len() ); @@ -212,7 +212,7 @@ async fn main() { } } - println!("\nValidated Template Paths (>3 elements, Rust-serializable format):"); + println!("\nValidated Template Paths (>=2 elements, Rust-serializable format):"); println!("{}", combined_store.to_validated_serialized_string()); } } else { diff --git a/src/template_detection.rs b/src/template_detection.rs index a5fe61c..319fa94 100644 --- a/src/template_detection.rs +++ b/src/template_detection.rs @@ -45,14 +45,14 @@ impl TemplatePathStore { &self.detected_paths } - /// Get validated paths that appear in more than 3 HTML elements + /// Get validated paths that appear in at least 2 HTML elements pub fn get_validated_paths(&self) -> HashSet { self.detected_paths .iter() .filter(|path| { self.template_counts .get(&path.template_pattern) - .is_some_and(|&count| count > 3) + .is_some_and(|&count| count >= 2) }) .cloned() .collect() @@ -824,22 +824,33 @@ mod tests { // Add the same template pattern multiple times (simulating multiple HTML elements) let template_pattern = "{count} comments".to_string(); - // Add template path 2 times (should not be validated) - for i in 0..2 { - let path = ElementPath { - components: vec![ElementPathComponent { - tag: "div".to_string(), - classes: vec![format!("comment-{}", i)], - }], - template_pattern: template_pattern.clone(), - }; - store.add_path(path); - } + // Add template path 1 time (should not be validated) + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "div".to_string(), + classes: vec!["comment-0".to_string()], + }], + template_pattern: template_pattern.clone(), + }; + store.add_path(path); - // Should not be validated (only 2 elements) + // Should not be validated (only 1 element) assert_eq!(store.get_validated_paths().len(), 0); - // Add 2 more times (total 4, should be validated) + // Add 1 more time (total 2, should be validated) + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "div".to_string(), + classes: vec!["comment-1".to_string()], + }], + template_pattern: template_pattern.clone(), + }; + store.add_path(path); + + // Should now be validated (>=2 elements) + assert_eq!(store.get_validated_paths().len(), 2); + + // Add 2 more times (total 4, should still be validated) for i in 2..4 { let path = ElementPath { components: vec![ElementPathComponent { @@ -851,7 +862,7 @@ mod tests { store.add_path(path); } - // Should now be validated (>3 elements) + // Should still be validated (4 elements >= 2) assert_eq!(store.get_validated_paths().len(), 4); // Test that different template patterns are counted separately @@ -865,7 +876,7 @@ mod tests { }; store.add_path(path); - // Still only 4 validated paths (the new pattern appears only once) + // Still only 4 validated paths (the new pattern appears only once, count < 2) assert_eq!(store.get_validated_paths().len(), 4); } } From 49df4a116362f840a0a16ce473882157abeadd18 Mon Sep 17 00:00:00 2001 From: Sumit Datta Date: Wed, 9 Jul 2025 13:56:25 +0530 Subject: [PATCH 3/4] feat: change --prep to accept file path and write JSON to file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update CLI --prep option to accept a file path instead of boolean flag - Change CliArgs.prep from bool to Option to store file path - Write validated template paths to specified JSON file during prep mode - Print template count summary to terminal instead of full JSON output - Update all main.rs logic to handle new prep file parameter - Update CLI tests to reflect new structure - Maintain backward compatibility with existing functionality Usage examples: - Normal mode: smart-crawler --domain example.com - Prep mode: smart-crawler --domain example.com --prep templates.json 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/cli.rs | 18 ++++++++++-------- src/main.rs | 27 +++++++++++++++++++++------ 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index c3a6400..20dd7d9 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -4,7 +4,7 @@ use url::Url; #[derive(Debug, Clone)] pub struct CliArgs { pub domain: String, - pub prep: bool, + pub prep: Option, } impl CliArgs { @@ -22,10 +22,11 @@ impl CliArgs { .arg( Arg::new("prep") .long("prep") + .value_name("FILE") .help( - "Enable preparation mode to discover template patterns across domain pages", + "Enable preparation mode to discover template patterns across domain pages. Saves detected template paths to the specified JSON file", ) - .action(clap::ArgAction::SetTrue), + .required(false), ) .get_matches(); @@ -34,7 +35,7 @@ impl CliArgs { .ok_or("Domain argument is required")?; let validated_domain = Self::extract_domain(domain_input)?; - let prep = matches.get_flag("prep"); + let prep = matches.get_one::("prep").cloned(); Ok(CliArgs { domain: validated_domain, @@ -74,11 +75,11 @@ mod tests { // Test that single domain parsing works correctly let args = CliArgs { domain: "example.com".to_string(), - prep: false, + prep: None, }; assert_eq!(args.domain, "example.com"); - assert!(!args.prep); + assert!(args.prep.is_none()); } #[test] @@ -124,10 +125,11 @@ mod tests { // since we can't easily test the full CLI parsing in unit tests) let args = CliArgs { domain: "example.com".to_string(), - prep: true, + prep: Some("templates.json".to_string()), }; - assert!(args.prep); + assert!(args.prep.is_some()); + assert_eq!(args.prep.unwrap(), "templates.json"); assert_eq!(args.domain, "example.com"); } } diff --git a/src/main.rs b/src/main.rs index 96644e4..f84efbb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -57,7 +57,7 @@ async fn main() { // Phase 1: URL Discovery - find additional URLs for each domain info!("Starting URL discovery for domains"); - let max_urls_per_domain = if args.prep { 10 } else { 3 }; + let max_urls_per_domain = if args.prep.is_some() { 10 } else { 3 }; // Discover additional URLs for the domain let domain = &args.domain; @@ -135,7 +135,7 @@ async fn main() { } // Phase 3: Template analysis (prep mode) or standard duplicate analysis - if args.prep { + if let Some(prep_file) = &args.prep { info!("Running template detection analysis in prep mode"); let mut combined_store = TemplatePathStore::new(); let template_detector = TemplateDetector::new(); @@ -157,6 +157,12 @@ async fn main() { combined_store.get_paths().len(), validated_paths.len() ); + + // Write template data to JSON file + match std::fs::write(prep_file, combined_store.to_validated_serialized_string()) { + Ok(_) => info!("Template data written to {}", prep_file), + Err(e) => error!("Failed to write template data to {}: {}", prep_file, e), + } } else { info!("Running standard duplicate analysis"); @@ -180,8 +186,8 @@ async fn main() { let _ = browser.close().await; - if args.prep { - // In prep mode, output detected template paths in serialized format + if args.prep.is_some() { + // In prep mode, output template count summary to terminal println!("\n=== Template Path Detection Results ==="); let mut combined_store = TemplatePathStore::new(); @@ -212,8 +218,17 @@ async fn main() { } } - println!("\nValidated Template Paths (>=2 elements, Rust-serializable format):"); - println!("{}", combined_store.to_validated_serialized_string()); + let validated_paths = combined_store.get_validated_paths(); + println!("\nTemplate Detection Summary:"); + println!( + " Total template paths found: {}", + combined_store.get_paths().len() + ); + println!( + " Validated template paths (>=2 elements): {}", + validated_paths.len() + ); + println!(" Template data saved to: {}", args.prep.as_ref().unwrap()); } } else { // Regular mode - show crawling results From 4daab569f2067973c183e45c1e7fe3a80b574fb1 Mon Sep 17 00:00:00 2001 From: Sumit Datta Date: Wed, 9 Jul 2025 17:30:50 +0530 Subject: [PATCH 4/4] feat: clean up JSON output to only include detected_paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Create ValidatedTemplatePathStore struct for JSON output containing only detected_paths - Remove template_counts from JSON output to keep it clean and focused - Update to_validated_serialized_string() to use the new simplified structure - Add test to verify JSON output structure excludes template_counts - Maintains internal template_counts for validation logic while hiding from output JSON output now contains only: { "detected_paths": [...] } 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/template_detection.rs | 43 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/src/template_detection.rs b/src/template_detection.rs index 319fa94..e5e1887 100644 --- a/src/template_detection.rs +++ b/src/template_detection.rs @@ -24,6 +24,12 @@ pub struct TemplatePathStore { template_counts: HashMap, } +/// Simplified store for JSON output containing only detected paths +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ValidatedTemplatePathStore { + pub detected_paths: HashSet, +} + impl TemplatePathStore { pub fn new() -> Self { Self { @@ -64,9 +70,8 @@ impl TemplatePathStore { /// Get serialized string with only validated paths (for prep mode) pub fn to_validated_serialized_string(&self) -> String { - let validated_store = TemplatePathStore { + let validated_store = ValidatedTemplatePathStore { detected_paths: self.get_validated_paths(), - template_counts: self.template_counts.clone(), }; serde_json::to_string_pretty(&validated_store).unwrap_or_default() } @@ -879,4 +884,38 @@ mod tests { // Still only 4 validated paths (the new pattern appears only once, count < 2) assert_eq!(store.get_validated_paths().len(), 4); } + + #[test] + fn test_validated_json_output_structure() { + let mut store = TemplatePathStore::new(); + + // Add some template paths + let template_pattern = "{count} comments".to_string(); + for i in 0..3 { + let path = ElementPath { + components: vec![ElementPathComponent { + tag: "div".to_string(), + classes: vec![format!("comment-{}", i)], + }], + template_pattern: template_pattern.clone(), + }; + store.add_path(path); + } + + // Get the JSON output + let json_output = store.to_validated_serialized_string(); + + // Parse the JSON to verify structure + let parsed: serde_json::Value = serde_json::from_str(&json_output).unwrap(); + + // Should have "detected_paths" field + assert!(parsed.get("detected_paths").is_some()); + + // Should NOT have "template_counts" field + assert!(parsed.get("template_counts").is_none()); + + // Should have the validated paths (3 elements >= 2) + let detected_paths = parsed["detected_paths"].as_array().unwrap(); + assert_eq!(detected_paths.len(), 3); + } }