Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use url::Url;
#[derive(Debug, Clone)]
pub struct CliArgs {
pub domain: String,
pub prep: bool,
pub prep: Option<String>,
}

impl CliArgs {
Expand All @@ -22,10 +22,11 @@ impl CliArgs {
.arg(
Arg::new("prep")
.long("prep")
.value_name("FILE")
.help(
"Enable preparation mode to discover template patterns across domain pages",
"Enable preparation mode to discover template patterns across domain pages. Saves detected template paths to the specified JSON file",
)
.action(clap::ArgAction::SetTrue),
.required(false),
)
.get_matches();

Expand All @@ -34,7 +35,7 @@ impl CliArgs {
.ok_or("Domain argument is required")?;

let validated_domain = Self::extract_domain(domain_input)?;
let prep = matches.get_flag("prep");
let prep = matches.get_one::<String>("prep").cloned();

Ok(CliArgs {
domain: validated_domain,
Expand Down Expand Up @@ -74,11 +75,11 @@ mod tests {
// Test that single domain parsing works correctly
let args = CliArgs {
domain: "example.com".to_string(),
prep: false,
prep: None,
};

assert_eq!(args.domain, "example.com");
assert!(!args.prep);
assert!(args.prep.is_none());
}

#[test]
Expand Down Expand Up @@ -124,10 +125,11 @@ mod tests {
// since we can't easily test the full CLI parsing in unit tests)
let args = CliArgs {
domain: "example.com".to_string(),
prep: true,
prep: Some("templates.json".to_string()),
};

assert!(args.prep);
assert!(args.prep.is_some());
assert_eq!(args.prep.unwrap(), "templates.json");
assert_eq!(args.domain, "example.com");
}
}
33 changes: 25 additions & 8 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ async fn main() {
// Phase 1: URL Discovery - find additional URLs for each domain
info!("Starting URL discovery for domains");

let max_urls_per_domain = if args.prep { 10 } else { 3 };
let max_urls_per_domain = if args.prep.is_some() { 10 } else { 3 };

// Discover additional URLs for the domain
let domain = &args.domain;
Expand Down Expand Up @@ -135,7 +135,7 @@ async fn main() {
}

// Phase 3: Template analysis (prep mode) or standard duplicate analysis
if args.prep {
if let Some(prep_file) = &args.prep {
info!("Running template detection analysis in prep mode");
let mut combined_store = TemplatePathStore::new();
let template_detector = TemplateDetector::new();
Expand All @@ -151,10 +151,18 @@ async fn main() {
}
}

let validated_paths = combined_store.get_validated_paths();
info!(
"Template analysis complete, found {} unique template paths",
combined_store.get_paths().len()
"Template analysis complete, found {} total template paths, {} validated (>=2 elements)",
combined_store.get_paths().len(),
validated_paths.len()
);

// Write template data to JSON file
match std::fs::write(prep_file, combined_store.to_validated_serialized_string()) {
Ok(_) => info!("Template data written to {}", prep_file),
Err(e) => error!("Failed to write template data to {}: {}", prep_file, e),
}
} else {
info!("Running standard duplicate analysis");

Expand All @@ -178,8 +186,8 @@ async fn main() {

let _ = browser.close().await;

if args.prep {
// In prep mode, output detected template paths in serialized format
if args.prep.is_some() {
// In prep mode, output template count summary to terminal
println!("\n=== Template Path Detection Results ===");

let mut combined_store = TemplatePathStore::new();
Expand Down Expand Up @@ -210,8 +218,17 @@ async fn main() {
}
}

println!("\nDetected Template Paths (Rust-serializable format):");
println!("{}", combined_store.to_serialized_string());
let validated_paths = combined_store.get_validated_paths();
println!("\nTemplate Detection Summary:");
println!(
" Total template paths found: {}",
combined_store.get_paths().len()
);
println!(
" Validated template paths (>=2 elements): {}",
validated_paths.len()
);
println!(" Template data saved to: {}", args.prep.as_ref().unwrap());
}
} else {
// Regular mode - show crawling results
Expand Down
184 changes: 178 additions & 6 deletions src/template_detection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,61 @@ pub struct ElementPath {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TemplatePathStore {
pub detected_paths: HashSet<ElementPath>,
// Track how many HTML elements each template pattern appears in
template_counts: HashMap<String, usize>,
}

/// Simplified store for JSON output containing only detected paths
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ValidatedTemplatePathStore {
pub detected_paths: HashSet<ElementPath>,
}

impl TemplatePathStore {
pub fn new() -> Self {
Self {
detected_paths: HashSet::new(),
template_counts: HashMap::new(),
}
}

pub fn add_path(&mut self, path: ElementPath) {
// Track the count for this template pattern
*self
.template_counts
.entry(path.template_pattern.clone())
.or_insert(0) += 1;
self.detected_paths.insert(path);
}

pub fn get_paths(&self) -> &HashSet<ElementPath> {
&self.detected_paths
}

/// Get validated paths that appear in at least 2 HTML elements
pub fn get_validated_paths(&self) -> HashSet<ElementPath> {
self.detected_paths
.iter()
.filter(|path| {
self.template_counts
.get(&path.template_pattern)
.is_some_and(|&count| count >= 2)
})
.cloned()
.collect()
}

pub fn to_serialized_string(&self) -> String {
serde_json::to_string_pretty(self).unwrap_or_default()
}

/// Get serialized string with only validated paths (for prep mode)
pub fn to_validated_serialized_string(&self) -> String {
let validated_store = ValidatedTemplatePathStore {
detected_paths: self.get_validated_paths(),
};
serde_json::to_string_pretty(&validated_store).unwrap_or_default()
}
}

impl Default for TemplatePathStore {
Expand Down Expand Up @@ -268,6 +303,11 @@ impl TemplateDetector {
return false;
}

// Limit templates to 5 parts maximum (words/numbers/floats)
if words.len() > 5 {
return false;
}

// Check for known patterns
for word in &words {
let lowercase = word.to_lowercase();
Expand Down Expand Up @@ -456,6 +496,38 @@ mod tests {
}
}

#[test]
fn test_five_part_limit() {
let detector = TemplateDetector::new();

// Valid: 5 parts or less
let valid_patterns = vec![
"42 comments", // 2 parts
"Posted 2 hours ago", // 3 parts
"Page 5 of 100 items", // 5 parts
];

for input in valid_patterns {
assert!(
detector.detect_template(input).is_some(),
"Should detect pattern for: {input}"
);
}

// Invalid: More than 5 parts
let invalid_patterns = vec![
"This is a very long sentence with 42 comments here", // 10 parts
"Posted by user 123 about 2 hours ago today", // 8 parts
];

for input in invalid_patterns {
assert!(
detector.detect_template(input).is_none(),
"Should NOT detect pattern for long sentence: {input}"
);
}
}

#[test]
fn test_apply_template() {
let detector = TemplateDetector::new();
Expand All @@ -469,15 +541,18 @@ mod tests {
fn test_edge_cases() {
let detector = TemplateDetector::new();

// Multiple numbers - should pick the first one that makes sense
let template = detector
.detect_template("Posted 2 hours ago by user123")
.unwrap();
assert_eq!(template.pattern, "Posted {time} hours ago by user123");
// Multiple numbers within 5-part limit
let template = detector.detect_template("Posted 2 hours ago").unwrap();
assert_eq!(template.pattern, "Posted {time} hours ago");

// Complex patterns
// Complex patterns within limit
let template = detector.detect_template("Page 5 of 100").unwrap();
assert_eq!(template.pattern, "Page {count} of 100");

// Test that patterns exceeding 5 parts are rejected
assert!(detector
.detect_template("Posted 2 hours ago by user123")
.is_none());
}

#[test]
Expand Down Expand Up @@ -746,4 +821,101 @@ mod tests {
assert_eq!(body1.children[0].content, body2.children[0].content);
assert_eq!(body1.children[1].content, body2.children[1].content);
}

#[test]
fn test_template_count_validation() {
let mut store = TemplatePathStore::new();

// Add the same template pattern multiple times (simulating multiple HTML elements)
let template_pattern = "{count} comments".to_string();

// Add template path 1 time (should not be validated)
let path = ElementPath {
components: vec![ElementPathComponent {
tag: "div".to_string(),
classes: vec!["comment-0".to_string()],
}],
template_pattern: template_pattern.clone(),
};
store.add_path(path);

// Should not be validated (only 1 element)
assert_eq!(store.get_validated_paths().len(), 0);

// Add 1 more time (total 2, should be validated)
let path = ElementPath {
components: vec![ElementPathComponent {
tag: "div".to_string(),
classes: vec!["comment-1".to_string()],
}],
template_pattern: template_pattern.clone(),
};
store.add_path(path);

// Should now be validated (>=2 elements)
assert_eq!(store.get_validated_paths().len(), 2);

// Add 2 more times (total 4, should still be validated)
for i in 2..4 {
let path = ElementPath {
components: vec![ElementPathComponent {
tag: "div".to_string(),
classes: vec![format!("comment-{}", i)],
}],
template_pattern: template_pattern.clone(),
};
store.add_path(path);
}

// Should still be validated (4 elements >= 2)
assert_eq!(store.get_validated_paths().len(), 4);

// Test that different template patterns are counted separately
let different_pattern = "{time} hours ago".to_string();
let path = ElementPath {
components: vec![ElementPathComponent {
tag: "span".to_string(),
classes: vec!["timestamp".to_string()],
}],
template_pattern: different_pattern,
};
store.add_path(path);

// Still only 4 validated paths (the new pattern appears only once, count < 2)
assert_eq!(store.get_validated_paths().len(), 4);
}

#[test]
fn test_validated_json_output_structure() {
let mut store = TemplatePathStore::new();

// Add some template paths
let template_pattern = "{count} comments".to_string();
for i in 0..3 {
let path = ElementPath {
components: vec![ElementPathComponent {
tag: "div".to_string(),
classes: vec![format!("comment-{}", i)],
}],
template_pattern: template_pattern.clone(),
};
store.add_path(path);
}

// Get the JSON output
let json_output = store.to_validated_serialized_string();

// Parse the JSON to verify structure
let parsed: serde_json::Value = serde_json::from_str(&json_output).unwrap();

// Should have "detected_paths" field
assert!(parsed.get("detected_paths").is_some());

// Should NOT have "template_counts" field
assert!(parsed.get("template_counts").is_none());

// Should have the validated paths (3 elements >= 2)
let detected_paths = parsed["detected_paths"].as_array().unwrap();
assert_eq!(detected_paths.len(), 3);
}
}
Loading