Skip to content

Commit 36a3ba0

Browse files
authored
Merge pull request #67 from pixlie/feature/cli-usability-improvements
feat: implement CLI usability improvements with domain-based crawling…
2 parents 4b4f808 + 8bfd4ab commit 36a3ba0

3 files changed

Lines changed: 313 additions & 277 deletions

File tree

src/cli.rs

Lines changed: 81 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,81 +1,67 @@
11
use clap::{Arg, Command};
2-
use std::collections::HashSet;
32
use url::Url;
43

54
#[derive(Debug, Clone)]
65
pub struct CliArgs {
7-
pub links: Vec<String>,
8-
pub verbose: bool,
9-
pub template: bool,
6+
pub domain: String,
7+
pub prep: bool,
108
}
119

1210
impl CliArgs {
1311
pub fn parse() -> Result<Self, String> {
1412
let matches = Command::new("smart-crawler")
15-
.version("0.3.2")
13+
.version("0.4.1")
1614
.about("A web crawler that uses WebDriver to extract and parse HTML content")
1715
.arg(
18-
Arg::new("link")
19-
.long("link")
20-
.value_name("URL")
21-
.help("URL to crawl (can be specified multiple times)")
22-
.action(clap::ArgAction::Append)
16+
Arg::new("domain")
17+
.long("domain")
18+
.value_name("DOMAIN")
19+
.help("Domain to crawl. Can be a URL or domain name")
2320
.required(true),
2421
)
2522
.arg(
26-
Arg::new("verbose")
27-
.long("verbose")
28-
.help("Enable verbose output showing filtered HTML node tree")
29-
.action(clap::ArgAction::SetTrue),
30-
)
31-
.arg(
32-
Arg::new("template")
33-
.long("template")
34-
.help("Enable template detection mode to identify patterns like '{count} comments' in HTML content")
23+
Arg::new("prep")
24+
.long("prep")
25+
.help(
26+
"Enable preparation mode to discover template patterns across domain pages",
27+
)
3528
.action(clap::ArgAction::SetTrue),
3629
)
3730
.get_matches();
3831

39-
let links: Vec<String> = matches
40-
.get_many::<String>("link")
41-
.unwrap_or_default()
42-
.cloned()
43-
.collect();
32+
let domain_input = matches
33+
.get_one::<String>("domain")
34+
.ok_or("Domain argument is required")?;
4435

45-
let validated_links = Self::validate_and_deduplicate_links(links)?;
46-
let verbose = matches.get_flag("verbose");
47-
let template = matches.get_flag("template");
36+
let validated_domain = Self::extract_domain(domain_input)?;
37+
let prep = matches.get_flag("prep");
4838

4939
Ok(CliArgs {
50-
links: validated_links,
51-
verbose,
52-
template,
40+
domain: validated_domain,
41+
prep,
5342
})
5443
}
5544

56-
fn validate_and_deduplicate_links(links: Vec<String>) -> Result<Vec<String>, String> {
57-
let mut seen_urls = HashSet::new();
58-
let mut validated_links = Vec::new();
59-
60-
for link in links {
61-
match Url::parse(&link) {
62-
Ok(url) => {
63-
let normalized_url = url.to_string();
64-
if seen_urls.insert(normalized_url.clone()) {
65-
validated_links.push(normalized_url);
66-
}
67-
}
68-
Err(_) => {
69-
return Err(format!("Invalid URL: {link}"));
45+
fn extract_domain(input: &str) -> Result<String, String> {
46+
let trimmed = input.trim();
47+
48+
// Always try to parse as URL to validate the domain
49+
let url_str = if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
50+
trimmed.to_string()
51+
} else {
52+
format!("https://{trimmed}")
53+
};
54+
55+
match Url::parse(&url_str) {
56+
Ok(url) => {
57+
if let Some(domain) = url.host_str() {
58+
Ok(domain.to_string())
59+
} else {
60+
Err(format!("Could not extract domain from: {input}"))
7061
}
7162
}
63+
Err(_) => Err(format!("Invalid domain or URL: {input}")),
7264
}
73-
74-
if validated_links.is_empty() {
75-
return Err("No valid URLs provided".to_string());
76-
}
77-
78-
Ok(validated_links)
7965
}
8066
}
8167

@@ -84,47 +70,64 @@ mod tests {
8470
use super::*;
8571

8672
#[test]
87-
fn test_validate_and_deduplicate_links() {
88-
let links = vec![
89-
"https://example.com".to_string(),
90-
"https://example.org".to_string(),
91-
"https://example.com".to_string(), // duplicate
92-
];
93-
94-
let result = CliArgs::validate_and_deduplicate_links(links).unwrap();
95-
assert_eq!(result.len(), 2);
96-
assert!(result.contains(&"https://example.com/".to_string()));
97-
assert!(result.contains(&"https://example.org/".to_string()));
73+
fn test_single_domain_parsing() {
74+
// Test that single domain parsing works correctly
75+
let args = CliArgs {
76+
domain: "example.com".to_string(),
77+
prep: false,
78+
};
79+
80+
assert_eq!(args.domain, "example.com");
81+
assert!(!args.prep);
9882
}
9983

10084
#[test]
101-
fn test_validate_invalid_url() {
102-
let links = vec!["invalid-url".to_string()];
103-
let result = CliArgs::validate_and_deduplicate_links(links);
104-
assert!(result.is_err());
105-
assert!(result.unwrap_err().contains("Invalid URL"));
85+
fn test_extract_domain() {
86+
// Test URL with protocol
87+
assert_eq!(
88+
CliArgs::extract_domain("https://example.com").unwrap(),
89+
"example.com"
90+
);
91+
assert_eq!(
92+
CliArgs::extract_domain("http://example.com/path").unwrap(),
93+
"example.com"
94+
);
95+
96+
// Test domain without protocol
97+
assert_eq!(
98+
CliArgs::extract_domain("example.com").unwrap(),
99+
"example.com"
100+
);
101+
assert_eq!(
102+
CliArgs::extract_domain(" example.com ").unwrap(),
103+
"example.com"
104+
);
105+
106+
// Test edge case - the URL crate behavior with multiple dots
107+
assert_eq!(
108+
CliArgs::extract_domain("invalid..domain").unwrap(),
109+
"invalid..domain"
110+
);
106111
}
107112

108113
#[test]
109-
fn test_validate_empty_links() {
110-
let links = vec![];
111-
let result = CliArgs::validate_and_deduplicate_links(links);
114+
fn test_extract_domain_error() {
115+
// Test that invalid domain extraction returns error
116+
let result = CliArgs::extract_domain("://invalid");
112117
assert!(result.is_err());
113-
assert!(result.unwrap_err().contains("No valid URLs provided"));
118+
assert!(result.unwrap_err().contains("Invalid domain or URL"));
114119
}
115120

116121
#[test]
117-
fn test_cli_template_flag() {
118-
// Test that template flag is properly parsed (this is a simplified test
122+
fn test_cli_prep_flag() {
123+
// Test that prep flag is properly parsed (this is a simplified test
119124
// since we can't easily test the full CLI parsing in unit tests)
120125
let args = CliArgs {
121-
links: vec!["https://example.com".to_string()],
122-
verbose: true,
123-
template: true,
126+
domain: "example.com".to_string(),
127+
prep: true,
124128
};
125129

126-
assert!(args.template);
127-
assert!(args.verbose);
128-
assert_eq!(args.links.len(), 1);
130+
assert!(args.prep);
131+
assert_eq!(args.domain, "example.com");
129132
}
130133
}

0 commit comments

Comments
 (0)