11use clap:: { Arg , Command } ;
2- use std:: collections:: HashSet ;
32use url:: Url ;
43
54#[ derive( Debug , Clone ) ]
65pub struct CliArgs {
7- pub links : Vec < String > ,
8- pub verbose : bool ,
9- pub template : bool ,
6+ pub domain : String ,
7+ pub prep : bool ,
108}
119
1210impl CliArgs {
1311 pub fn parse ( ) -> Result < Self , String > {
1412 let matches = Command :: new ( "smart-crawler" )
15- . version ( "0.3.2 " )
13+ . version ( "0.4.1 " )
1614 . about ( "A web crawler that uses WebDriver to extract and parse HTML content" )
1715 . arg (
18- Arg :: new ( "link" )
19- . long ( "link" )
20- . value_name ( "URL" )
21- . help ( "URL to crawl (can be specified multiple times)" )
22- . action ( clap:: ArgAction :: Append )
16+ Arg :: new ( "domain" )
17+ . long ( "domain" )
18+ . value_name ( "DOMAIN" )
19+ . help ( "Domain to crawl. Can be a URL or domain name" )
2320 . required ( true ) ,
2421 )
2522 . arg (
26- Arg :: new ( "verbose" )
27- . long ( "verbose" )
28- . help ( "Enable verbose output showing filtered HTML node tree" )
29- . action ( clap:: ArgAction :: SetTrue ) ,
30- )
31- . arg (
32- Arg :: new ( "template" )
33- . long ( "template" )
34- . help ( "Enable template detection mode to identify patterns like '{count} comments' in HTML content" )
23+ Arg :: new ( "prep" )
24+ . long ( "prep" )
25+ . help (
26+ "Enable preparation mode to discover template patterns across domain pages" ,
27+ )
3528 . action ( clap:: ArgAction :: SetTrue ) ,
3629 )
3730 . get_matches ( ) ;
3831
39- let links: Vec < String > = matches
40- . get_many :: < String > ( "link" )
41- . unwrap_or_default ( )
42- . cloned ( )
43- . collect ( ) ;
32+ let domain_input = matches
33+ . get_one :: < String > ( "domain" )
34+ . ok_or ( "Domain argument is required" ) ?;
4435
45- let validated_links = Self :: validate_and_deduplicate_links ( links) ?;
46- let verbose = matches. get_flag ( "verbose" ) ;
47- let template = matches. get_flag ( "template" ) ;
36+ let validated_domain = Self :: extract_domain ( domain_input) ?;
37+ let prep = matches. get_flag ( "prep" ) ;
4838
4939 Ok ( CliArgs {
50- links : validated_links,
51- verbose,
52- template,
40+ domain : validated_domain,
41+ prep,
5342 } )
5443 }
5544
56- fn validate_and_deduplicate_links ( links : Vec < String > ) -> Result < Vec < String > , String > {
57- let mut seen_urls = HashSet :: new ( ) ;
58- let mut validated_links = Vec :: new ( ) ;
59-
60- for link in links {
61- match Url :: parse ( & link) {
62- Ok ( url) => {
63- let normalized_url = url. to_string ( ) ;
64- if seen_urls. insert ( normalized_url. clone ( ) ) {
65- validated_links. push ( normalized_url) ;
66- }
67- }
68- Err ( _) => {
69- return Err ( format ! ( "Invalid URL: {link}" ) ) ;
45+ fn extract_domain ( input : & str ) -> Result < String , String > {
46+ let trimmed = input. trim ( ) ;
47+
48+ // Always try to parse as URL to validate the domain
49+ let url_str = if trimmed. starts_with ( "http://" ) || trimmed. starts_with ( "https://" ) {
50+ trimmed. to_string ( )
51+ } else {
52+ format ! ( "https://{trimmed}" )
53+ } ;
54+
55+ match Url :: parse ( & url_str) {
56+ Ok ( url) => {
57+ if let Some ( domain) = url. host_str ( ) {
58+ Ok ( domain. to_string ( ) )
59+ } else {
60+ Err ( format ! ( "Could not extract domain from: {input}" ) )
7061 }
7162 }
63+ Err ( _) => Err ( format ! ( "Invalid domain or URL: {input}" ) ) ,
7264 }
73-
74- if validated_links. is_empty ( ) {
75- return Err ( "No valid URLs provided" . to_string ( ) ) ;
76- }
77-
78- Ok ( validated_links)
7965 }
8066}
8167
@@ -84,47 +70,64 @@ mod tests {
8470 use super :: * ;
8571
8672 #[ test]
87- fn test_validate_and_deduplicate_links ( ) {
88- let links = vec ! [
89- "https://example.com" . to_string( ) ,
90- "https://example.org" . to_string( ) ,
91- "https://example.com" . to_string( ) , // duplicate
92- ] ;
93-
94- let result = CliArgs :: validate_and_deduplicate_links ( links) . unwrap ( ) ;
95- assert_eq ! ( result. len( ) , 2 ) ;
96- assert ! ( result. contains( & "https://example.com/" . to_string( ) ) ) ;
97- assert ! ( result. contains( & "https://example.org/" . to_string( ) ) ) ;
73+ fn test_single_domain_parsing ( ) {
74+ // Test that single domain parsing works correctly
75+ let args = CliArgs {
76+ domain : "example.com" . to_string ( ) ,
77+ prep : false ,
78+ } ;
79+
80+ assert_eq ! ( args. domain, "example.com" ) ;
81+ assert ! ( !args. prep) ;
9882 }
9983
10084 #[ test]
101- fn test_validate_invalid_url ( ) {
102- let links = vec ! [ "invalid-url" . to_string( ) ] ;
103- let result = CliArgs :: validate_and_deduplicate_links ( links) ;
104- assert ! ( result. is_err( ) ) ;
105- assert ! ( result. unwrap_err( ) . contains( "Invalid URL" ) ) ;
85+ fn test_extract_domain ( ) {
86+ // Test URL with protocol
87+ assert_eq ! (
88+ CliArgs :: extract_domain( "https://example.com" ) . unwrap( ) ,
89+ "example.com"
90+ ) ;
91+ assert_eq ! (
92+ CliArgs :: extract_domain( "http://example.com/path" ) . unwrap( ) ,
93+ "example.com"
94+ ) ;
95+
96+ // Test domain without protocol
97+ assert_eq ! (
98+ CliArgs :: extract_domain( "example.com" ) . unwrap( ) ,
99+ "example.com"
100+ ) ;
101+ assert_eq ! (
102+ CliArgs :: extract_domain( " example.com " ) . unwrap( ) ,
103+ "example.com"
104+ ) ;
105+
106+ // Test edge case - the URL crate behavior with multiple dots
107+ assert_eq ! (
108+ CliArgs :: extract_domain( "invalid..domain" ) . unwrap( ) ,
109+ "invalid..domain"
110+ ) ;
106111 }
107112
108113 #[ test]
109- fn test_validate_empty_links ( ) {
110- let links = vec ! [ ] ;
111- let result = CliArgs :: validate_and_deduplicate_links ( links ) ;
114+ fn test_extract_domain_error ( ) {
115+ // Test that invalid domain extraction returns error
116+ let result = CliArgs :: extract_domain ( "://invalid" ) ;
112117 assert ! ( result. is_err( ) ) ;
113- assert ! ( result. unwrap_err( ) . contains( "No valid URLs provided " ) ) ;
118+ assert ! ( result. unwrap_err( ) . contains( "Invalid domain or URL " ) ) ;
114119 }
115120
116121 #[ test]
117- fn test_cli_template_flag ( ) {
118- // Test that template flag is properly parsed (this is a simplified test
122+ fn test_cli_prep_flag ( ) {
123+ // Test that prep flag is properly parsed (this is a simplified test
119124 // since we can't easily test the full CLI parsing in unit tests)
120125 let args = CliArgs {
121- links : vec ! [ "https://example.com" . to_string( ) ] ,
122- verbose : true ,
123- template : true ,
126+ domain : "example.com" . to_string ( ) ,
127+ prep : true ,
124128 } ;
125129
126- assert ! ( args. template) ;
127- assert ! ( args. verbose) ;
128- assert_eq ! ( args. links. len( ) , 1 ) ;
130+ assert ! ( args. prep) ;
131+ assert_eq ! ( args. domain, "example.com" ) ;
129132 }
130133}
0 commit comments