22import json
33import urllib .parse
44from typing import Dict , List , Optional , Set
5+ from concurrent .futures import ThreadPoolExecutor , as_completed
56
7+ import time
68import requests
79import validators
810from bs4 import BeautifulSoup
911from colorama import Fore , Style , init
1012
1113init (autoreset = True )
1214
13-
1415DEFAULT_SCHEME : str = 'http://'
1516
1617
@@ -27,7 +28,13 @@ class Spider():
2728 save_to_file (Optional[str]): The file path to save the crawl results.
2829 """
2930
30- def __init__ (self , root_url : str , max_links : int = 5 , save_to_file : Optional [str ] = None ) -> None :
31+ def __init__ (self ,
32+ root_url : str ,
33+ max_links : int = 5 ,
34+ save_to_file : Optional [str ] = None ,
35+ max_workers : int = 1 ,
36+ delay : float = 0.5 ,
37+ verbose : bool = True ) -> None :
3138 """
3239 Initializes the Spider class.
3340
@@ -43,6 +50,9 @@ def __init__(self, root_url: str, max_links: int = 5, save_to_file: Optional[str
4350 self .link_count : int = 0
4451 self .save_to_file : Optional [str ] = save_to_file
4552 self .scheme : str = DEFAULT_SCHEME
53+ self .max_workers : int = max_workers
54+ self .delay : float = delay
55+ self .verbose : bool = verbose
4656
4757 def fetch_url (self , url : str ) -> Optional [BeautifulSoup ]:
4858 """
@@ -54,7 +64,6 @@ def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
5464 Returns:
5565 Optional[BeautifulSoup]: A BeautifulSoup object if the URL is fetched successfully, None otherwise.
5666 """
57-
5867 try :
5968 response = requests .get (url , timeout = 10 )
6069 response .raise_for_status () # Raise an HTTPError for bad responses (4xx and 5xx)
@@ -83,6 +92,10 @@ def is_valid_url(url: str) -> bool:
8392 """
8493 return bool (validators .url (url ))
8594
95+ def verbose_print (self , content : str ) -> None :
96+ if self .verbose :
97+ print (content )
98+
8699 def save_results (self ) -> None :
87100 """
88101 Saves the crawl results into a JSON file.
@@ -127,37 +140,38 @@ def crawl(self, url: str) -> None:
127140 url (str): The URL to crawl.
128141 """
129142 if not self .is_valid_url (url ):
130- print (Fore .RED + f"Invalid url to crawl: { url } " )
143+ self . verbose_print (Fore .RED + f"Invalid url to crawl: { url } " )
131144 return
132145
133146 if url in self .crawl_result :
134- print (Fore .YELLOW + f"URL already crawled: { url } " )
147+ self . verbose_print (Fore .YELLOW + f"URL already crawled: { url } " )
135148 return
136149
137- print (Fore .GREEN + f"Crawling: { url } " )
150+ self . verbose_print (Fore .GREEN + f"Crawling: { url } " )
138151 soup = self .fetch_url (url )
139152 if not soup :
140153 return
141154
142- links = soup .body .find_all ('a' , href = True )
155+ links = soup .body .find_all ('a' , href = True ) if soup . body else []
143156 self .crawl_result [url ] = {'urls' : []}
144157
145158 for link in links :
146159 pretty_url = self .format_url (link ['href' ].lstrip (), url )
147160 if not self .is_valid_url (pretty_url ):
148- print (Fore .RED + f"Invalid url: { pretty_url } " )
161+ self . verbose_print (Fore .RED + f"Invalid url: { pretty_url } " )
149162 continue
150163
151164 if pretty_url in self .crawl_result [url ]['urls' ]:
152165 continue
153166
154167 self .crawl_result [url ]['urls' ].append (pretty_url )
155168 self .crawl_set .add (pretty_url )
156- print (Fore .BLUE + f"Link found: { pretty_url } " )
169+ self . verbose_print (Fore .BLUE + f"Link found: { pretty_url } " )
157170
158171 if self .link_count < self .max_links :
159172 self .link_count += 1
160- print (Fore .GREEN + f"Links crawled: { self .link_count } " )
173+ self .verbose_print (
174+ Fore .GREEN + f"Links crawled: { self .link_count } " )
161175
162176 def start (self ) -> Dict [str , Dict [str , List [str ]]]:
163177 """
@@ -166,25 +180,35 @@ def start(self) -> Dict[str, Dict[str, List[str]]]:
166180 Returns:
167181 Dict[str, Dict[str, List[str]]]: The crawl results.
168182 """
169- self .crawl (self .root_url )
170-
171- while self .crawl_set and self .link_count < self .max_links :
172- self .crawl (self .crawl_set .pop ())
183+ with ThreadPoolExecutor (max_workers = self .max_workers ) as executor :
184+ futures = {executor .submit (self .crawl , self .root_url )}
185+
186+ while self .link_count < self .max_links and futures :
187+ for future in as_completed (futures ):
188+ futures .remove (future )
189+ if future .exception () is None :
190+ while self .link_count < self .max_links and self .crawl_set :
191+ url = self .crawl_set .pop ()
192+ if url not in self .crawl_result :
193+ futures .add (executor .submit (self .crawl , url ))
194+ time .sleep (self .delay )
195+ break # Break to check the next future
173196
174197 if self .save_to_file :
175198 self .save_results ()
176- print (Style .BRIGHT + Fore .MAGENTA + "Exiting...." )
199+ self . verbose_print (Style .BRIGHT + Fore .MAGENTA + "Exiting...." )
177200 return self .crawl_result
178201
179202
180203def main () -> None :
181204 """
182205 The main function to initialize and start the crawler.
183206 """
184- root_url = 'http ://github.com '
185- max_links = 2
207+ root_url = 'https ://pypi.org/ '
208+ max_links = 5
186209
187210 crawler = Spider (root_url , max_links , save_to_file = 'out.json' )
211+ print (Fore .GREEN + f"Crawling: { root_url } " )
188212 crawler .start ()
189213
190214
0 commit comments