Skip to content

Commit 3e581e2

Browse files
authored
Merge pull request #7 from indrajithi/feature/concurrent-workers
add support for concurrent workers, custom delay and optional verbose
2 parents e65ef69 + 9d5a905 commit 3e581e2

5 files changed

Lines changed: 61 additions & 25 deletions

File tree

.github/workflows/ci.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@ jobs:
1818
- name: Install dependencies
1919
run: |
2020
poetry install
21-
- name: Run pylint
21+
- name: Run linter :pylint
2222
run: |
2323
poetry run pylint tiny_web_crawler
24+
- name: Run mypy :type_checking
25+
run: |
26+
poetry run mypy --install-types --non-interactive tiny_web_crawler
2427
2528
test:
2629
needs: lint

.pylintrc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,10 +285,10 @@ exclude-too-few-public-methods=
285285
ignored-parents=
286286

287287
# Maximum number of arguments for function / method.
288-
max-args=5
288+
max-args=10
289289

290290
# Maximum number of attributes for a class (see R0902).
291-
max-attributes=7
291+
max-attributes=15
292292

293293
# Maximum number of boolean expressions in an if statement (see R0916).
294294
max-bool-expr=5

README.md

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ A simple and efficient web crawler in Python.
66

77
## Features
88

9-
- Crawl web pages and extract links starting from a root URL and extract all the links found on each page
9+
- Crawl web pages and extract links starting from a root URL recursively
1010
- Handle relative and absolute URLs
11-
- Save the results of your crawl in a structured JSON format for easy analysis and processing
1211
- Designed with simplicity in mind, making it easy to use and extend for various web crawling tasks
12+
- Set concurrent workers and custom delay
1313

1414
## Installation
1515

@@ -27,8 +27,16 @@ from tiny_web_crawler.crawler import Spider
2727
root_url = 'http://github.com'
2828
max_links = 2
2929

30-
spider = Spider(root_url, max_links)
31-
spider.start()
30+
crawl = Spider(root_url, max_links)
31+
crawl.start()
32+
33+
34+
# Set workers and delay (default: delay is 0.5 sec and verbose is True)
35+
# If you do not want delay, set delay=0
36+
37+
crawl = Spider(root_url='https://github.com', max_links=5, max_workers=5, delay=1, verbose=False)
38+
crawl.start()
39+
3240
```
3341

3442

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "tiny-web-crawler"
3-
version = "0.2.0"
3+
version = "0.3.0"
44
description = "A simple and efficient web crawler in Python."
55
authors = ["Indrajith Indraprastham <indr4jith@gmail.com>"]
66
license = "GPL-3.0-or-later"
@@ -21,6 +21,7 @@ requests = "^2.32.3"
2121
pytest = "^6.2"
2222
responses = "^0.13.4"
2323
pylint = "^2.7"
24+
mypy = "^1.10.0"
2425

2526
[build-system]
2627
requires = ["poetry-core>=1.0.0"]

tiny_web_crawler/crawler.py

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@
22
import json
33
import urllib.parse
44
from typing import Dict, List, Optional, Set
5+
from concurrent.futures import ThreadPoolExecutor, as_completed
56

7+
import time
68
import requests
79
import validators
810
from bs4 import BeautifulSoup
911
from colorama import Fore, Style, init
1012

1113
init(autoreset=True)
1214

13-
1415
DEFAULT_SCHEME: str = 'http://'
1516

1617

@@ -27,7 +28,13 @@ class Spider():
2728
save_to_file (Optional[str]): The file path to save the crawl results.
2829
"""
2930

30-
def __init__(self, root_url: str, max_links: int = 5, save_to_file: Optional[str] = None) -> None:
31+
def __init__(self,
32+
root_url: str,
33+
max_links: int = 5,
34+
save_to_file: Optional[str] = None,
35+
max_workers: int = 1,
36+
delay: float = 0.5,
37+
verbose: bool = True) -> None:
3138
"""
3239
Initializes the Spider class.
3340
@@ -43,6 +50,9 @@ def __init__(self, root_url: str, max_links: int = 5, save_to_file: Optional[str
4350
self.link_count: int = 0
4451
self.save_to_file: Optional[str] = save_to_file
4552
self.scheme: str = DEFAULT_SCHEME
53+
self.max_workers: int = max_workers
54+
self.delay: float = delay
55+
self.verbose: bool = verbose
4656

4757
def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
4858
"""
@@ -54,7 +64,6 @@ def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
5464
Returns:
5565
Optional[BeautifulSoup]: A BeautifulSoup object if the URL is fetched successfully, None otherwise.
5666
"""
57-
5867
try:
5968
response = requests.get(url, timeout=10)
6069
response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
@@ -83,6 +92,10 @@ def is_valid_url(url: str) -> bool:
8392
"""
8493
return bool(validators.url(url))
8594

95+
def verbose_print(self, content: str) -> None:
96+
if self.verbose:
97+
print(content)
98+
8699
def save_results(self) -> None:
87100
"""
88101
Saves the crawl results into a JSON file.
@@ -127,37 +140,38 @@ def crawl(self, url: str) -> None:
127140
url (str): The URL to crawl.
128141
"""
129142
if not self.is_valid_url(url):
130-
print(Fore.RED + f"Invalid url to crawl: {url}")
143+
self.verbose_print(Fore.RED + f"Invalid url to crawl: {url}")
131144
return
132145

133146
if url in self.crawl_result:
134-
print(Fore.YELLOW + f"URL already crawled: {url}")
147+
self.verbose_print(Fore.YELLOW + f"URL already crawled: {url}")
135148
return
136149

137-
print(Fore.GREEN + f"Crawling: {url}")
150+
self.verbose_print(Fore.GREEN + f"Crawling: {url}")
138151
soup = self.fetch_url(url)
139152
if not soup:
140153
return
141154

142-
links = soup.body.find_all('a', href=True)
155+
links = soup.body.find_all('a', href=True) if soup.body else []
143156
self.crawl_result[url] = {'urls': []}
144157

145158
for link in links:
146159
pretty_url = self.format_url(link['href'].lstrip(), url)
147160
if not self.is_valid_url(pretty_url):
148-
print(Fore.RED + f"Invalid url: {pretty_url}")
161+
self.verbose_print(Fore.RED + f"Invalid url: {pretty_url}")
149162
continue
150163

151164
if pretty_url in self.crawl_result[url]['urls']:
152165
continue
153166

154167
self.crawl_result[url]['urls'].append(pretty_url)
155168
self.crawl_set.add(pretty_url)
156-
print(Fore.BLUE + f"Link found: {pretty_url}")
169+
self.verbose_print(Fore.BLUE + f"Link found: {pretty_url}")
157170

158171
if self.link_count < self.max_links:
159172
self.link_count += 1
160-
print(Fore.GREEN + f"Links crawled: {self.link_count}")
173+
self.verbose_print(
174+
Fore.GREEN + f"Links crawled: {self.link_count}")
161175

162176
def start(self) -> Dict[str, Dict[str, List[str]]]:
163177
"""
@@ -166,25 +180,35 @@ def start(self) -> Dict[str, Dict[str, List[str]]]:
166180
Returns:
167181
Dict[str, Dict[str, List[str]]]: The crawl results.
168182
"""
169-
self.crawl(self.root_url)
170-
171-
while self.crawl_set and self.link_count < self.max_links:
172-
self.crawl(self.crawl_set.pop())
183+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
184+
futures = {executor.submit(self.crawl, self.root_url)}
185+
186+
while self.link_count < self.max_links and futures:
187+
for future in as_completed(futures):
188+
futures.remove(future)
189+
if future.exception() is None:
190+
while self.link_count < self.max_links and self.crawl_set:
191+
url = self.crawl_set.pop()
192+
if url not in self.crawl_result:
193+
futures.add(executor.submit(self.crawl, url))
194+
time.sleep(self.delay)
195+
break # Break to check the next future
173196

174197
if self.save_to_file:
175198
self.save_results()
176-
print(Style.BRIGHT + Fore.MAGENTA + "Exiting....")
199+
self.verbose_print(Style.BRIGHT + Fore.MAGENTA + "Exiting....")
177200
return self.crawl_result
178201

179202

180203
def main() -> None:
181204
"""
182205
The main function to initialize and start the crawler.
183206
"""
184-
root_url = 'http://github.com'
185-
max_links = 2
207+
root_url = 'https://pypi.org/'
208+
max_links = 5
186209

187210
crawler = Spider(root_url, max_links, save_to_file='out.json')
211+
print(Fore.GREEN + f"Crawling: {root_url}")
188212
crawler.start()
189213

190214

0 commit comments

Comments
 (0)