-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathurl_frontier.py
More file actions
77 lines (59 loc) · 2.4 KB
/
url_frontier.py
File metadata and controls
77 lines (59 loc) · 2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from urllib.parse import urlparse
from typing import Set
import asyncio
from product_url_analyser import is_dead_end_url, is_product_url
from work_tracker import WorkTracker
SKIP_EXTENSIONS = (".jpg", ".png", ".svg", ".js", ".css", ".ico", ".woff", ".ttf", ".mp4", ".pdf", ".zip")
SKIP_PATH_KEYWORDS = ("/login", "/signup", "/cart", "/help", "/terms", "/privacy", "/account")
class URLFrontier:
def __init__(self, seed_url: str, tracker: WorkTracker, max_depth: int = 3):
self.queue = asyncio.PriorityQueue()
self.visited: Set[str] = set()
self.allowed_domain = urlparse(seed_url).netloc
self.max_depth = max_depth
self.condition = asyncio.Condition()
self.tracker = tracker
self.active = True
def has_next(self) -> bool:
return len(self.queue) > 0
async def add_urls(self, urls: set[str], current_depth: int):
added_count = 0
async with self.condition:
for url in urls:
if url not in self.visited and current_depth + 1 <= self.max_depth:
self.visited.add(url)
added_count += 1
priority = self.score_url(url)
await self.queue.put((priority, url, current_depth + 1))
if added_count > 0:
self.condition.notify_all()
if added_count > 0:
self.active = True
await self.tracker.add(added_count)
async def next_url(self):
async with self.condition:
while self.queue.empty():
if not self.active:
raise StopAsyncIteration
await self.condition.wait()
priority, url, depth = await self.queue.get()
return url, depth
def is_empty(self):
return self.queue.empty()
async def finish(self):
async with self.condition:
self.active = False
self.condition.notify_all()
def score_url(self, url: str) -> int:
is_prod_url, score , explanation = is_product_url(url)
# High confidence product indicators
if is_prod_url and score > 1:
return 1
# Medium confidence: numeric slug or long URLs
if is_prod_url and score >= 0:
return 3
# Known non-product paths
if is_dead_end_url(url):
return 100
# Default
return 10