-
Notifications
You must be signed in to change notification settings - Fork 757
Expand file tree
/
Copy pathrobots.py
More file actions
149 lines (119 loc) · 5.97 KB
/
Copy pathrobots.py
File metadata and controls
149 lines (119 loc) · 5.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from __future__ import annotations
from logging import getLogger
from typing import TYPE_CHECKING
from protego import Protego
from yarl import URL
from crawlee._utils.sitemap import Sitemap
from crawlee._utils.urls import filter_url
from crawlee._utils.web import is_status_code_client_error
if TYPE_CHECKING:
from typing_extensions import Self
from crawlee._types import EnqueueStrategy
from crawlee.http_clients import HttpClient
from crawlee.proxy_configuration import ProxyInfo
logger = getLogger(__name__)
class RobotsTxtFile:
def __init__(
self,
url: str,
robots: Protego,
http_client: HttpClient | None = None,
proxy_info: ProxyInfo | None = None,
) -> None:
self._robots = robots
self._original_url = URL(url).origin()
self._http_client = http_client
self._proxy_info = proxy_info
@classmethod
async def from_content(cls, url: str, content: str) -> Self:
"""Create a `RobotsTxtFile` instance from the given content.
Args:
url: The URL associated with the robots.txt file.
content: The raw string content of the robots.txt file to be parsed.
"""
robots = Protego.parse(content)
return cls(url, robots)
@classmethod
async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
"""Load the robots.txt file for a given URL.
Args:
url: The direct URL of the robots.txt file to be loaded.
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
"""
try:
response = await http_client.send_request(url, proxy_info=proxy_info)
body = (
b'User-agent: *\nAllow: /'
if is_status_code_client_error(response.status_code)
else await response.read()
)
robots = Protego.parse(body.decode('utf-8'))
except Exception as e:
logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
robots = Protego.parse('User-agent: *\nAllow: /')
return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
@classmethod
async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
"""Determine the location of a robots.txt file for a URL and fetch it.
Args:
url: The URL whose domain will be used to find the corresponding robots.txt file.
http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
"""
robots_url = URL(url).with_path('/robots.txt')
return await cls.load(str(robots_url), http_client, proxy_info)
def is_allowed(self, url: str, user_agent: str = '*') -> bool:
"""Check if the given URL is allowed for the given user agent.
Args:
url: The URL to check against the robots.txt rules.
user_agent: The user-agent string to check permissions for. Defaults to '*' which matches any user-agent.
"""
check_url = URL(url)
if check_url.origin() != self._original_url:
return True
return bool(self._robots.can_fetch(str(check_url), user_agent))
def get_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
"""Get the list of sitemap URLs from the robots.txt file, filtered by enqueue strategy.
Args:
enqueue_strategy: Strategy used to filter sitemap entries relative to the robots.txt URL's host.
Pass `'same-hostname'` to match the sitemap protocol's same-host expectation, or `'all'` to
disable host filtering. Regardless of the strategy, entries with non-`http(s)` schemes are
always filtered out.
"""
sitemaps: list[str] = []
for sitemap_url in self._robots.sitemaps:
ok, reason = filter_url(target=sitemap_url, strategy=enqueue_strategy, origin=self._original_url)
if not ok:
logger.warning(
f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {str(self._original_url)!r}: {reason}.'
)
continue
sitemaps.append(sitemap_url)
return sitemaps
def get_crawl_delay(self, user_agent: str = '*') -> int | None:
"""Get the crawl delay for the given user agent.
Args:
user_agent: The user-agent string to check the crawl delay for. Defaults to '*' which matches any
user-agent.
"""
crawl_delay = self._robots.crawl_delay(user_agent)
return int(crawl_delay) if crawl_delay is not None else None
async def parse_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> Sitemap:
"""Parse the sitemaps from the robots.txt file and return a `Sitemap` instance.
Args:
enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
"""
sitemaps = self.get_sitemaps(enqueue_strategy=enqueue_strategy)
if not self._http_client:
raise ValueError('HTTP client is required to parse sitemaps.')
return await Sitemap.load(
sitemaps, self._http_client, self._proxy_info, parse_sitemap_options={'enqueue_strategy': enqueue_strategy}
)
async def parse_urls_from_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
"""Parse the sitemaps in the robots.txt file and return a list URLs.
Args:
enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
"""
sitemap = await self.parse_sitemaps(enqueue_strategy=enqueue_strategy)
return sitemap.urls