Skip to content

Commit 96d0d8a

Browse files
committed
docs: use Request.crawl_depth for depth tracking in BeautifulSoup and Parsel examples
1 parent 4c681ac commit 96d0d8a

2 files changed

Lines changed: 13 additions & 23 deletions

File tree

docs/03_guides/code/01_beautifulsoup_httpx.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,20 @@ async def main() -> None:
2323
# Open the default request queue for handling URLs to be processed.
2424
request_queue = await Actor.open_request_queue()
2525

26-
# Enqueue the start URLs with an initial crawl depth of 0.
26+
# Enqueue the start URLs. Their crawl depth defaults to 0.
2727
for start_url in start_urls:
2828
url = start_url.get('url')
2929
Actor.log.info(f'Enqueuing {url} ...')
30-
new_request = Request.from_url(url, user_data={'depth': 0})
31-
await request_queue.add_request(new_request)
30+
await request_queue.add_request(Request.from_url(url))
3231

3332
# Create an HTTPX client to fetch the HTML content of the URLs.
3433
async with httpx.AsyncClient() as client:
3534
# Process the URLs from the request queue.
3635
while request := await request_queue.fetch_next_request():
3736
url = request.url
3837

39-
if not isinstance(request.user_data['depth'], (str, int)):
40-
raise TypeError('Request.depth is an unexpected type.')
41-
42-
depth = int(request.user_data['depth'])
38+
# Read the crawl depth tracked by the request itself.
39+
depth = request.crawl_depth
4340
Actor.log.info(f'Scraping {url} (depth={depth}) ...')
4441

4542
try:
@@ -58,10 +55,8 @@ async def main() -> None:
5855

5956
if link_url.startswith(('http://', 'https://')):
6057
Actor.log.info(f'Enqueuing {link_url} ...')
61-
new_request = Request.from_url(
62-
link_url,
63-
user_data={'depth': depth + 1},
64-
)
58+
new_request = Request.from_url(link_url)
59+
new_request.crawl_depth = depth + 1
6560
await request_queue.add_request(new_request)
6661

6762
# Extract the desired data.
@@ -81,7 +76,7 @@ async def main() -> None:
8176

8277
finally:
8378
# Mark the request as handled to ensure it is not processed again.
84-
await request_queue.mark_request_as_handled(new_request)
79+
await request_queue.mark_request_as_handled(request)
8580

8681

8782
if __name__ == '__main__':

docs/03_guides/code/02_parsel_impit.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,20 @@ async def main() -> None:
2323
# Open the default request queue for handling URLs to be processed.
2424
request_queue = await Actor.open_request_queue()
2525

26-
# Enqueue the start URLs with an initial crawl depth of 0.
26+
# Enqueue the start URLs. Their crawl depth defaults to 0.
2727
for start_url in start_urls:
2828
url = start_url.get('url')
2929
Actor.log.info(f'Enqueuing {url} ...')
30-
new_request = Request.from_url(url, user_data={'depth': 0})
31-
await request_queue.add_request(new_request)
30+
await request_queue.add_request(Request.from_url(url))
3231

3332
# Create an Impit client to fetch the HTML content of the URLs.
3433
async with impit.AsyncClient() as client:
3534
# Process the URLs from the request queue.
3635
while request := await request_queue.fetch_next_request():
3736
url = request.url
3837

39-
if not isinstance(request.user_data['depth'], (str, int)):
40-
raise TypeError('Request.depth is an unexpected type.')
41-
42-
depth = int(request.user_data['depth'])
38+
# Read the crawl depth tracked by the request itself.
39+
depth = request.crawl_depth
4340
Actor.log.info(f'Scraping {url} (depth={depth}) ...')
4441

4542
try:
@@ -59,10 +56,8 @@ async def main() -> None:
5956

6057
if link_url.startswith(('http://', 'https://')):
6158
Actor.log.info(f'Enqueuing {link_url} ...')
62-
new_request = Request.from_url(
63-
link_url,
64-
user_data={'depth': depth + 1},
65-
)
59+
new_request = Request.from_url(link_url)
60+
new_request.crawl_depth = depth + 1
6661
await request_queue.add_request(new_request)
6762

6863
# Extract the desired data using Parsel selectors.

0 commit comments

Comments
 (0)