@@ -23,23 +23,20 @@ async def main() -> None:
2323 # Open the default request queue for handling URLs to be processed.
2424 request_queue = await Actor .open_request_queue ()
2525
26- # Enqueue the start URLs with an initial crawl depth of 0.
26+ # Enqueue the start URLs. Their crawl depth defaults to 0.
2727 for start_url in start_urls :
2828 url = start_url .get ('url' )
2929 Actor .log .info (f'Enqueuing { url } ...' )
30- new_request = Request .from_url (url , user_data = {'depth' : 0 })
31- await request_queue .add_request (new_request )
30+ await request_queue .add_request (Request .from_url (url ))
3231
3332 # Create an HTTPX client to fetch the HTML content of the URLs.
3433 async with httpx .AsyncClient () as client :
3534 # Process the URLs from the request queue.
3635 while request := await request_queue .fetch_next_request ():
3736 url = request .url
3837
39- if not isinstance (request .user_data ['depth' ], (str , int )):
40- raise TypeError ('Request.depth is an unexpected type.' )
41-
42- depth = int (request .user_data ['depth' ])
38+ # Read the crawl depth tracked by the request itself.
39+ depth = request .crawl_depth
4340 Actor .log .info (f'Scraping { url } (depth={ depth } ) ...' )
4441
4542 try :
@@ -58,10 +55,8 @@ async def main() -> None:
5855
5956 if link_url .startswith (('http://' , 'https://' )):
6057 Actor .log .info (f'Enqueuing { link_url } ...' )
61- new_request = Request .from_url (
62- link_url ,
63- user_data = {'depth' : depth + 1 },
64- )
58+ new_request = Request .from_url (link_url )
59+ new_request .crawl_depth = depth + 1
6560 await request_queue .add_request (new_request )
6661
6762 # Extract the desired data.
@@ -81,7 +76,7 @@ async def main() -> None:
8176
8277 finally :
8378 # Mark the request as handled to ensure it is not processed again.
84- await request_queue .mark_request_as_handled (new_request )
79+ await request_queue .mark_request_as_handled (request )
8580
8681
8782if __name__ == '__main__' :
0 commit comments