1+ import os
2+ from dataclasses import dataclass
3+ from pathlib import Path
4+ from urllib .parse import parse_qs , urljoin , urlparse
5+
6+ from bs4 import BeautifulSoup
7+ from bs4 .element import Tag
8+ from dotenv import load_dotenv
9+ from requests import Response
10+ from scrapingbee import ScrapingBeeClient
11+
12+
13+ TARGET_URL : str = "https://www.google.com/search?q=web+scraping&hl=en&gl=us"
14+ HTML_FILE : Path = Path ("google_search_rendered.html" )
15+ ERROR_FILE : Path = Path ("scrapingbee_error.html" )
16+ SCREENSHOT_FILE : Path = Path ("google_search_screenshot.png" )
17+
18+
19+ @dataclass (frozen = True )
20+ class SearchResult :
21+ position : int
22+ title : str
23+ link : str
24+
25+
26+ def get_api_key () -> str :
27+ load_dotenv ()
28+ api_key = os .getenv ("SCRAPINGBEE_API_KEY" )
29+
30+ if not api_key :
31+ raise ValueError ("SCRAPINGBEE_API_KEY not found in .env file" )
32+
33+ return api_key
34+
35+
36+ def create_client (api_key : str ) -> ScrapingBeeClient :
37+ return ScrapingBeeClient (api_key = api_key )
38+
39+
40+ def save_response_debug (response : Response , output_path : Path ) -> None :
41+ output_path .write_bytes (response .content )
42+
43+ print ("\n ScrapingBee request failed." )
44+ print (f"Status code: { response .status_code } " )
45+ print (f"Saved response body to { output_path } " )
46+ print ("Open that file to see what ScrapingBee/Google returned." )
47+
48+
49+ def fetch_rendered_google_html (
50+ client : ScrapingBeeClient ,
51+ url : str ,
52+ ) -> str :
53+ response = client .get (
54+ url ,
55+ params = {
56+ "custom_google" : "true" ,
57+ "render_js" : "true" ,
58+ "wait_browser" : "load" ,
59+ "block_resources" : "false" ,
60+ "premium_proxy" : "true" ,
61+ },
62+ retries = 3 ,
63+ )
64+
65+ if not response .ok :
66+ save_response_debug (response , ERROR_FILE )
67+ response .raise_for_status ()
68+
69+ return response .text
70+
71+
72+ def save_html (html : str , output_path : Path ) -> None :
73+ output_path .write_text (html , encoding = "utf-8" )
74+ print (f"Saved rendered HTML to { output_path } " )
75+
76+
77+ def save_full_page_screenshot (
78+ client : ScrapingBeeClient ,
79+ url : str ,
80+ output_path : Path ,
81+ ) -> None :
82+ response = client .get (
83+ url ,
84+ params = {
85+ "custom_google" : "true" ,
86+ "render_js" : "true" ,
87+ "wait_browser" : "networkidle2" ,
88+ "screenshot" : "true" ,
89+ "screenshot_full_page" : "true" ,
90+ "block_resources" : "false" ,
91+ "premium_proxy" : "true" ,
92+ },
93+ retries = 3 ,
94+ )
95+
96+ if not response .ok :
97+ save_response_debug (response , ERROR_FILE )
98+ response .raise_for_status ()
99+
100+ output_path .write_bytes (response .content )
101+ print (f"Saved screenshot to { output_path } " )
102+
103+
104+ def clean_google_link (raw_link : str ) -> str :
105+ if raw_link .startswith ("/url?" ):
106+ parsed_url = urlparse (raw_link )
107+ query_params = parse_qs (parsed_url .query )
108+
109+ if "q" in query_params :
110+ return query_params ["q" ][0 ]
111+
112+ return urljoin ("https://www.google.com" , raw_link )
113+
114+
115+ def is_google_internal_link (link : str ) -> bool :
116+ parsed_url = urlparse (link )
117+
118+ if not parsed_url .netloc :
119+ return True
120+
121+ return "google." in parsed_url .netloc
122+
123+
124+ def parse_search_results (html : str ) -> list [SearchResult ]:
125+ soup = BeautifulSoup (html , "html.parser" )
126+ results : list [SearchResult ] = []
127+ seen_links : set [str ] = set ()
128+
129+ for link_tag in soup .find_all ("a" , href = True ):
130+ if not isinstance (link_tag , Tag ):
131+ continue
132+
133+ heading = link_tag .find ("h3" )
134+
135+ if not isinstance (heading , Tag ):
136+ continue
137+
138+ raw_link = link_tag .get ("href" )
139+
140+ if not isinstance (raw_link , str ):
141+ continue
142+
143+ title = heading .get_text (strip = True )
144+ link = clean_google_link (raw_link )
145+
146+ if not title or not link .startswith ("http" ):
147+ continue
148+
149+ if is_google_internal_link (link ):
150+ continue
151+
152+ if link in seen_links :
153+ continue
154+
155+ seen_links .add (link )
156+
157+ results .append (
158+ SearchResult (
159+ position = len (results ) + 1 ,
160+ title = title ,
161+ link = link ,
162+ )
163+ )
164+
165+ return results
166+
167+
168+ def print_search_results (results : list [SearchResult ]) -> None :
169+ if not results :
170+ print ("No search results found in the rendered HTML." )
171+ return
172+
173+ print ("\n Search results:" )
174+
175+ for result in results :
176+ print (f"\n { result .position } . { result .title } " )
177+ print (result .link )
178+
179+
180+ def main () -> None :
181+ api_key = get_api_key ()
182+ client = create_client (api_key )
183+
184+ html = fetch_rendered_google_html (client , TARGET_URL )
185+ save_html (html , HTML_FILE )
186+
187+ results = parse_search_results (html )
188+ print_search_results (results )
189+
190+ save_full_page_screenshot (client , TARGET_URL , SCREENSHOT_FILE )
191+
192+
193+ if __name__ == "__main__" :
194+ main ()
0 commit comments