Skip to content

Commit f228ddb

Browse files
committed
refactors
1 parent 4259fc1 commit f228ddb

13 files changed

Lines changed: 826 additions & 472 deletions

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.venv/
2+
.env

.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.14

google_scraper/data.csv

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
tripadvisor.com,bbc.com,foodnetwork.com
2-
4,13,10
3-
6,8,11
1+
the-carboholic.com,reddit.com,yelp.com
2+
2,3,4
3+
1,2,7
4+
1,3,5

google_scraper/scrape_diy.py

Lines changed: 116 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,130 @@
1+
from dataclasses import dataclass
2+
from urllib.parse import parse_qs, urljoin, urlparse
3+
14
import requests
25
from bs4 import BeautifulSoup
3-
from scrapingbee import ScrapingBeeClient
6+
from bs4.element import Tag
7+
8+
GOOGLE_SEARCH_URL: str = "https://www.google.com/search"
9+
SEARCH_QUERY: str = "web scraping"
10+
11+
CONSENT_COOKIE: str = "YES+cb.20220419-08-p0.cs+FX+111"
12+
13+
USER_AGENT: str = (
14+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) "
15+
"Gecko/20100101 Firefox/118.0"
16+
)
17+
18+
19+
@dataclass(frozen=True)
20+
class SearchResult:
21+
position: int
22+
title: str
23+
link: str
24+
25+
26+
def fetch_google_html(query: str) -> str:
27+
response = requests.get(
28+
GOOGLE_SEARCH_URL,
29+
params={
30+
"q": query,
31+
"hl": "en",
32+
"gl": "us",
33+
},
34+
headers={
35+
"User-Agent": USER_AGENT,
36+
},
37+
cookies={
38+
"CONSENT": CONSENT_COOKIE,
39+
},
40+
timeout=10,
41+
)
42+
43+
response.raise_for_status()
44+
return response.text
45+
46+
47+
def create_soup(html: str) -> BeautifulSoup:
48+
return BeautifulSoup(html, "html.parser")
49+
50+
51+
def find_result_headings(soup: BeautifulSoup) -> list[Tag]:
52+
headings = soup.select("#search h3")
53+
54+
return [
55+
heading
56+
for heading in headings
57+
if isinstance(heading, Tag)
58+
]
59+
60+
61+
def clean_google_link(raw_link: str) -> str:
62+
if raw_link.startswith("/url?"):
63+
parsed_url = urlparse(raw_link)
64+
query_params = parse_qs(parsed_url.query)
65+
66+
if "q" in query_params:
67+
return query_params["q"][0]
68+
69+
return urljoin("https://www.google.com", raw_link)
70+
71+
72+
def parse_search_results(soup: BeautifulSoup) -> list[SearchResult]:
73+
results: list[SearchResult] = []
74+
seen_links: set[str] = set()
75+
76+
for heading in find_result_headings(soup):
77+
link_tag = heading.find_parent("a", href=True)
78+
79+
if not isinstance(link_tag, Tag):
80+
continue
81+
82+
raw_link = link_tag.get("href")
83+
84+
if not isinstance(raw_link, str):
85+
continue
486

5-
text = "web scraping"
6-
url = "https://google.com/search?q=" + text
87+
title = heading.get_text(strip=True)
88+
link = clean_google_link(raw_link)
789

8-
cookies = {"CONSENT": "YES+cb.20220419-08-p0.cs+FX+111"}
90+
if not title or not link.startswith("http"):
91+
continue
992

10-
headers = {
11-
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0"
12-
}
93+
if link in seen_links:
94+
continue
1395

14-
response = requests.get(url, headers=headers, cookies=cookies)
96+
seen_links.add(link)
1597

16-
soup = BeautifulSoup(response.content, "html.parser")
98+
results.append(
99+
SearchResult(
100+
position=len(results) + 1,
101+
title=title,
102+
link=link,
103+
)
104+
)
17105

18-
heading_object = soup.select("#search h3")
106+
return results
107+
19108

20-
for i, result in enumerate(heading_object):
21-
if "href" in result.parent.attrs:
22-
print(i + 1)
23-
print(result.string)
24-
print(result.parent.attrs["href"])
25-
print("------")
109+
def print_search_results(results: list[SearchResult]) -> None:
110+
if not results:
111+
print("No search results found.")
112+
return
26113

114+
print("\nSearch results:")
27115

28-
# OR
116+
for result in results:
117+
print(f"\n{result.position}. {result.title}")
118+
print(result.link)
119+
120+
121+
def main() -> None:
122+
html = fetch_google_html(SEARCH_QUERY)
123+
soup = create_soup(html)
124+
results = parse_search_results(soup)
29125

30-
# client = ScrapingBeeClient(
31-
# api_key=""
32-
# )
126+
print_search_results(results)
33127

34-
# response = client.get(
35-
# "https://www.google.com/search?q=Best+Laptops+in+Europe&tbm=shop",
36-
# params={
37-
# "custom_google": "true",
38-
# # 'premium_proxy': 'true',
39-
# # 'country_code':'lv',
40-
# "block_resources": "false",
41-
# "wait": "1500", # Waiting for the content to load (1.5 seconds)
42-
# "screenshot": True,
43-
# # Specify that we need the full height
44-
# "screenshot_full_page": True,
45-
# "forward_headers": True,
46-
# },
47-
# cookies=cookies,
48-
# headers=headers,
49-
# )
50128

51-
# soup = BeautifulSoup(response.content, "html.parser")
129+
if __name__ == "__main__":
130+
main()
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
import os
2+
from dataclasses import dataclass
3+
from pathlib import Path
4+
from urllib.parse import parse_qs, urljoin, urlparse
5+
6+
from bs4 import BeautifulSoup
7+
from bs4.element import Tag
8+
from dotenv import load_dotenv
9+
from requests import Response
10+
from scrapingbee import ScrapingBeeClient
11+
12+
13+
TARGET_URL: str = "https://www.google.com/search?q=web+scraping&hl=en&gl=us"
14+
HTML_FILE: Path = Path("google_search_rendered.html")
15+
ERROR_FILE: Path = Path("scrapingbee_error.html")
16+
SCREENSHOT_FILE: Path = Path("google_search_screenshot.png")
17+
18+
19+
@dataclass(frozen=True)
20+
class SearchResult:
21+
position: int
22+
title: str
23+
link: str
24+
25+
26+
def get_api_key() -> str:
27+
load_dotenv()
28+
api_key = os.getenv("SCRAPINGBEE_API_KEY")
29+
30+
if not api_key:
31+
raise ValueError("SCRAPINGBEE_API_KEY not found in .env file")
32+
33+
return api_key
34+
35+
36+
def create_client(api_key: str) -> ScrapingBeeClient:
37+
return ScrapingBeeClient(api_key=api_key)
38+
39+
40+
def save_response_debug(response: Response, output_path: Path) -> None:
41+
output_path.write_bytes(response.content)
42+
43+
print("\nScrapingBee request failed.")
44+
print(f"Status code: {response.status_code}")
45+
print(f"Saved response body to {output_path}")
46+
print("Open that file to see what ScrapingBee/Google returned.")
47+
48+
49+
def fetch_rendered_google_html(
50+
client: ScrapingBeeClient,
51+
url: str,
52+
) -> str:
53+
response = client.get(
54+
url,
55+
params={
56+
"custom_google": "true",
57+
"render_js": "true",
58+
"wait_browser": "load",
59+
"block_resources": "false",
60+
"premium_proxy": "true",
61+
},
62+
retries=3,
63+
)
64+
65+
if not response.ok:
66+
save_response_debug(response, ERROR_FILE)
67+
response.raise_for_status()
68+
69+
return response.text
70+
71+
72+
def save_html(html: str, output_path: Path) -> None:
73+
output_path.write_text(html, encoding="utf-8")
74+
print(f"Saved rendered HTML to {output_path}")
75+
76+
77+
def save_full_page_screenshot(
78+
client: ScrapingBeeClient,
79+
url: str,
80+
output_path: Path,
81+
) -> None:
82+
response = client.get(
83+
url,
84+
params={
85+
"custom_google": "true",
86+
"render_js": "true",
87+
"wait_browser": "networkidle2",
88+
"screenshot": "true",
89+
"screenshot_full_page": "true",
90+
"block_resources": "false",
91+
"premium_proxy": "true",
92+
},
93+
retries=3,
94+
)
95+
96+
if not response.ok:
97+
save_response_debug(response, ERROR_FILE)
98+
response.raise_for_status()
99+
100+
output_path.write_bytes(response.content)
101+
print(f"Saved screenshot to {output_path}")
102+
103+
104+
def clean_google_link(raw_link: str) -> str:
105+
if raw_link.startswith("/url?"):
106+
parsed_url = urlparse(raw_link)
107+
query_params = parse_qs(parsed_url.query)
108+
109+
if "q" in query_params:
110+
return query_params["q"][0]
111+
112+
return urljoin("https://www.google.com", raw_link)
113+
114+
115+
def is_google_internal_link(link: str) -> bool:
116+
parsed_url = urlparse(link)
117+
118+
if not parsed_url.netloc:
119+
return True
120+
121+
return "google." in parsed_url.netloc
122+
123+
124+
def parse_search_results(html: str) -> list[SearchResult]:
125+
soup = BeautifulSoup(html, "html.parser")
126+
results: list[SearchResult] = []
127+
seen_links: set[str] = set()
128+
129+
for link_tag in soup.find_all("a", href=True):
130+
if not isinstance(link_tag, Tag):
131+
continue
132+
133+
heading = link_tag.find("h3")
134+
135+
if not isinstance(heading, Tag):
136+
continue
137+
138+
raw_link = link_tag.get("href")
139+
140+
if not isinstance(raw_link, str):
141+
continue
142+
143+
title = heading.get_text(strip=True)
144+
link = clean_google_link(raw_link)
145+
146+
if not title or not link.startswith("http"):
147+
continue
148+
149+
if is_google_internal_link(link):
150+
continue
151+
152+
if link in seen_links:
153+
continue
154+
155+
seen_links.add(link)
156+
157+
results.append(
158+
SearchResult(
159+
position=len(results) + 1,
160+
title=title,
161+
link=link,
162+
)
163+
)
164+
165+
return results
166+
167+
168+
def print_search_results(results: list[SearchResult]) -> None:
169+
if not results:
170+
print("No search results found in the rendered HTML.")
171+
return
172+
173+
print("\nSearch results:")
174+
175+
for result in results:
176+
print(f"\n{result.position}. {result.title}")
177+
print(result.link)
178+
179+
180+
def main() -> None:
181+
api_key = get_api_key()
182+
client = create_client(api_key)
183+
184+
html = fetch_rendered_google_html(client, TARGET_URL)
185+
save_html(html, HTML_FILE)
186+
187+
results = parse_search_results(html)
188+
print_search_results(results)
189+
190+
save_full_page_screenshot(client, TARGET_URL, SCREENSHOT_FILE)
191+
192+
193+
if __name__ == "__main__":
194+
main()

0 commit comments

Comments
 (0)