bodrovis-learning
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.python-version‎
Lines changed: 1 addition & 0 deletions b/‎.python-version‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎google_scraper/data.csv‎
Lines changed: 4 additions & 3 deletions b/‎google_scraper/data.csv‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎google_scraper/scrape_diy.py‎
Lines changed: 116 additions & 37 deletions b/‎google_scraper/scrape_diy.py‎
Lines changed: 116 additions & 37 deletions
diff --git a/‎google_scraper/scrape_diy_with_scrapingbee.py‎
Lines changed: 194 additions & 0 deletions b/‎google_scraper/scrape_diy_with_scrapingbee.py‎
Lines changed: 194 additions & 0 deletions
@@ -0,0 +1,2 @@
+.venv/
+.env
@@ -0,0 +1 @@
+3.14
@@ -1,3 +1,4 @@
-tripadvisor.com,bbc.com,foodnetwork.com
-4,13,10
-6,8,11
+the-carboholic.com,reddit.com,yelp.com
+2,3,4
+1,2,7
+1,3,5
@@ -1,51 +1,130 @@
+from dataclasses import dataclass
+from urllib.parse import parse_qs, urljoin, urlparse
+
 import requests
 from bs4 import BeautifulSoup
-from scrapingbee import ScrapingBeeClient
+from bs4.element import Tag
+
+GOOGLE_SEARCH_URL: str = "https://www.google.com/search"
+SEARCH_QUERY: str = "web scraping"
+
+CONSENT_COOKIE: str = "YES+cb.20220419-08-p0.cs+FX+111"
+
+USER_AGENT: str = (
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) "
+    "Gecko/20100101 Firefox/118.0"
+)
+
+
+@dataclass(frozen=True)
+class SearchResult:
+    position: int
+    title: str
+    link: str
+    
+
+def fetch_google_html(query: str) -> str:
+    response = requests.get(
+        GOOGLE_SEARCH_URL,
+        params={
+            "q": query,
+            "hl": "en",
+            "gl": "us",
+        },
+        headers={
+            "User-Agent": USER_AGENT,
+        },
+        cookies={
+            "CONSENT": CONSENT_COOKIE,
+        },
+        timeout=10,
+    )
+
+    response.raise_for_status()
+    return response.text
+    
+
+def create_soup(html: str) -> BeautifulSoup:
+    return BeautifulSoup(html, "html.parser")
+    
+
+def find_result_headings(soup: BeautifulSoup) -> list[Tag]:
+    headings = soup.select("#search h3")
+
+    return [
+        heading
+        for heading in headings
+        if isinstance(heading, Tag)
+    ]
+    
+
+def clean_google_link(raw_link: str) -> str:
+    if raw_link.startswith("/url?"):
+        parsed_url = urlparse(raw_link)
+        query_params = parse_qs(parsed_url.query)
+
+        if "q" in query_params:
+            return query_params["q"][0]
+
+    return urljoin("https://www.google.com", raw_link)
+    
+    
+def parse_search_results(soup: BeautifulSoup) -> list[SearchResult]:
+    results: list[SearchResult] = []
+    seen_links: set[str] = set()
+
+    for heading in find_result_headings(soup):
+        link_tag = heading.find_parent("a", href=True)
+
+        if not isinstance(link_tag, Tag):
+            continue
+
+        raw_link = link_tag.get("href")
+
+        if not isinstance(raw_link, str):
+            continue
 
-text = "web scraping"
-url = "https://google.com/search?q=" + text
+        title = heading.get_text(strip=True)
+        link = clean_google_link(raw_link)
 
-cookies = {"CONSENT": "YES+cb.20220419-08-p0.cs+FX+111"}
+        if not title or not link.startswith("http"):
+            continue
 
-headers = {
-    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0"
-}
+        if link in seen_links:
+            continue
 
-response = requests.get(url, headers=headers, cookies=cookies)
+        seen_links.add(link)
 
-soup = BeautifulSoup(response.content, "html.parser")
+        results.append(
+            SearchResult(
+                position=len(results) + 1,
+                title=title,
+                link=link,
+            )
+        )
 
-heading_object = soup.select("#search h3")
+    return results
+    
 
-for i, result in enumerate(heading_object):
-    if "href" in result.parent.attrs:
-        print(i + 1)
-        print(result.string)
-        print(result.parent.attrs["href"])
-        print("------")
+def print_search_results(results: list[SearchResult]) -> None:
+    if not results:
+        print("No search results found.")
+        return
 
+    print("\nSearch results:")
 
-# OR
+    for result in results:
+        print(f"\n{result.position}. {result.title}")
+        print(result.link)
+        
+        
+def main() -> None:
+    html = fetch_google_html(SEARCH_QUERY)
+    soup = create_soup(html)
+    results = parse_search_results(soup)
 
-# client = ScrapingBeeClient(
-#     api_key=""
-# )
+    print_search_results(results)
 
-# response = client.get(
-#     "https://www.google.com/search?q=Best+Laptops+in+Europe&tbm=shop",
-#     params={
-#         "custom_google": "true",
-#         # 'premium_proxy': 'true',
-#         # 'country_code':'lv',
-#         "block_resources": "false",
-#         "wait": "1500",  # Waiting for the content to load (1.5 seconds)
-#         "screenshot": True,
-#         # Specify that we need the full height
-#         "screenshot_full_page": True,
-#         "forward_headers": True,
-#     },
-#     cookies=cookies,
-#     headers=headers,
-# )
 
-# soup = BeautifulSoup(response.content, "html.parser")
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,194 @@
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from urllib.parse import parse_qs, urljoin, urlparse
+
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+from dotenv import load_dotenv
+from requests import Response
+from scrapingbee import ScrapingBeeClient
+
+
+TARGET_URL: str = "https://www.google.com/search?q=web+scraping&hl=en&gl=us"
+HTML_FILE: Path = Path("google_search_rendered.html")
+ERROR_FILE: Path = Path("scrapingbee_error.html")
+SCREENSHOT_FILE: Path = Path("google_search_screenshot.png")
+
+
+@dataclass(frozen=True)
+class SearchResult:
+    position: int
+    title: str
+    link: str
+
+
+def get_api_key() -> str:
+    load_dotenv()
+    api_key = os.getenv("SCRAPINGBEE_API_KEY")
+
+    if not api_key:
+        raise ValueError("SCRAPINGBEE_API_KEY not found in .env file")
+
+    return api_key
+
+
+def create_client(api_key: str) -> ScrapingBeeClient:
+    return ScrapingBeeClient(api_key=api_key)
+
+
+def save_response_debug(response: Response, output_path: Path) -> None:
+    output_path.write_bytes(response.content)
+
+    print("\nScrapingBee request failed.")
+    print(f"Status code: {response.status_code}")
+    print(f"Saved response body to {output_path}")
+    print("Open that file to see what ScrapingBee/Google returned.")
+
+
+def fetch_rendered_google_html(
+    client: ScrapingBeeClient,
+    url: str,
+) -> str:
+    response = client.get(
+        url,
+        params={
+            "custom_google": "true",
+            "render_js": "true",
+            "wait_browser": "load",
+            "block_resources": "false",
+            "premium_proxy": "true",
+        },
+        retries=3,
+    )
+
+    if not response.ok:
+        save_response_debug(response, ERROR_FILE)
+        response.raise_for_status()
+
+    return response.text
+
+
+def save_html(html: str, output_path: Path) -> None:
+    output_path.write_text(html, encoding="utf-8")
+    print(f"Saved rendered HTML to {output_path}")
+
+
+def save_full_page_screenshot(
+    client: ScrapingBeeClient,
+    url: str,
+    output_path: Path,
+) -> None:
+    response = client.get(
+        url,
+        params={
+            "custom_google": "true",
+            "render_js": "true",
+            "wait_browser": "networkidle2",
+            "screenshot": "true",
+            "screenshot_full_page": "true",
+            "block_resources": "false",
+            "premium_proxy": "true",
+        },
+        retries=3,
+    )
+
+    if not response.ok:
+        save_response_debug(response, ERROR_FILE)
+        response.raise_for_status()
+
+    output_path.write_bytes(response.content)
+    print(f"Saved screenshot to {output_path}")
+
+
+def clean_google_link(raw_link: str) -> str:
+    if raw_link.startswith("/url?"):
+        parsed_url = urlparse(raw_link)
+        query_params = parse_qs(parsed_url.query)
+
+        if "q" in query_params:
+            return query_params["q"][0]
+
+    return urljoin("https://www.google.com", raw_link)
+
+
+def is_google_internal_link(link: str) -> bool:
+    parsed_url = urlparse(link)
+
+    if not parsed_url.netloc:
+        return True
+
+    return "google." in parsed_url.netloc
+
+
+def parse_search_results(html: str) -> list[SearchResult]:
+    soup = BeautifulSoup(html, "html.parser")
+    results: list[SearchResult] = []
+    seen_links: set[str] = set()
+
+    for link_tag in soup.find_all("a", href=True):
+        if not isinstance(link_tag, Tag):
+            continue
+
+        heading = link_tag.find("h3")
+
+        if not isinstance(heading, Tag):
+            continue
+
+        raw_link = link_tag.get("href")
+
+        if not isinstance(raw_link, str):
+            continue
+
+        title = heading.get_text(strip=True)
+        link = clean_google_link(raw_link)
+
+        if not title or not link.startswith("http"):
+            continue
+
+        if is_google_internal_link(link):
+            continue
+
+        if link in seen_links:
+            continue
+
+        seen_links.add(link)
+
+        results.append(
+            SearchResult(
+                position=len(results) + 1,
+                title=title,
+                link=link,
+            )
+        )
+
+    return results
+
+
+def print_search_results(results: list[SearchResult]) -> None:
+    if not results:
+        print("No search results found in the rendered HTML.")
+        return
+
+    print("\nSearch results:")
+
+    for result in results:
+        print(f"\n{result.position}. {result.title}")
+        print(result.link)
+
+
+def main() -> None:
+    api_key = get_api_key()
+    client = create_client(api_key)
+
+    html = fetch_rendered_google_html(client, TARGET_URL)
+    save_html(html, HTML_FILE)
+
+    results = parse_search_results(html)
+    print_search_results(results)
+
+    save_full_page_screenshot(client, TARGET_URL, SCREENSHOT_FILE)
+
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 -tripadvisor.com,bbc.com,foodnetwork.com
 -4,13,10
 -6,8,11
 +the-carboholic.com,reddit.com,yelp.com
 +2,3,4
 +1,2,7
 +1,3,5