-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_scraping.py
More file actions
123 lines (93 loc) · 3.44 KB
/
web_scraping.py
File metadata and controls
123 lines (93 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import requests
from bs4 import BeautifulSoup
from gspread import auth
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
import pickle
import os
from dataclasses import dataclass
import re
@dataclass(frozen=True)
class Config:
"""Configuration constants for the script."""
SCOPES: tuple = (
"https://www.googleapis.com/auth/spreadsheets",
"https://www.googleapis.com/auth/drive",
)
TOKEN_PATH: str = "token.pickle"
CREDENTIALS_PATH: str = "creds.json"
SPREADSHEET_NAME: str = "Test"
TARGET_URL: str = "https://datatracker.ietf.org/doc/html/rfc1918"
def scrape_website(url):
"""Scrape data from the given website and return a list of formatted strings."""
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# Find all <a> elements
elements = soup.find_all("p")
if not elements:
print("No <p> elements found on the webpage.")
return []
# Extract and clean text
raw_data = [element.get_text(strip=True) for element in elements]
# Ensure spaces after specific keywords
return [
re.sub(
r"(Adress:|Telefon:|E-post:)(\S)",
r"\1 \2",
text,
)
for text in raw_data
]
def authenticate_google_sheets():
"""Authenticate with Google Sheets API using OAuth 2.0 and return a gspread client."""
creds = None
if os.path.exists(Config.TOKEN_PATH):
with open(Config.TOKEN_PATH, "rb") as token:
creds = pickle.load(token)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
Config.CREDENTIALS_PATH, Config.SCOPES
)
creds = flow.run_local_server(port=0)
with open(Config.TOKEN_PATH, "wb") as token:
pickle.dump(creds, token)
return auth.authorize(creds)
def write_to_google_sheets(client, data):
"""Write unique data to a single column (Column A) in Google Sheets with added spaces."""
if not data:
print("No data to write.")
return
spreadsheet = client.open(Config.SPREADSHEET_NAME)
worksheet = spreadsheet.sheet1 # Access the first sheet
# Get existing values from column A
existing_values = set(worksheet.col_values(1)) # Convert to set for faster lookup
# Sort data before filtering out duplicates
sorted_data = sorted(data)
# Filter out duplicates while formatting it
new_data = [
[row.replace(".", ". ").replace(",", ", ").strip()]
for row in sorted_data
if row not in existing_values
]
if not new_data:
print("No new data to add. All entries are duplicates.")
return
# Append new data in a single operation for efficiency
worksheet.append_rows(new_data)
print(f"Added {len(new_data)} new rows to Google Sheets!")
def main():
"""Main function to scrape data and update Google Sheets."""
print("Scraping data from:", Config.TARGET_URL)
data = scrape_website(Config.TARGET_URL)
if not data:
print("No data scraped. Exiting.")
return
print("Authenticating with Google Sheets...")
client = authenticate_google_sheets()
print("Writing data to Google Sheets...")
write_to_google_sheets(client, data)
if __name__ == "__main__":
main()