-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathextract_data.py
More file actions
122 lines (93 loc) · 3.32 KB
/
Copy pathextract_data.py
File metadata and controls
122 lines (93 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
INPUT_CSV = Path("urls.csv")
OUTPUT_FILE = Path("extracted_texts.txt")
URL_COLUMN = "loc"
REQUEST_TIMEOUT = 30
MAX_WORKERS = 5
def load_urls(csv_path: Path) -> list[str]:
"""Load URLs from the sitemap CSV file."""
try:
df = pd.read_csv(csv_path)
except FileNotFoundError:
print(f"Input file not found: {csv_path}")
return []
except pd.errors.EmptyDataError:
print(f"Input file is empty: {csv_path}")
return []
except pd.errors.ParserError as e:
print(f"Failed to parse CSV file {csv_path}: {e}")
return []
if URL_COLUMN not in df.columns:
print(f"Missing required column: {URL_COLUMN}")
return []
return [
str(url).strip()
for url in df[URL_COLUMN].dropna()
if str(url).strip() and str(url).strip().lower() != "n/a"
]
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=10),
retry=retry_if_exception_type(requests.RequestException),
reraise=True,
)
def fetch_with_retry(url: str) -> requests.Response:
"""Fetch a URL with retries."""
response = requests.get(url, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
return response
def fetch_page_text(url: str) -> str | None:
"""Fetch a page and extract readable text from its HTML."""
try:
response = fetch_with_retry(url)
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
content_type = response.headers.get("Content-Type", "")
if content_type and "text/html" not in content_type:
print(f"Skipping non-HTML page: {url}")
return None
soup = BeautifulSoup(response.text, "html.parser")
# Remove elements that usually do not contain useful training text.
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
return soup.get_text(separator="\n", strip=True)
def save_texts(texts: list[str], output_path: Path) -> bool:
"""Save extracted texts to a UTF-8 encoded text file."""
try:
with output_path.open("w", encoding="utf-8") as file:
for text in texts:
file.write(text + "\n\n")
except OSError as e:
print(f"Failed to write output file {output_path}: {e}")
return False
return True
def main() -> None:
urls = load_urls(INPUT_CSV)
if not urls:
print("No URLs found. Nothing to extract.")
return
all_texts: list[str] = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {
executor.submit(fetch_page_text, url): url
for url in urls
}
for future in as_completed(futures):
url = futures[future]
try:
text = future.result()
except Exception as e:
print(f"Unexpected error processing {url}: {e}")
continue
if text:
all_texts.append(text)
if save_texts(all_texts, OUTPUT_FILE):
print("Text extraction completed successfully!")
if __name__ == "__main__":
main()