-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathumn-scrape.py
More file actions
167 lines (135 loc) · 7.4 KB
/
umn-scrape.py
File metadata and controls
167 lines (135 loc) · 7.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import asyncio
from playwright.async_api import async_playwright
import os
from urllib.parse import urljoin
import time
# Category URLs to scrape
CATEGORIES = [
"https://open.umn.edu/opentextbooks/subjects/computer-science-information-systems",
"https://open.umn.edu/opentextbooks/subjects/business",
"https://open.umn.edu/opentextbooks/subjects/education",
"https://open.umn.edu/opentextbooks/subjects/engineering",
"https://open.umn.edu/opentextbooks/subjects/humanities",
"https://open.umn.edu/opentextbooks/subjects/journalism-media-studies-communications",
"https://open.umn.edu/opentextbooks/subjects/law",
"https://open.umn.edu/opentextbooks/subjects/mathematics",
"https://open.umn.edu/opentextbooks/subjects/medicine",
"https://open.umn.edu/opentextbooks/subjects/natural-sciences",
"https://open.umn.edu/opentextbooks/subjects/social-sciences"
]
# Domains we want to download from
ALLOWED_DOWNLOAD_DOMAINS = ['milneopentextbooks.org', 'open.umn.edu']
# Download directory
DOWNLOAD_DIR = "./textbook_pdfs"
# Base URL for resolving relative links
BASE_URL = "https://open.umn.edu"
async def setup_download_dir():
"""Create download directory if it doesn't exist"""
if not os.path.exists(DOWNLOAD_DIR):
os.makedirs(DOWNLOAD_DIR)
async def is_allowed_domain(url):
"""Check if URL is from an allowed download domain"""
if not url:
return False
return any(domain in url for domain in ALLOWED_DOWNLOAD_DOMAINS)
async def scrape_books():
async with async_playwright() as p:
# Launch browser in headless mode
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
accept_downloads=True
)
page = await context.new_page()
await setup_download_dir()
downloaded_count = 0
skipped_count = 0
error_count = 0
# Iterate through each category
for category_url in CATEGORIES:
print(f"\n{'='*60}")
print(f"Processing category: {category_url}")
print(f"{'='*60}")
try:
await page.goto(category_url, wait_until='networkidle', timeout=60000)
await page.wait_for_timeout(2000)
# Find all "Read More" buttons
read_more_buttons = await page.locator('a:has-text("READ MORE")').all()
print(f"Found {len(read_more_buttons)} books in this category")
# Get all book URLs first and convert to absolute URLs
book_urls = []
for button in read_more_buttons:
href = await button.get_attribute('href')
if href:
# Convert relative URLs to absolute URLs
book_url = urljoin(BASE_URL, href)
book_urls.append(book_url)
# Process each book
for idx, book_url in enumerate(book_urls, 1):
print(f"\n[{idx}/{len(book_urls)}] Processing book: {book_url}")
try:
# Navigate to book page
await page.goto(book_url, wait_until='networkidle', timeout=60000)
await page.wait_for_timeout(2000)
# Get book title for filename
try:
title_element = await page.locator('h1, h2').first.text_content()
book_title = title_element.strip() if title_element else f"book_{idx}"
except:
book_title = f"book_{idx}"
# Find PDF button
pdf_button = page.locator('a:has-text("PDF")').first
if await pdf_button.count() > 0:
pdf_url = await pdf_button.get_attribute('href')
# Convert relative URL to absolute if needed
if pdf_url:
pdf_url = urljoin(book_url, pdf_url)
print(f" PDF URL: {pdf_url}")
# Check if it's an allowed domain
if await is_allowed_domain(pdf_url):
print(f" ✓ Allowed domain detected, attempting download...")
try:
# Handle download
async with page.expect_download(timeout=30000) as download_info:
await pdf_button.click()
# Wait for download to start
await page.wait_for_timeout(5000)
download = await download_info.value
# Save with sanitized filename
safe_title = "".join(c for c in book_title if c.isalnum() or c in (' ', '-', '_')).strip()
safe_title = safe_title[:100] # Limit length
filename = f"{safe_title}.pdf"
filepath = os.path.join(DOWNLOAD_DIR, filename)
await download.save_as(filepath)
print(f" ✓ Downloaded: {filename}")
downloaded_count += 1
except Exception as e:
print(f" ✗ Download failed: {str(e)}")
error_count += 1
else:
print(f" ✗ Skipped: External site (not in allowed domains)")
skipped_count += 1
else:
print(f" ✗ No PDF URL found")
skipped_count += 1
else:
print(f" ✗ No PDF button found")
skipped_count += 1
except Exception as e:
print(f" ✗ Error processing book: {str(e)}")
error_count += 1
continue
except Exception as e:
print(f"Error processing category {category_url}: {str(e)}")
error_count += 1
continue
await browser.close()
print(f"\n{'='*60}")
print(f"SUMMARY")
print(f"{'='*60}")
print(f"Total downloaded: {downloaded_count}")
print(f"Total skipped: {skipped_count}")
print(f"Total errors: {error_count}")
print(f"Download directory: {DOWNLOAD_DIR}")
# Run the scraper
if __name__ == "__main__":
asyncio.run(scrape_books())