Skip to content

Commit 920acf1

Browse files
Merge pull request #1 from PoCInnovation/scrape/medium
Add HTML and Medium scrapers for article extraction
2 parents 1210135 + 62aca8c commit 920acf1

2 files changed

Lines changed: 372 additions & 0 deletions

File tree

html_scrapper.py

Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
from typing import Dict, Optional, List
4+
from datetime import datetime
5+
import time
6+
7+
8+
class HTMLScraper:
9+
"""Generic HTML scraper for extracting full article content."""
10+
11+
def __init__(self, user_agent: Optional[str] = None):
12+
"""Initialize the scraper with optional custom user agent."""
13+
self.user_agent = user_agent or (
14+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
15+
"AppleWebKit/537.36 (KHTML, like Gecko) "
16+
"Chrome/120.0.0.0 Safari/537.36"
17+
)
18+
self.session = requests.Session()
19+
self.session.headers.update({'User-Agent': self.user_agent})
20+
21+
def fetch_html(self, url: str, timeout: int = 30) -> Optional[str]:
22+
"""Fetch HTML content from URL."""
23+
try:
24+
response = self.session.get(url, timeout=timeout)
25+
response.raise_for_status()
26+
return response.text
27+
except Exception as e:
28+
print(f"❌ Error fetching {url}: {e}")
29+
return None
30+
31+
def scrape_medium_article(self, url: str) -> Optional[Dict]:
32+
"""Scrape a Medium article and extract all content."""
33+
html = self.fetch_html(url)
34+
if not html:
35+
return None
36+
37+
try:
38+
soup = BeautifulSoup(html, 'html.parser')
39+
40+
title = self._extract_title(soup)
41+
author = self._extract_author(soup)
42+
publish_date = self._extract_publish_date(soup)
43+
content = self._extract_full_content(soup)
44+
top_image = self._extract_top_image(soup)
45+
tags = self._extract_tags(soup)
46+
47+
return {
48+
'url': url,
49+
'title': title,
50+
'author': author,
51+
'publish_date': publish_date,
52+
'content': content,
53+
'top_image': top_image,
54+
'tags': tags,
55+
'scraped_at': datetime.now().isoformat(),
56+
'word_count': len(content.split()) if content else 0
57+
}
58+
59+
except Exception as e:
60+
print(f"❌ Error parsing {url}: {e}")
61+
return None
62+
63+
def _extract_title(self, soup: BeautifulSoup) -> str:
64+
"""Extract article title."""
65+
selectors = [
66+
'h1',
67+
'article h1',
68+
'[data-testid="storyTitle"]',
69+
'meta[property="og:title"]',
70+
]
71+
72+
for selector in selectors:
73+
if selector.startswith('meta'):
74+
element = soup.select_one(selector)
75+
if element and element.get('content'):
76+
return element.get('content')
77+
else:
78+
element = soup.select_one(selector)
79+
if element:
80+
return element.get_text(strip=True)
81+
82+
return "Title not found"
83+
84+
def _extract_author(self, soup: BeautifulSoup) -> str:
85+
"""Extract article author."""
86+
selectors = [
87+
'meta[name="author"]',
88+
'meta[property="article:author"]',
89+
'a[rel="author"]',
90+
'[data-testid="authorName"]',
91+
]
92+
93+
for selector in selectors:
94+
if selector.startswith('meta'):
95+
element = soup.select_one(selector)
96+
if element and element.get('content'):
97+
return element.get('content')
98+
else:
99+
element = soup.select_one(selector)
100+
if element:
101+
return element.get_text(strip=True)
102+
103+
return "Author not found"
104+
105+
def _extract_publish_date(self, soup: BeautifulSoup) -> Optional[str]:
106+
"""Extract publication date."""
107+
selectors = [
108+
'meta[property="article:published_time"]',
109+
'meta[name="publish_date"]',
110+
'time[datetime]',
111+
]
112+
113+
for selector in selectors:
114+
element = soup.select_one(selector)
115+
if element:
116+
if selector.startswith('meta'):
117+
return element.get('content')
118+
elif selector == 'time[datetime]':
119+
return element.get('datetime')
120+
121+
return None
122+
123+
def _extract_full_content(self, soup: BeautifulSoup) -> str:
124+
"""Extract all article content using multiple approaches."""
125+
content_parts = []
126+
127+
article = soup.find('article')
128+
if article:
129+
paragraphs = article.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote', 'pre'])
130+
for para in paragraphs:
131+
text = para.get_text(strip=True)
132+
if text and len(text) > 10:
133+
content_parts.append(text)
134+
135+
if not content_parts:
136+
content_divs = soup.find_all(['div', 'section'], class_=lambda x: x and any(
137+
keyword in str(x).lower() for keyword in ['content', 'article', 'post', 'story']
138+
))
139+
140+
for div in content_divs:
141+
paragraphs = div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote'])
142+
for para in paragraphs:
143+
text = para.get_text(strip=True)
144+
if text and len(text) > 10:
145+
content_parts.append(text)
146+
147+
if not content_parts:
148+
all_paragraphs = soup.find_all('p')
149+
for para in all_paragraphs:
150+
text = para.get_text(strip=True)
151+
if text and len(text) > 20:
152+
content_parts.append(text)
153+
154+
cleaned_parts = []
155+
seen = set()
156+
for part in content_parts:
157+
cleaned = ' '.join(part.split())
158+
if cleaned and cleaned not in seen and len(cleaned) > 20:
159+
cleaned_parts.append(cleaned)
160+
seen.add(cleaned)
161+
162+
full_content = '\n\n'.join(cleaned_parts)
163+
return full_content if full_content else "Content not found"
164+
165+
def _extract_top_image(self, soup: BeautifulSoup) -> Optional[str]:
166+
"""Extract main article image."""
167+
selectors = [
168+
'meta[property="og:image"]',
169+
'meta[name="twitter:image"]',
170+
'article img',
171+
]
172+
173+
for selector in selectors:
174+
if selector.startswith('meta'):
175+
element = soup.select_one(selector)
176+
if element and element.get('content'):
177+
return element.get('content')
178+
else:
179+
element = soup.select_one(selector)
180+
if element and element.get('src'):
181+
return element.get('src')
182+
183+
return None
184+
185+
def _extract_tags(self, soup: BeautifulSoup) -> List[str]:
186+
"""Extract article tags and categories."""
187+
tags = []
188+
189+
meta_keywords = soup.select_one('meta[name="keywords"]')
190+
if meta_keywords and meta_keywords.get('content'):
191+
tags.extend([tag.strip() for tag in meta_keywords.get('content').split(',')])
192+
193+
tag_links = soup.find_all('a', href=lambda x: x and '/tag/' in str(x))
194+
for link in tag_links:
195+
tag_text = link.get_text(strip=True)
196+
if tag_text and tag_text not in tags:
197+
tags.append(tag_text)
198+
199+
return tags
200+
201+
def scrape_multiple_articles(self, urls: List[str], delay: int = 2) -> List[Dict]:
202+
"""Scrape multiple articles with delay between requests."""
203+
articles = []
204+
total = len(urls)
205+
206+
for i, url in enumerate(urls, 1):
207+
print(f"\n[{i}/{total}] Scraping: {url}")
208+
209+
article = self.scrape_medium_article(url)
210+
if article:
211+
articles.append(article)
212+
print(f"✅ Article scraped: {article['title']}")
213+
print(f" Words: {article['word_count']}")
214+
215+
if i < total:
216+
time.sleep(delay)
217+
218+
print(f"\n✨ Total: {len(articles)} articles scraped")
219+
return articles
220+
221+
def save_to_file(self, articles: List[Dict], filename: str = "scraped_articles.txt"):
222+
"""Save articles to text file."""
223+
with open(filename, 'w', encoding='utf-8') as f:
224+
for article in articles:
225+
f.write(f"\n{'='*80}\n")
226+
f.write(f"TITLE: {article['title']}\n")
227+
f.write(f"AUTHOR: {article['author']}\n")
228+
f.write(f"DATE: {article['publish_date']}\n")
229+
f.write(f"URL: {article['url']}\n")
230+
f.write(f"TAGS: {', '.join(article['tags'])}\n")
231+
f.write(f"WORD COUNT: {article['word_count']}\n")
232+
f.write(f"\n{'-'*80}\n")
233+
f.write(f"CONTENT:\n\n{article['content']}\n")
234+
235+
print(f"\n💾 Articles saved to {filename}")
236+
237+
238+
if __name__ == "__main__":
239+
scraper = HTMLScraper()
240+
241+
test_urls = [
242+
"https://medium.com/@satvik.jain.kht/bert-and-its-tokenization-explained-intuitively-a986f952c491"
243+
]
244+
245+
articles = scraper.scrape_multiple_articles(test_urls, delay=3)
246+
247+
if articles:
248+
scraper.save_to_file(articles, "medium_full_articles.txt")
249+
250+
if len(articles) > 0:
251+
print("\n" + "="*60)
252+
print("FIRST ARTICLE PREVIEW:")
253+
print("="*60)
254+
first = articles[0]
255+
print(f"Title: {first['title']}")
256+
print(f"Author: {first['author']}")
257+
print(f"Words: {first['word_count']}")
258+
print(f"\nContent (excerpt):\n{first['content'][:300]}...")

medium_scraping.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import feedparser
2+
from html_scrapper import HTMLScraper
3+
from typing import List, Dict, Optional
4+
import time
5+
6+
7+
class MediumScraper:
8+
"""Scraper for Medium AI articles."""
9+
10+
RSS_FEEDS = [
11+
"https://medium.com/feed/tag/artificial-intelligence",
12+
"https://medium.com/feed/tag/machine-learning",
13+
"https://medium.com/feed/tag/deep-learning",
14+
"https://medium.com/feed/tag/ai",
15+
]
16+
17+
def __init__(self):
18+
self.articles_data = []
19+
self.html_scraper = HTMLScraper()
20+
21+
def fetch_rss_feeds(self, max_articles_per_feed: int = 10) -> List[Dict]:
22+
"""Fetch articles from Medium RSS feeds."""
23+
all_entries = []
24+
25+
for feed_url in self.RSS_FEEDS:
26+
print(f"📡 Fetching RSS: {feed_url}")
27+
try:
28+
feed = feedparser.parse(feed_url)
29+
for entry in feed.entries[:max_articles_per_feed]:
30+
article_info = {
31+
'title': entry.get('title', 'N/A'),
32+
'link': entry.get('link', ''),
33+
'published': entry.get('published', 'N/A'),
34+
'summary': entry.get('summary', 'N/A'),
35+
'author': entry.get('author', 'N/A'),
36+
'tags': [tag.term for tag in entry.get('tags', [])] if 'tags' in entry else []
37+
}
38+
all_entries.append(article_info)
39+
print(f"✅ {len(feed.entries[:max_articles_per_feed])} articles fetched")
40+
except Exception as e:
41+
print(f"❌ Error fetching {feed_url}: {e}")
42+
time.sleep(1)
43+
44+
print(f"\n📊 Total: {len(all_entries)} articles from all feeds")
45+
return all_entries
46+
47+
def scrape_article_content(self, url: str) -> Optional[Dict]:
48+
"""Scrape full HTML content of a Medium article."""
49+
return self.html_scraper.scrape_medium_article(url)
50+
51+
def scrape_articles_from_rss(self, max_articles: int = 20, delay: int = 2) -> List[Dict]:
52+
"""Fetch RSS feeds and scrape full content of each article."""
53+
print("=" * 60)
54+
print("STEP 1: Fetching RSS feeds")
55+
print("=" * 60)
56+
rss_entries = self.fetch_rss_feeds(max_articles_per_feed=max_articles)
57+
58+
unique_urls = list({entry['link'] for entry in rss_entries})[:max_articles]
59+
60+
print("\n" + "=" * 60)
61+
print(f"STEP 2: Scraping {len(unique_urls)} articles")
62+
print("=" * 60)
63+
64+
scraped_articles = []
65+
for i, url in enumerate(unique_urls, 1):
66+
print(f"\n[{i}/{len(unique_urls)}] Scraping: {url}")
67+
content = self.scrape_article_content(url)
68+
if content:
69+
scraped_articles.append(content)
70+
print(f"✅ Article scraped: {content['title']}")
71+
if i < len(unique_urls):
72+
time.sleep(delay)
73+
74+
self.articles_data = scraped_articles
75+
76+
print("\n" + "=" * 60)
77+
print(f"✨ Scraping done: {len(scraped_articles)} articles")
78+
print("=" * 60)
79+
80+
return scraped_articles
81+
82+
def save_to_file(self, filename: str = "medium_articles.txt"):
83+
"""Save articles to text file."""
84+
with open(filename, 'w', encoding='utf-8') as f:
85+
for article in self.articles_data:
86+
f.write(f"\n{'='*80}\n")
87+
f.write(f"TITLE: {article['title']}\n")
88+
f.write(f"AUTHOR: {article['author']}\n")
89+
f.write(f"DATE: {article['publish_date']}\n")
90+
f.write(f"URL: {article['url']}\n")
91+
f.write(f"TAGS: {', '.join(article['tags'])}\n")
92+
f.write(f"WORD COUNT: {article['word_count']}\n")
93+
f.write(f"\n{article['content']}\n")
94+
95+
print(f"💾 Articles saved to {filename}")
96+
97+
98+
if __name__ == "__main__":
99+
scraper = MediumScraper()
100+
articles = scraper.scrape_articles_from_rss(max_articles=2, delay=5)
101+
102+
if articles:
103+
scraper.save_to_file("medium_ai_articles.txt")
104+
105+
if len(articles) > 0:
106+
print("\n" + "="*60)
107+
print("FIRST ARTICLE PREVIEW:")
108+
print("="*60)
109+
first = articles[0]
110+
print(f"Title: {first['title']}")
111+
print(f"Author: {first['author']}")
112+
print(f"Date: {first['publish_date']}")
113+
print(f"Words: {first['word_count']}")
114+
print(f"Content (excerpt): {first['content'][:300]}...")

0 commit comments

Comments
 (0)