Skip to content

Commit f15c721

Browse files
authored
Merge pull request #42 from cuappdev/claire/dailySunImages
Added Daily Sun Images
2 parents 8ff59fd + 580a3f3 commit f15c721

2 files changed

Lines changed: 21 additions & 2 deletions

File tree

src/scrapers/daily_sun_scrape.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33
from datetime import datetime, timedelta
44
from dotenv import load_dotenv
55
from ..services import ArticleService
6+
from ..utils.constants import ARTICLE_IMG_TAG
67
import logging
8+
from bs4 import BeautifulSoup
9+
import base64
710

811
load_dotenv()
912

@@ -36,16 +39,32 @@ def fetch_news():
3639
)
3740
article_url = f"https://cornellsun.com/article/{article['slug']}"
3841

42+
article_image = None
43+
try:
44+
response = requests.get(
45+
article_url,
46+
headers={
47+
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
48+
}
49+
)
50+
response.raise_for_status()
51+
soup = BeautifulSoup(response.content, 'html.parser')
52+
img_tag = soup.select_one(ARTICLE_IMG_TAG)
53+
if img_tag and img_tag.get('src'):
54+
article_image=img_tag.get('src')
55+
except Exception as e:
56+
logging.error(f"Error fetching news: {str(e)}")
3957
article_doc = {
4058
"title": article["headline"],
41-
"image": article["dominantMedia"]["title"] if article["dominantMedia"] else None,
59+
"image": article_image,
4260
"sports_type": sports_type,
4361
"published_at": published_at,
4462
"url": article_url,
4563
"slug": article["slug"],
4664
"created_at": datetime.now()
4765
}
4866
articles_to_store.append(article_doc)
67+
4968

5069
if articles_to_store:
5170
ArticleService.create_articles_bulk(articles_to_store)

src/utils/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,4 +130,4 @@
130130
# The maximum number of videos to retrieve
131131
VIDEO_LIMIT = 20
132132

133-
133+
ARTICLE_IMG_TAG = ".dom-art-container img"

0 commit comments

Comments
 (0)