Merge pull request #19 from cuappdev/maw346/add-daily-sun

TushigBili · web-flow · commit 74e0070415e3 · 2025-09-30T17:19:21.000-04:00
implement articles from daily sun
diff --git a/.env_template b/.env_template
@@ -2,3 +2,4 @@ YOUTUBE_API_KEY=
 MONGO_URI=
 MONGO_DB=
 STAGE=
+DAILY_SUN_URL=
diff --git a/README.md b/README.md
@@ -22,4 +22,6 @@ To start the project, run the following command in the terminal
 
 ## Setting up the database
 
-Add /graphql to the url to access the interactive GraphQL platform
+Create a Mongo database named `score_db` and another named `daily_sun_db`. A partnership with the Daily Sun has given us access to their articles which we copy and paginate the results for frontend.
+
+Add /graphql to the url to access the interactive GraphQL platform
diff --git a/app.py b/app.py
@@ -5,6 +5,10 @@
 from flask_graphql import GraphQLView
 from graphene import Schema
 from src.schema import Query, Mutation
+from src.scrapers.games_scraper import fetch_game_schedule
+from src.scrapers.youtube_stats import fetch_videos
+from src.scrapers.daily_sun_scrape import fetch_news
+from src.services.article_service import ArticleService
 from src.utils.team_loader import TeamLoader
 import signal
 import sys
@@ -83,6 +87,22 @@ def create_context():
     ),
 )
 
+# Setup command line arguments
+def parse_args():
+    parser = argparse.ArgumentParser(description="Skip scraping tasks, for dev purposes.")
+    parser.add_argument(
+        "--no-scrape",
+        action="store_true",
+        help="Skips scraping tasks if set, useful for frontend development.",
+    )
+    parser.add_argument(
+        "--no-daily-sun",
+        action="store_true",
+        help="Skips using the Daily Sun page for alerts",
+    )
+    return parser.parse_args()
+
+args = parse_args()
 
 def signal_handler(sig, frame):
     sys.exit(0)
@@ -91,5 +111,40 @@ def signal_handler(sig, frame):
 signal.signal(signal.SIGINT, signal_handler)
 signal.signal(signal.SIGTERM, signal_handler)
 
+# Only run scraping tasks if not disabled
+if not args.no_scrape:
+    from flask_apscheduler import APScheduler
+    scheduler = APScheduler()
+    scheduler.init_app(app)
+    scheduler.start()
+
+    @scheduler.task("interval", id="scrape_schedules", seconds=43200) # 12 hours
+    def scrape_schedules():
+        logging.info("Scraping game schedules...")
+        fetch_game_schedule()
+
+    @scheduler.task("interval", id="scrape_videos", seconds=43200) # 12 hours
+    def scrape_videos():
+        logging.info("Scraping YouTube videos...")
+        fetch_videos()
+
+    scrape_schedules()
+    scrape_videos()
+
+if not args.no_daily_sun and not args.no_scrape:
+    @scheduler.task("interval", id="scrape_daily_sun", seconds=3600)
+    def scrape_daily_sun():
+        logging.info("Getting Daily Sun Sports News...")
+        fetch_news()
+
+    @scheduler.task("interval", id="cleanse_daily_sun_db", seconds=604800) # 1 week
+    def cleanse_daily_sun_db():
+        logging.info("Cleaning the Daily Sun database from old articles...")
+        ArticleService.cleanse_old_articles()
+
+    scrape_daily_sun()
+    cleanse_daily_sun_db()
+
+
 if __name__ == "__main__":
     app.run(debug=True, host="0.0.0.0", port=8000)
diff --git a/src/database.py b/src/database.py
@@ -48,6 +48,7 @@ def keep_connection_alive():
 
 # Access the database
 db = client[os.getenv("MONGO_DB", "score_db")]
+daily_sun_db = client[os.getenv("DAILY_SUN_DB", "daily_sun_db")]
 
 
 def setup_database_indexes():
diff --git a/src/models/__init__.py b/src/models/__init__.py
@@ -1,3 +1,4 @@
 from .game import Game
 from .team import Team
-from .youtube_video import YoutubeVideo
+from .youtube_video import YoutubeVideo
+from .article import Article
diff --git a/src/models/article.py b/src/models/article.py
@@ -0,0 +1,56 @@
+from bson.objectid import ObjectId
+from datetime import datetime
+
+class Article:
+    """
+    A model representing a news article.
+
+    Attributes:
+        - title: The title of the article
+        - image: The filename of the article's main image
+        - sports_type: The specific sport category
+        - published_at: The publication date
+        - url: The URL to the full article
+        - slug: Unique identifier from the source
+        - created_at: When the article was added to our DB
+    """
+    def __init__(self, title, sports_type, published_at, url, slug, image=None, id=None, created_at=None):
+        self.id = id if id else str(ObjectId())
+        self.title = title
+        self.image = image
+        self.sports_type = sports_type
+        self.published_at = published_at
+        self.url = url
+        self.slug = slug
+        self.created_at = created_at if created_at else datetime.now()
+
+    def to_dict(self):
+        """
+        Converts the Article object to a dictionary format for MongoDB storage.
+        """
+        return {
+            "_id": self.id,
+            "title": self.title,
+            "image": self.image,
+            "sports_type": self.sports_type,
+            "published_at": self.published_at,
+            "url": self.url,
+            "slug": self.slug,
+            "created_at": self.created_at
+        }
+
+    @staticmethod
+    def from_dict(data):
+        """
+        Converts a MongoDB document to an Article object.
+        """
+        return Article(
+            id=data.get("_id"),
+            title=data.get("title"),
+            image=data.get("image"),
+            sports_type=data.get("sports_type"),
+            published_at=data.get("published_at"),
+            url=data.get("url"),
+            slug=data.get("slug"),
+            created_at=data.get("created_at")
+        )
diff --git a/src/mutations/__init__.py b/src/mutations/__init__.py
@@ -1,3 +1,4 @@
 from .create_game import CreateGame
 from .create_team import CreateTeam
-from .create_youtube_video import CreateYoutubeVideo
+from .create_youtube_video import CreateYoutubeVideo
+from .create_article import CreateArticle
diff --git a/src/mutations/create_article.py b/src/mutations/create_article.py
@@ -0,0 +1,27 @@
+from graphene import Mutation, String, Field
+from src.types import ArticleType
+from src.services.article_service import ArticleService
+
+class CreateArticle(Mutation):
+    class Arguments:
+        title = String(required=True)
+        sports_type = String(required=True)
+        published_at = String(required=True)
+        url = String(required=True)
+        slug = String(required=True)
+        image = String(required=False)
+
+    article = Field(lambda: ArticleType)
+
+    def mutate(self, info, title, sports_type, published_at, url, slug, image=None):
+        from datetime import datetime
+        article_data = {
+            "title": title,
+            "sports_type": sports_type,
+            "published_at": datetime.fromisoformat(published_at),
+            "url": url,
+            "slug": slug,
+            "image": image
+        }
+        new_article = ArticleService.create_article(article_data)
+        return CreateArticle(article=new_article)
diff --git a/src/queries/__init__.py b/src/queries/__init__.py
@@ -1,3 +1,4 @@
 from .game_query import GameQuery
 from .team_query import TeamQuery
 from .youtube_video_query import YoutubeVideoQuery
+from .article_query import ArticleQuery
diff --git a/src/queries/article_query.py b/src/queries/article_query.py
@@ -0,0 +1,12 @@
+from graphene import ObjectType, List, String
+from src.services.article_service import ArticleService
+from src.types import ArticleType
+
+class ArticleQuery(ObjectType):
+    articles = List(ArticleType, sports_type=String())
+
+    def resolve_articles(self, info, sports_type=None):
+        """
+        Resolver for retrieving news articles, optionally filtered by sports_type.
+        """
+        return ArticleService.get_articles(sports_type)
diff --git a/src/repositories/__init__.py b/src/repositories/__init__.py
@@ -1,3 +1,4 @@
 from .game_repository import GameRepository
 from .team_repository import TeamRepository
 from .youtube_video_repository import YoutubeVideoRepository
+from .article_repository import ArticleRepository
diff --git a/src/repositories/article_repository.py b/src/repositories/article_repository.py
@@ -0,0 +1,79 @@
+from src.database import daily_sun_db
+from src.models.article import Article
+from pymongo import UpdateOne
+from datetime import datetime, timedelta
+
+class ArticleRepository:
+    @staticmethod
+    def upsert(article):
+        """
+        Upsert an article into the 'news_articles' collection in MongoDB.
+        """
+        article_collection = daily_sun_db["news_articles"]
+        article_dict = article.to_dict()
+        # Remove _id from the update to avoid MongoDB error
+        article_dict.pop("_id", None)
+        
+        article_collection.update_one(
+            {"slug": article.slug},
+            {"$set": article_dict},
+            upsert=True
+        )
+
+    @staticmethod
+    def bulk_upsert(articles):
+        """
+        Bulk upsert articles into the 'news_articles' collection based on slug.
+        """
+        if not articles:
+            return
+
+        article_collection = daily_sun_db["news_articles"]
+        operations = []
+        for article in articles:
+            article_dict = article.to_dict()
+            # Remove _id from the update to avoid MongoDB error
+            article_dict.pop("_id", None)
+            
+            operations.append(
+                UpdateOne(
+                    {"slug": article.slug},
+                    {"$set": article_dict},
+                    upsert=True
+                )
+            )
+        
+        if operations:
+            article_collection.bulk_write(operations)
+
+    @staticmethod
+    def find_recent(limit_days=3):
+        """
+        Retrieve articles from the last N days, sorted by published_at descending.
+        """
+        article_collection = daily_sun_db["news_articles"]
+        query = {"published_at": {"$gte": datetime.now() - timedelta(days=limit_days)}}
+        articles = article_collection.find(query).sort("published_at", -1)
+        return [Article.from_dict(article) for article in articles]
+
+    @staticmethod
+    def find_by_sports_type(sports_type, limit_days=3):
+        """
+        Retrieve articles by sports_type from the last N days, sorted by published_at descending.
+        """
+        article_collection = daily_sun_db["news_articles"]
+        query = {
+            "sports_type": sports_type,
+            "published_at": {"$gte": datetime.now() - timedelta(days=limit_days)}
+        }
+        articles = article_collection.find(query).sort("published_at", -1)
+        return [Article.from_dict(article) for article in articles]
+    
+    @staticmethod
+    def delete_not_recent(limit_days=3):
+        """
+        Delete articles older than N days, sorted by published_at descending.
+        """
+        article_collection = daily_sun_db["news_articles"]
+        query = {"published_at": {"$lt": datetime.now() - timedelta(days=limit_days)}}
+        article_collection.delete_many(query)
diff --git a/src/schema.py b/src/schema.py
@@ -1,16 +1,17 @@
 from graphene import ObjectType, Schema, Mutation
-from src.mutations import CreateGame, CreateTeam, CreateYoutubeVideo
-from src.queries import GameQuery, TeamQuery, YoutubeVideoQuery
+from src.mutations import CreateGame, CreateTeam, CreateYoutubeVideo, CreateArticle
+from src.queries import GameQuery, TeamQuery, YoutubeVideoQuery, ArticleQuery
 
 
-class Query(TeamQuery, GameQuery, YoutubeVideoQuery, ObjectType):
+class Query(TeamQuery, GameQuery, YoutubeVideoQuery, ArticleQuery, ObjectType):
     pass
 
 
 class Mutation(ObjectType):
     create_game = CreateGame.Field(description="Creates a new game.")
     create_team = CreateTeam.Field(description="Creates a new team.")
     create_youtube_video = CreateYoutubeVideo.Field(description="Creates a new youtube video.")
+    create_article = CreateArticle.Field(description="Creates a new article.")
 
 
 schema = Schema(query=Query, mutation=Mutation)
diff --git a/src/scrapers/daily_sun_scrape.py b/src/scrapers/daily_sun_scrape.py
@@ -0,0 +1,59 @@
+import os
+import requests
+from datetime import datetime, timedelta
+from dotenv import load_dotenv
+from ..services import ArticleService
+import logging
+
+load_dotenv()
+
+
+def fetch_news():
+    try:
+        url = os.getenv("DAILY_SUN_URL")
+        response = requests.get(
+            url,
+            headers={
+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+            }
+        )
+        response.raise_for_status()
+        data = response.json()
+
+        # Current date and 3-day threshold
+        current_date = datetime.now()
+        three_days_ago = current_date - timedelta(days=3)
+
+        # Process articles
+        articles_to_store = []
+        for article in data.get("articles", []):
+            published_at = datetime.strptime(article["published_at"], "%Y-%m-%d %H:%M:%S")
+            
+            if published_at >= three_days_ago:
+                sports_type = next(
+                    (tag["name"] for tag in article["tags"] if tag["name"] not in ["Sports", "Top Stories"]),
+                    "General"
+                )
+                article_url = f"https://cornellsun.com/article/{article['slug']}"
+
+                article_doc = {
+                    "title": article["headline"],
+                    "image": article["dominantMedia"]["title"] if article["dominantMedia"] else None,
+                    "sports_type": sports_type,
+                    "published_at": published_at,
+                    "url": article_url,
+                    "slug": article["slug"],
+                    "created_at": datetime.now()
+                }
+                articles_to_store.append(article_doc)
+
+        if articles_to_store:
+            ArticleService.create_articles_bulk(articles_to_store)
+            logging.info(f"Stored/Updated {len(articles_to_store)} recent articles")
+        else:
+            logging.info("No recent articles to store")
+        return True
+
+    except Exception as e:
+        logging.error(f"Error fetching news: {str(e)}")
+        return False
diff --git a/src/services/__init__.py b/src/services/__init__.py
@@ -1,3 +1,4 @@
 from .game_service import GameService
 from .team_service import TeamService
-from .youtube_video_service import YoutubeVideoService
+from .youtube_video_service import YoutubeVideoService
+from .article_service import ArticleService
diff --git a/src/services/article_service.py b/src/services/article_service.py
diff --git a/src/types.py b/src/types.py