diff --git a/README.md b/README.md index 338a864..86c8abc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,4 @@ -> If you see this section, you've just created a repository using [PoC Innovation's Open-Source project template](https://github.com/PoCInnovation/open-source-project-template). Check the [getting started guide](./.github/getting-started.md). - -# [PROJECT'S NAME] +# EarlyTech [Project's description] diff --git a/html_scrapper.py b/html_scrapper.py deleted file mode 100644 index 6a061c0..0000000 --- a/html_scrapper.py +++ /dev/null @@ -1,258 +0,0 @@ -import requests -from bs4 import BeautifulSoup -from typing import Dict, Optional, List -from datetime import datetime -import time - - -class HTMLScraper: - """Generic HTML scraper for extracting full article content.""" - - def __init__(self, user_agent: Optional[str] = None): - """Initialize the scraper with optional custom user agent.""" - self.user_agent = user_agent or ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/120.0.0.0 Safari/537.36" - ) - self.session = requests.Session() - self.session.headers.update({'User-Agent': self.user_agent}) - - def fetch_html(self, url: str, timeout: int = 30) -> Optional[str]: - """Fetch HTML content from URL.""" - try: - response = self.session.get(url, timeout=timeout) - response.raise_for_status() - return response.text - except Exception as e: - print(f"❌ Error fetching {url}: {e}") - return None - - def scrape_medium_article(self, url: str) -> Optional[Dict]: - """Scrape a Medium article and extract all content.""" - html = self.fetch_html(url) - if not html: - return None - - try: - soup = BeautifulSoup(html, 'html.parser') - - title = self._extract_title(soup) - author = self._extract_author(soup) - publish_date = self._extract_publish_date(soup) - content = self._extract_full_content(soup) - top_image = self._extract_top_image(soup) - tags = self._extract_tags(soup) - - return { - 'url': url, - 'title': title, - 'author': author, - 'publish_date': publish_date, - 'content': content, - 'top_image': top_image, - 'tags': tags, - 'scraped_at': datetime.now().isoformat(), - 'word_count': len(content.split()) if content else 0 - } - - except Exception as e: - print(f"❌ Error parsing {url}: {e}") - return None - - def _extract_title(self, soup: BeautifulSoup) -> str: - """Extract article title.""" - selectors = [ - 'h1', - 'article h1', - '[data-testid="storyTitle"]', - 'meta[property="og:title"]', - ] - - for selector in selectors: - if selector.startswith('meta'): - element = soup.select_one(selector) - if element and element.get('content'): - return element.get('content') - else: - element = soup.select_one(selector) - if element: - return element.get_text(strip=True) - - return "Title not found" - - def _extract_author(self, soup: BeautifulSoup) -> str: - """Extract article author.""" - selectors = [ - 'meta[name="author"]', - 'meta[property="article:author"]', - 'a[rel="author"]', - '[data-testid="authorName"]', - ] - - for selector in selectors: - if selector.startswith('meta'): - element = soup.select_one(selector) - if element and element.get('content'): - return element.get('content') - else: - element = soup.select_one(selector) - if element: - return element.get_text(strip=True) - - return "Author not found" - - def _extract_publish_date(self, soup: BeautifulSoup) -> Optional[str]: - """Extract publication date.""" - selectors = [ - 'meta[property="article:published_time"]', - 'meta[name="publish_date"]', - 'time[datetime]', - ] - - for selector in selectors: - element = soup.select_one(selector) - if element: - if selector.startswith('meta'): - return element.get('content') - elif selector == 'time[datetime]': - return element.get('datetime') - - return None - - def _extract_full_content(self, soup: BeautifulSoup) -> str: - """Extract all article content using multiple approaches.""" - content_parts = [] - - article = soup.find('article') - if article: - paragraphs = article.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote', 'pre']) - for para in paragraphs: - text = para.get_text(strip=True) - if text and len(text) > 10: - content_parts.append(text) - - if not content_parts: - content_divs = soup.find_all(['div', 'section'], class_=lambda x: x and any( - keyword in str(x).lower() for keyword in ['content', 'article', 'post', 'story'] - )) - - for div in content_divs: - paragraphs = div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote']) - for para in paragraphs: - text = para.get_text(strip=True) - if text and len(text) > 10: - content_parts.append(text) - - if not content_parts: - all_paragraphs = soup.find_all('p') - for para in all_paragraphs: - text = para.get_text(strip=True) - if text and len(text) > 20: - content_parts.append(text) - - cleaned_parts = [] - seen = set() - for part in content_parts: - cleaned = ' '.join(part.split()) - if cleaned and cleaned not in seen and len(cleaned) > 20: - cleaned_parts.append(cleaned) - seen.add(cleaned) - - full_content = '\n\n'.join(cleaned_parts) - return full_content if full_content else "Content not found" - - def _extract_top_image(self, soup: BeautifulSoup) -> Optional[str]: - """Extract main article image.""" - selectors = [ - 'meta[property="og:image"]', - 'meta[name="twitter:image"]', - 'article img', - ] - - for selector in selectors: - if selector.startswith('meta'): - element = soup.select_one(selector) - if element and element.get('content'): - return element.get('content') - else: - element = soup.select_one(selector) - if element and element.get('src'): - return element.get('src') - - return None - - def _extract_tags(self, soup: BeautifulSoup) -> List[str]: - """Extract article tags and categories.""" - tags = [] - - meta_keywords = soup.select_one('meta[name="keywords"]') - if meta_keywords and meta_keywords.get('content'): - tags.extend([tag.strip() for tag in meta_keywords.get('content').split(',')]) - - tag_links = soup.find_all('a', href=lambda x: x and '/tag/' in str(x)) - for link in tag_links: - tag_text = link.get_text(strip=True) - if tag_text and tag_text not in tags: - tags.append(tag_text) - - return tags - - def scrape_multiple_articles(self, urls: List[str], delay: int = 2) -> List[Dict]: - """Scrape multiple articles with delay between requests.""" - articles = [] - total = len(urls) - - for i, url in enumerate(urls, 1): - print(f"\n[{i}/{total}] Scraping: {url}") - - article = self.scrape_medium_article(url) - if article: - articles.append(article) - print(f"✅ Article scraped: {article['title']}") - print(f" Words: {article['word_count']}") - - if i < total: - time.sleep(delay) - - print(f"\n✨ Total: {len(articles)} articles scraped") - return articles - - def save_to_file(self, articles: List[Dict], filename: str = "scraped_articles.txt"): - """Save articles to text file.""" - with open(filename, 'w', encoding='utf-8') as f: - for article in articles: - f.write(f"\n{'='*80}\n") - f.write(f"TITLE: {article['title']}\n") - f.write(f"AUTHOR: {article['author']}\n") - f.write(f"DATE: {article['publish_date']}\n") - f.write(f"URL: {article['url']}\n") - f.write(f"TAGS: {', '.join(article['tags'])}\n") - f.write(f"WORD COUNT: {article['word_count']}\n") - f.write(f"\n{'-'*80}\n") - f.write(f"CONTENT:\n\n{article['content']}\n") - - print(f"\n💾 Articles saved to {filename}") - - -if __name__ == "__main__": - scraper = HTMLScraper() - - test_urls = [ - "https://medium.com/@satvik.jain.kht/bert-and-its-tokenization-explained-intuitively-a986f952c491" - ] - - articles = scraper.scrape_multiple_articles(test_urls, delay=3) - - if articles: - scraper.save_to_file(articles, "medium_full_articles.txt") - - if len(articles) > 0: - print("\n" + "="*60) - print("FIRST ARTICLE PREVIEW:") - print("="*60) - first = articles[0] - print(f"Title: {first['title']}") - print(f"Author: {first['author']}") - print(f"Words: {first['word_count']}") - print(f"\nContent (excerpt):\n{first['content'][:300]}...") diff --git a/medium_scraping.py b/medium_scraping.py deleted file mode 100644 index 971ffda..0000000 --- a/medium_scraping.py +++ /dev/null @@ -1,114 +0,0 @@ -import feedparser -from html_scrapper import HTMLScraper -from typing import List, Dict, Optional -import time - - -class MediumScraper: - """Scraper for Medium AI articles.""" - - RSS_FEEDS = [ - "https://medium.com/feed/tag/artificial-intelligence", - "https://medium.com/feed/tag/machine-learning", - "https://medium.com/feed/tag/deep-learning", - "https://medium.com/feed/tag/ai", - ] - - def __init__(self): - self.articles_data = [] - self.html_scraper = HTMLScraper() - - def fetch_rss_feeds(self, max_articles_per_feed: int = 10) -> List[Dict]: - """Fetch articles from Medium RSS feeds.""" - all_entries = [] - - for feed_url in self.RSS_FEEDS: - print(f"📡 Fetching RSS: {feed_url}") - try: - feed = feedparser.parse(feed_url) - for entry in feed.entries[:max_articles_per_feed]: - article_info = { - 'title': entry.get('title', 'N/A'), - 'link': entry.get('link', ''), - 'published': entry.get('published', 'N/A'), - 'summary': entry.get('summary', 'N/A'), - 'author': entry.get('author', 'N/A'), - 'tags': [tag.term for tag in entry.get('tags', [])] if 'tags' in entry else [] - } - all_entries.append(article_info) - print(f"✅ {len(feed.entries[:max_articles_per_feed])} articles fetched") - except Exception as e: - print(f"❌ Error fetching {feed_url}: {e}") - time.sleep(1) - - print(f"\n📊 Total: {len(all_entries)} articles from all feeds") - return all_entries - - def scrape_article_content(self, url: str) -> Optional[Dict]: - """Scrape full HTML content of a Medium article.""" - return self.html_scraper.scrape_medium_article(url) - - def scrape_articles_from_rss(self, max_articles: int = 20, delay: int = 2) -> List[Dict]: - """Fetch RSS feeds and scrape full content of each article.""" - print("=" * 60) - print("STEP 1: Fetching RSS feeds") - print("=" * 60) - rss_entries = self.fetch_rss_feeds(max_articles_per_feed=max_articles) - - unique_urls = list({entry['link'] for entry in rss_entries})[:max_articles] - - print("\n" + "=" * 60) - print(f"STEP 2: Scraping {len(unique_urls)} articles") - print("=" * 60) - - scraped_articles = [] - for i, url in enumerate(unique_urls, 1): - print(f"\n[{i}/{len(unique_urls)}] Scraping: {url}") - content = self.scrape_article_content(url) - if content: - scraped_articles.append(content) - print(f"✅ Article scraped: {content['title']}") - if i < len(unique_urls): - time.sleep(delay) - - self.articles_data = scraped_articles - - print("\n" + "=" * 60) - print(f"✨ Scraping done: {len(scraped_articles)} articles") - print("=" * 60) - - return scraped_articles - - def save_to_file(self, filename: str = "medium_articles.txt"): - """Save articles to text file.""" - with open(filename, 'w', encoding='utf-8') as f: - for article in self.articles_data: - f.write(f"\n{'='*80}\n") - f.write(f"TITLE: {article['title']}\n") - f.write(f"AUTHOR: {article['author']}\n") - f.write(f"DATE: {article['publish_date']}\n") - f.write(f"URL: {article['url']}\n") - f.write(f"TAGS: {', '.join(article['tags'])}\n") - f.write(f"WORD COUNT: {article['word_count']}\n") - f.write(f"\n{article['content']}\n") - - print(f"💾 Articles saved to {filename}") - - -if __name__ == "__main__": - scraper = MediumScraper() - articles = scraper.scrape_articles_from_rss(max_articles=2, delay=5) - - if articles: - scraper.save_to_file("medium_ai_articles.txt") - - if len(articles) > 0: - print("\n" + "="*60) - print("FIRST ARTICLE PREVIEW:") - print("="*60) - first = articles[0] - print(f"Title: {first['title']}") - print(f"Author: {first['author']}") - print(f"Date: {first['publish_date']}") - print(f"Words: {first['word_count']}") - print(f"Content (excerpt): {first['content'][:300]}...") diff --git a/scrap/arxiv/arxiv_papers.db b/scrap/arxiv/arxiv_papers.db deleted file mode 100644 index 526e51f..0000000 Binary files a/scrap/arxiv/arxiv_papers.db and /dev/null differ diff --git a/scrap/arxiv/category.md b/scrap/arxiv/category.md deleted file mode 100644 index ccdc1db..0000000 --- a/scrap/arxiv/category.md +++ /dev/null @@ -1,8 +0,0 @@ -| Code | Domaine | -| -------------- | --------------------------------------- | -| cs.AI | Artificial Intelligence | -| cs.CL | Computation and Language | -| cs.CV | Computer Vision and Pattern Recognition | -| math.PR | Probability | -| stat.ML | Machine Learning (Statistics) | -| physics.gen-ph | General Physics | diff --git a/scrap/arxiv/scrap_arxiv.py b/scrap/arxiv/scrap_arxiv.py deleted file mode 100644 index 3a10db7..0000000 --- a/scrap/arxiv/scrap_arxiv.py +++ /dev/null @@ -1,61 +0,0 @@ -import time -import os -import sqlite3 -import arxiv - -CATEGORY = "cs.LG" # check in category.md -INTERVAL = 300 # secondes -DB_FILE = os.path.join(os.path.dirname(__file__), "arxiv_papers.db") - -conn = sqlite3.connect(DB_FILE) -cursor = conn.cursor() -cursor.execute(""" -CREATE TABLE IF NOT EXISTS papers ( - id TEXT PRIMARY KEY, - title TEXT, - authors TEXT, - published TEXT, - summary TEXT, - link TEXT -) -""") -conn.commit() - -def save_paper(paper): - cursor.execute(""" - INSERT OR IGNORE INTO papers (id, title, authors, published, summary, link) - VALUES (?, ?, ?, ?, ?, ?) - """, ( - paper.entry_id, - paper.title, - ", ".join([a.name for a in paper.authors]), - paper.published.isoformat(), - paper.summary, - paper.entry_id - )) - conn.commit() - -cursor.execute("SELECT id FROM papers") -seen_ids = set(row[0] for row in cursor.fetchall()) - -while True: - search = arxiv.Search( - query=f"cat:{CATEGORY}", - max_results=10, - sort_by=arxiv.SortCriterion.SubmittedDate, - sort_order=arxiv.SortOrder.Descending - ) - - for result in search.results(): - if result.entry_id not in seen_ids: - print("NOUVEAU PAPIER !") - print("Title:", result.title) - print("Authors:", ", ".join([author.name for author in result.authors])) - print("Published:", result.published) - print("Link:", result.entry_id) - print("="*80) - - save_paper(result) - seen_ids.add(result.entry_id) - - time.sleep(INTERVAL) diff --git a/scrap/le_monde/lemonde_articles.db b/scrap/le_monde/lemonde_articles.db deleted file mode 100644 index a04385a..0000000 Binary files a/scrap/le_monde/lemonde_articles.db and /dev/null differ diff --git a/scrap/le_monde/scrap_le_monde.py b/scrap/le_monde/scrap_le_monde.py deleted file mode 100644 index beae010..0000000 --- a/scrap/le_monde/scrap_le_monde.py +++ /dev/null @@ -1,95 +0,0 @@ -import time -import os -import sqlite3 -import feedparser -from datetime import datetime - -FEEDS = [ - # Flux par catégorie - "https://www.lemonde.fr/bresil/rss_full.xml", - "https://www.lemonde.fr/international/rss_full.xml", - "https://www.lemonde.fr/actualite-medias/rss_full.xml", - # Flux "en continu" - "https://www.lemonde.fr/en_continu/rss_full.xml" -] - -DB_FILE = os.path.join(os.path.dirname(__file__), "lemonde_articles.db") -INTERVAL = 300 - -conn = sqlite3.connect(DB_FILE, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) -cur = conn.cursor() -cur.execute(""" -CREATE TABLE IF NOT EXISTS articles ( - id TEXT PRIMARY KEY, - title TEXT, - published TIMESTAMP, - summary TEXT, - link TEXT, - feed TEXT -) -""") -conn.commit() - -def save_article(entry, feed_url): - """ - entry: objet feedparser entry - """ - entry_id = getattr(entry, "id", None) or getattr(entry, "link", None) - title = getattr(entry, "title", "") - link = getattr(entry, "link", "") - summary = getattr(entry, "summary", "") - published = None - if getattr(entry, "published_parsed", None): - published = datetime.fromtimestamp(time.mktime(entry.published_parsed)) - elif getattr(entry, "updated_parsed", None): - published = datetime.fromtimestamp(time.mktime(entry.updated_parsed)) - else: - published = datetime.utcnow() - - cur.execute(""" - INSERT OR IGNORE INTO articles (id, title, published, summary, link, feed) - VALUES (?, ?, ?, ?, ?, ?) - """, (entry_id, title, published, summary, link, feed_url)) - conn.commit() - -def load_seen_ids(): - cur.execute("SELECT id FROM articles") - return set(row[0] for row in cur.fetchall()) - -def fetch_feed(feed_url): - return feedparser.parse(feed_url) - -def main(): - print("Initialisation...") - seen_ids = load_seen_ids() - print(f"{len(seen_ids)} articles déjà en base.") - try: - while True: - for feed in FEEDS: - try: - d = fetch_feed(feed) - if d.bozo: - print(f"[WARN] Problème lecture flux {feed}: {getattr(d, 'bozo_exception', '')}") - continue - - for entry in d.entries: - entry_id = getattr(entry, "id", None) or getattr(entry, "link", None) - if entry_id is None: - continue - if entry_id not in seen_ids: - print(f"[NOUVEAU] {entry.get('title','(no title)')}") - print(" ->", entry.get("link","")) - save_article(entry, feed) - seen_ids.add(entry_id) - except Exception as e: - print(f"[ERREUR] du fetch du feed {feed}: {e}") - - print(f"Attente {INTERVAL}s avant prochaine vérification...") - time.sleep(INTERVAL) - except KeyboardInterrupt: - print("Arrêt par l'utilisateur.") - finally: - conn.close() - -if __name__ == "__main__": - main() diff --git a/scrap/medium_scraping.py b/scrap/medium_scraping.py deleted file mode 100644 index 47405de..0000000 --- a/scrap/medium_scraping.py +++ /dev/null @@ -1,65 +0,0 @@ -import feedparser -from datetime import datetime -from typing import List, Dict, Optional -import time - -SOURCE_SITE = "medium" - -RSS_FEEDS = [ - "https://medium.com/feed/tag/artificial-intelligence", - "https://medium.com/feed/tag/machine-learning", - "https://medium.com/feed/tag/deep-learning", - "https://medium.com/feed/tag/ai", -] - -def normalize_medium_entry(entry: feedparser.FeedParserDict) -> Dict: - """Normalise une entrée RSS Medium dans le format unifié.""" - entry_id = entry.get('link', '') - - published_date = datetime.utcnow().isoformat() - if getattr(entry, "published_parsed", None): - published_date = datetime.fromtimestamp(time.mktime(entry.published_parsed)).isoformat() - - keywords = [tag.term for tag in entry.get('tags', [])] if 'tags' in entry else [] - - return { - "id": entry_id, - "source_site": SOURCE_SITE, - "title": entry.get('title', 'N/A'), - "description": entry.get('summary', 'N/A'), - "author_info": entry.get('author', 'N/A'), - "keywords": ", ".join(keywords), - "content_url": entry_id, - "published_date": published_date, - "item_type": "article", - } - -def scrape_medium(max_articles_per_feed: int = 10) -> List[Dict]: - """Scrape les flux RSS Medium et retourne les éléments unifiés.""" - all_items = [] - unique_links = set() - - for feed_url in RSS_FEEDS: - print(f"📡 Fetching RSS: {feed_url}") - try: - feed = feedparser.parse(feed_url) - - for entry in feed.entries[:max_articles_per_feed]: - link = entry.get('link') - if link and link not in unique_links: - all_items.append(normalize_medium_entry(entry)) - unique_links.add(link) - - except Exception as e: - print(f"❌ Error fetching {feed_url}: {e}") - time.sleep(1) - - return all_items - -if __name__ == "__main__": - results = scrape_medium(max_articles_per_feed=2) - print(f"Total Medium items scraped: {len(results)}") - if results: - print("\nExemple d'élément unifié:") - import json - print(json.dumps(results[0], indent=2)) \ No newline at end of file diff --git a/scrap/scrap_arxiv.py b/scrap/scrap_arxiv.py deleted file mode 100644 index 868b857..0000000 --- a/scrap/scrap_arxiv.py +++ /dev/null @@ -1,59 +0,0 @@ -import arxiv -from datetime import datetime -from typing import List, Dict - -# Constantes de l'outil de veille -SOURCE_SITE = "arxiv" -CATEGORY = "cs.LG" - -def normalize_arxiv_result(paper: arxiv.Result) -> Dict: - """Normalise un résultat arXiv dans le format unifié.""" - - authors = ", ".join([a.name for a in paper.authors]) - - link = paper.entry_id - - keywords_list = [paper.primary_category] - if paper.categories: - keywords_list.extend(paper.categories) - - return { - "id": link, - "source_site": SOURCE_SITE, - "title": paper.title.replace('\n', ' '), - "description": paper.summary.replace('\n', ' '), - "author_info": authors, - "keywords": ", ".join(keywords_list), - "content_url": link, - "published_date": paper.published.isoformat(), - "item_type": "paper", - } - -def scrape_arxiv(category: str = CATEGORY, max_results: int = 10) -> List[Dict]: - """Scrape arXiv pour une catégorie et retourne les éléments unifiés.""" - - try: - search = arxiv.Search( - query=f"cat:{category}", - max_results=max_results, - sort_by=arxiv.SortCriterion.SubmittedDate, - sort_order=arxiv.SortOrder.Descending - ) - - normalized_results = [] - for result in search.results(): - normalized_results.append(normalize_arxiv_result(result)) - - return normalized_results - - except Exception as e: - print(f"[ERREUR] arXiv Search: {e}") - return [] - -if __name__ == "__main__": - results = scrape_arxiv(max_results=5) - print(f"Total arXiv items scraped: {len(results)}") - if results: - print("\nExemple d'élément unifié:") - import json - print(json.dumps(results[0], indent=2)) \ No newline at end of file diff --git a/scrap/scrap_le_monde.py b/scrap/scrap_le_monde.py deleted file mode 100644 index baf74d7..0000000 --- a/scrap/scrap_le_monde.py +++ /dev/null @@ -1,71 +0,0 @@ -import feedparser -import time -from datetime import datetime -from typing import List, Dict - -SOURCE_SITE = "le_monde" - -FEEDS = [ - "https://www.lemonde.fr/international/rss_full.xml", - "https://www.lemonde.fr/actualite-medias/rss_full.xml", - "https://www.lemonde.fr/en_continu/rss_full.xml" -] - -def normalize_lemonde_entry(entry: feedparser.FeedParserDict, feed_url: str) -> Dict: - """Normalise une entrée RSS Le Monde dans le format unifié.""" - entry_id = getattr(entry, "id", None) or getattr(entry, "link", None) - - published_date = datetime.utcnow().isoformat() - if getattr(entry, "published_parsed", None): - published_date = datetime.fromtimestamp(time.mktime(entry.published_parsed)).isoformat() - elif getattr(entry, "updated_parsed", None): - published_date = datetime.fromtimestamp(time.mktime(entry.updated_parsed)).isoformat() - - category = "actualité générale" - if "international" in feed_url: - category = "international" - elif "medias" in feed_url: - category = "actualité médias" - elif "continu" in feed_url: - category = "en continu" - - return { - "id": entry_id, - "source_site": SOURCE_SITE, - "title": getattr(entry, "title", ""), - "description": getattr(entry, "summary", ""), - "author_info": getattr(entry, "author", "Le Monde"), - "keywords": category, - "content_url": getattr(entry, "link", ""), - "published_date": published_date, - "item_type": "article", - } - -def scrape_lemonde(feeds: List[str] = FEEDS) -> List[Dict]: - """Scrape les flux RSS Le Monde et retourne les éléments unifiés.""" - all_items = [] - unique_ids = set() - - for feed_url in feeds: - try: - d = feedparser.parse(feed_url) - - for entry in d.entries: - entry_id = getattr(entry, "id", None) or getattr(entry, "link", None) - if entry_id and entry_id not in unique_ids: - all_items.append(normalize_lemonde_entry(entry, feed_url)) - unique_ids.add(entry_id) - - except Exception as e: - print(f"[ERREUR] du fetch du feed {feed_url}: {e}") - time.sleep(1) - - return all_items - -if __name__ == "__main__": - results = scrape_lemonde(feeds=FEEDS[:1]) - print(f"Total Le Monde items scraped: {len(results)}") - if results: - print("\nExemple d'élément unifié:") - import json - print(json.dumps(results[0], indent=2)) \ No newline at end of file diff --git a/scrap/scrape_github.py b/scrap/scrape_github.py deleted file mode 100644 index b775dbe..0000000 --- a/scrap/scrape_github.py +++ /dev/null @@ -1,130 +0,0 @@ -import os -import requests -from datetime import datetime, UTC -from typing import List, Dict - -SOURCE_SITE = "github" -GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") - - -THEMES = [ - "large-language-model", "llm", "transformer", "text-generation", "retrieval-augmented-generation", - "rag", "agents", "chatbot", "fine-tuning", "quantization", "lora", "peft", - "diffusion", "stable-diffusion", "image-generation", "multimodal", - "speech-to-text", "speech-synthesis", "audio", "reinforcement-learning", - "computer-vision", -] - -HEADERS = { - "Accept": "application/vnd.github+json", - "User-Agent": "github-ai-theme-watcher/1.0" -} -if GITHUB_TOKEN: - HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}" - -class RateLimitError(Exception): - def __init__(self, retry_after=None): - self.retry_after = retry_after - super().__init__("Rate limit hit on GitHub API. Retry after: {}".format(retry_after)) - -def sanitize_text(s): - return str(s) if s is not None else "" - -def normalize_github_repo(repo: Dict, theme: str) -> Dict: - full_name = repo.get("full_name") - keywords_list = [theme, repo.get("language") or ""] - if repo.get("topics"): - keywords_list.extend(repo.get("topics")) - updated_at = repo.get("updated_at") or repo.get("pushed_at") or datetime.now(UTC).isoformat() - return { - "id": full_name, "source_site": SOURCE_SITE, "title": repo.get("name"), - "description": sanitize_text(repo.get("description")), "author_info": repo.get("owner", {}).get("login", ""), - "keywords": ", ".join(filter(None, keywords_list)), "content_url": repo.get("html_url") or f"https://github.com/{full_name}", - "published_date": updated_at, "item_type": "repository", - } - -def build_query_for_theme(theme: str) -> str: - theme_token = theme.replace(" ", "+") - q = f"{theme_token} in:name,description,readme stars:>50" - return q - - -def search_github_repos(query: str, per_page: int = 20) -> List[Dict]: - """ - Recherche des repositories GitHub. - Lève RateLimitError ou retourne List[Dict] (vide ou pleine). - """ - url = "https://api.github.com/search/repositories" - params = { - "q": query, - "sort": "stars", - "order": "desc", - "per_page": per_page - } - - try: - resp = requests.get(url, headers=HEADERS, params=params, timeout=20) - - if resp.status_code == 403: - retry_after = resp.headers.get("Retry-After") - raise RateLimitError(retry_after=int(retry_after) if retry_after and retry_after.isdigit() else None) - - if resp.status_code != 200: - print(f"[WARN] HTTP Status {resp.status_code} for query: {query}") - return [] - - data = resp.json() - return data.get("items", []) - - except RateLimitError: - raise - except requests.exceptions.RequestException as e: - print(f"[ERREUR CONNEXION/HTTP] GitHub Search: {e}") - return [] - except Exception as e: - print(f"[ERREUR INCONNUE/JSON] GitHub Search: {e}") - return [] - - -def scrape_github(themes: List[str] = THEMES, limit_per_theme: int = 20) -> List[Dict]: - """Scrape GitHub pour les thèmes donnés et retourne les éléments unifiés.""" - - all_items = [] - stop_scraping = False - - try: - for theme in themes: - if stop_scraping: - break - - q = build_query_for_theme(theme) - print(f"-> Recherche thème '{theme}' (q={q})") - - try: - items = search_github_repos(q, limit_per_theme) - - if not isinstance(items, list): - print(f"[FATAL WARN] search_github_repos a retourné {type(items)} au lieu de list. Arrêt.") - stop_scraping = True - continue - - normalized_items = [normalize_github_repo(repo, theme) for repo in items] - all_items.extend(normalized_items) - - except RateLimitError: - print(f"[RATE LIMIT] Limite atteinte. Arrêt de la veille GitHub pour cette itération.") - stop_scraping = True - except Exception as e: - print(f"[ERREUR THÈME] '{theme}': {e}") - continue - - finally: - return all_items - -if __name__ == "__main__": - results = scrape_github(themes=["llm"], limit_per_theme=5) - print(f"Total GitHub items scraped: {len(results)}") - if results: - import json - print("\nExemple d'élément unifié:") - print(json.dumps(results[0], indent=2)) \ No newline at end of file diff --git a/scrap/scrape_hf.py b/scrap/scrape_hf.py deleted file mode 100644 index 1c19c55..0000000 --- a/scrap/scrape_hf.py +++ /dev/null @@ -1,93 +0,0 @@ -import requests -from datetime import datetime, UTC -from typing import List, Dict - -SOURCE_SITE = "huggingface" - -def build_url(item: Dict, item_type: str) -> str: - """Construit l’URL publique de l’élément""" - base = "https://huggingface.co" - item_id = item.get("id") - if item_type == "model": - return f"{base}/{item.get('modelId')}" - elif item_type in ("dataset", "space", "collection", "paper"): - return f"{base}/{item_id}" - return base - -def normalize_huggingface_item(item: Dict, item_type: str) -> Dict: - """Normalise un élément Hugging Face dans le format unifié.""" - item_name = item.get("name") or item.get("modelId") or item.get("id") - item_id = item.get("id") or item.get("modelId") or item.get("name") - - author = item.get("author") or item.get("organization", "") - - description = item.get("description", item_name) - - keywords_list = [] - if item.get("tags"): - keywords_list.extend(item.get("tags")) - if item.get("pipeline_tag"): - tag = item.get("pipeline_tag") - keywords_list.append(tag if isinstance(tag, str) else ", ".join(tag)) - - last_modified = item.get("lastModified") or item.get("last_modified") or datetime.now(UTC).isoformat() - - return { - "id": item_id, - "source_site": SOURCE_SITE, - "title": item_name, - "description": description, - "author_info": author, - "keywords": ", ".join(keywords_list), - "content_url": build_url(item, item_type), - "published_date": last_modified, - "item_type": item_type, - } - -def fetch_huggingface_api(endpoint: str, item_type: str, limit: int = 20) -> List[Dict]: - """Récupère les données d'un endpoint spécifique et les normalise.""" - url = f"https://huggingface.co/api/{endpoint}?sort=lastModified&direction=-1&limit={limit}" - - try: - r = requests.get(url, timeout=20) - - if r.status_code == 404: - return [] - - r.raise_for_status() - - items = r.json() - - normalized_items = [normalize_huggingface_item(item, item_type) for item in items] - return normalized_items - - except Exception as e: - print(f"[ERREUR] HF {item_type}: {e}") - return [] - -def scrape_huggingface(limit_per_type: int = 20) -> List[Dict]: - """Scrape le Hugging Face Hub, ignorant l'endpoint 'organizations'.""" - - fetchers = [ - ("models", "model"), - ("datasets", "dataset"), - ("spaces", "space"), - ("collections", "collection"), - ("papers", "paper"), - ] - - all_items = [] - - for endpoint, item_type in fetchers: - items = fetch_huggingface_api(endpoint, item_type, limit_per_type) - all_items.extend(items) - - return all_items - -if __name__ == "__main__": - results = scrape_huggingface(limit_per_type=5) - print(f"Total Hugging Face items scraped: {len(results)}") - if results: - print("\nExemple d'élément unifié:") - import json - print(json.dumps(results[0], indent=2)) \ No newline at end of file diff --git a/scrap/unified_scrapper_pipeline.py b/scrap/unified_scrapper_pipeline.py deleted file mode 100644 index de08f4b..0000000 --- a/scrap/unified_scrapper_pipeline.py +++ /dev/null @@ -1,145 +0,0 @@ -import sqlite3 -from datetime import datetime, UTC -from typing import List, Dict -import time -import os -from scrape_hf import scrape_huggingface -from scrape_github import scrape_github -from medium_scraping import scrape_medium -from scrap_arxiv import scrape_arxiv -from scrap_le_monde import scrape_lemonde - - -DB_FILE = "veille_technique_unified.db" - -def setup_database(): - """Initialise la base de données et crée la table unifiée.""" - conn = sqlite3.connect(DB_FILE) - cur = conn.cursor() - - cur.execute(""" - CREATE TABLE IF NOT EXISTS unified_data ( - id TEXT PRIMARY KEY, - source_site TEXT NOT NULL, - title TEXT NOT NULL, - description TEXT, - author_info TEXT, - keywords TEXT, - content_url TEXT NOT NULL, - published_date TEXT, - item_type TEXT, - created_at TIMESTAMP - ) - """) - conn.commit() - conn.close() - -def save_unified_item(item: Dict, conn: sqlite3.Connection): - """Insère un élément unifié dans la base de données.""" - cur = conn.cursor() - now = datetime.now(UTC).isoformat() - - cur.execute(""" - INSERT OR IGNORE INTO unified_data - (id, source_site, title, description, author_info, keywords, content_url, published_date, item_type, created_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, ( - item["id"], - item["source_site"], - item["title"], - item["description"], - item["author_info"], - item["keywords"], - item["content_url"], - item["published_date"], - item["item_type"], - now - )) - conn.commit() - -def run_scrapers_and_save(): - """Exécute tous les scrapers, collecte les données et les sauvegarde.""" - print("--- Démarrage du Pipeline de Veille Technique ---") - setup_database() - - conn = sqlite3.connect(DB_FILE) - - scrapers = [ - ("Hugging Face", scrape_huggingface, 10), - ("GitHub", scrape_github, 5), - ("Medium", scrape_medium, 5), - ("arXiv", scrape_arxiv, 10), - ("Le Monde", scrape_lemonde, None), - ] - - total_new_items = 0 - - for name, scraper_func, limit in scrapers: - print(f"\n🚀 Lancement du scraper : **{name}**") - - try: - items = scraper_func(limit) if limit is not None else scraper_func() - - - if items is None: - print(f" ❌ **ALERTE: Le scraper {name} a retourné None. Skipping.**") - continue - - try: - iter(items) - - except TypeError: - print(f" ❌ **ERREUR FATALE (Non-Itérable)**: Le scraper {name} a retourné un type non itérable ({type(items)}). Skipping.") - continue - - if not isinstance(items, list): - print(f" ⚠️ WARNING: Le scraper {name} a retourné un objet itérable ({type(items)}) mais pas une liste. Conversion en liste.") - items = list(items) - - print(f" -> {len(items)} éléments récupérés.") - - count_saved = 0 - for item in items: - save_unified_item(item, conn) - count_saved += 1 - - print(f" -> {count_saved} éléments insérés/vérifiés dans la base de données.") - total_new_items += count_saved - - except Exception as e: - print(f" ❌ **ERREUR FATALE** lors du scraping {name}: {e}") - - conn.close() - print(f"\n--- Pipeline Terminé. {total_new_items} éléments traités. ---") - print(f"Base de données unifiée : **{DB_FILE}**") - -def check_results(): - """Affiche les 5 premières entrées de la base de données unifiée.""" - conn = sqlite3.connect(DB_FILE) - cur = conn.cursor() - - cur.execute("SELECT * FROM unified_data LIMIT 5") - rows = cur.fetchall() - - print("\n--- Aperçu des Résultats Unifiés (5 premières lignes) ---") - if not rows: - print("La base de données est vide.") - return - - column_names = [description[0] for description in cur.description] - print(f"Colonnes: {column_names}") - print("-" * 120) - - for row in rows: - print(row) - - cur.execute("SELECT COUNT(*) FROM unified_data") - total_count = cur.fetchone()[0] - print(f"\nTotal des éléments dans la DB : **{total_count}**") - - conn.close() - - -if __name__ == "__main__": - run_scrapers_and_save() - check_results() \ No newline at end of file diff --git a/scrape_github.py b/scrape_github.py deleted file mode 100644 index 7e631e6..0000000 --- a/scrape_github.py +++ /dev/null @@ -1,214 +0,0 @@ -#!/usr/bin/env python3 -""" -github_ai_theme_watcher.py - -Veille thématique GitHub orientée IA — recherche de projets par thème (ex: "LLM", "diffusion", "RAG", ...) -Stocke des résultats synthétiques dans une base SQLite pour consommation par un dashboard / newsletter / alertes. - -Usage: - python github_ai_theme_watcher.py # tourne en continu (sleep INTERVAL) - python github_ai_theme_watcher.py --once # exécute une seule itération (utile pour cron/tests) - -Configure via variables en tête du fichier ou via variables d'environnement: - - GITHUB_TOKEN: token (optionnel mais recommandé) -""" - -import os -import sys -import time -import sqlite3 -import requests -import argparse -from datetime import datetime -from typing import List - - -THEMES = [ - "large-language-model", - "llm", - "transformer", - "text-generation", - "retrieval-augmented-generation", - "rag", - "agents", - "chatbot", - "fine-tuning", - "quantization", - "lora", - "peft", - "diffusion", - "stable-diffusion", - "image-generation", - "multimodal", - "speech-to-text", - "speech-synthesis", - "audio", - "reinforcement-learning", - "computer-vision", -] - -RESULTS_PER_THEME = 20 - -INTERVAL = int(os.getenv("GITHUB_WATCHER_INTERVAL", 21600)) - -DB_FILE = os.path.join(os.path.dirname(__file__), "github_ai_trending.db") - -GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") - -HEADERS = { - "Accept": "application/vnd.github+json", - "User-Agent": "github-ai-theme-watcher/1.0" -} -if GITHUB_TOKEN: - HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}" - -conn = sqlite3.connect(DB_FILE) -cur = conn.cursor() - -cur.execute(""" -CREATE TABLE IF NOT EXISTS trending_ai_projects ( - full_name TEXT PRIMARY KEY, - name TEXT, - description TEXT, - stars INTEGER, - language TEXT, - theme TEXT, - updated_at TEXT, - html_url TEXT, - last_seen TIMESTAMP -) -""") -conn.commit() - -cur.execute(""" -CREATE TABLE IF NOT EXISTS project_history ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - full_name TEXT, - stars INTEGER, - updated_at TEXT, - captured_at TIMESTAMP -) -""") -conn.commit() - - -def search_github_repos(query: str, per_page: int = RESULTS_PER_THEME) -> List[dict]: - """ - Recherche des repositories GitHub via l'API Search. - `query` doit être la Q de recherche (ex: "transformer language:python"). - """ - url = "https://api.github.com/search/repositories" - params = { - "q": query, - "sort": "stars", - "order": "desc", - "per_page": per_page - } - resp = requests.get(url, headers=HEADERS, params=params, timeout=20) - if resp.status_code == 403: - retry_after = resp.headers.get("Retry-After") - raise RateLimitError(retry_after=int(retry_after) if retry_after and retry_after.isdigit() else None) - resp.raise_for_status() - data = resp.json() - return data.get("items", []) - -def sanitize_text(s): - if s is None: - return "" - return str(s) - -def save_project(repo: dict, theme: str): - """INSERT OR REPLACE de l'enregistrement principal + ajout historique.""" - full_name = repo.get("full_name") - name = repo.get("name") - desc = sanitize_text(repo.get("description")) - stars = repo.get("stargazers_count", 0) - language = repo.get("language") or "" - updated_at = repo.get("updated_at") or repo.get("pushed_at") or datetime.utcnow().isoformat() - html_url = repo.get("html_url") or f"https://github.com/{full_name}" - now = datetime.utcnow().isoformat() - - cur.execute(""" - INSERT OR REPLACE INTO trending_ai_projects - (full_name, name, description, stars, language, theme, updated_at, html_url, last_seen) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) - """, (full_name, name, desc, stars, language, theme, updated_at, html_url, now)) - conn.commit() - - cur.execute(""" - INSERT INTO project_history (full_name, stars, updated_at, captured_at) - VALUES (?, ?, ?, ?) - """, (full_name, stars, updated_at, now)) - conn.commit() - - -class RateLimitError(Exception): - def __init__(self, retry_after=None): - self.retry_after = retry_after - super().__init__("Rate limit hit on GitHub API. Retry after: {}".format(retry_after)) - - -def build_query_for_theme(theme: str) -> str: - theme_token = theme.replace(" ", "+") - q = f"{theme_token} in:name,description,readme stars:>50" - - return q - -def run_once(themes=THEMES): - print(f"[{datetime.utcnow().isoformat()}] Démarrage d'une itération de veille (thèmes: {len(themes)})") - total_saved = 0 - for theme in themes: - try: - q = build_query_for_theme(theme) - print(f"-> Recherche thème '{theme}' (q={q})") - items = search_github_repos(q) - print(f" ↳ {len(items)} résultats récupérés pour '{theme}'") - for repo in items: - save_project(repo, theme) - total_saved += 1 - except RateLimitError as rle: - wait = rle.retry_after or 60 - print(f"[RATE LIMIT] Limit atteint. Pause {wait} secondes.") - time.sleep(wait) - except Exception as e: - print(f"[ERREUR] thème '{theme}': {e}") - print(f"[{datetime.utcnow().isoformat()}] Itération terminée — {total_saved} enregistrements traités.") - return total_saved - -def main_loop(interval=INTERVAL, once=False): - if once: - run_once() - return - - try: - while True: - run_once() - print(f"Attente {interval} secondes avant la prochaine vérification...") - time.sleep(interval) - except KeyboardInterrupt: - print("") - finally: - conn.close() - -def parse_args(): - p = argparse.ArgumentParser(description="Veille thématique GitHub orientée IA") - p.add_argument("--once", action="store_true", help="Exécuter une unique itération et quitter") - p.add_argument("--interval", type=int, default=INTERVAL, help="Intervalle entre itérations (secondes)") - p.add_argument("--themes", type=str, help="Liste de thèmes séparés par des virgules (remplace la config)") - return p.parse_args() - -if __name__ == "__main__": - args = parse_args() - if args.themes: - THEMES = [t.strip() for t in args.themes.split(",") if t.strip()] - print(f"Themes remplacés: {THEMES}") - - INTERVAL = args.interval - - print("Github AI Theme Watcher démarré.") - if GITHUB_TOKEN: - print("") - else: - print("") - - main_loop(interval=INTERVAL, once=args.once) diff --git a/scrape_hf.py b/scrape_hf.py deleted file mode 100644 index fc021ba..0000000 --- a/scrape_hf.py +++ /dev/null @@ -1,146 +0,0 @@ -import os -import time -import sqlite3 -import requests -from datetime import datetime - -INTERVAL = 300 - -DB_FILE = os.path.join(os.path.dirname(__file__), "huggingface_hub.db") -conn = sqlite3.connect(DB_FILE) -cur = conn.cursor() - -cur.execute(""" -CREATE TABLE IF NOT EXISTS hubs ( - id TEXT PRIMARY KEY, - name TEXT, - author TEXT, - likes INTEGER, - downloads INTEGER, - task TEXT, - last_modified TEXT, - type TEXT, - url TEXT -) -""") -conn.commit() - - -def fetch_models(): - """Récupère les modèles récents via l’API Hugging Face""" - url = "https://huggingface.co/api/models?sort=lastModified&direction=-1&limit=20" - r = requests.get(url, timeout=20) - r.raise_for_status() - return r.json() - -def fetch_datasets(): - """Récupère les datasets récents""" - url = "https://huggingface.co/api/datasets?sort=lastModified&direction=-1&limit=20" - r = requests.get(url, timeout=20) - r.raise_for_status() - return r.json() - -def fetch_spaces(): - """Récupère les Spaces récents""" - url = "https://huggingface.co/api/spaces?sort=lastModified&direction=-1&limit=20" - r = requests.get(url, timeout=20) - r.raise_for_status() - return r.json() - -def fetch_collections(): - """Récupère les collections récentes""" - url = "https://huggingface.co/api/collections?sort=lastModified&direction=-1&limit=20" - r = requests.get(url, timeout=20) - if r.status_code == 404: - return [] - r.raise_for_status() - return r.json() - -def fetch_organizations(): - """Récupère les organisations récentes""" - url = "https://huggingface.co/api/organizations?limit=20" - r = requests.get(url, timeout=20) - r.raise_for_status() - return r.json() - -def fetch_papers(): - """Récupère les papiers de recherche récents (si API accessible)""" - url = "https://huggingface.co/api/papers?sort=lastModified&direction=-1&limit=20" - try: - r = requests.get(url, timeout=20) - if r.status_code == 404: - return [] - r.raise_for_status() - return r.json() - except Exception: - return [] - - -def save_item(item, item_type): - cur.execute(""" - INSERT OR IGNORE INTO hubs (id, name, author, likes, downloads, task, last_modified, type, url) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) - """, ( - item.get("id"), - item.get("name") or item.get("modelId") or item.get("id"), - item.get("author") or item.get("organization", ""), - item.get("likes", 0), - item.get("downloads", 0), - ", ".join(item.get("pipeline_tag", "")) if isinstance(item.get("pipeline_tag"), list) else item.get("pipeline_tag", ""), - item.get("lastModified") or item.get("last_modified") or datetime.utcnow().isoformat(), - item_type, - build_url(item, item_type) - )) - conn.commit() - -def build_url(item, item_type): - """Construit l’URL publique de l’élément""" - base = "https://huggingface.co" - if item_type in ("model", "dataset", "space", "collection", "organization"): - return f"{base}/{item.get('id')}" - elif item_type == "paper": - return f"{base}/papers/{item.get('id')}" - return base - -def load_seen_ids(): - cur.execute("SELECT id FROM hubs") - return set(row[0] for row in cur.fetchall()) - - -def main(): - print("Initialisation Hugging Face Hub Watcher...") - seen_ids = load_seen_ids() - print(f"{len(seen_ids)} éléments déjà enregistrés.") - - fetchers = [ - ("model", fetch_models), - ("dataset", fetch_datasets), - ("space", fetch_spaces), - ("collection", fetch_collections), - ("organization", fetch_organizations), - ("paper", fetch_papers) - ] - - try: - while True: - for item_type, fetch_func in fetchers: - try: - items = fetch_func() - for item in items: - item_id = item.get("id") - if item_id and item_id not in seen_ids: - print(f"[NOUVEAU {item_type.upper()}] {item_id}") - save_item(item, item_type) - seen_ids.add(item_id) - except Exception as e: - print(f"[ERREUR] {item_type}: {e}") - - print(f"Attente {INTERVAL}s avant prochaine vérification...\n") - time.sleep(INTERVAL) - except KeyboardInterrupt: - print("Arrêt manuel.") - finally: - conn.close() - -if __name__ == "__main__": - main() diff --git a/server/.env.example b/scrapper/.env.example similarity index 100% rename from server/.env.example rename to scrapper/.env.example diff --git a/server/ARCHITECTURE.md b/scrapper/ARCHITECTURE.md similarity index 100% rename from server/ARCHITECTURE.md rename to scrapper/ARCHITECTURE.md diff --git a/server/clustering/cluster_scoring.py b/scrapper/clustering/cluster_scoring.py similarity index 100% rename from server/clustering/cluster_scoring.py rename to scrapper/clustering/cluster_scoring.py diff --git a/server/clustering/cluster_thresholds.py b/scrapper/clustering/cluster_thresholds.py similarity index 100% rename from server/clustering/cluster_thresholds.py rename to scrapper/clustering/cluster_thresholds.py diff --git a/server/config.py b/scrapper/config.py similarity index 53% rename from server/config.py rename to scrapper/config.py index 273392c..5c8c273 100644 --- a/server/config.py +++ b/scrapper/config.py @@ -36,37 +36,4 @@ def __post_init__(self): "lemonde": ScraperConfig(enabled=True, limit_latest=20, limit_all=100), "huggingface": ScraperConfig(enabled=True, limit_latest=20, limit_all=100), } - - @classmethod - def from_file(cls, filepath: str) -> "ServerConfig": - """Load configuration from JSON/YAML file.""" - import json - - try: - with open(filepath, 'r') as f: - data = json.load(f) - - if "scrapers" in data: - data["scrapers"] = { - name: ScraperConfig(**cfg) - for name, cfg in data["scrapers"].items() - } - - return cls(**data) - except FileNotFoundError: - print(f"Config file {filepath} not found, using default config") - return cls() - - -DEFAULT_CONFIG = ServerConfig() - -DEV_CONFIG = ServerConfig( - db_url=os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/veille_technique_dev"), - watch_interval_seconds=60, -) -PROD_CONFIG = ServerConfig( - db_url=os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/veille_technique"), - watch_interval_seconds=600, - log_level="WARNING", -) diff --git a/server/cross_encoder.py b/scrapper/cross_encoder.py similarity index 100% rename from server/cross_encoder.py rename to scrapper/cross_encoder.py diff --git a/server/database.py b/scrapper/database.py similarity index 100% rename from server/database.py rename to scrapper/database.py diff --git a/server/embeddings.py b/scrapper/embeddings.py similarity index 69% rename from server/embeddings.py rename to scrapper/embeddings.py index 0476a95..a2a3542 100644 --- a/server/embeddings.py +++ b/scrapper/embeddings.py @@ -34,67 +34,6 @@ def get_dimension(self) -> Optional[int]: return None -class DummyEmbeddingProvider(EmbeddingProvider): - """Dummy embedding provider for development.""" - - def __init__(self, dimension: int = 1536): - """ - Initialize dummy provider. - - Args: - dimension: Embedding dimension - """ - self.dimension = dimension - - def embed(self, text: str) -> np.ndarray: - """Generate deterministic random embedding from text hash.""" - seed = abs(hash(text)) % (2**31) - np.random.seed(seed) - return np.random.randn(self.dimension).astype(np.float32) - - def get_name(self) -> str: - """Return provider name.""" - return "dummy" - - def get_dimension(self) -> Optional[int]: - return self.dimension - - -class SentenceTransformerEmbeddingProvider(EmbeddingProvider): - """Embedding provider using sentence-transformers.""" - - def __init__(self, model_name: str = "all-MiniLM-L6-v2"): - """ - Initialize SentenceTransformers provider. - - Args: - model_name: Model name to use - """ - try: - from sentence_transformers import SentenceTransformer - except ImportError: - raise ImportError( - "sentence-transformers is required. Install it with: " - "pip install sentence-transformers" - ) - - self.model_name = model_name - self.model = SentenceTransformer(model_name) - self.dimension = getattr(self.model, "get_sentence_embedding_dimension", lambda: None)() - - def embed(self, text: str) -> np.ndarray: - """Generate embedding with SentenceTransformer.""" - embedding = self.model.encode(text, convert_to_numpy=True) - return embedding.astype(np.float32) - - def get_name(self) -> str: - """Return provider name.""" - return f"sentence-transformers-{self.model_name}" - - def get_dimension(self) -> Optional[int]: - return self.dimension - - class OpenAIEmbeddingProvider(EmbeddingProvider): """Embedding provider using OpenAI API.""" @@ -178,14 +117,14 @@ def _infer_dimension(self, model: str) -> Optional[int]: class EmbeddingManager: """Manage embeddings for articles.""" - def __init__(self, provider: Optional[EmbeddingProvider] = None, expected_dimension: Optional[int] = None): + def __init__(self, provider: EmbeddingProvider, expected_dimension: Optional[int] = None): """Initialize embedding manager. Args: - provider: Embedding provider to use (default: Dummy) + provider: Embedding provider to use expected_dimension: Optional enforced dimension (aligns with DB vector size) """ - self.provider = provider or DummyEmbeddingProvider() + self.provider = provider self.expected_dimension = expected_dimension or self.provider.get_dimension() def embed_text(self, text: str) -> np.ndarray: diff --git a/server/entity_llm_processor.py b/scrapper/entity_llm_processor.py similarity index 100% rename from server/entity_llm_processor.py rename to scrapper/entity_llm_processor.py diff --git a/server/examples.py b/scrapper/examples.py similarity index 99% rename from server/examples.py rename to scrapper/examples.py index 2a76c0e..606606e 100644 --- a/server/examples.py +++ b/scrapper/examples.py @@ -5,7 +5,6 @@ import asyncio import os from main import WatchServer -from config import DEV_CONFIG, PROD_CONFIG def example_backfill(): diff --git a/server/examples_new_features.py b/scrapper/examples_new_features.py similarity index 100% rename from server/examples_new_features.py rename to scrapper/examples_new_features.py diff --git a/server/main.py b/scrapper/main.py similarity index 93% rename from server/main.py rename to scrapper/main.py index 5a59f15..fdabac6 100644 --- a/server/main.py +++ b/scrapper/main.py @@ -261,15 +261,6 @@ def print_stats(self): for source, count in stats['articles_by_source'].items(): print(f" - {source}: {count}") print("=" * 60) - - def export_database(self, output_path: str) -> bool: - """Guide export for PostgreSQL deployments.""" - logger.error("Export is not handled automatically for PostgreSQL. Use pg_dump instead.") - logger.info( - "Example: pg_dump --dbname=$DATABASE_URL --format=c --file=%s", - output_path, - ) - return False def main(): @@ -279,7 +270,7 @@ def main(): ) parser.add_argument( "mode", - choices=["watch", "backfill", "stats", "export"], + choices=["watch", "backfill", "stats"], help="Execution mode" ) parser.add_argument( @@ -309,11 +300,6 @@ def main(): default=100, help="Max articles per source (backfill mode)" ) - parser.add_argument( - "--output", - default="veille_export.db", - help="Output file path for export mode" - ) args = parser.parse_args() @@ -333,12 +319,6 @@ def main(): elif args.mode == "stats": server.print_stats() - - elif args.mode == "export": - logger.info( - "Export helper: use pg_dump on your PostgreSQL instance. Example: pg_dump --dbname=$DATABASE_URL --file=%s", - args.output, - ) if __name__ == "__main__": main() \ No newline at end of file diff --git a/server/requirements.txt b/scrapper/requirements.txt similarity index 100% rename from server/requirements.txt rename to scrapper/requirements.txt diff --git a/server/scrapers/__init__.py b/scrapper/scrapers/__init__.py similarity index 100% rename from server/scrapers/__init__.py rename to scrapper/scrapers/__init__.py diff --git a/server/scrapers/arxiv_scraper.py b/scrapper/scrapers/arxiv_scraper.py similarity index 100% rename from server/scrapers/arxiv_scraper.py rename to scrapper/scrapers/arxiv_scraper.py diff --git a/server/scrapers/base.py b/scrapper/scrapers/base.py similarity index 100% rename from server/scrapers/base.py rename to scrapper/scrapers/base.py diff --git a/server/scrapers/github_scraper.py b/scrapper/scrapers/github_scraper.py similarity index 100% rename from server/scrapers/github_scraper.py rename to scrapper/scrapers/github_scraper.py diff --git a/server/scrapers/huggingface_scraper.py b/scrapper/scrapers/huggingface_scraper.py similarity index 100% rename from server/scrapers/huggingface_scraper.py rename to scrapper/scrapers/huggingface_scraper.py diff --git a/server/scrapers/lemonde_scraper.py b/scrapper/scrapers/lemonde_scraper.py similarity index 100% rename from server/scrapers/lemonde_scraper.py rename to scrapper/scrapers/lemonde_scraper.py diff --git a/server/scrapers/medium_scraper.py b/scrapper/scrapers/medium_scraper.py similarity index 100% rename from server/scrapers/medium_scraper.py rename to scrapper/scrapers/medium_scraper.py diff --git a/server/veille_technique.db b/scrapper/veille_technique.db similarity index 100% rename from server/veille_technique.db rename to scrapper/veille_technique.db diff --git a/server/Cargo.toml b/server/Cargo.toml new file mode 100644 index 0000000..bcf540b --- /dev/null +++ b/server/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "server" +version = "0.1.0" +edition = "2021" + +[dependencies] +axum = "0.7" +tokio = { version = "1", features = ["full"] } diff --git a/server/src/main.rs b/server/src/main.rs new file mode 100644 index 0000000..ce454b3 --- /dev/null +++ b/server/src/main.rs @@ -0,0 +1,18 @@ +use axum::{routing::get, Router}; + +#[tokio::main] +async fn main() { + let app = Router::new() + .route("/", get(|| async { "Route principale" })) + .route("/ping", get(|| async { "pong" })); + + let listener = tokio::net::TcpListener::bind("127.0.0.1:3000") + .await + .unwrap(); + + println!("Server sur http://localhost:3000"); + + axum::serve(listener, app) + .await + .unwrap(); +}