Skip to content

Commit 716f986

Browse files
Edouard SilvestreEdouard Silvestre
authored andcommitted
[FIX] unified output format for every scrappers
1 parent 2780486 commit 716f986

6 files changed

Lines changed: 606 additions & 0 deletions

File tree

scrap/medium_scraping.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import feedparser
2+
from datetime import datetime
3+
from typing import List, Dict, Optional
4+
import time
5+
6+
# Constantes de l'outil de veille
7+
SOURCE_SITE = "medium"
8+
9+
RSS_FEEDS = [
10+
"https://medium.com/feed/tag/artificial-intelligence",
11+
"https://medium.com/feed/tag/machine-learning",
12+
"https://medium.com/feed/tag/deep-learning",
13+
"https://medium.com/feed/tag/ai",
14+
]
15+
16+
def normalize_medium_entry(entry: feedparser.FeedParserDict) -> Dict:
17+
"""Normalise une entrée RSS Medium dans le format unifié."""
18+
entry_id = entry.get('link', '')
19+
20+
# Conversion de la date
21+
published_date = datetime.utcnow().isoformat()
22+
if getattr(entry, "published_parsed", None):
23+
published_date = datetime.fromtimestamp(time.mktime(entry.published_parsed)).isoformat()
24+
25+
keywords = [tag.term for tag in entry.get('tags', [])] if 'tags' in entry else []
26+
27+
return {
28+
"id": entry_id,
29+
"source_site": SOURCE_SITE,
30+
"title": entry.get('title', 'N/A'),
31+
"description": entry.get('summary', 'N/A'),
32+
"author_info": entry.get('author', 'N/A'),
33+
"keywords": ", ".join(keywords),
34+
"content_url": entry_id,
35+
"published_date": published_date,
36+
"item_type": "article",
37+
}
38+
39+
def scrape_medium(max_articles_per_feed: int = 10) -> List[Dict]:
40+
"""Scrape les flux RSS Medium et retourne les éléments unifiés."""
41+
all_items = []
42+
unique_links = set()
43+
44+
for feed_url in RSS_FEEDS:
45+
print(f"📡 Fetching RSS: {feed_url}")
46+
try:
47+
feed = feedparser.parse(feed_url)
48+
49+
for entry in feed.entries[:max_articles_per_feed]:
50+
link = entry.get('link')
51+
if link and link not in unique_links:
52+
all_items.append(normalize_medium_entry(entry))
53+
unique_links.add(link)
54+
55+
except Exception as e:
56+
print(f"❌ Error fetching {feed_url}: {e}")
57+
time.sleep(1) # Respecter une pause entre les appels RSS
58+
59+
return all_items
60+
61+
if __name__ == "__main__":
62+
results = scrape_medium(max_articles_per_feed=2)
63+
print(f"Total Medium items scraped: {len(results)}")
64+
if results:
65+
print("\nExemple d'élément unifié:")
66+
import json
67+
print(json.dumps(results[0], indent=2))

scrap/scrap_arxiv.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import arxiv
2+
from datetime import datetime
3+
from typing import List, Dict
4+
5+
# Constantes de l'outil de veille
6+
SOURCE_SITE = "arxiv"
7+
CATEGORY = "cs.LG"
8+
9+
def normalize_arxiv_result(paper: arxiv.Result) -> Dict:
10+
"""Normalise un résultat arXiv dans le format unifié."""
11+
12+
authors = ", ".join([a.name for a in paper.authors])
13+
14+
# Utiliser le lien de l'abstract comme ID/URL
15+
link = paper.entry_id # C'est généralement l'URL de l'abstract dans la librairie
16+
17+
keywords_list = [paper.primary_category]
18+
if paper.categories:
19+
keywords_list.extend(paper.categories)
20+
21+
return {
22+
"id": link,
23+
"source_site": SOURCE_SITE,
24+
"title": paper.title.replace('\n', ' '), # Enlever les sauts de ligne dans le titre
25+
"description": paper.summary.replace('\n', ' '),
26+
"author_info": authors,
27+
"keywords": ", ".join(keywords_list),
28+
"content_url": link,
29+
"published_date": paper.published.isoformat(),
30+
"item_type": "paper",
31+
}
32+
33+
def scrape_arxiv(category: str = CATEGORY, max_results: int = 10) -> List[Dict]:
34+
"""Scrape arXiv pour une catégorie et retourne les éléments unifiés."""
35+
36+
try:
37+
search = arxiv.Search(
38+
query=f"cat:{category}",
39+
max_results=max_results,
40+
sort_by=arxiv.SortCriterion.SubmittedDate,
41+
sort_order=arxiv.SortOrder.Descending
42+
)
43+
44+
normalized_results = []
45+
for result in search.results():
46+
normalized_results.append(normalize_arxiv_result(result))
47+
48+
return normalized_results
49+
50+
except Exception as e:
51+
print(f"[ERREUR] arXiv Search: {e}")
52+
return []
53+
54+
if __name__ == "__main__":
55+
results = scrape_arxiv(max_results=5)
56+
print(f"Total arXiv items scraped: {len(results)}")
57+
if results:
58+
print("\nExemple d'élément unifié:")
59+
import json
60+
print(json.dumps(results[0], indent=2))

scrap/scrap_le_monde.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import feedparser
2+
import time
3+
from datetime import datetime
4+
from typing import List, Dict
5+
6+
# Constantes de l'outil de veille
7+
SOURCE_SITE = "le_monde"
8+
9+
FEEDS = [
10+
"https://www.lemonde.fr/international/rss_full.xml",
11+
"https://www.lemonde.fr/actualite-medias/rss_full.xml",
12+
"https://www.lemonde.fr/en_continu/rss_full.xml"
13+
]
14+
15+
def normalize_lemonde_entry(entry: feedparser.FeedParserDict, feed_url: str) -> Dict:
16+
"""Normalise une entrée RSS Le Monde dans le format unifié."""
17+
entry_id = getattr(entry, "id", None) or getattr(entry, "link", None)
18+
19+
# Déterminer la date
20+
published_date = datetime.utcnow().isoformat()
21+
if getattr(entry, "published_parsed", None):
22+
published_date = datetime.fromtimestamp(time.mktime(entry.published_parsed)).isoformat()
23+
elif getattr(entry, "updated_parsed", None):
24+
published_date = datetime.fromtimestamp(time.mktime(entry.updated_parsed)).isoformat()
25+
26+
# Extraire la catégorie du flux si possible
27+
category = "actualité générale"
28+
if "international" in feed_url:
29+
category = "international"
30+
elif "medias" in feed_url:
31+
category = "actualité médias"
32+
elif "continu" in feed_url:
33+
category = "en continu"
34+
35+
return {
36+
"id": entry_id,
37+
"source_site": SOURCE_SITE,
38+
"title": getattr(entry, "title", ""),
39+
"description": getattr(entry, "summary", ""),
40+
"author_info": getattr(entry, "author", "Le Monde"),
41+
"keywords": category, # Utiliser la catégorie du flux comme mot-clé principal
42+
"content_url": getattr(entry, "link", ""),
43+
"published_date": published_date,
44+
"item_type": "article",
45+
}
46+
47+
def scrape_lemonde(feeds: List[str] = FEEDS) -> List[Dict]:
48+
"""Scrape les flux RSS Le Monde et retourne les éléments unifiés."""
49+
all_items = []
50+
unique_ids = set()
51+
52+
for feed_url in feeds:
53+
try:
54+
d = feedparser.parse(feed_url)
55+
56+
for entry in d.entries:
57+
entry_id = getattr(entry, "id", None) or getattr(entry, "link", None)
58+
if entry_id and entry_id not in unique_ids:
59+
all_items.append(normalize_lemonde_entry(entry, feed_url))
60+
unique_ids.add(entry_id)
61+
62+
except Exception as e:
63+
print(f"[ERREUR] du fetch du feed {feed_url}: {e}")
64+
time.sleep(1) # Petite pause entre les flux
65+
66+
return all_items
67+
68+
if __name__ == "__main__":
69+
results = scrape_lemonde(feeds=FEEDS[:1])
70+
print(f"Total Le Monde items scraped: {len(results)}")
71+
if results:
72+
print("\nExemple d'élément unifié:")
73+
import json
74+
print(json.dumps(results[0], indent=2))

scrap/scrape_github.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import os
2+
import requests
3+
from datetime import datetime, UTC
4+
from typing import List, Dict
5+
6+
# Constantes de l'outil de veille
7+
SOURCE_SITE = "github"
8+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
9+
10+
# ... (THEMES, HEADERS, RateLimitError, sanitize_text, normalize_github_repo, build_query_for_theme restent inchangés) ...
11+
12+
THEMES = [
13+
"large-language-model", "llm", "transformer", "text-generation", "retrieval-augmented-generation",
14+
"rag", "agents", "chatbot", "fine-tuning", "quantization", "lora", "peft",
15+
"diffusion", "stable-diffusion", "image-generation", "multimodal",
16+
"speech-to-text", "speech-synthesis", "audio", "reinforcement-learning",
17+
"computer-vision",
18+
]
19+
20+
HEADERS = {
21+
"Accept": "application/vnd.github+json",
22+
"User-Agent": "github-ai-theme-watcher/1.0"
23+
}
24+
if GITHUB_TOKEN:
25+
HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"
26+
27+
class RateLimitError(Exception):
28+
def __init__(self, retry_after=None):
29+
self.retry_after = retry_after
30+
super().__init__("Rate limit hit on GitHub API. Retry after: {}".format(retry_after))
31+
32+
def sanitize_text(s):
33+
return str(s) if s is not None else ""
34+
35+
def normalize_github_repo(repo: Dict, theme: str) -> Dict:
36+
full_name = repo.get("full_name")
37+
keywords_list = [theme, repo.get("language") or ""]
38+
if repo.get("topics"):
39+
keywords_list.extend(repo.get("topics"))
40+
updated_at = repo.get("updated_at") or repo.get("pushed_at") or datetime.now(UTC).isoformat()
41+
return {
42+
"id": full_name, "source_site": SOURCE_SITE, "title": repo.get("name"),
43+
"description": sanitize_text(repo.get("description")), "author_info": repo.get("owner", {}).get("login", ""),
44+
"keywords": ", ".join(filter(None, keywords_list)), "content_url": repo.get("html_url") or f"https://github.com/{full_name}",
45+
"published_date": updated_at, "item_type": "repository",
46+
}
47+
48+
def build_query_for_theme(theme: str) -> str:
49+
theme_token = theme.replace(" ", "+")
50+
q = f"{theme_token} in:name,description,readme stars:>50"
51+
return q
52+
53+
54+
def search_github_repos(query: str, per_page: int = 20) -> List[Dict]:
55+
"""
56+
Recherche des repositories GitHub.
57+
Lève RateLimitError ou retourne List[Dict] (vide ou pleine).
58+
"""
59+
url = "https://api.github.com/search/repositories"
60+
params = {
61+
"q": query,
62+
"sort": "stars",
63+
"order": "desc",
64+
"per_page": per_page
65+
}
66+
67+
try:
68+
resp = requests.get(url, headers=HEADERS, params=params, timeout=20)
69+
70+
if resp.status_code == 403:
71+
retry_after = resp.headers.get("Retry-After")
72+
# Lève l'erreur pour la gestion du break dans scrape_github
73+
raise RateLimitError(retry_after=int(retry_after) if retry_after and retry_after.isdigit() else None)
74+
75+
# 🎯 CORRECTION CLÉ DANS CE BLOC :
76+
# Utiliser 'resp.raise_for_status()' si vous souhaitez détecter les 4xx/5xx généraux,
77+
# mais pour la robustesse, nous allons d'abord vérifier le statut et analyser le JSON.
78+
79+
if resp.status_code != 200:
80+
# Pour toutes les autres erreurs non 403, nous loguons et retournons vide.
81+
print(f"[WARN] HTTP Status {resp.status_code} for query: {query}")
82+
return []
83+
84+
# Si le statut est 200, nous essayons d'analyser le JSON
85+
data = resp.json()
86+
return data.get("items", [])
87+
88+
except RateLimitError:
89+
raise # Relance RateLimitError
90+
except requests.exceptions.RequestException as e:
91+
print(f"[ERREUR CONNEXION/HTTP] GitHub Search: {e}")
92+
return []
93+
except Exception as e:
94+
print(f"[ERREUR INCONNUE/JSON] GitHub Search: {e}")
95+
return []
96+
97+
98+
def scrape_github(themes: List[str] = THEMES, limit_per_theme: int = 20) -> List[Dict]:
99+
"""Scrape GitHub pour les thèmes donnés et retourne les éléments unifiés."""
100+
101+
all_items = []
102+
stop_scraping = False # Drapeau de contrôle
103+
104+
try:
105+
for theme in themes:
106+
if stop_scraping:
107+
break
108+
109+
q = build_query_for_theme(theme)
110+
print(f"-> Recherche thème '{theme}' (q={q})")
111+
112+
try:
113+
items = search_github_repos(q, limit_per_theme)
114+
115+
# SÉCURITÉ SUPPLÉMENTAIRE :
116+
if not isinstance(items, list):
117+
print(f"[FATAL WARN] search_github_repos a retourné {type(items)} au lieu de list. Arrêt.")
118+
stop_scraping = True
119+
continue
120+
121+
normalized_items = [normalize_github_repo(repo, theme) for repo in items]
122+
all_items.extend(normalized_items)
123+
124+
except RateLimitError:
125+
# Gère spécifiquement l'erreur de Rate Limit
126+
print(f"[RATE LIMIT] Limite atteinte. Arrêt de la veille GitHub pour cette itération.")
127+
stop_scraping = True
128+
except Exception as e:
129+
# Gère toutes les autres exceptions de niveau thème (très peu probables maintenant)
130+
print(f"[ERREUR THÈME] '{theme}': {e}")
131+
continue
132+
133+
finally:
134+
return all_items
135+
136+
if __name__ == "__main__":
137+
results = scrape_github(themes=["llm"], limit_per_theme=5)
138+
print(f"Total GitHub items scraped: {len(results)}")
139+
if results:
140+
import json
141+
print("\nExemple d'élément unifié:")
142+
print(json.dumps(results[0], indent=2))

0 commit comments

Comments
 (0)