1+ import os
2+ import requests
3+ from datetime import datetime , UTC
4+ from typing import List , Dict
5+
6+ # Constantes de l'outil de veille
7+ SOURCE_SITE = "github"
8+ GITHUB_TOKEN = os .getenv ("GITHUB_TOKEN" )
9+
10+ # ... (THEMES, HEADERS, RateLimitError, sanitize_text, normalize_github_repo, build_query_for_theme restent inchangés) ...
11+
12+ THEMES = [
13+ "large-language-model" , "llm" , "transformer" , "text-generation" , "retrieval-augmented-generation" ,
14+ "rag" , "agents" , "chatbot" , "fine-tuning" , "quantization" , "lora" , "peft" ,
15+ "diffusion" , "stable-diffusion" , "image-generation" , "multimodal" ,
16+ "speech-to-text" , "speech-synthesis" , "audio" , "reinforcement-learning" ,
17+ "computer-vision" ,
18+ ]
19+
20+ HEADERS = {
21+ "Accept" : "application/vnd.github+json" ,
22+ "User-Agent" : "github-ai-theme-watcher/1.0"
23+ }
24+ if GITHUB_TOKEN :
25+ HEADERS ["Authorization" ] = f"Bearer { GITHUB_TOKEN } "
26+
27+ class RateLimitError (Exception ):
28+ def __init__ (self , retry_after = None ):
29+ self .retry_after = retry_after
30+ super ().__init__ ("Rate limit hit on GitHub API. Retry after: {}" .format (retry_after ))
31+
32+ def sanitize_text (s ):
33+ return str (s ) if s is not None else ""
34+
35+ def normalize_github_repo (repo : Dict , theme : str ) -> Dict :
36+ full_name = repo .get ("full_name" )
37+ keywords_list = [theme , repo .get ("language" ) or "" ]
38+ if repo .get ("topics" ):
39+ keywords_list .extend (repo .get ("topics" ))
40+ updated_at = repo .get ("updated_at" ) or repo .get ("pushed_at" ) or datetime .now (UTC ).isoformat ()
41+ return {
42+ "id" : full_name , "source_site" : SOURCE_SITE , "title" : repo .get ("name" ),
43+ "description" : sanitize_text (repo .get ("description" )), "author_info" : repo .get ("owner" , {}).get ("login" , "" ),
44+ "keywords" : ", " .join (filter (None , keywords_list )), "content_url" : repo .get ("html_url" ) or f"https://github.com/{ full_name } " ,
45+ "published_date" : updated_at , "item_type" : "repository" ,
46+ }
47+
48+ def build_query_for_theme (theme : str ) -> str :
49+ theme_token = theme .replace (" " , "+" )
50+ q = f"{ theme_token } in:name,description,readme stars:>50"
51+ return q
52+
53+
54+ def search_github_repos (query : str , per_page : int = 20 ) -> List [Dict ]:
55+ """
56+ Recherche des repositories GitHub.
57+ Lève RateLimitError ou retourne List[Dict] (vide ou pleine).
58+ """
59+ url = "https://api.github.com/search/repositories"
60+ params = {
61+ "q" : query ,
62+ "sort" : "stars" ,
63+ "order" : "desc" ,
64+ "per_page" : per_page
65+ }
66+
67+ try :
68+ resp = requests .get (url , headers = HEADERS , params = params , timeout = 20 )
69+
70+ if resp .status_code == 403 :
71+ retry_after = resp .headers .get ("Retry-After" )
72+ # Lève l'erreur pour la gestion du break dans scrape_github
73+ raise RateLimitError (retry_after = int (retry_after ) if retry_after and retry_after .isdigit () else None )
74+
75+ # 🎯 CORRECTION CLÉ DANS CE BLOC :
76+ # Utiliser 'resp.raise_for_status()' si vous souhaitez détecter les 4xx/5xx généraux,
77+ # mais pour la robustesse, nous allons d'abord vérifier le statut et analyser le JSON.
78+
79+ if resp .status_code != 200 :
80+ # Pour toutes les autres erreurs non 403, nous loguons et retournons vide.
81+ print (f"[WARN] HTTP Status { resp .status_code } for query: { query } " )
82+ return []
83+
84+ # Si le statut est 200, nous essayons d'analyser le JSON
85+ data = resp .json ()
86+ return data .get ("items" , [])
87+
88+ except RateLimitError :
89+ raise # Relance RateLimitError
90+ except requests .exceptions .RequestException as e :
91+ print (f"[ERREUR CONNEXION/HTTP] GitHub Search: { e } " )
92+ return []
93+ except Exception as e :
94+ print (f"[ERREUR INCONNUE/JSON] GitHub Search: { e } " )
95+ return []
96+
97+
98+ def scrape_github (themes : List [str ] = THEMES , limit_per_theme : int = 20 ) -> List [Dict ]:
99+ """Scrape GitHub pour les thèmes donnés et retourne les éléments unifiés."""
100+
101+ all_items = []
102+ stop_scraping = False # Drapeau de contrôle
103+
104+ try :
105+ for theme in themes :
106+ if stop_scraping :
107+ break
108+
109+ q = build_query_for_theme (theme )
110+ print (f"-> Recherche thème '{ theme } ' (q={ q } )" )
111+
112+ try :
113+ items = search_github_repos (q , limit_per_theme )
114+
115+ # SÉCURITÉ SUPPLÉMENTAIRE :
116+ if not isinstance (items , list ):
117+ print (f"[FATAL WARN] search_github_repos a retourné { type (items )} au lieu de list. Arrêt." )
118+ stop_scraping = True
119+ continue
120+
121+ normalized_items = [normalize_github_repo (repo , theme ) for repo in items ]
122+ all_items .extend (normalized_items )
123+
124+ except RateLimitError :
125+ # Gère spécifiquement l'erreur de Rate Limit
126+ print (f"[RATE LIMIT] Limite atteinte. Arrêt de la veille GitHub pour cette itération." )
127+ stop_scraping = True
128+ except Exception as e :
129+ # Gère toutes les autres exceptions de niveau thème (très peu probables maintenant)
130+ print (f"[ERREUR THÈME] '{ theme } ': { e } " )
131+ continue
132+
133+ finally :
134+ return all_items
135+
136+ if __name__ == "__main__" :
137+ results = scrape_github (themes = ["llm" ], limit_per_theme = 5 )
138+ print (f"Total GitHub items scraped: { len (results )} " )
139+ if results :
140+ import json
141+ print ("\n Exemple d'élément unifié:" )
142+ print (json .dumps (results [0 ], indent = 2 ))
0 commit comments