11#!/usr/bin/env python3
22"""
3- Script pour générer une page de citations depuis Google Scholar
3+ Script to generate a citations page from Google Scholar
44"""
55
66import argparse
@@ -29,6 +29,59 @@ def extract_year(citation: Dict) -> int:
2929 return 0
3030
3131
32+ def normalize_title (title : str ) -> str :
33+ """
34+ Normalize a citation title for duplicate detection.
35+ Lowercase, keep only alphanumeric characters.
36+ """
37+ return re .sub (r'[^a-z0-9]' , '' , title .lower ())
38+
39+
40+ def extract_author_lastnames (authors_pub : str ) -> frozenset :
41+ """
42+ Extract a frozenset of lowercase last names from an authors_pub string.
43+ Splits on the first ' - ' to isolate the author list, then takes the
44+ last word of each comma-separated token as the last name.
45+ """
46+ author_part = authors_pub .split (' - ' )[0 ] if ' - ' in authors_pub else authors_pub
47+ lastnames = set ()
48+ for token in author_part .split (',' ):
49+ token = token .strip ().rstrip ('\u2026 ' ).strip ()
50+ if token :
51+ last = token .split ()[- 1 ].lower ()
52+ last = re .sub (r'[^a-z]' , '' , last )
53+ if last :
54+ lastnames .add (last )
55+ return frozenset (lastnames )
56+
57+
58+ def deduplicate_citations (citations : List [Dict ]) -> List [Dict ]:
59+ """
60+ Remove duplicate citations, keeping the first occurrence and storing
61+ alternative URLs in an 'alt_urls' list on the primary citation.
62+ Two citations are considered identical when their normalized titles
63+ and author lastname frozensets are both equal.
64+ """
65+ seen : dict = {}
66+ result : List [Dict ] = []
67+ for citation in citations :
68+ title_key = normalize_title (citation .get ('title' , '' ))
69+ authors_key = extract_author_lastnames (citation .get ('authors_pub' , '' ))
70+ key = (title_key , authors_key )
71+ if key not in seen :
72+ citation = dict (citation )
73+ citation .setdefault ('alt_urls' , [])
74+ seen [key ] = len (result )
75+ result .append (citation )
76+ else :
77+ alt = {
78+ 'url' : citation .get ('url' , '' ),
79+ 'authors_pub' : citation .get ('authors_pub' , '' ),
80+ }
81+ result [seen [key ]]['alt_urls' ].append (alt )
82+ return result
83+
84+
3285def sort_citations_by_year (citations : List [Dict ]) -> List [Dict ]:
3386 """
3487 Sort citations by year (most recent first)
@@ -45,15 +98,15 @@ def sort_citations_by_year(citations: List[Dict]) -> List[Dict]:
4598def generate_web_page (citations : List [Dict ], scholar_url : str ,
4699 output_file : str = "about/citations.md" ):
47100 """
48- Génère une page web Markdown avec front matter Jekyll pour le site control-toolbox
101+ Generate a Markdown web page with Jekyll front matter for the control-toolbox site.
49102
50103 Args:
51- citations: Liste des citations
52- scholar_url: URL Google Scholar utilisée
53- output_file: Nom du fichier de sortie
104+ citations: List of citations
105+ scholar_url: Google Scholar URL used
106+ output_file: Output file name
54107 """
55108 if not citations :
56- print ("❌ Impossible de générer la page web : aucune citation trouvée ." )
109+ print ("❌ Cannot generate the web page: no citations found ." )
57110 return
58111
59112 with open (output_file , 'w' , encoding = 'utf-8' ) as f :
@@ -64,33 +117,33 @@ def generate_web_page(citations: List[Dict], scholar_url: str,
64117 f .write ("permalink: /citations/\n " )
65118 f .write ("---\n \n " )
66119
67- # Lien CSS personnalisé
120+ # Custom CSS link
68121 f .write ('<link rel="stylesheet" href="/assets/css/citations.css">\n \n ' )
69122
70- # Conteneur principal
123+ # Main container
71124 f .write ('<div class="citations-page">\n \n ' )
72125
73- # En-tête
126+ # Header
74127 f .write ('<div class="citations-header">\n ' )
75128 f .write ('<h1>📚 Citations</h1>\n ' )
76129 f .write ('<p class="subtitle">Academic papers citing OptimalControl.jl and related control-toolbox projects</p>\n ' )
77130 f .write (f'<p class="last-update">Last updated: { datetime .now ().strftime ("%B %d, %Y at %H:%M UTC" )} </p>\n ' )
78131 f .write ('</div>\n \n ' )
79132
80- # Carte de résumé
133+ # Summary cards
81134 f .write ('<div class="summary-cards">\n ' )
82135 f .write ('<div class="summary-card card-citations">\n ' )
83136 f .write ('<div class="card-label">Total Citations</div>\n ' )
84137 f .write (f'<div class="card-value">{ len (citations )} </div>\n ' )
85138 f .write ('</div>\n ' )
86139 f .write ('</div>\n \n ' )
87140
88- # Note d'information
141+ # Info note
89142 f .write ('<div class="info-box">\n ' )
90143 f .write ('<p><strong>Note:</strong> This page lists academic papers that cite OptimalControl.jl and related control-toolbox projects. Data is automatically retrieved from Google Scholar.</p>\n ' )
91144 f .write ('</div>\n \n ' )
92145
93- # Liste des citations
146+ # Citations list
94147 f .write ('<div class="citations-section">\n ' )
95148 f .write ('<h2>📖 Citations</h2>\n ' )
96149 f .write ('<div class="citations-list">\n ' )
@@ -116,32 +169,42 @@ def generate_web_page(citations: List[Dict], scholar_url: str,
116169 if snippet :
117170 f .write (f'<p class="citation-snippet">{ snippet } </p>\n ' )
118171
172+ # Alternate versions (duplicates merged)
173+ for j , alt in enumerate (citation .get ('alt_urls' , []), 2 ):
174+ alt_url = alt .get ('url' , '' )
175+ alt_authors = alt .get ('authors_pub' , '' )
176+ if alt_url :
177+ f .write (f'<p class="citation-alt"><a href="{ alt_url } " target="_blank">Version { j } </a>' )
178+ if alt_authors :
179+ f .write (f' — { alt_authors } ' )
180+ f .write ('</p>\n ' )
181+
119182 f .write ('</div>\n ' )
120183
121184 f .write ('</div>\n ' )
122185 f .write ('</div>\n \n ' )
123186
124- # Pied de page
187+ # Footer
125188 f .write ('<hr>\n ' )
126189 f .write ('<p style="text-align: center; color: #6a737d; font-size: 0.9rem;">\n ' )
127190 f .write ('<em>🤖 This page is automatically generated from Google Scholar and updated weekly.</em>\n ' )
128191 f .write ('</p>\n \n ' )
129192
130- # Fermer le conteneur
193+ # Close container
131194 f .write ('</div>\n ' )
132195
133- print (f"\n ✅ Page web générée : { output_file } " )
196+ print (f"\n ✅ Web page generated : { output_file } " )
134197 return output_file
135198
136199
137200def main ():
138- """Fonction principale """
201+ """Main function """
139202
140203 parser = argparse .ArgumentParser (
141- description = "Génère une page de citations depuis Google Scholar" ,
204+ description = "Generate a citations page from Google Scholar" ,
142205 formatter_class = argparse .RawDescriptionHelpFormatter ,
143206 epilog = """
144- Exemples d'utilisation :
207+ Usage examples :
145208 python generate_citations_page.py
146209 python generate_citations_page.py --url "https://scholar.google.com/scholar?cites=PAPER_ID"
147210 python generate_citations_page.py --max-pages 10
@@ -151,58 +214,67 @@ def main():
151214 parser .add_argument (
152215 '--url' ,
153216 type = str ,
154- help = 'URL Google Scholar pour les citations (défaut : OptimalControl.jl)'
217+ help = 'Google Scholar URL for citations (default : OptimalControl.jl)'
155218 )
156219
157220 parser .add_argument (
158221 '--max-pages' ,
159222 type = int ,
160223 default = 5 ,
161- help = 'Nombre maximum de pages à récupérer (défaut : 5)'
224+ help = 'Maximum number of pages to fetch (default : 5)'
162225 )
163226
164227 parser .add_argument (
165228 '--output' ,
166229 type = str ,
167230 default = 'about/citations.md' ,
168- help = 'Nom du fichier de sortie (défaut : about/citations.md)'
231+ help = 'Output file name (default : about/citations.md)'
169232 )
170233
171234 args = parser .parse_args ()
172235
173- # URL par défaut pour OptimalControl.jl
236+ # Default URL for OptimalControl.jl
174237 if args .url :
175238 scholar_url = args .url
176239 else :
177240 scholar_url = "https://scholar.google.com/scholar?cites=1899455738170204200"
178241
179242 print ("\n " + "=" * 70 )
180- print ("🔍 RÉCUPÉRATION DES CITATIONS" )
243+ print ("🔍 FETCHING CITATIONS" )
181244 print ("=" * 70 )
182- print (f"URL Google Scholar : { scholar_url } " )
183- print (f"Pages maximum : { args .max_pages } " )
245+ print (f"Google Scholar URL : { scholar_url } " )
246+ print (f"Maximum pages : { args .max_pages } " )
184247 print ("=" * 70 + "\n " )
185248
186- # Récupération des citations
249+ # Fetch citations
187250 scraper = ScholarCitationScraper (scholar_url )
188251 citations = scraper .get_citations (max_pages = args .max_pages )
189252
190- print (f"\n Total citations récupérées : { len (citations )} " )
253+ print (f"\n Total citations retrieved : { len (citations )} " )
191254
192255 if not citations :
193- print ("❌ Aucune citation trouvée ." )
256+ print ("❌ No citations found ." )
194257 return
195258
196- # Trier par année (plus récent en premier )
259+ # Sort by year (most recent first )
197260 citations = sort_citations_by_year (citations )
198- print ("✅ Citations triées par année (plus récent en premier)" )
261+ print ("✅ Citations sorted by year (most recent first)" )
262+
263+ # Remove duplicates
264+ citations_before = len (citations )
265+ citations = deduplicate_citations (citations )
266+ removed = citations_before - len (citations )
267+ if removed :
268+ print (f"✅ { removed } duplicate(s) removed ({ len (citations )} unique citations)" )
269+ else :
270+ print ("✅ No duplicates detected" )
199271
200- # Obtenir le répertoire racine du projet (2 niveaux au-dessus du script)
272+ # Get project root directory (2 levels above the script)
201273 script_dir = os .path .dirname (os .path .abspath (__file__ ))
202274 project_root = os .path .dirname (os .path .dirname (script_dir ))
203275 web_output = os .path .join (project_root , args .output )
204276
205- # Génération de la page web
277+ # Generate the web page
206278 generate_web_page (citations , scholar_url , web_output )
207279
208280
0 commit comments