1010import re
1111import logging
1212import subprocess
13- from typing import Dict , Tuple
14- import tempfile
13+ from typing import Dict , List , Tuple
1514
1615import asyncio
1716
@@ -196,75 +195,82 @@ def setup_and_config() -> Tuple[argparse.Namespace, Dict]:
196195 sys .exit (1 )
197196
198197
199- def transform_publisso_to_publisso_schemaorg ():
198+ def transform_publisso_to_publisso_schemaorg (
199+ input_file : Path ,
200+ original_report : Dict
201+ ) -> Tuple [List [Path ], List [Dict ]]:
200202 """
201203 Transform the Publisso metadata to schema.org format.
202204 """
203205
204206 # Archivos
205- input_file = Path ( "./output/ publisso.json"). resolve ()
206- jq_script = Path ("./ scripts/publiso_conversor.jq"). resolve ()
207+ output_file = input_file . parent / " publisso.json"
208+ jq_script = Path (__file__ ). parent / " scripts/publiso_conversor.jq"
207209
208210 if not input_file .exists ():
209- print (f"❌ Archivo de entrada no encontrado: { input_file } " )
210- return
211-
212- # Crear directorio de salida si no existe
213- input_file .parent .mkdir (parents = True , exist_ok = True )
214-
215- # Usar archivo temporal para el resultado
216- with tempfile .NamedTemporaryFile (
217- mode = "w" , delete = False , dir = input_file .parent , suffix = ".json"
218- ) as tmp_file :
219- tmp_path = Path (tmp_file .name )
211+ logging .warning ("❌ Archivo de entrada no encontrado: %s" , input_file )
212+ return ([],[])
220213
221214 try :
222215 # Ejecutar jq en memoria usando context managers so resources are cleaned up
223216 with subprocess .Popen (
224217 ["jq" , "-f" , str (jq_script ), str (input_file )],
225218 stdout = subprocess .PIPE ,
219+ stderr = subprocess .PIPE ,
226220 text = True
227221 ) as p1 :
228- with open (tmp_path , "w" , encoding = "utf-8" ) as outfile :
222+ with open (output_file , "w" , encoding = "utf-8" ) as outfile :
229223 with subprocess .Popen (
230224 ["jq" , "-s" , "." ],
231225 stdin = p1 .stdout ,
232226 stdout = outfile ,
227+ stderr = subprocess .PIPE ,
233228 text = True
234229 ) as p2 :
235230 if p1 .stdout is not None :
236231 p1 .stdout .close () # Permite que p1 reciba SIGPIPE si p2 falla
237- p2 .communicate ()
232+ _ , err2 = p2 .communicate ()
233+ _ , err1 = p1 .communicate ()
238234
239235 if p1 .wait () != 0 :
240- raise RuntimeError (f"jq script failed with exit code { p1 .returncode } " )
236+ logging .error (
237+ "jq script failed with exit code %s. stderr: %s" ,
238+ p1 .returncode , err1 )
239+ # Do not return, just log the error and continue, as it could be
240+ # that the output is still valid
241+ # return ([],[])
241242 if p2 .returncode != 0 :
242- raise RuntimeError (f"jq merge failed with exit code { p2 .returncode } " )
243-
244- # Reemplazar archivo original
245- os .remove (input_file ) # Eliminar input original
246- tmp_path .rename (input_file ) # Renombrar temp como input original
243+ logging .error (
244+ "jq merge failed with exit code %s. stderr: %s" ,
245+ p2 .returncode , err2 )
246+ # Do not return, just log the error and continue, as it could be
247+ # that the output is still valid
248+ # return ([],[])
249+ except subprocess .CalledProcessError as e :
250+ logging .error ("❌ Error al ejecutar jq: %s" , str (e ))
251+ return ([],[])
247252
248- print (
249- f"✅ Transformación completada, archivo actualizado: { input_file } " )
253+ logging .info (
254+ "✅ Transformación completada, archivo actualizado: %s" , input_file
255+ )
250256
251- except subprocess .CalledProcessError as e :
252- print (f"❌ Error al ejecutar jq: { e } " )
253- if tmp_path .exists ():
254- tmp_path .unlink ()
257+ repo_report = [{"repo_name" : "publisso" , ** original_report }]
258+ return ([output_file ], repo_report )
255259
256260
257- def extract_thunen_from_openagrar_metadata ():
261+ def extract_thunen_from_openagrar_metadata (
262+ input_file : Path
263+ ) -> Tuple [List [Path ], List [Dict ]]:
258264 """
259265 Extract Thünen metadata from OpenAgrar metadata.
260266 """
261267 # Configuration
262- input_file = Path ( "./output/openagrar .json"). resolve ()
263- output_file = Path ( "./output/thunen .json"). resolve ()
268+ thunen_out = input_file . parent / "thunen_atlas .json"
269+ openagrar_out = input_file . parent / "openagrar .json"
264270
265271 if not input_file .exists ():
266- print ( f "❌ Archivo de entrada no encontrado: { input_file } " )
267- return
272+ logging . warning ( "❌ Archivo de entrada no encontrado: %s" , input_file )
273+ return ([],[])
268274 # Regex pattern to match publisher synonyms
269275 publisher_pattern = re .compile (
270276 r"Thünen[- ]?Institut|Thuenen Institute|Thünen-Atlas" , re .IGNORECASE
@@ -273,7 +279,7 @@ def extract_thunen_from_openagrar_metadata():
273279 # Load JSON
274280 with open (input_file , "r" , encoding = "utf-8" ) as f :
275281 data = json .load (f )
276- print ( f "Found { len ( data ) } datasets in { input_file } " )
282+ logging . info ( "Found %s datasets in %s" , len ( data ), input_file )
277283 # Separate datasets
278284 filtered = []
279285 remaining = []
@@ -285,15 +291,31 @@ def extract_thunen_from_openagrar_metadata():
285291 remaining .append (dataset )
286292
287293 # Write filtered datasets to new file
288- with open (output_file , "w" , encoding = "utf-8" ) as f :
294+ with open (thunen_out , "w" , encoding = "utf-8" ) as f :
289295 json .dump (filtered , f , ensure_ascii = False , indent = 2 )
290296
291297 # Update original file with remaining datasets
292- with open (input_file , "w" , encoding = "utf-8" ) as f :
298+ with open (openagrar_out , "w" , encoding = "utf-8" ) as f :
293299 json .dump (remaining , f , ensure_ascii = False , indent = 2 )
294300
295- print (f"Extracted { len (filtered )} datasets to { output_file } " )
296- print (f"{ len (remaining )} datasets remain in { input_file } " )
301+ logging .info ("Extracted %s datasets to %s" , len (filtered ), thunen_out )
302+ logging .info ("%s datasets remain in %s" , len (remaining ), openagrar_out )
303+
304+ return (
305+ [thunen_out , openagrar_out ],
306+ [
307+ {
308+ "repo_name" : "thunen_atlas" ,
309+ "valid_entries" : len (filtered ),
310+ "failed_entries" : 0
311+ },
312+ {
313+ "repo_name" : "openagrar" ,
314+ "valid_entries" : len (remaining ),
315+ "failed_entries" : 0
316+ }
317+ ]
318+ )
297319
298320
299321async def main ():
@@ -309,11 +331,13 @@ async def main():
309331 if args .git :
310332 git_config = GitRepoConfig (** config ["git" ])
311333 git_repo = GitRepo (git_config )
312- local_path = git_repo .working_dir
334+ local_path = Path ( git_repo .working_dir )
313335 git_repo .pull ()
314336 else :
315337 git_repo = None
316- local_path = config .get ("git" , {}).get ("local_path" , "/tmp/middleware_git" )
338+ local_path = Path (
339+ config .get ("git" , {}).get ("local_path" , "/tmp/middleware_git" )
340+ )
317341 os .makedirs (local_path , exist_ok = True )
318342
319343 default_http_config = HttpSessionConfig (** config ["http_client" ])
@@ -322,18 +346,27 @@ async def main():
322346 for sitemap in config ["sitemaps" ]:
323347 scraper_config = MetadataScraperConfig (** sitemap )
324348 path , starttime , repo_report = await scrape_repo_and_write_to_file (
325- Path ( local_path ) , scraper_config , default_http_config
349+ local_path , scraper_config , default_http_config
326350 )
327- full_report += [{"repo_name" : sitemap ["name" ], ** repo_report }]
328- if sitemap ["name" ] == "publisso" :
329- transform_publisso_to_publisso_schemaorg ()
330- if sitemap ["name" ] == "openagrar" :
331- extract_thunen_from_openagrar_metadata ()
332- commit = sitemap .get ("commit" , True )
351+ # Ugly logic to perform transformations for specific repos.
352+ # This should be replaced by a more generic mechanism in the future.
353+ if "publisso" in scraper_config .name :
354+ paths , repo_reports = transform_publisso_to_publisso_schemaorg (
355+ path , repo_report )
356+ commit = True
357+ elif "openagrar" in scraper_config .name :
358+ paths , repo_reports = extract_thunen_from_openagrar_metadata (path )
359+ commit = True
360+ else :
361+ paths = [path ]
362+ repo_reports = [{"repo_name" : sitemap ["name" ], ** repo_report }]
363+ commit = sitemap .get ("commit" , True )
364+ full_report += repo_reports
333365 if git_repo and commit :
334366 # if a git repo is set, commit all files except those that are explicitly
335367 # excluded
336- commit_to_git (scraper_config .url , git_repo , path , starttime )
368+ for path in paths :
369+ commit_to_git (scraper_config .url , git_repo , path , starttime )
337370
338371 if git_repo :
339372 git_repo .push ()
0 commit comments