@@ -317,6 +317,52 @@ def extract_thunen_from_openagrar_metadata(
317317 ]
318318 )
319319
320+ async def setup_repo (args , config ):
321+ """
322+ Setup the git repository if configured and requested.
323+ """
324+
325+ if args .git :
326+ git_config = GitRepoConfig (** config ["git" ])
327+ git_repo = GitRepo (git_config )
328+ local_path = Path (git_repo .working_dir )
329+ git_repo .pull ()
330+ else :
331+ git_repo = None
332+ local_path = Path (config .get ("git" , {}).get ("local_path" , "/tmp/middleware_git" ))
333+ os .makedirs (local_path , exist_ok = True )
334+ return git_repo , local_path
335+
336+
337+ async def process_sitemap (sitemap , local_path , default_http_config , git_repo ):
338+ """
339+ Process a single sitemap configuration.
340+ """
341+
342+ scraper_config = MetadataScraperConfig (** sitemap )
343+ path , starttime , repo_report = await scrape_repo_and_write_to_file (
344+ local_path , scraper_config , default_http_config
345+ )
346+
347+ # Ugly special cases for known repositories that need post-processing.
348+ # We should find a more generic solution in the future.
349+ if "publisso" in scraper_config .name :
350+ paths , repo_reports = transform_publisso_to_publisso_schemaorg (path , repo_report )
351+ commit = True
352+ elif "openagrar" in scraper_config .name :
353+ paths , repo_reports = extract_thunen_from_openagrar_metadata (path )
354+ commit = True
355+ else :
356+ paths = [path ]
357+ repo_reports = [{"repo_name" : sitemap ["name" ], ** repo_report }]
358+ commit = sitemap .get ("commit" , True )
359+
360+ if git_repo and commit :
361+ for path in paths :
362+ commit_to_git (scraper_config .url , git_repo , path , starttime )
363+
364+ return repo_reports
365+
320366
321367async def main ():
322368 """
@@ -327,54 +373,23 @@ async def main():
327373
328374 with trace .get_tracer (__name__ ).start_as_current_span ("main" ) as otel_span :
329375 try :
330- # setup git repo if desired
331- if args .git :
332- git_config = GitRepoConfig (** config ["git" ])
333- git_repo = GitRepo (git_config )
334- local_path = Path (git_repo .working_dir )
335- git_repo .pull ()
336- else :
337- git_repo = None
338- local_path = Path (
339- config .get ("git" , {}).get ("local_path" , "/tmp/middleware_git" )
340- )
341- os .makedirs (local_path , exist_ok = True )
342-
376+ git_repo , local_path = await setup_repo (args , config )
343377 default_http_config = HttpSessionConfig (** config ["http_client" ])
378+
344379 full_report = []
345- # scrape sites
346380 for sitemap in config ["sitemaps" ]:
347- scraper_config = MetadataScraperConfig (** sitemap )
348- path , starttime , repo_report = await scrape_repo_and_write_to_file (
349- local_path , scraper_config , default_http_config
381+ repo_reports = await process_sitemap (
382+ sitemap , local_path , default_http_config , git_repo
350383 )
351- # Ugly logic to perform transformations for specific repos.
352- # This should be replaced by a more generic mechanism in the future.
353- if "publisso" in scraper_config .name :
354- paths , repo_reports = transform_publisso_to_publisso_schemaorg (
355- path , repo_report )
356- commit = True
357- elif "openagrar" in scraper_config .name :
358- paths , repo_reports = extract_thunen_from_openagrar_metadata (path )
359- commit = True
360- else :
361- paths = [path ]
362- repo_reports = [{"repo_name" : sitemap ["name" ], ** repo_report }]
363- commit = sitemap .get ("commit" , True )
364- full_report += repo_reports
365- if git_repo and commit :
366- # if a git repo is set, commit all files except those that are explicitly
367- # excluded
368- for path in paths :
369- commit_to_git (scraper_config .url , git_repo , path , starttime )
384+ full_report .extend (repo_reports )
370385
371386 if git_repo :
372387 git_repo .push ()
373388
374389 print (json .dumps (full_report , indent = 2 , ensure_ascii = False , sort_keys = True ))
390+
375391 # pylint: disable-next=broad-except
376392 except Exception as e :
377- otel_span = trace .get_current_span ()
378393 otel_span .record_exception (e )
379394 msg = "Error when scraping repositories"
380395 otel_span .add_event (msg )
0 commit comments