Skip to content

Commit 5319665

Browse files
committed
refactored too big main function
1 parent 4307d1b commit 5319665

1 file changed

Lines changed: 52 additions & 37 deletions

File tree

middleware/main.py

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,52 @@ def extract_thunen_from_openagrar_metadata(
317317
]
318318
)
319319

320+
async def setup_repo(args, config):
321+
"""
322+
Setup the git repository if configured and requested.
323+
"""
324+
325+
if args.git:
326+
git_config = GitRepoConfig(**config["git"])
327+
git_repo = GitRepo(git_config)
328+
local_path = Path(git_repo.working_dir)
329+
git_repo.pull()
330+
else:
331+
git_repo = None
332+
local_path = Path(config.get("git", {}).get("local_path", "/tmp/middleware_git"))
333+
os.makedirs(local_path, exist_ok=True)
334+
return git_repo, local_path
335+
336+
337+
async def process_sitemap(sitemap, local_path, default_http_config, git_repo):
338+
"""
339+
Process a single sitemap configuration.
340+
"""
341+
342+
scraper_config = MetadataScraperConfig(**sitemap)
343+
path, starttime, repo_report = await scrape_repo_and_write_to_file(
344+
local_path, scraper_config, default_http_config
345+
)
346+
347+
# Ugly special cases for known repositories that need post-processing.
348+
# We should find a more generic solution in the future.
349+
if "publisso" in scraper_config.name:
350+
paths, repo_reports = transform_publisso_to_publisso_schemaorg(path, repo_report)
351+
commit = True
352+
elif "openagrar" in scraper_config.name:
353+
paths, repo_reports = extract_thunen_from_openagrar_metadata(path)
354+
commit = True
355+
else:
356+
paths = [path]
357+
repo_reports = [{"repo_name": sitemap["name"], **repo_report}]
358+
commit = sitemap.get("commit", True)
359+
360+
if git_repo and commit:
361+
for path in paths:
362+
commit_to_git(scraper_config.url, git_repo, path, starttime)
363+
364+
return repo_reports
365+
320366

321367
async def main():
322368
"""
@@ -327,54 +373,23 @@ async def main():
327373

328374
with trace.get_tracer(__name__).start_as_current_span("main") as otel_span:
329375
try:
330-
# setup git repo if desired
331-
if args.git:
332-
git_config = GitRepoConfig(**config["git"])
333-
git_repo = GitRepo(git_config)
334-
local_path = Path(git_repo.working_dir)
335-
git_repo.pull()
336-
else:
337-
git_repo = None
338-
local_path = Path(
339-
config.get("git", {}).get("local_path", "/tmp/middleware_git")
340-
)
341-
os.makedirs(local_path, exist_ok=True)
342-
376+
git_repo, local_path = await setup_repo(args, config)
343377
default_http_config = HttpSessionConfig(**config["http_client"])
378+
344379
full_report = []
345-
# scrape sites
346380
for sitemap in config["sitemaps"]:
347-
scraper_config = MetadataScraperConfig(**sitemap)
348-
path, starttime, repo_report = await scrape_repo_and_write_to_file(
349-
local_path, scraper_config, default_http_config
381+
repo_reports = await process_sitemap(
382+
sitemap, local_path, default_http_config, git_repo
350383
)
351-
# Ugly logic to perform transformations for specific repos.
352-
# This should be replaced by a more generic mechanism in the future.
353-
if "publisso" in scraper_config.name:
354-
paths, repo_reports = transform_publisso_to_publisso_schemaorg(
355-
path, repo_report)
356-
commit = True
357-
elif "openagrar" in scraper_config.name:
358-
paths, repo_reports = extract_thunen_from_openagrar_metadata(path)
359-
commit = True
360-
else:
361-
paths = [path]
362-
repo_reports = [{"repo_name": sitemap["name"], **repo_report}]
363-
commit = sitemap.get("commit", True)
364-
full_report += repo_reports
365-
if git_repo and commit:
366-
# if a git repo is set, commit all files except those that are explicitly
367-
# excluded
368-
for path in paths:
369-
commit_to_git(scraper_config.url, git_repo, path, starttime)
384+
full_report.extend(repo_reports)
370385

371386
if git_repo:
372387
git_repo.push()
373388

374389
print(json.dumps(full_report, indent=2, ensure_ascii=False, sort_keys=True))
390+
375391
# pylint: disable-next=broad-except
376392
except Exception as e:
377-
otel_span = trace.get_current_span()
378393
otel_span.record_exception(e)
379394
msg = "Error when scraping repositories"
380395
otel_span.add_event(msg)

0 commit comments

Comments
 (0)