Skip to content

Commit 4307d1b

Browse files
committed
integrated the functions transform_publisso_to_publisso_schemaorg and extract_thunen_from_openagrar_metadata into the rest of the software
1 parent 712a95f commit 4307d1b

2 files changed

Lines changed: 86 additions & 53 deletions

File tree

config.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ sitemaps:
1616
url: https://doi.ipk-gatersleben.de/sitemap.xml
1717
sitemap: xml
1818
metadata: embedded_jsonld
19-
- name: openagrar
19+
- name: openagrar_native
2020
url: "https://www.openagrar.de/servlets/solr/select?core=main&q=category.top%3A%22mir_genres%3Aresearch_data%22+AND+objectType%3Amods+AND+category.top%3A%22state%3Apublished%22&rows=1000&fl=id%2Cmods.identifier&wt=json&XSL.Style=xml"
2121
sitemap: openagrar
2222
metadata: embedded_jsonld
23-
- name: publisso
23+
- name: publisso_native
2424
url: "https://frl.publisso.de/find?q=contentType:researchData&from=0&until=200&format=json"
2525
sitemap: publisso
2626
# - name: thunen_atlas
@@ -40,7 +40,7 @@ http_client:
4040
git:
4141
repo_url: git@github.com:fairagro/middleware_repo.git
4242
branch: main
43-
local_path: /middleware/output
43+
local_path: output
4444
user_name: "FAIRagro middleware"
4545
user_email: "middleware@fairagro.net"
4646

middleware/main.py

Lines changed: 83 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@
1010
import re
1111
import logging
1212
import subprocess
13-
from typing import Dict, Tuple
14-
import tempfile
13+
from typing import Dict, List, Tuple
1514

1615
import asyncio
1716

@@ -196,75 +195,82 @@ def setup_and_config() -> Tuple[argparse.Namespace, Dict]:
196195
sys.exit(1)
197196

198197

199-
def transform_publisso_to_publisso_schemaorg():
198+
def transform_publisso_to_publisso_schemaorg(
199+
input_file: Path,
200+
original_report: Dict
201+
) -> Tuple[List[Path], List[Dict]]:
200202
"""
201203
Transform the Publisso metadata to schema.org format.
202204
"""
203205

204206
# Archivos
205-
input_file = Path("./output/publisso.json").resolve()
206-
jq_script = Path("./scripts/publiso_conversor.jq").resolve()
207+
output_file = input_file.parent / "publisso.json"
208+
jq_script = Path(__file__).parent / "scripts/publiso_conversor.jq"
207209

208210
if not input_file.exists():
209-
print(f"❌ Archivo de entrada no encontrado: {input_file}")
210-
return
211-
212-
# Crear directorio de salida si no existe
213-
input_file.parent.mkdir(parents=True, exist_ok=True)
214-
215-
# Usar archivo temporal para el resultado
216-
with tempfile.NamedTemporaryFile(
217-
mode="w", delete=False, dir=input_file.parent, suffix=".json"
218-
) as tmp_file:
219-
tmp_path = Path(tmp_file.name)
211+
logging.warning("❌ Archivo de entrada no encontrado: %s", input_file)
212+
return ([],[])
220213

221214
try:
222215
# Ejecutar jq en memoria usando context managers so resources are cleaned up
223216
with subprocess.Popen(
224217
["jq", "-f", str(jq_script), str(input_file)],
225218
stdout=subprocess.PIPE,
219+
stderr=subprocess.PIPE,
226220
text=True
227221
) as p1:
228-
with open(tmp_path, "w", encoding="utf-8") as outfile:
222+
with open(output_file, "w", encoding="utf-8") as outfile:
229223
with subprocess.Popen(
230224
["jq", "-s", "."],
231225
stdin=p1.stdout,
232226
stdout=outfile,
227+
stderr=subprocess.PIPE,
233228
text=True
234229
) as p2:
235230
if p1.stdout is not None:
236231
p1.stdout.close() # Permite que p1 reciba SIGPIPE si p2 falla
237-
p2.communicate()
232+
_, err2 = p2.communicate()
233+
_, err1 = p1.communicate()
238234

239235
if p1.wait() != 0:
240-
raise RuntimeError(f"jq script failed with exit code {p1.returncode}")
236+
logging.error(
237+
"jq script failed with exit code %s. stderr: %s",
238+
p1.returncode, err1)
239+
# Do not return, just log the error and continue, as it could be
240+
# that the output is still valid
241+
# return ([],[])
241242
if p2.returncode != 0:
242-
raise RuntimeError(f"jq merge failed with exit code {p2.returncode}")
243-
244-
# Reemplazar archivo original
245-
os.remove(input_file) # Eliminar input original
246-
tmp_path.rename(input_file) # Renombrar temp como input original
243+
logging.error(
244+
"jq merge failed with exit code %s. stderr: %s",
245+
p2.returncode, err2)
246+
# Do not return, just log the error and continue, as it could be
247+
# that the output is still valid
248+
# return ([],[])
249+
except subprocess.CalledProcessError as e:
250+
logging.error("❌ Error al ejecutar jq: %s", str(e))
251+
return ([],[])
247252

248-
print(
249-
f"✅ Transformación completada, archivo actualizado: {input_file}")
253+
logging.info(
254+
"✅ Transformación completada, archivo actualizado: %s", input_file
255+
)
250256

251-
except subprocess.CalledProcessError as e:
252-
print(f"❌ Error al ejecutar jq: {e}")
253-
if tmp_path.exists():
254-
tmp_path.unlink()
257+
repo_report = [{"repo_name": "publisso", **original_report}]
258+
return ([output_file], repo_report)
255259

256260

257-
def extract_thunen_from_openagrar_metadata():
261+
def extract_thunen_from_openagrar_metadata(
262+
input_file: Path
263+
) -> Tuple[List[Path], List[Dict]]:
258264
"""
259265
Extract Thünen metadata from OpenAgrar metadata.
260266
"""
261267
# Configuration
262-
input_file = Path("./output/openagrar.json").resolve()
263-
output_file = Path("./output/thunen.json").resolve()
268+
thunen_out = input_file.parent / "thunen_atlas.json"
269+
openagrar_out = input_file.parent / "openagrar.json"
264270

265271
if not input_file.exists():
266-
print(f"❌ Archivo de entrada no encontrado: {input_file}")
267-
return
272+
logging.warning("❌ Archivo de entrada no encontrado: %s", input_file)
273+
return ([],[])
268274
# Regex pattern to match publisher synonyms
269275
publisher_pattern = re.compile(
270276
r"Thünen[- ]?Institut|Thuenen Institute|Thünen-Atlas", re.IGNORECASE
@@ -273,7 +279,7 @@ def extract_thunen_from_openagrar_metadata():
273279
# Load JSON
274280
with open(input_file, "r", encoding="utf-8") as f:
275281
data = json.load(f)
276-
print(f"Found {len(data)} datasets in {input_file}")
282+
logging.info("Found %s datasets in %s", len(data), input_file)
277283
# Separate datasets
278284
filtered = []
279285
remaining = []
@@ -285,15 +291,31 @@ def extract_thunen_from_openagrar_metadata():
285291
remaining.append(dataset)
286292

287293
# Write filtered datasets to new file
288-
with open(output_file, "w", encoding="utf-8") as f:
294+
with open(thunen_out, "w", encoding="utf-8") as f:
289295
json.dump(filtered, f, ensure_ascii=False, indent=2)
290296

291297
# Update original file with remaining datasets
292-
with open(input_file, "w", encoding="utf-8") as f:
298+
with open(openagrar_out, "w", encoding="utf-8") as f:
293299
json.dump(remaining, f, ensure_ascii=False, indent=2)
294300

295-
print(f"Extracted {len(filtered)} datasets to {output_file}")
296-
print(f"{len(remaining)} datasets remain in {input_file}")
301+
logging.info("Extracted %s datasets to %s", len(filtered), thunen_out)
302+
logging.info("%s datasets remain in %s", len(remaining), openagrar_out)
303+
304+
return (
305+
[thunen_out, openagrar_out],
306+
[
307+
{
308+
"repo_name": "thunen_atlas",
309+
"valid_entries": len(filtered),
310+
"failed_entries": 0
311+
},
312+
{
313+
"repo_name": "openagrar",
314+
"valid_entries": len(remaining),
315+
"failed_entries": 0
316+
}
317+
]
318+
)
297319

298320

299321
async def main():
@@ -309,11 +331,13 @@ async def main():
309331
if args.git:
310332
git_config = GitRepoConfig(**config["git"])
311333
git_repo = GitRepo(git_config)
312-
local_path = git_repo.working_dir
334+
local_path = Path(git_repo.working_dir)
313335
git_repo.pull()
314336
else:
315337
git_repo = None
316-
local_path = config.get("git", {}).get("local_path", "/tmp/middleware_git")
338+
local_path = Path(
339+
config.get("git", {}).get("local_path", "/tmp/middleware_git")
340+
)
317341
os.makedirs(local_path, exist_ok=True)
318342

319343
default_http_config = HttpSessionConfig(**config["http_client"])
@@ -322,18 +346,27 @@ async def main():
322346
for sitemap in config["sitemaps"]:
323347
scraper_config = MetadataScraperConfig(**sitemap)
324348
path, starttime, repo_report = await scrape_repo_and_write_to_file(
325-
Path(local_path), scraper_config, default_http_config
349+
local_path, scraper_config, default_http_config
326350
)
327-
full_report += [{"repo_name": sitemap["name"], **repo_report}]
328-
if sitemap["name"] == "publisso":
329-
transform_publisso_to_publisso_schemaorg()
330-
if sitemap["name"] == "openagrar":
331-
extract_thunen_from_openagrar_metadata()
332-
commit = sitemap.get("commit", True)
351+
# Ugly logic to perform transformations for specific repos.
352+
# This should be replaced by a more generic mechanism in the future.
353+
if "publisso" in scraper_config.name:
354+
paths, repo_reports = transform_publisso_to_publisso_schemaorg(
355+
path, repo_report)
356+
commit = True
357+
elif "openagrar" in scraper_config.name:
358+
paths, repo_reports = extract_thunen_from_openagrar_metadata(path)
359+
commit = True
360+
else:
361+
paths = [path]
362+
repo_reports = [{"repo_name": sitemap["name"], **repo_report}]
363+
commit = sitemap.get("commit", True)
364+
full_report += repo_reports
333365
if git_repo and commit:
334366
# if a git repo is set, commit all files except those that are explicitly
335367
# excluded
336-
commit_to_git(scraper_config.url, git_repo, path, starttime)
368+
for path in paths:
369+
commit_to_git(scraper_config.url, git_repo, path, starttime)
337370

338371
if git_repo:
339372
git_repo.push()

0 commit comments

Comments
 (0)