Skip to content

Commit 2d909a0

Browse files
committed
extract thunen from openagrar
1 parent 2b60080 commit 2d909a0

1 file changed

Lines changed: 40 additions & 3 deletions

File tree

middleware/main.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import datetime
99
import argparse
1010
import json
11-
import jq
11+
import re
1212
import logging
1313
import subprocess
1414
from typing import Tuple
@@ -240,12 +240,47 @@ def transform_publisso_to_publisso_schemaorg():
240240
if tmp_path.exists():
241241
tmp_path.unlink()
242242

243+
243244
def extract_thunen_from_openagrar_metadata():
244245
"""
245246
Extract Thünen metadata from OpenAgrar metadata.
246247
"""
247-
with open("openagrar_metadata.json", "r", encoding="utf-8") as f:
248-
openagrar_metadata = json.load(f)
248+
# Configuration
249+
input_file = Path("./output/openagrar.json").resolve()
250+
output_file = Path("./output/thunen.json").resolve()
251+
252+
if not input_file.exists():
253+
print(f"❌ Archivo de entrada no encontrado: {input_file}")
254+
return
255+
# Regex pattern to match publisher synonyms
256+
publisher_pattern = re.compile(
257+
r"Thünen[- ]?Institut|Thuenen Institute", re.IGNORECASE
258+
)
259+
260+
# Load JSON
261+
with open(input_file, "r", encoding="utf-8") as f:
262+
data = json.load(f)
263+
print(f"Found {len(data)} datasets in {input_file}")
264+
# Separate datasets
265+
filtered = []
266+
remaining = []
267+
for dataset in data:
268+
publisher_name = dataset.get("publisher", {}).get("name", "")
269+
if publisher_pattern.search(publisher_name):
270+
filtered.append(dataset)
271+
else:
272+
remaining.append(dataset)
273+
274+
# Write filtered datasets to new file
275+
with open(output_file, "w", encoding="utf-8") as f:
276+
json.dump(filtered, f, ensure_ascii=False, indent=2)
277+
278+
# Update original file with remaining datasets
279+
with open(input_file, "w", encoding="utf-8") as f:
280+
json.dump(remaining, f, ensure_ascii=False, indent=2)
281+
282+
print(f"Extracted {len(filtered)} datasets to {output_file}")
283+
print(f"{len(remaining)} datasets remain in {input_file}")
249284

250285

251286
async def main():
@@ -279,6 +314,8 @@ async def main():
279314
full_report += [{"repo_name": sitemap["name"], **repo_report}]
280315
if sitemap["name"] == "publisso":
281316
transform_publisso_to_publisso_schemaorg()
317+
if sitemap["name"] == "openagrar":
318+
extract_thunen_from_openagrar_metadata()
282319
if git_repo:
283320
commit_to_git(scraper_config.url, git_repo, path, starttime)
284321

0 commit comments

Comments
 (0)