|
8 | 8 | import datetime |
9 | 9 | import argparse |
10 | 10 | import json |
11 | | -import jq |
| 11 | +import re |
12 | 12 | import logging |
13 | 13 | import subprocess |
14 | 14 | from typing import Tuple |
@@ -240,12 +240,47 @@ def transform_publisso_to_publisso_schemaorg(): |
240 | 240 | if tmp_path.exists(): |
241 | 241 | tmp_path.unlink() |
242 | 242 |
|
| 243 | + |
243 | 244 | def extract_thunen_from_openagrar_metadata(): |
244 | 245 | """ |
245 | 246 | Extract Thünen metadata from OpenAgrar metadata. |
246 | 247 | """ |
247 | | - with open("openagrar_metadata.json", "r", encoding="utf-8") as f: |
248 | | - openagrar_metadata = json.load(f) |
| 248 | + # Configuration |
| 249 | + input_file = Path("./output/openagrar.json").resolve() |
| 250 | + output_file = Path("./output/thunen.json").resolve() |
| 251 | + |
| 252 | + if not input_file.exists(): |
| 253 | + print(f"❌ Archivo de entrada no encontrado: {input_file}") |
| 254 | + return |
| 255 | + # Regex pattern to match publisher synonyms |
| 256 | + publisher_pattern = re.compile( |
| 257 | + r"Thünen[- ]?Institut|Thuenen Institute", re.IGNORECASE |
| 258 | + ) |
| 259 | + |
| 260 | + # Load JSON |
| 261 | + with open(input_file, "r", encoding="utf-8") as f: |
| 262 | + data = json.load(f) |
| 263 | + print(f"Found {len(data)} datasets in {input_file}") |
| 264 | + # Separate datasets |
| 265 | + filtered = [] |
| 266 | + remaining = [] |
| 267 | + for dataset in data: |
| 268 | + publisher_name = dataset.get("publisher", {}).get("name", "") |
| 269 | + if publisher_pattern.search(publisher_name): |
| 270 | + filtered.append(dataset) |
| 271 | + else: |
| 272 | + remaining.append(dataset) |
| 273 | + |
| 274 | + # Write filtered datasets to new file |
| 275 | + with open(output_file, "w", encoding="utf-8") as f: |
| 276 | + json.dump(filtered, f, ensure_ascii=False, indent=2) |
| 277 | + |
| 278 | + # Update original file with remaining datasets |
| 279 | + with open(input_file, "w", encoding="utf-8") as f: |
| 280 | + json.dump(remaining, f, ensure_ascii=False, indent=2) |
| 281 | + |
| 282 | + print(f"Extracted {len(filtered)} datasets to {output_file}") |
| 283 | + print(f"{len(remaining)} datasets remain in {input_file}") |
249 | 284 |
|
250 | 285 |
|
251 | 286 | async def main(): |
@@ -279,6 +314,8 @@ async def main(): |
279 | 314 | full_report += [{"repo_name": sitemap["name"], **repo_report}] |
280 | 315 | if sitemap["name"] == "publisso": |
281 | 316 | transform_publisso_to_publisso_schemaorg() |
| 317 | + if sitemap["name"] == "openagrar": |
| 318 | + extract_thunen_from_openagrar_metadata() |
282 | 319 | if git_repo: |
283 | 320 | commit_to_git(scraper_config.url, git_repo, path, starttime) |
284 | 321 |
|
|
0 commit comments