Skip to content

Commit 2b60080

Browse files
committed
implement publisso conversor for schema.org
1 parent c27340e commit 2b60080

4 files changed

Lines changed: 187 additions & 45 deletions

File tree

config.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ sitemaps:
2323
- name: publisso
2424
url: "https://frl.publisso.de/find?q=contentType:researchData&from=0&until=200&format=json"
2525
sitemap: publisso
26-
- name: thunen_atlas
27-
url: "https://atlas.thuenen.de/api/v2/resources?page_size=200&format=json"
28-
sitemap: thunen_atlas
26+
# - name: thunen_atlas
27+
# url: "https://atlas.thuenen.de/api/v2/resources?page_size=200&format=json"
28+
# sitemap: thunen_atlas
2929

3030
# Configures the HTTP client uses to download sitemaps and datasets
3131
http_client:

middleware/main.py

Lines changed: 106 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,23 @@
88
import datetime
99
import argparse
1010
import json
11+
import jq
1112
import logging
13+
import subprocess
1214
from typing import Tuple
15+
import tempfile
1316

1417
import asyncio
1518
import aiofiles
1619
import pytz
1720
import yaml
18-
from opentelemetry import trace #, metrics
21+
from opentelemetry import trace # , metrics
1922
from opentelemetry.sdk.resources import Resource
2023
from opentelemetry.sdk.trace import TracerProvider
2124
from opentelemetry.sdk.trace.sampling import ALWAYS_ON
2225
from opentelemetry.sdk.trace.export import BatchSpanProcessor
2326
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
27+
2428
# from opentelemetry.sdk.metrics import MeterProvider
2529
# from opentelemetry.sdk.metrics.export import (
2630
# ConsoleMetricExporter,
@@ -59,18 +63,16 @@ def setup_opentelemetry(otlp_config: dict) -> None:
5963
opentelemetry.instrumentation.urllib.URLLibInstrumentor().instrument()
6064
opentelemetry.instrumentation.aiohttp_client.AioHttpClientInstrumentor().instrument()
6165

62-
endpoint = otlp_config.get('endpoint')
66+
endpoint = otlp_config.get("endpoint")
6367
if endpoint:
6468
# Initialize OpenTelemetry for Tracing to OTLP endpoint
6569
trace.set_tracer_provider(
6670
TracerProvider(
67-
resource=Resource.create({
68-
"service.name": "FAIRagro middleware"
69-
}),
71+
resource=Resource.create({"service.name": "FAIRagro middleware"}),
7072
active_span_processor=BatchSpanProcessor(
7173
OTLPSpanExporter(endpoint=endpoint)
7274
),
73-
sampler=ALWAYS_ON
75+
sampler=ALWAYS_ON,
7476
)
7577
)
7678
else:
@@ -88,9 +90,10 @@ def setup_opentelemetry(otlp_config: dict) -> None:
8890

8991

9092
async def scrape_repo_and_write_to_file(
91-
folder_path: str,
92-
scraper_config: MetadataScraperConfig,
93-
default_http_config: HttpSessionConfig) -> Tuple[str, datetime.datetime]:
93+
folder_path: str,
94+
scraper_config: MetadataScraperConfig,
95+
default_http_config: HttpSessionConfig,
96+
) -> Tuple[str, datetime.datetime]:
9497
"""
9598
Scrapes research repository metadata and writes it to a file.
9699
@@ -114,15 +117,17 @@ async def scrape_repo_and_write_to_file(
114117
# simple synchronous gauge values.
115118
# count_sites = len(sites)
116119
# count_metadata = len(metadata)
117-
path = os.path.join(folder_path, f'{scraper_config.name}.json')
118-
async with aiofiles.open(path, 'w', encoding='utf-8') as f:
119-
await f.write(json.dumps(metadata, indent=2, ensure_ascii=False, sort_keys=True))
120+
path = os.path.join(folder_path, f"{scraper_config.name}.json")
121+
async with aiofiles.open(path, "w", encoding="utf-8") as f:
122+
await f.write(
123+
json.dumps(metadata, indent=2, ensure_ascii=False, sort_keys=True)
124+
)
120125
return path, start_timestamp, report
121126

122-
def commit_to_git(sitemap_url: str,
123-
git_repo: GitRepo,
124-
path: str,
125-
starttime: datetime) -> None:
127+
128+
def commit_to_git(
129+
sitemap_url: str, git_repo: GitRepo, path: str, starttime: datetime
130+
) -> None:
126131
"""
127132
Create a log message and commit file to git.
128133
@@ -141,53 +146,108 @@ def commit_to_git(sitemap_url: str,
141146
-------
142147
None
143148
"""
144-
formatted_time = starttime.strftime('%Y-%m-%d %H:%M:%S.%f %Z%z')
145-
msg = (
146-
f'harvested by FAIRargo middleware at {formatted_time} from {sitemap_url}'
147-
)
149+
formatted_time = starttime.strftime("%Y-%m-%d %H:%M:%S.%f %Z%z")
150+
msg = f"harvested by FAIRargo middleware at {formatted_time} from {sitemap_url}"
148151
git_repo.add_and_commit([path], msg)
149152

153+
150154
def setup_andconfig() -> dict:
151155
"""
152156
This function will perform setup work and reads the configuration file.
153157
"""
154158

155159
try:
156160
parser = argparse.ArgumentParser(
157-
prog = 'fairagro-middleware',
158-
description= 'Extracts schema.org meta data from research data repositories.',
161+
prog="fairagro-middleware",
162+
description="Extracts schema.org meta data from research data repositories.",
159163
)
160-
parser.add_argument('--config', '-c',
161-
type=Path,
162-
default='config.yml',
163-
help='Config file for this tool.')
164164
parser.add_argument(
165-
'--git',
165+
"--config",
166+
"-c",
167+
type=Path,
168+
default="config.yml",
169+
help="Config file for this tool.",
170+
)
171+
parser.add_argument(
172+
"--git",
166173
action=argparse.BooleanOptionalAction,
167174
default=True,
168175
help=(
169-
'Specify this flag to enabled or disable git interactions.'
170-
'If disabled the outout files will nevrtheless be written to git.local_path '
171-
'as specified within the config file.'
172-
)
176+
"Specify this flag to enabled or disable git interactions."
177+
"If disabled the outout files will nevrtheless be written to git.local_path "
178+
"as specified within the config file."
179+
),
173180
)
174181
args = parser.parse_args()
175182

176183
if not os.path.isfile(args.config):
177-
raise FileNotFoundError(f'Config file {args.config} does not exist.')
184+
raise FileNotFoundError(f"Config file {args.config} does not exist.")
178185

179186
# load config
180-
with open(args.config, 'r', encoding='utf-8') as f:
187+
with open(args.config, "r", encoding="utf-8") as f:
181188
config = yaml.safe_load(f)
182189

183-
setup_opentelemetry(config['opentelemetry'])
190+
setup_opentelemetry(config["opentelemetry"])
184191

185192
return args, config
186193
# pylint: disable-next=broad-except
187194
except Exception:
188195
logging.exception("An error occured during initialization")
189196
sys.exit(1)
190197

198+
199+
def transform_publisso_to_publisso_schemaorg():
200+
"""
201+
Transform the Publisso metadata to schema.org format.
202+
"""
203+
204+
# Archivos
205+
input_file = Path("./output/publisso.json").resolve()
206+
jq_script = Path("./scripts/publiso_conversor.jq").resolve()
207+
208+
if not input_file.exists():
209+
print(f"❌ Archivo de entrada no encontrado: {input_file}")
210+
return
211+
212+
# Crear directorio de salida si no existe
213+
input_file.parent.mkdir(parents=True, exist_ok=True)
214+
215+
# Usar archivo temporal para el resultado
216+
with tempfile.NamedTemporaryFile(
217+
mode="w", delete=False, dir=input_file.parent, suffix=".json"
218+
) as tmp_file:
219+
tmp_path = Path(tmp_file.name)
220+
221+
try:
222+
# Ejecutar jq en memoria
223+
p1 = subprocess.Popen(
224+
["jq", "-f", str(jq_script), str(input_file)], stdout=subprocess.PIPE
225+
)
226+
p2 = subprocess.Popen(
227+
["jq", "-s", "."], stdin=p1.stdout, stdout=open(tmp_path, "w")
228+
)
229+
p1.stdout.close() # Permite que p1 reciba SIGPIPE si p2 falla
230+
p2.communicate() # Espera a que termine
231+
232+
# Reemplazar archivo original
233+
os.remove(input_file) # Eliminar input original
234+
tmp_path.rename(input_file) # Renombrar temp como input original
235+
236+
print(f"✅ Transformación completada, archivo actualizado: {input_file}")
237+
238+
except subprocess.CalledProcessError as e:
239+
print(f"❌ Error al ejecutar jq: {e}")
240+
if tmp_path.exists():
241+
tmp_path.unlink()
242+
243+
def extract_thunen_from_openagrar_metadata():
244+
"""
245+
Extract Thünen metadata from OpenAgrar metadata.
246+
"""
247+
with open("openagrar_metadata.json", "r", encoding="utf-8") as f:
248+
openagrar_metadata = json.load(f)
249+
250+
191251
async def main():
192252
"""
193253
The main async function of the basic middleware
@@ -199,30 +259,33 @@ async def main():
199259
try:
200260
# setup git repo if desired
201261
if args.git:
202-
git_config = GitRepoConfig(**config['git'])
262+
git_config = GitRepoConfig(**config["git"])
203263
git_repo = GitRepo(git_config)
204264
local_path = git_repo.working_dir
205265
git_repo.pull()
206266
else:
207267
git_repo = None
208-
local_path = config['git']['local_path']
268+
local_path = config["git"]["local_path"]
209269
os.makedirs(local_path, exist_ok=True)
210270

211-
default_http_config = HttpSessionConfig(**config['http_client'])
271+
default_http_config = HttpSessionConfig(**config["http_client"])
212272
full_report = []
213273
# scrape sites
214-
for sitemap in config['sitemaps']:
274+
for sitemap in config["sitemaps"]:
215275
scraper_config = MetadataScraperConfig(**sitemap)
216276
path, starttime, repo_report = await scrape_repo_and_write_to_file(
217-
local_path, scraper_config, default_http_config)
218-
full_report += [{'repo_name': sitemap['name'] , **repo_report}]
277+
local_path, scraper_config, default_http_config
278+
)
279+
full_report += [{"repo_name": sitemap["name"], **repo_report}]
280+
if sitemap["name"] == "publisso":
281+
transform_publisso_to_publisso_schemaorg()
219282
if git_repo:
220283
commit_to_git(scraper_config.url, git_repo, path, starttime)
221284

222285
if git_repo:
223286
git_repo.push()
224287

225-
print(json.dumps(full_report, indent=2, ensure_ascii=False, sort_keys=True))
288+
# print(json.dumps(full_report, indent=2, ensure_ascii=False, sort_keys=True))
226289
# pylint: disable-next=broad-except
227290
except Exception as e:
228291
otel_span = trace.get_current_span()
@@ -232,5 +295,6 @@ async def main():
232295
logging.exception(msg)
233296
sys.exit(1)
234297

235-
if __name__ == '__main__':
298+
299+
if __name__ == "__main__":
236300
asyncio.run(main())
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
..[] | {
2+
"@context": {
3+
"@language": "en",
4+
"@vocab":"https://schema.org/"},
5+
"@type": "Dataset",
6+
"@id": ("https://doi.org/" + .doi),
7+
"name": (if has("title") then (.title[]) else null end),
8+
"alternativeHeadline": (if has("alternative") then .alternative[0] else null end),
9+
"description": (if has("description") then (.description[]) else null end),
10+
"identifier": (if has("doi") then (.doi | {"@type":"PropertyValue", "value":., "propertyID": "https://registry.identifiers.org/registry/doi", "url": ("https://doi.org/" + .)}) else null end),
11+
"creator":
12+
[
13+
14+
(if has("creator") then
15+
(.creator[]
16+
| {
17+
"@type": "Person",
18+
"familyName": (.prefLabel | split(", ")[0]),
19+
"givenName": (.prefLabel | split(", ")[1: ] | join(" ")),
20+
"identifier": ."@id"
21+
}
22+
)
23+
else null end)
24+
],
25+
"contributor":
26+
[
27+
(if has("contributor") then
28+
(.contributor[]
29+
| {
30+
"@type": "Person",
31+
"familyName": (.prefLabel | split(", ")[0]),
32+
"givenName": (.prefLabel | split(", ")[1: ] | join(" ")),
33+
"identifier": ."@id"
34+
}
35+
)
36+
else null end)
37+
],
38+
"sourceOrganization":
39+
[
40+
(if has("institution") then (.institution[] | {"@type":"Organization", "@id": ."@id", "name": .prefLabel}) else null end)
41+
],
42+
"datePublished": (if has("issued") then (.issued) else null end),
43+
"copyrightYear": (if has("yearOfCopyright") then (.yearOfCopyright[0]) else null end),
44+
"keywords":
45+
[
46+
(if has("ddc") then (.ddc[] | {"@type":"DefinedTerm", "name": .prefLabel, "identifier": ."@id", "inDefinedTermSet": "https://www.oclc.org/en/dewey.html"}) else null end),
47+
(if has("subject") then (.subject[] | {"@type":"DefinedTerm", "name": .prefLabel, "identifier": ."@id"}) else null end)
48+
],
49+
"measurementTechnique":
50+
[
51+
(if has("dataOrigin") then (.dataOrigin[] | {"@type":"DefinedTerm", "name": .prefLabel, "identifier": ."@id"}) else null end)
52+
],
53+
"funder":
54+
[
55+
(if has("fundingId") then (.fundingId[] | {"@type":"Organization", "@id": ."@id", "name": .prefLabel}) else null end)
56+
],
57+
"funding":
58+
[
59+
(if has("joinedFunding") then (.joinedFunding[] | {"@type":"Grant", "name": .fundingProgramJoined, "funder": .fundingJoined | {"@type":"Organization", "@id": ."@id", "name": .prefLabel}}) else (if has("fundingProgram") then (.fundingProgram[] | {"@type":"Grant", "name": .}) else null end) end)
60+
],
61+
"distribution":
62+
[
63+
(if has("hasPart") then (.hasPart[] | {"@type":"DataDownload", "@id": ("https://repository.publisso.de/resource/" + ."@id"), "name": .prefLabel}) else null end)
64+
],
65+
"inLanguage":
66+
[
67+
(if has("language") then (.language[] | {"@type":"Language", "@id": ."@id", "name": .prefLabel}) else null end)
68+
],
69+
"license": (if has("license") then (.license[]."@id") else null end),
70+
"spatial":
71+
[
72+
(if has("recordingCoordinates") then (.recordingCoordinates[] | {"@type":"Place", "url": ."@id"}) else null end),
73+
(if has("recordingLocation") then (.recordingLocation[] | {"@type":"Place", "name": .prefLabel, "url": ."@id"}) else null end)
74+
]
75+
} | del(..|nulls)

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ html_text==0.7.0
2020
idna==3.10
2121
importlib_metadata==8.6.1
2222
isodate==0.7.2
23+
jq==1.10.0
2324
jstyleson==0.0.2
2425
lxml==5.4.0
2526
lxml_html_clean==0.4.2
@@ -55,3 +56,5 @@ webencodings==0.5.1
5556
wrapt==1.17.2
5657
yarl==1.20.0
5758
zipp==3.21.0
59+
60+

0 commit comments

Comments
 (0)