Skip to content

Commit 649a878

Browse files
committed
fixed cyclic imports
1 parent b28a856 commit 649a878

13 files changed

Lines changed: 261 additions & 263 deletions

File tree

middleware/metadata_scraper/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
from opentelemetry.semconv.attributes import url_attributes
1111

1212
from middleware.http_session import HttpSession, HttpSessionConfig
13-
from middleware.metadata_scraper.sitemap_parser import SitemapParser
14-
from middleware.metadata_scraper.metadata_extractor import MetadataExtractor
13+
from middleware.metadata_scraper.sitemap_parser.sitemap_parser import SitemapParser
14+
from middleware.metadata_scraper.metadata_extractor.metadata_extractor import MetadataExtractor
1515

1616

1717
class MetadataScraperConfig(NamedTuple):
Lines changed: 6 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -1,117 +1,10 @@
11
"""
2-
This module defines the abstract base class 'MetadataExtractor'
2+
Metadata scraper package.
3+
Register available implementations of SitemapParser and MetadataExtractor.
34
"""
45

5-
from abc import abstractmethod
6-
from typing import Dict, List, Optional
7-
import logging
8-
from opentelemetry import trace
6+
from .embedded_jsonld import MetadataExtractorEmbeddedJsonld
7+
from .jsonld import MetadataExtractorJsonld
98

10-
from middleware.utils.registering_abc import RegisteringABC
11-
12-
13-
class MetadataParseError(RuntimeError):
14-
"""
15-
An excpetion of this type will be thrown by implementations of MetadataExtractor
16-
if the content cannot be parsed.
17-
"""
18-
19-
def __init__(self, inner_stuff: Exception | str) -> None:
20-
super().__init__(f"Failed to parse metadata: {str(inner_stuff)}")
21-
self.inner_stuff = inner_stuff
22-
23-
24-
class MetadataExtractor(RegisteringABC):
25-
"""
26-
An abstract base class for metadata extractors (aka parsers).
27-
It will abstract away how the metadata is embedded in a webpage.
28-
29-
Methods
30-
-------
31-
metadata(content, url)
32-
An abstract method that is expected to extract metadata from the given content in
33-
terms of a nested Dict/List structure. It may throw exceptions -- of type
34-
'MetadataParseError'.
35-
raw_metadata(content)
36-
An abstract method that is expected to extract metadata from the given content in
37-
terms of a list of strings. It is expected not to raise exceptions of type
38-
'MetadataParseError.
39-
get_metadata_or_log_error(content, url)
40-
A wrapper method around 'metadata' that catches 'MetadataParseError' excpetions and
41-
logs them.
42-
"""
43-
44-
@abstractmethod
45-
def metadata(self, content: str, url: str) -> List[Dict]:
46-
"""
47-
Extracts metadata from the given content. It is expected that the content can define
48-
several sets of metadata, so a list is returned.
49-
Also it is expected that this function will raise Exceptions if something goes wrong,
50-
e.g. the content is not parsable.
51-
52-
Arguments
53-
---------
54-
content : str
55-
The dataset content to extract the metadata from.
56-
url : str
57-
The URL to extract metadata from. For some implementations it might be helpful
58-
to have this information.
59-
60-
Returns
61-
-------
62-
List[Dict]
63-
A list of dictionaries containing the extracted metadata.
64-
"""
65-
66-
@abstractmethod
67-
def raw_metadata(self, content: str) -> List[str]:
68-
"""
69-
Extracts unparsed metadata from the given content. It is expected that the content can
70-
define several sets of metadata, so a list is returned.
71-
It is expected that this function won't raise exceptions.
72-
73-
Arguments
74-
---------
75-
content : str
76-
The dataset content to extract the metadata from.
77-
78-
Returns
79-
-------
80-
List[str]
81-
A list of strings containing the unprased metadata.
82-
"""
83-
84-
def get_metadata_or_log_error(self, content: str, url: str) -> Optional[List[Dict]]:
85-
"""
86-
Tries to extract metadata from the given content that was downloaded from the given URL.
87-
If the metadata cannot be extracted, it logs an error and returns None.
88-
89-
Arguments
90-
---------
91-
content : str
92-
The content to extract metadata from.
93-
url : str
94-
The URL associated with the content.
95-
96-
Returns
97-
-------
98-
Optional[List[Dict]]
99-
The extracted metadata if successful, None otherwise.
100-
"""
101-
with trace.get_tracer(__name__).start_as_current_span(
102-
"MetadataExtractor.get_metadata_or_log_error") as otel_span:
103-
try:
104-
metadata = self.metadata(content, url)
105-
return metadata
106-
except MetadataParseError as e:
107-
suspicious_data = ''.join(self.raw_metadata(content))
108-
otel_span.set_attribute(
109-
"FAIRagro.middleware.MetadataExtractor.suspicious_data", suspicious_data)
110-
otel_span.record_exception(e)
111-
msg = "Could not extract meta data, maybe a parsing error?"
112-
otel_span.add_event(msg)
113-
logging.exception("%s, suspicious data:\n%s", msg, suspicious_data)
114-
return None
115-
116-
from .embedded_jsonld import MetadataExtractorEmbeddedJsonld # noqa: E402, F401
117-
from .jsonld import MetadataExtractorJsonld # noqa: E402, F401
9+
MetadataExtractorEmbeddedJsonld.register_implementation('embedded_jsonld')
10+
MetadataExtractorJsonld.register_implementation('jsonld')

middleware/metadata_scraper/metadata_extractor/embedded_jsonld.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from extruct import extract
1414
from bs4 import BeautifulSoup
1515

16-
from middleware.metadata_scraper.metadata_extractor import MetadataExtractor, MetadataParseError
16+
from .metadata_extractor import MetadataExtractor, MetadataParseError
1717

1818

1919
class MetadataExtractorEmbeddedJsonld(MetadataExtractor):
@@ -40,7 +40,8 @@ def metadata(self, content: str, url: str) -> List[Dict]:
4040
"""
4141
base_url = get_base_url(content, url)
4242
try:
43-
metadata = extract(content, base_url=base_url, uniform=True, syntaxes=['json-ld'])
43+
metadata = extract(content, base_url=base_url,
44+
uniform=True, syntaxes=['json-ld'])
4445
except Exception as e:
4546
raise MetadataParseError(e) from e
4647
return metadata['json-ld']
@@ -62,8 +63,5 @@ def raw_metadata(self, content: str) -> List[str]:
6263
"""
6364
soup = BeautifulSoup(content, 'html.parser')
6465
json_ld = soup.find_all('script', type='application/ld+json')
65-
metadata = [ js.text for js in json_ld ]
66+
metadata = [js.text for js in json_ld]
6667
return metadata
67-
68-
69-
MetadataExtractorEmbeddedJsonld.register_implementation('embedded_jsonld')

middleware/metadata_scraper/metadata_extractor/jsonld.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import json
1212
from typing import Dict, List
1313

14-
from middleware.metadata_scraper.metadata_extractor import MetadataExtractor, MetadataParseError
14+
from .metadata_extractor import MetadataExtractor, MetadataParseError
1515

1616

1717
class MetadataExtractorJsonld(MetadataExtractor):
@@ -63,6 +63,3 @@ def raw_metadata(self, content: str) -> List[str]:
6363
A one-element array contains the JSON-LD string.
6464
"""
6565
return [content]
66-
67-
68-
MetadataExtractorJsonld.register_implementation('jsonld')
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
"""
2+
This module defines the abstract base class 'MetadataExtractor'
3+
"""
4+
5+
from abc import abstractmethod
6+
from typing import Dict, List, Optional
7+
import logging
8+
from opentelemetry import trace
9+
10+
from middleware.utils.registering_abc import RegisteringABC
11+
12+
13+
class MetadataParseError(RuntimeError):
14+
"""
15+
An excpetion of this type will be thrown by implementations of MetadataExtractor
16+
if the content cannot be parsed.
17+
"""
18+
19+
def __init__(self, inner_stuff: Exception | str) -> None:
20+
super().__init__(f"Failed to parse metadata: {str(inner_stuff)}")
21+
self.inner_stuff = inner_stuff
22+
23+
24+
class MetadataExtractor(RegisteringABC):
25+
"""
26+
An abstract base class for metadata extractors (aka parsers).
27+
It will abstract away how the metadata is embedded in a webpage.
28+
29+
Methods
30+
-------
31+
metadata(content, url)
32+
An abstract method that is expected to extract metadata from the given content in
33+
terms of a nested Dict/List structure. It may throw exceptions -- of type
34+
'MetadataParseError'.
35+
raw_metadata(content)
36+
An abstract method that is expected to extract metadata from the given content in
37+
terms of a list of strings. It is expected not t>o raise exceptions of type
38+
'MetadataParseError.
39+
get_metadata_or_log_error(content, url)
40+
A wrapper method around 'metadata' that catches 'MetadataParseError' excpetions and
41+
logs them.
42+
"""
43+
44+
@abstractmethod
45+
def metadata(self, content: str, url: str) -> List[Dict]:
46+
"""
47+
Extracts metadata from the given content. It is expected that the content can define
48+
several sets of metadata, so a list is returned.
49+
Also it is expected that this function will raise Exceptions if something goes wrong,
50+
e.g. the content is not parsable.
51+
52+
Arguments
53+
---------
54+
content : str
55+
The dataset content to extract the metadata from.
56+
url : str
57+
The URL to extract metadata from. For some implementations it might be helpful
58+
to have this information.
59+
60+
Returns
61+
-------
62+
List[Dict]
63+
A list of dictionaries containing the extracted metadata.
64+
"""
65+
66+
@abstractmethod
67+
def raw_metadata(self, content: str) -> List[str]:
68+
"""
69+
Extracts unparsed metadata from the given content. It is expected that the content can
70+
define several sets of metadata, so a list is returned.
71+
It is expected that this function won't raise exceptions.
72+
73+
Arguments
74+
---------
75+
content : str
76+
The dataset content to extract the metadata from.
77+
78+
Returns
79+
-------
80+
List[str]
81+
A list of strings containing the unprased metadata.
82+
"""
83+
84+
def get_metadata_or_log_error(self, content: str, url: str) -> Optional[List[Dict]]:
85+
"""
86+
Tries to extract metadata from the given content that was downloaded from the given URL.
87+
If the metadata cannot be extracted, it logs an error and returns None.
88+
89+
Arguments
90+
---------
91+
content : str
92+
The content to extract metadata from.
93+
url : str
94+
The URL associated with the content.
95+
96+
Returns
97+
-------
98+
Optional[List[Dict]]
99+
The extracted metadata if successful, None otherwise.
100+
"""
101+
with trace.get_tracer(__name__).start_as_current_span(
102+
"MetadataExtractor.get_metadata_or_log_error") as otel_span:
103+
try:
104+
metadata = self.metadata(content, url)
105+
return metadata
106+
except MetadataParseError as e:
107+
suspicious_data = ''.join(self.raw_metadata(content))
108+
otel_span.set_attribute(
109+
"FAIRagro.middleware.MetadataExtractor.suspicious_data", suspicious_data)
110+
otel_span.record_exception(e)
111+
msg = "Could not extract meta data, maybe a parsing error?"
112+
otel_span.add_event(msg)
113+
logging.exception("%s, suspicious data:\n%s", msg, suspicious_data)
114+
return None

middleware/metadata_scraper/metadata_extractor/test/test_embedded_jsonld.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import unittest
44

55
from middleware.metadata_scraper.metadata_extractor.embedded_jsonld import MetadataExtractorEmbeddedJsonld
6-
from middleware.metadata_scraper.metadata_extractor import MetadataParseError
6+
from middleware.metadata_scraper.metadata_extractor.metadata_extractor import MetadataParseError
77
from middleware.utils.test_utils import assertListofCodesEqual
88

99

middleware/metadata_scraper/metadata_extractor/test/test_jsonld.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import unittest
44

55
from middleware.metadata_scraper.metadata_extractor.jsonld import MetadataExtractorJsonld
6-
from middleware.metadata_scraper.metadata_extractor import MetadataParseError
6+
from middleware.metadata_scraper.metadata_extractor.metadata_extractor import MetadataParseError
77
from middleware.utils.test_utils import assertListofCodesEqual
88

99

0 commit comments

Comments
 (0)