|
1 | 1 | """ |
2 | | -This module defines the abstract base class 'MetadataExtractor' |
| 2 | +Metadata scraper package. |
| 3 | +Register available implementations of SitemapParser and MetadataExtractor. |
3 | 4 | """ |
4 | 5 |
|
5 | | -from abc import abstractmethod |
6 | | -from typing import Dict, List, Optional |
7 | | -import logging |
8 | | -from opentelemetry import trace |
| 6 | +from .embedded_jsonld import MetadataExtractorEmbeddedJsonld |
| 7 | +from .jsonld import MetadataExtractorJsonld |
9 | 8 |
|
10 | | -from middleware.utils.registering_abc import RegisteringABC |
11 | | - |
12 | | - |
13 | | -class MetadataParseError(RuntimeError): |
14 | | - """ |
15 | | - An excpetion of this type will be thrown by implementations of MetadataExtractor |
16 | | - if the content cannot be parsed. |
17 | | - """ |
18 | | - |
19 | | - def __init__(self, inner_stuff: Exception | str) -> None: |
20 | | - super().__init__(f"Failed to parse metadata: {str(inner_stuff)}") |
21 | | - self.inner_stuff = inner_stuff |
22 | | - |
23 | | - |
24 | | -class MetadataExtractor(RegisteringABC): |
25 | | - """ |
26 | | - An abstract base class for metadata extractors (aka parsers). |
27 | | - It will abstract away how the metadata is embedded in a webpage. |
28 | | -
|
29 | | - Methods |
30 | | - ------- |
31 | | - metadata(content, url) |
32 | | - An abstract method that is expected to extract metadata from the given content in |
33 | | - terms of a nested Dict/List structure. It may throw exceptions -- of type |
34 | | - 'MetadataParseError'. |
35 | | - raw_metadata(content) |
36 | | - An abstract method that is expected to extract metadata from the given content in |
37 | | - terms of a list of strings. It is expected not to raise exceptions of type |
38 | | - 'MetadataParseError. |
39 | | - get_metadata_or_log_error(content, url) |
40 | | - A wrapper method around 'metadata' that catches 'MetadataParseError' excpetions and |
41 | | - logs them. |
42 | | - """ |
43 | | - |
44 | | - @abstractmethod |
45 | | - def metadata(self, content: str, url: str) -> List[Dict]: |
46 | | - """ |
47 | | - Extracts metadata from the given content. It is expected that the content can define |
48 | | - several sets of metadata, so a list is returned. |
49 | | - Also it is expected that this function will raise Exceptions if something goes wrong, |
50 | | - e.g. the content is not parsable. |
51 | | -
|
52 | | - Arguments |
53 | | - --------- |
54 | | - content : str |
55 | | - The dataset content to extract the metadata from. |
56 | | - url : str |
57 | | - The URL to extract metadata from. For some implementations it might be helpful |
58 | | - to have this information. |
59 | | -
|
60 | | - Returns |
61 | | - ------- |
62 | | - List[Dict] |
63 | | - A list of dictionaries containing the extracted metadata. |
64 | | - """ |
65 | | - |
66 | | - @abstractmethod |
67 | | - def raw_metadata(self, content: str) -> List[str]: |
68 | | - """ |
69 | | - Extracts unparsed metadata from the given content. It is expected that the content can |
70 | | - define several sets of metadata, so a list is returned. |
71 | | - It is expected that this function won't raise exceptions. |
72 | | -
|
73 | | - Arguments |
74 | | - --------- |
75 | | - content : str |
76 | | - The dataset content to extract the metadata from. |
77 | | -
|
78 | | - Returns |
79 | | - ------- |
80 | | - List[str] |
81 | | - A list of strings containing the unprased metadata. |
82 | | - """ |
83 | | - |
84 | | - def get_metadata_or_log_error(self, content: str, url: str) -> Optional[List[Dict]]: |
85 | | - """ |
86 | | - Tries to extract metadata from the given content that was downloaded from the given URL. |
87 | | - If the metadata cannot be extracted, it logs an error and returns None. |
88 | | - |
89 | | - Arguments |
90 | | - --------- |
91 | | - content : str |
92 | | - The content to extract metadata from. |
93 | | - url : str |
94 | | - The URL associated with the content. |
95 | | - |
96 | | - Returns |
97 | | - ------- |
98 | | - Optional[List[Dict]] |
99 | | - The extracted metadata if successful, None otherwise. |
100 | | - """ |
101 | | - with trace.get_tracer(__name__).start_as_current_span( |
102 | | - "MetadataExtractor.get_metadata_or_log_error") as otel_span: |
103 | | - try: |
104 | | - metadata = self.metadata(content, url) |
105 | | - return metadata |
106 | | - except MetadataParseError as e: |
107 | | - suspicious_data = ''.join(self.raw_metadata(content)) |
108 | | - otel_span.set_attribute( |
109 | | - "FAIRagro.middleware.MetadataExtractor.suspicious_data", suspicious_data) |
110 | | - otel_span.record_exception(e) |
111 | | - msg = "Could not extract meta data, maybe a parsing error?" |
112 | | - otel_span.add_event(msg) |
113 | | - logging.exception("%s, suspicious data:\n%s", msg, suspicious_data) |
114 | | - return None |
115 | | - |
116 | | -from .embedded_jsonld import MetadataExtractorEmbeddedJsonld # noqa: E402, F401 |
117 | | -from .jsonld import MetadataExtractorJsonld # noqa: E402, F401 |
| 9 | +MetadataExtractorEmbeddedJsonld.register_implementation('embedded_jsonld') |
| 10 | +MetadataExtractorJsonld.register_implementation('jsonld') |
0 commit comments