2121 SitemapParseError , SitemapParser )
2222from middleware .metadata_scraper .metadata_extractor .metadata_extractor import (
2323 MetadataExtractor , MetadataParseError )
24+ from middleware .utils .tracer import traced
2425
2526
2627class MetadataScraperConfig (NamedTuple ):
@@ -49,6 +50,7 @@ class MetadataScraperConfig(NamedTuple):
4950}
5051
5152
53+ @traced
5254async def _extract_metadata (
5355 url : str ,
5456 session : HttpSession ,
@@ -70,26 +72,26 @@ async def _extract_metadata(
7072 Optional[List[Dict]]
7173 A dictionary containing the extracted metadata.
7274 """
73- with trace .get_tracer ( __name__ ). start_as_current_span (
74- "MetadataScraper._extract_metadata" ) as otel_span :
75- otel_span . set_attribute ( url_attributes . URL_FULL , url )
76- try :
77- content = await session . get_decoded_url ( url )
78- metadata = extractor . get_metadata_or_log_error ( content , url )
79- return metadata
80- except ( HttpSessionResponseError , HttpSessionDecodeError ) as e :
81- # These exceptions are raised by get_decoded_url.
82- # Treat them as errors that only relate to single datasets and
83- # skip this dataset.
84- # (Same approach as get_metadata_or_log_error performs internally
85- # when it encounters parsing errors )
86- otel_span . record_exception ( e )
87- msg = "caught recoverable exception, omitting metadataset"
88- otel_span . add_event (msg )
89- logging . exception ( msg )
90- return None
91-
92-
75+ otel_span = trace .get_current_span ()
76+ otel_span . set_attribute ( url_attributes . URL_FULL , url )
77+ try :
78+ content = await session . get_decoded_url ( url )
79+ metadata = extractor . get_metadata_or_log_error ( content , url )
80+ return metadata
81+ except ( HttpSessionResponseError , HttpSessionDecodeError ) as e :
82+ # These exceptions are raised by get_decoded_url.
83+ # Treat them as errors that only relate to single datasets and
84+ # skip this dataset.
85+ # (Same approach as get_metadata_or_log_error performs internally
86+ # when it encounters parsing errors)
87+ otel_span . record_exception ( e )
88+ msg = "caught recoverable exception, omitting metadataset"
89+ otel_span . add_event ( msg )
90+ logging . exception (msg )
91+ return None
92+
93+
94+ @ traced
9395async def _extract_many_metadata (
9496 urls : List [str ],
9597 session : HttpSession ,
@@ -114,29 +116,28 @@ async def _extract_many_metadata(
114116 include several several metadata entries or none (especially in case the
115117 metadata extraction failed).
116118 """
117- with trace .get_tracer (__name__ ).start_as_current_span (
118- "MetadataScraper.extract_metadata" ) as otel_span :
119- extractors = [_extract_metadata (
120- url , session , extractor ) for url in urls ]
121- datasets = await asyncio .gather (* extractors , return_exceptions = True )
122- for dataset in datasets :
123- if isinstance (dataset , Exception ):
124- otel_span .record_exception (dataset )
125- msg = "caught unrecoverable exception, omitting all metadata of RDI"
126- otel_span .add_event (msg )
127- logging .exception (msg )
128- return None , SKIP_RDI_REPORT
129-
130- filtered_datasets = (m for m in datasets if isinstance (m , list ))
131- result = list (itertools .chain .from_iterable (filtered_datasets ))
132- report = {
133- 'valid_entries' : len (result ),
134- 'failed_entries' : len (datasets )- len (result ),
135- 'skipped' : False
136- }
137- return result , report
119+ otel_span = trace .get_current_span ()
120+ extractors = [_extract_metadata (url , session , extractor ) for url in urls ]
121+ datasets = await asyncio .gather (* extractors , return_exceptions = True )
122+ for dataset in datasets :
123+ if isinstance (dataset , Exception ):
124+ otel_span .record_exception (dataset )
125+ msg = "caught unrecoverable exception, omitting all metadata of RDI"
126+ otel_span .add_event (msg )
127+ logging .exception (msg )
128+ return None , SKIP_RDI_REPORT
129+
130+ filtered_datasets = (m for m in datasets if isinstance (m , list ))
131+ result = list (itertools .chain .from_iterable (filtered_datasets ))
132+ report = {
133+ 'valid_entries' : len (result ),
134+ 'failed_entries' : len (datasets )- len (result ),
135+ 'skipped' : False
136+ }
137+ return result , report
138138
139139
140+ @traced
140141async def scrape_repo (
141142 config : MetadataScraperConfig ,
142143 default_session_config : HttpSessionConfig ) -> Tuple [Optional [List [Dict ]], Dict ]:
@@ -156,34 +157,33 @@ async def scrape_repo(
156157 The extracted metadata in terms of python dictonaries.
157158 """
158159
159- with trace .get_tracer (__name__ ).start_as_current_span (
160- "MetadataScraper.scrape_repo" ) as otel_span :
161- otel_span .set_attribute (
162- "FAIRagro.middleware.MetadataScraper.repository_name" , config .name )
163- otel_span .set_attribute (
164- "FAIRagro.middleware.MetadataScraper.repository_sitemap_url" , config .url )
165- try :
166- if config .http_client :
167- http_session_config = HttpSessionConfig (** config .http_client )
168- else :
169- http_session_config = default_session_config
170- async with HttpSession (http_session_config ) as session :
171- sitemap_content = await session .get_decoded_url (config .url )
172- parser = SitemapParser .create_instance (
173- config .sitemap , sitemap_content )
174- if parser .has_metadata :
175- return parser .metadata
176-
177- urls = list (parser .datasets )
178- if config .metadata :
179- extractor = MetadataExtractor .create_instance (config .metadata )
180- metadata , report = await _extract_many_metadata (urls , session , extractor )
181- return metadata , report
182-
183- except (HttpSessionFetchError , SitemapParseError , MetadataParseError ) as e :
184- otel_span .record_exception (e )
185- msg = "Could not download or parse RDI sitemap, skipping RDI"
186- otel_span .add_event (msg )
187- logging .exception (msg )
188-
189- return None , SKIP_RDI_REPORT
160+ otel_span = trace .get_current_span ()
161+ otel_span .set_attribute (
162+ "FAIRagro.middleware.MetadataScraper.repository_name" , config .name )
163+ otel_span .set_attribute (
164+ "FAIRagro.middleware.MetadataScraper.repository_sitemap_url" , config .url )
165+ try :
166+ if config .http_client :
167+ http_session_config = HttpSessionConfig (** config .http_client )
168+ else :
169+ http_session_config = default_session_config
170+ async with HttpSession (http_session_config ) as session :
171+ sitemap_content = await session .get_decoded_url (config .url )
172+ parser = SitemapParser .create_instance (
173+ config .sitemap , sitemap_content )
174+ if parser .has_metadata :
175+ return parser .metadata
176+
177+ urls = list (parser .datasets )
178+ if config .metadata :
179+ extractor = MetadataExtractor .create_instance (config .metadata )
180+ metadata , report = await _extract_many_metadata (urls , session , extractor )
181+ return metadata , report
182+
183+ except (HttpSessionFetchError , SitemapParseError , MetadataParseError ) as e :
184+ otel_span .record_exception (e )
185+ msg = "Could not download or parse RDI sitemap, skipping RDI"
186+ otel_span .add_event (msg )
187+ logging .exception (msg )
188+
189+ return None , SKIP_RDI_REPORT
0 commit comments