fairagro
diff --git a/‎middleware/http_session/__init__.py‎
Lines changed: 63 additions & 61 deletions b/‎middleware/http_session/__init__.py‎
Lines changed: 63 additions & 61 deletions
diff --git a/‎middleware/main.py‎
Lines changed: 26 additions & 27 deletions b/‎middleware/main.py‎
Lines changed: 26 additions & 27 deletions
diff --git a/‎middleware/metadata_scraper/__init__.py‎
Lines changed: 72 additions & 72 deletions b/‎middleware/metadata_scraper/__init__.py‎
Lines changed: 72 additions & 72 deletions
@@ -14,6 +14,8 @@
 import aiofiles
 import chardet
 
+from middleware.utils.tracer import traced
+
 
 class HttpSessionConfig(NamedTuple):
     """
@@ -117,6 +119,7 @@ async def __aexit__(
         """
         return await super().__aexit__(exc_type, exc_val, exc_tb)
 
+    @traced
     async def get_decoded_url(self, url: str) -> str:
         """
         Fetches the content of the given URL and decodes it using the detected encoding.
@@ -132,70 +135,69 @@ async def get_decoded_url(self, url: str) -> str:
                 The decoded content of the URL.
         """
 
-        with trace.get_tracer(__name__).start_as_current_span(
-                "HttpSession.get_decoded_url") as otel_span:
-            otel_span.set_attribute(url_attributes.URL_FULL, url)
-
-            parsed_url = urlparse(url)  # does not raise
-            if parsed_url.scheme in ["http", "https"]:
-                try:
-                    async with self.get(url) as response:
-                        # treat 5xx like a technical network error
-                        if 500 <= response.status < 600:
-                            otel_span.add_event(
-                                "server reponse 5xx, raising HttpSessionTechnicalError")
-                            raise HttpSessionTechnicalError(
-                                f"Server error {response.status} for {url}"
-                            )
-                        # treat 4xx as response error
-                        if 400 <= response.status < 500:
-                            otel_span.add_event(
-                                "server reponse 4xx, raising HttpSessionResponseError")
-                            raise HttpSessionResponseError(
-                                f"Server error {response.status} for {url}"
-                            )
-                        encoded_content = await response.read()
-                except (ClientError, asyncio.TimeoutError) as e:
-                    otel_span.record_exception(e)
-                    otel_span.add_event(
-                        "caught network-related exception, raising HttpSessionTechnicalError")
-                    raise HttpSessionTechnicalError(
-                        f"Cannot fetch {url}: {e}") from e
-            elif parsed_url.scheme == "file":
-                try:
-                    # We need to deal with the following situation:
-                    # urlparse('file://test') => netloc = 'test', path = '', joined = 'test'
-                    # urlparse('file:///test') => netloc = '', path = '/test', joined = '\test'
-                    # urlparse('file://./test') => netloc = '.', path = '/test', joined = '\test'
-                    # In the last case the path is relative, so the result is wrong. Thus this code:
-                    base_path = PurePath(parsed_url.netloc)
-                    if base_path == PurePath('.'):
-                        path = base_path / parsed_url.path.lstrip("/").lstrip("\\")
-                    else:
-                        path = base_path / parsed_url.path
-                    async with aiofiles.open(path, 'rb') as f:
-                        encoded_content = await f.read()
-                except Exception as e:
-                    otel_span.record_exception(e)
-                    otel_span.add_event(
-                        "caught exception when trying to read file, "
-                        "raising HttpSessionResponseError")
-                    raise HttpSessionResponseError(
-                        f"Cannot read file {url}: {e}") from e
-            else:
-                otel_span.add_event(
-                    "found unsupported URL protocol, raising HttpSessionArgumentError")
-                raise HttpSessionArgumentError(
-                    f"Unsupported URL scheme: {parsed_url.scheme} in URL {url}")
+        otel_span = trace.get_current_span()
+        otel_span.set_attribute(url_attributes.URL_FULL, url)
 
+        parsed_url = urlparse(url)  # does not raise
+        if parsed_url.scheme in ["http", "https"]:
+            try:
+                async with self.get(url) as response:
+                    # treat 5xx like a technical network error
+                    if 500 <= response.status < 600:
+                        otel_span.add_event(
+                            "server reponse 5xx, raising HttpSessionTechnicalError")
+                        raise HttpSessionTechnicalError(
+                            f"Server error {response.status} for {url}"
+                        )
+                    # treat 4xx as response error
+                    if 400 <= response.status < 500:
+                        otel_span.add_event(
+                            "server reponse 4xx, raising HttpSessionResponseError")
+                        raise HttpSessionResponseError(
+                            f"Server error {response.status} for {url}"
+                        )
+                    encoded_content = await response.read()
+            except (ClientError, asyncio.TimeoutError) as e:
+                otel_span.record_exception(e)
+                otel_span.add_event(
+                    "caught network-related exception, raising HttpSessionTechnicalError")
+                raise HttpSessionTechnicalError(
+                    f"Cannot fetch {url}: {e}") from e
+        elif parsed_url.scheme == "file":
             try:
-                encoding = str(chardet.detect(encoded_content)['encoding']) or 'utf-8'
-                content = encoded_content.decode(encoding)
+                # We need to deal with the following situation:
+                # urlparse('file://test') => netloc = 'test', path = '', joined = 'test'
+                # urlparse('file:///test') => netloc = '', path = '/test', joined = '\test'
+                # urlparse('file://./test') => netloc = '.', path = '/test', joined = '\test'
+                # In the last case the path is relative, so the result is wrong. Thus this code:
+                base_path = PurePath(parsed_url.netloc)
+                if base_path == PurePath('.'):
+                    path = base_path / parsed_url.path.lstrip("/").lstrip("\\")
+                else:
+                    path = base_path / parsed_url.path
+                async with aiofiles.open(path, 'rb') as f:
+                    encoded_content = await f.read()
             except Exception as e:
                 otel_span.record_exception(e)
                 otel_span.add_event(
-                    "caught exception during decoding, raising HttpSessionDecodeError")
-                raise HttpSessionDecodeError(
-                    f"cannot decode URL content from {url}: {e}") from e
+                    "caught exception when trying to read file, "
+                    "raising HttpSessionResponseError")
+                raise HttpSessionResponseError(
+                    f"Cannot read file {url}: {e}") from e
+        else:
+            otel_span.add_event(
+                "found unsupported URL protocol, raising HttpSessionArgumentError")
+            raise HttpSessionArgumentError(
+                f"Unsupported URL scheme: {parsed_url.scheme} in URL {url}")
+
+        try:
+            encoding = str(chardet.detect(encoded_content)['encoding']) or 'utf-8'
+            content = encoded_content.decode(encoding)
+        except Exception as e:
+            otel_span.record_exception(e)
+            otel_span.add_event(
+                "caught exception during decoding, raising HttpSessionDecodeError")
+            raise HttpSessionDecodeError(
+                f"cannot decode URL content from {url}: {e}") from e
 
-            return content
+        return content
@@ -31,9 +31,7 @@
 from middleware.git_repo import GitRepo, GitRepoConfig
 from middleware.http_session import HttpSessionConfig
 from middleware.metadata_scraper import MetadataScraperConfig, scrape_repo
-
-# add the script directory to the python module path
-sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from middleware.utils.tracer import traced
 
 # Disable pylint warning that imports are not on top. But we need to adapt the import path before.
 # Is there another solution so packages next top the main script can be found?
@@ -372,37 +370,38 @@ async def process_sitemap(sitemap, local_path, default_http_config, git_repo):
     return repo_reports
 
 
+@traced
 async def main():
     """
     The main async function of the basic middleware
     """
 
     args, config = setup_and_config()
 
-    with trace.get_tracer(__name__).start_as_current_span("main") as otel_span:
-        try:
-            git_repo, local_path = await setup_repo(args, config)
-            default_http_config = HttpSessionConfig(**config["http_client"])
-
-            full_report = []
-            for sitemap in config["sitemaps"]:
-                repo_reports = await process_sitemap(
-                    sitemap, local_path, default_http_config, git_repo
-                )
-                full_report.extend(repo_reports)
-
-            if git_repo:
-                git_repo.push()
-
-            print(json.dumps(full_report, indent=2, ensure_ascii=False, sort_keys=True))
-
-        # pylint: disable-next=broad-except
-        except Exception as e:
-            otel_span.record_exception(e)
-            msg = "Error when scraping repositories"
-            otel_span.add_event(msg)
-            logging.exception(msg)
-            sys.exit(1)
+    try:
+        git_repo, local_path = await setup_repo(args, config)
+        default_http_config = HttpSessionConfig(**config["http_client"])
+
+        full_report = []
+        for sitemap in config["sitemaps"]:
+            repo_reports = await process_sitemap(
+                sitemap, local_path, default_http_config, git_repo
+            )
+            full_report.extend(repo_reports)
+
+        if git_repo:
+            git_repo.push()
+
+        print(json.dumps(full_report, indent=2, ensure_ascii=False, sort_keys=True))
+
+    # pylint: disable-next=broad-except
+    except Exception as e:
+        otel_span = trace.get_current_span()
+        otel_span.record_exception(e)
+        msg = "Error when scraping repositories"
+        otel_span.add_event(msg)
+        logging.exception(msg)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
 
@@ -21,6 +21,7 @@
     SitemapParseError, SitemapParser)
 from middleware.metadata_scraper.metadata_extractor.metadata_extractor import (
     MetadataExtractor, MetadataParseError)
+from middleware.utils.tracer import traced
 
 
 class MetadataScraperConfig(NamedTuple):
@@ -49,6 +50,7 @@ class MetadataScraperConfig(NamedTuple):
 }
 
 
+@traced
 async def _extract_metadata(
         url: str,
         session: HttpSession,
@@ -70,26 +72,26 @@ async def _extract_metadata(
     Optional[List[Dict]]
         A dictionary containing the extracted metadata.
     """
-    with trace.get_tracer(__name__).start_as_current_span(
-            "MetadataScraper._extract_metadata") as otel_span:
-        otel_span.set_attribute(url_attributes.URL_FULL, url)
-        try:
-            content = await session.get_decoded_url(url)
-            metadata = extractor.get_metadata_or_log_error(content, url)
-            return metadata
-        except (HttpSessionResponseError, HttpSessionDecodeError) as e:
-            # These exceptions are raised by get_decoded_url.
-            # Treat them as errors that only relate to single datasets and
-            # skip this dataset.
-            # (Same approach as get_metadata_or_log_error performs internally
-            # when it encounters parsing errors)
-            otel_span.record_exception(e)
-            msg = "caught recoverable exception, omitting metadataset"
-            otel_span.add_event(msg)
-            logging.exception(msg)
-            return None
-
-
+    otel_span = trace.get_current_span()
+    otel_span.set_attribute(url_attributes.URL_FULL, url)
+    try:
+        content = await session.get_decoded_url(url)
+        metadata = extractor.get_metadata_or_log_error(content, url)
+        return metadata
+    except (HttpSessionResponseError, HttpSessionDecodeError) as e:
+        # These exceptions are raised by get_decoded_url.
+        # Treat them as errors that only relate to single datasets and
+        # skip this dataset.
+        # (Same approach as get_metadata_or_log_error performs internally
+        # when it encounters parsing errors)
+        otel_span.record_exception(e)
+        msg = "caught recoverable exception, omitting metadataset"
+        otel_span.add_event(msg)
+        logging.exception(msg)
+        return None
+
+
+@traced
 async def _extract_many_metadata(
         urls: List[str],
         session: HttpSession,
@@ -114,29 +116,28 @@ async def _extract_many_metadata(
             include several several metadata entries or none (especially in case the
             metadata extraction failed).
     """
-    with trace.get_tracer(__name__).start_as_current_span(
-            "MetadataScraper.extract_metadata") as otel_span:
-        extractors = [_extract_metadata(
-            url, session, extractor) for url in urls]
-        datasets = await asyncio.gather(*extractors, return_exceptions=True)
-        for dataset in datasets:
-            if isinstance(dataset, Exception):
-                otel_span.record_exception(dataset)
-                msg = "caught unrecoverable exception, omitting all metadata of RDI"
-                otel_span.add_event(msg)
-                logging.exception(msg)
-                return None, SKIP_RDI_REPORT
-
-        filtered_datasets = (m for m in datasets if isinstance(m, list))
-        result = list(itertools.chain.from_iterable(filtered_datasets))
-        report = {
-            'valid_entries': len(result),
-            'failed_entries': len(datasets)-len(result),
-            'skipped': False
-        }
-        return result, report
+    otel_span = trace.get_current_span()
+    extractors = [_extract_metadata(url, session, extractor) for url in urls]
+    datasets = await asyncio.gather(*extractors, return_exceptions=True)
+    for dataset in datasets:
+        if isinstance(dataset, Exception):
+            otel_span.record_exception(dataset)
+            msg = "caught unrecoverable exception, omitting all metadata of RDI"
+            otel_span.add_event(msg)
+            logging.exception(msg)
+            return None, SKIP_RDI_REPORT
+
+    filtered_datasets = (m for m in datasets if isinstance(m, list))
+    result = list(itertools.chain.from_iterable(filtered_datasets))
+    report = {
+        'valid_entries': len(result),
+        'failed_entries': len(datasets)-len(result),
+        'skipped': False
+    }
+    return result, report
 
 
+@traced
 async def scrape_repo(
         config: MetadataScraperConfig,
         default_session_config: HttpSessionConfig) -> Tuple[Optional[List[Dict]], Dict]:
@@ -156,34 +157,33 @@ async def scrape_repo(
             The extracted metadata in terms of python dictonaries.
     """
 
-    with trace.get_tracer(__name__).start_as_current_span(
-            "MetadataScraper.scrape_repo") as otel_span:
-        otel_span.set_attribute(
-            "FAIRagro.middleware.MetadataScraper.repository_name", config.name)
-        otel_span.set_attribute(
-            "FAIRagro.middleware.MetadataScraper.repository_sitemap_url", config.url)
-        try:
-            if config.http_client:
-                http_session_config = HttpSessionConfig(**config.http_client)
-            else:
-                http_session_config = default_session_config
-            async with HttpSession(http_session_config) as session:
-                sitemap_content = await session.get_decoded_url(config.url)
-                parser = SitemapParser.create_instance(
-                    config.sitemap, sitemap_content)
-                if parser.has_metadata:
-                    return parser.metadata
-
-                urls = list(parser.datasets)
-                if config.metadata:
-                    extractor = MetadataExtractor.create_instance(config.metadata)
-                    metadata, report = await _extract_many_metadata(urls, session, extractor)
-                    return metadata, report
-
-        except (HttpSessionFetchError, SitemapParseError, MetadataParseError) as e:
-            otel_span.record_exception(e)
-            msg = "Could not download or parse RDI sitemap, skipping RDI"
-            otel_span.add_event(msg)
-            logging.exception(msg)
-
-        return None, SKIP_RDI_REPORT
+    otel_span = trace.get_current_span()
+    otel_span.set_attribute(
+        "FAIRagro.middleware.MetadataScraper.repository_name", config.name)
+    otel_span.set_attribute(
+        "FAIRagro.middleware.MetadataScraper.repository_sitemap_url", config.url)
+    try:
+        if config.http_client:
+            http_session_config = HttpSessionConfig(**config.http_client)
+        else:
+            http_session_config = default_session_config
+        async with HttpSession(http_session_config) as session:
+            sitemap_content = await session.get_decoded_url(config.url)
+            parser = SitemapParser.create_instance(
+                config.sitemap, sitemap_content)
+            if parser.has_metadata:
+                return parser.metadata
+
+            urls = list(parser.datasets)
+            if config.metadata:
+                extractor = MetadataExtractor.create_instance(config.metadata)
+                metadata, report = await _extract_many_metadata(urls, session, extractor)
+                return metadata, report
+
+    except (HttpSessionFetchError, SitemapParseError, MetadataParseError) as e:
+        otel_span.record_exception(e)
+        msg = "Could not download or parse RDI sitemap, skipping RDI"
+        otel_span.add_event(msg)
+        logging.exception(msg)
+
+    return None, SKIP_RDI_REPORT