refactor: streamline LangchainSummarizer and implement DefaultConfluenceExtractor with improved extraction logic

a-klos · a-klos · commit 75e5e25e3ede · 2025-09-02T13:21:37.000+02:00
diff --git a/libs/admin-api-lib/src/admin_api_lib/impl/summarizer/langchain_summarizer.py b/libs/admin-api-lib/src/admin_api_lib/impl/summarizer/langchain_summarizer.py
@@ -1,6 +1,5 @@
 """Module for the LangchainSummarizer class."""
 
-import asyncio
 import logging
 import traceback
 from typing import Optional
@@ -28,8 +27,6 @@ class LangchainSummarizer(Summarizer):
     document and retries the summarization process if an error occurs.
     """
 
-    RETRY_WAIT_TIME = 10
-
     def __init__(
         self,
         langfuse_manager: LangfuseManager,
@@ -87,12 +84,7 @@ async def ainvoke(self, query: SummarizerInput, config: Optional[RunnableConfig]
                 except Exception as e:
                     logger.error("Error in summarizing langchain doc: %s %s", e, traceback.format_exc())
                     config["tries_remaining"] = tries_remaining - 1
-                    if "rate limit" in str(e).lower() or "ratelimit" in str(e).lower():
-                        logger.warning(
-                            "Rate limit encountered, waiting %d seconds before retry...", self.RETRY_WAIT_TIME
-                        )
-                        await asyncio.sleep(self.RETRY_WAIT_TIME)
-                    result = await self.ainvoke(query, config)
+                    result = await self._create_chain().ainvoke({"text": langchain_document.page_content}, config)
                     # Extract content from AIMessage if it's not already a string
                     content = result.content if hasattr(result, "content") else str(result)
                     outputs.append(content)
diff --git a/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py
@@ -1,104 +1,72 @@
-"""Module for the LangchainSummarizer class."""
+"""Module for the DefaultConfluenceExtractor class."""
 
 import logging
-import traceback
-from typing import Optional
-
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_core.documents import Document
-from langchain_core.runnables import Runnable, RunnableConfig, ensure_config
-
-from admin_api_lib.summarizer.summarizer import (
-    Summarizer,
-    SummarizerInput,
-    SummarizerOutput,
+from langchain_community.document_loaders import ConfluenceLoader
+
+from extractor_api_lib.impl.types.extractor_types import ExtractorTypes
+from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece
+from extractor_api_lib.models.extraction_parameters import ExtractionParameters
+from extractor_api_lib.extractors.information_extractor import InformationExtractor
+from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import (
+    ConfluenceLangchainDocument2InformationPiece,
 )
-from rag_core_lib.impl.langfuse_manager.langfuse_manager import LangfuseManager
-from rag_core_lib.impl.utils.async_threadsafe_semaphore import AsyncThreadsafeSemaphore
 
 logger = logging.getLogger(__name__)
 
 
-class LangchainSummarizer(Summarizer):
-    """Is responsible for summarizing input data.
-
-    LangchainSummarizer is responsible for summarizing input data using the LangfuseManager,
-    RecursiveCharacterTextSplitter, and AsyncThreadsafeSemaphore. It handles chunking of the input
-    document and retries the summarization process if an error occurs.
-    """
+class ConfluenceExtractor(InformationExtractor):
+    """Implementation of the InformationExtractor interface for confluence."""
 
     def __init__(
         self,
-        langfuse_manager: LangfuseManager,
-        chunker: RecursiveCharacterTextSplitter,
-        semaphore: AsyncThreadsafeSemaphore,
+        mapper: ConfluenceLangchainDocument2InformationPiece,
     ):
-        self._chunker = chunker
-        self._langfuse_manager = langfuse_manager
-        self._semaphore = semaphore
-
-    async def ainvoke(self, query: SummarizerInput, config: Optional[RunnableConfig] = None) -> SummarizerOutput:
         """
-        Asynchronously invokes the summarization process on the given query.
+        Initialize the ConfluenceExtractor.
 
         Parameters
         ----------
-        query : SummarizerInput
-            The input data to be summarized.
-        config : Optional[RunnableConfig], optional
-            Configuration options for the summarization process, by default None.
-
-        Returns
-        -------
-        SummarizerOutput
-            The summarized output.
-
-        Raises
-        ------
-        Exception
-            If the summary creation fails after the allowed number of tries.
-
-        Notes
-        -----
-        This method handles chunking of the input document and retries the summarization
-        process if an error occurs, up to the number of tries specified in the config.
+        mapper : ConfluenceLangchainDocument2InformationPiece
+            An instance of ConfluenceLangchainDocument2InformationPiece used for mapping langchain documents
+            to information pieces.
         """
-        assert query, "Query is empty: %s" % query  # noqa S101
-        config = ensure_config(config)
-        tries_remaining = config.get("configurable", {}).get("tries_remaining", 3)
-        logger.debug("Tries remaining %d" % tries_remaining)
+        self._mapper = mapper
 
-        if tries_remaining < 0:
-            raise Exception("Summary creation failed.")
-        document = Document(page_content=query)
-        langchain_documents = self._chunker.split_documents([document])
+    @property
+    def extractor_type(self) -> ExtractorTypes:
+        return ExtractorTypes.CONFLUENCE
 
-        outputs = []
-        for langchain_document in langchain_documents:
-            async with self._semaphore:
-                try:
-                    result = await self._create_chain().ainvoke({"text": langchain_document.page_content}, config)
-                    # Extract content from AIMessage if it's not already a string
-                    content = result.content if hasattr(result, "content") else str(result)
-                    outputs.append(content)
-                except Exception as e:
-                    logger.error("Error in summarizing langchain doc: %s %s", e, traceback.format_exc())
-                    config["tries_remaining"] = tries_remaining - 1
-                    result = await self._create_chain().ainvoke({"text": langchain_document.page_content}, config)
-                    # Extract content from AIMessage if it's not already a string
-                    content = result.content if hasattr(result, "content") else str(result)
-                    outputs.append(content)
+    async def aextract_content(
+        self,
+        extraction_parameters: ExtractionParameters,
+    ) -> list[InternalInformationPiece]:
+        """
+        Asynchronously extracts information pieces from Confluence.
 
-        if len(outputs) == 1:
-            return outputs[0]
-        summary = " ".join(outputs)
-        logger.debug(
-            "Reduced number of chars from %d to %d"
-            % (len("".join([x.page_content for x in langchain_documents])), len(summary))
-        )
-        return await self.ainvoke(summary, config)
+        Parameters
+        ----------
+        extraction_parameters : ExtractionParameters
+            The parameters required to connect to and extract data from Confluence.
 
-    def _create_chain(self) -> Runnable:
-        return self._langfuse_manager.get_base_prompt(self.__class__.__name__) | self._langfuse_manager.get_base_llm(
-            self.__class__.__name__
-        )
+        Returns
+        -------
+        list[InternalInformationPiece]
+            A list of information pieces extracted from Confluence.
+        """
+        # Convert list of key value pairs to dict
+        confluence_loader_parameters = {
+            x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs
+        }
+        if not confluence_loader_parameters.get("max_pages") or isinstance(
+            confluence_loader_parameters.get("max_pages"), str
+        ):
+            logging.warning(
+                "max_pages parameter is not set or invalid discarding it. ConfluenceLoader will use default value."
+            )
+            confluence_loader_parameters.pop("max_pages")
+        # Drop the document_name parameter as it is not used by the ConfluenceLoader
+        if "document_name" in confluence_loader_parameters:
+            confluence_loader_parameters.pop("document_name", None)
+        document_loader = ConfluenceLoader(**confluence_loader_parameters)
+        documents = document_loader.load()
+        return [self._mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents]