Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ARG DEBIAN_FRONTEND=noninteractive
ARG USER=vscode

RUN DEBIAN_FRONTEND=noninteractive \
&& apt-get update \
&& apt-get update \
&& apt-get install -y build-essential --no-install-recommends make \
ca-certificates \
git \
Expand All @@ -27,7 +27,7 @@ RUN DEBIAN_FRONTEND=noninteractive \
# Python and poetry installation
USER $USER
ARG HOME="/home/$USER"
ARG PYTHON_VERSION=3.11
ARG PYTHON_VERSION=3.13

ENV PYENV_ROOT="${HOME}/.pyenv"
ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${HOME}/.local/bin:$PATH"
Expand All @@ -40,4 +40,4 @@ RUN echo "done 0" \
&& pyenv global ${PYTHON_VERSION} \
&& echo "done 3" \
&& curl -sSL https://install.python-poetry.org | python3 - \
&& poetry config virtualenvs.in-project true
&& poetry config virtualenvs.in-project true
19 changes: 18 additions & 1 deletion libs/extractor-api-lib/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions libs/extractor-api-lib/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ markdownify = "^1.1.0"
langchain-core = "0.3.63"
camelot-py = {extras = ["cv"], version = "^1.0.0"}
fake-useragent = "^2.2.0"
pypandoc-binary = "^1.15"

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.5"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,21 @@
from dependency_injector.containers import DeclarativeContainer
from dependency_injector.providers import Factory, List, Singleton # noqa: WOT001

from extractor_api_lib.impl.api_endpoints.general_source_extractor import GeneralSourceExtractor
from extractor_api_lib.impl.api_endpoints.general_file_extractor import (
GeneralFileExtractor,
)
from extractor_api_lib.impl.api_endpoints.general_source_extractor import (
GeneralSourceExtractor,
)
from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor
from extractor_api_lib.impl.extractors.file_extractors.ms_docs_extractor import MSDocsExtractor
from extractor_api_lib.impl.extractors.file_extractors.epub_extractor import (
EpubExtractor,
)
from extractor_api_lib.impl.extractors.file_extractors.ms_docs_extractor import (
MSDocsExtractor,
)
from extractor_api_lib.impl.extractors.file_extractors.pdf_extractor import PDFExtractor
from extractor_api_lib.impl.extractors.file_extractors.xml_extractor import XMLExtractor
from extractor_api_lib.impl.api_endpoints.general_file_extractor import GeneralFileExtractor
from extractor_api_lib.impl.extractors.sitemap_extractor import SitemapExtractor
from extractor_api_lib.impl.file_services.s3_service import S3Service
from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import (
Expand All @@ -17,7 +26,12 @@
from extractor_api_lib.impl.mapper.internal2external_information_piece import (
Internal2ExternalInformationPiece,
)
from extractor_api_lib.impl.mapper.sitemap_document2information_piece import SitemapLangchainDocument2InformationPiece
from extractor_api_lib.impl.mapper.langchain_document2information_piece import (
LangchainDocument2InformationPiece,
)
from extractor_api_lib.impl.mapper.sitemap_document2information_piece import (
SitemapLangchainDocument2InformationPiece,
)
from extractor_api_lib.impl.settings.pdf_extractor_settings import PDFExtractorSettings
from extractor_api_lib.impl.settings.s3_settings import S3Settings
from extractor_api_lib.impl.table_converter.dataframe2markdown import DataFrame2Markdown
Expand All @@ -44,12 +58,15 @@ class DependencyContainer(DeclarativeContainer):
xml_extractor = Singleton(XMLExtractor, file_service)

intern2external = Singleton(Internal2ExternalInformationPiece)
langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece)
confluence_langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece)
langchain_document2information_piece = Singleton(LangchainDocument2InformationPiece)
sitemap_document2information_piece = Singleton(SitemapLangchainDocument2InformationPiece)
file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor)
epub_extractor = Singleton(EpubExtractor, file_service, langchain_document2information_piece)

file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor, epub_extractor)

general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors, intern2external)
confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece)
confluence_extractor = Singleton(ConfluenceExtractor, mapper=confluence_langchain_document2information_piece)

sitemap_extractor = Singleton(
SitemapExtractor,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""Module containing the EpubExtractor class."""

import logging
from pathlib import Path

from langchain_community.document_loaders import UnstructuredEPubLoader

from extractor_api_lib.extractors.information_file_extractor import (
InformationFileExtractor,
)
from extractor_api_lib.file_services.file_service import FileService
from extractor_api_lib.impl.mapper.langchain_document2information_piece import (
LangchainDocument2InformationPiece,
)
from extractor_api_lib.impl.types.file_type import FileType
from extractor_api_lib.models.dataclasses.internal_information_piece import (
InternalInformationPiece,
)

logger = logging.getLogger(__name__)


class EpubExtractor(InformationFileExtractor):
"""Extractor for Epub documents using unstructured library."""

def __init__(
self,
file_service: FileService,
mapper: LangchainDocument2InformationPiece,
):
"""Initialize the EpubExtractor.

Parameters
----------
file_service : FileService
Handler for downloading the file to extract content from and upload results to if required.
mapper : LangchainDocument2InformationPiece
An instance of LangchainDocument2InformationPiece used for mapping langchain documents
to information pieces.
"""
super().__init__(file_service=file_service)
self._mapper = mapper

@property
def compatible_file_types(self) -> list[FileType]:
"""
List of compatible file types for the EPUB extractor.

Returns
-------
list[FileType]
A list containing the compatible file types, which in this case is EPUB.
"""
return [FileType.EPUB]

async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]:
"""
Extract content from an epub file and processes the elements.

Parameters
----------
file_path : Path
The path to the epub file to be processed.
name : str
Name of the document.

Returns
-------
list[InformationPiece]
A list of processed information pieces extracted from the epub file.
"""
elements = UnstructuredEPubLoader(file_path.as_posix()).load()
return [self._mapper.map_document2informationpiece(document=x, document_name=name) for x in elements]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Module for the LangchainDocument2InformationPiece class."""

from extractor_api_lib.mapper.source_langchain_document2information_piece import (
SourceLangchainDocument2InformationPiece,
)


class LangchainDocument2InformationPiece(SourceLangchainDocument2InformationPiece):
"""A class to map a LangchainDocument to an InformationPiece."""

def _map_meta(self, internal: dict, document_name: str) -> dict:
return internal
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ class FileType(StrEnum):
DOCX = "DOCX"
PPTX = "PPTX"
XML = "XML"
EPUB = "EPUB"
50 changes: 50 additions & 0 deletions libs/extractor-api-lib/tests/epub_extractor_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Comprehensive test suite for SitemapExtractor class."""

from pathlib import Path

import pytest

from extractor_api_lib.impl.extractors.file_extractors.epub_extractor import (
EpubExtractor,
)
from extractor_api_lib.impl.mapper.langchain_document2information_piece import (
LangchainDocument2InformationPiece,
)
from extractor_api_lib.impl.types.file_type import FileType
from extractor_api_lib.models.content_type import ContentType


class TestEpubExtractor:
"""Test class for EpubExtractor."""

@pytest.fixture
def mapper(self) -> LangchainDocument2InformationPiece:
return LangchainDocument2InformationPiece()

@pytest.fixture
def epub_extractor(self, mock_file_service, mapper):
"""Create a EpubExtractor instance for testing."""
return EpubExtractor(file_service=mock_file_service, mapper=mapper)

def test_init(self, mock_file_service, mapper):
"""Test EpubExtractor initialization."""
extractor = EpubExtractor(file_service=mock_file_service, mapper=mapper)
assert extractor._mapper == mapper
assert extractor._file_service == mock_file_service

def test_file_type(self, epub_extractor):
"""Test that extractor_type returns EPUB."""
assert epub_extractor.compatible_file_types == [FileType.EPUB]

@pytest.mark.asyncio
async def test_extract_content_success(self, epub_extractor):
page_content = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam"

test_data_dir = Path(__file__).parent / "test_data"

file_path = test_data_dir / "LoremIpsum.epub"
result = await epub_extractor.aextract_content(file_path, file_path.name)

assert len(result) == 1
assert result[0].type == ContentType.TEXT
assert result[0].page_content == page_content
Binary file not shown.
Loading