Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ARG DEBIAN_FRONTEND=noninteractive
ARG USER=vscode

RUN DEBIAN_FRONTEND=noninteractive \
&& apt-get update \
&& apt-get update \
&& apt-get install -y build-essential --no-install-recommends make \
ca-certificates \
git \
Expand All @@ -27,7 +27,7 @@ RUN DEBIAN_FRONTEND=noninteractive \
# Python and poetry installation
USER $USER
ARG HOME="/home/$USER"
ARG PYTHON_VERSION=3.11
ARG PYTHON_VERSION=3.13

ENV PYENV_ROOT="${HOME}/.pyenv"
ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${HOME}/.local/bin:$PATH"
Expand All @@ -40,4 +40,4 @@ RUN echo "done 0" \
&& pyenv global ${PYTHON_VERSION} \
&& echo "done 3" \
&& curl -sSL https://install.python-poetry.org | python3 - \
&& poetry config virtualenvs.in-project true
&& poetry config virtualenvs.in-project true
19 changes: 18 additions & 1 deletion libs/extractor-api-lib/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions libs/extractor-api-lib/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ markdownify = "^1.1.0"
langchain-core = "0.3.63"
camelot-py = {extras = ["cv"], version = "^1.0.0"}
fake-useragent = "^2.2.0"
pypandoc-binary = "^1.15"

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.5"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from extractor_api_lib.impl.api_endpoints.general_source_extractor import GeneralSourceExtractor
from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor
from extractor_api_lib.impl.extractors.file_extractors.epub_extractor import EpubExtractor
from extractor_api_lib.impl.extractors.file_extractors.ms_docs_extractor import MSDocsExtractor
from extractor_api_lib.impl.extractors.file_extractors.pdf_extractor import PDFExtractor
from extractor_api_lib.impl.extractors.file_extractors.xml_extractor import XMLExtractor
Expand Down Expand Up @@ -46,7 +47,9 @@ class DependencyContainer(DeclarativeContainer):
intern2external = Singleton(Internal2ExternalInformationPiece)
langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece)
sitemap_document2information_piece = Singleton(SitemapLangchainDocument2InformationPiece)
file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor)
epub_extractor = Singleton(EpubExtractor, file_service, langchain_document2information_piece)

file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor, epub_extractor)

general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors, intern2external)
confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Module containing the EpubExtractor class."""

import logging
from pathlib import Path


from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import (
ConfluenceLangchainDocument2InformationPiece,
)
from langchain_community.document_loaders import UnstructuredEPubLoader


from extractor_api_lib.file_services.file_service import FileService
from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor
from extractor_api_lib.impl.types.file_type import FileType
from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece

logger = logging.getLogger(__name__)


class EpubExtractor(InformationFileExtractor):
"""Extractor for Epub documents using unstructured library."""

def __init__(
self,
file_service: FileService,
mapper: ConfluenceLangchainDocument2InformationPiece,
):
"""Initialize the EpubExtractor.

Parameters
----------
file_service : FileService
Handler for downloading the file to extract content from and upload results to if required.
mapper : ConfluenceLangchainDocument2InformationPiece
An instance of ConfluenceLangchainDocument2InformationPiece used for mapping langchain documents
to information pieces.
"""
super().__init__(file_service=file_service)
self._mapper = mapper

@property
def compatible_file_types(self) -> list[FileType]:
"""
List of compatible file types for the XML extractor.

Returns
-------
list[FileType]
A list containing the compatible file types, which in this case is XML.
"""
return [FileType.EPUB]

async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]:
"""
Extract content from an epub file and processes the elements.

Parameters
----------
file_path : Path
The path to the epub file to be processed.
name : str
Name of the document.

Returns
-------
list[InformationPiece]
A list of processed information pieces extracted from the epub file.
"""
elements = UnstructuredEPubLoader(file_path.as_posix()).load()
return [self._mapper.map_document2informationpiece(x, name) for x in elements]
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ class FileType(StrEnum):
DOCX = "DOCX"
PPTX = "PPTX"
XML = "XML"
EPUB = "EPUB"
Loading