-
Notifications
You must be signed in to change notification settings - Fork 9
feat: adds extractor for epub #58
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 6 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
1fd656f
adds extractor for epub
MelvinKl b2115cd
add tests for epub extractor
MelvinKl eb5f615
fix linting
MelvinKl 10704df
Merge branch 'stackitcloud:main' into feature/epub
MelvinKl 1635946
adjust README
MelvinKl e8b7413
readme adjustment
MelvinKl ad529d9
Update libs/README.md
MelvinKl File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
73 changes: 73 additions & 0 deletions
73
...extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/epub_extractor.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| """Module containing the EpubExtractor class.""" | ||
|
|
||
| import logging | ||
| from pathlib import Path | ||
|
|
||
| from langchain_community.document_loaders import UnstructuredEPubLoader | ||
|
|
||
| from extractor_api_lib.extractors.information_file_extractor import ( | ||
| InformationFileExtractor, | ||
| ) | ||
| from extractor_api_lib.file_services.file_service import FileService | ||
| from extractor_api_lib.impl.mapper.langchain_document2information_piece import ( | ||
| LangchainDocument2InformationPiece, | ||
| ) | ||
| from extractor_api_lib.impl.types.file_type import FileType | ||
| from extractor_api_lib.models.dataclasses.internal_information_piece import ( | ||
| InternalInformationPiece, | ||
| ) | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class EpubExtractor(InformationFileExtractor): | ||
| """Extractor for Epub documents using unstructured library.""" | ||
|
|
||
| def __init__( | ||
| self, | ||
| file_service: FileService, | ||
| mapper: LangchainDocument2InformationPiece, | ||
| ): | ||
| """Initialize the EpubExtractor. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| file_service : FileService | ||
| Handler for downloading the file to extract content from and upload results to if required. | ||
| mapper : LangchainDocument2InformationPiece | ||
| An instance of LangchainDocument2InformationPiece used for mapping langchain documents | ||
| to information pieces. | ||
| """ | ||
| super().__init__(file_service=file_service) | ||
| self._mapper = mapper | ||
|
|
||
| @property | ||
| def compatible_file_types(self) -> list[FileType]: | ||
| """ | ||
| List of compatible file types for the EPUB extractor. | ||
|
|
||
| Returns | ||
| ------- | ||
| list[FileType] | ||
| A list containing the compatible file types, which in this case is EPUB. | ||
| """ | ||
| return [FileType.EPUB] | ||
|
|
||
| async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: | ||
| """ | ||
| Extract content from an epub file and processes the elements. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| file_path : Path | ||
| The path to the epub file to be processed. | ||
| name : str | ||
| Name of the document. | ||
|
|
||
| Returns | ||
| ------- | ||
| list[InformationPiece] | ||
| A list of processed information pieces extracted from the epub file. | ||
| """ | ||
| elements = UnstructuredEPubLoader(file_path.as_posix()).load() | ||
| return [self._mapper.map_document2informationpiece(document=x, document_name=name) for x in elements] |
12 changes: 12 additions & 0 deletions
12
...tractor-api-lib/src/extractor_api_lib/impl/mapper/langchain_document2information_piece.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| """Module for the LangchainDocument2InformationPiece class.""" | ||
|
|
||
| from extractor_api_lib.mapper.source_langchain_document2information_piece import ( | ||
| SourceLangchainDocument2InformationPiece, | ||
| ) | ||
|
|
||
|
|
||
| class LangchainDocument2InformationPiece(SourceLangchainDocument2InformationPiece): | ||
| """A class to map a LangchainDocument to an InformationPiece.""" | ||
|
|
||
| def _map_meta(self, internal: dict, document_name: str) -> dict: | ||
| return internal |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,3 +11,4 @@ class FileType(StrEnum): | |
| DOCX = "DOCX" | ||
| PPTX = "PPTX" | ||
| XML = "XML" | ||
| EPUB = "EPUB" | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,50 @@ | ||
| """Comprehensive test suite for SitemapExtractor class.""" | ||
|
|
||
| from pathlib import Path | ||
|
|
||
| import pytest | ||
|
|
||
| from extractor_api_lib.impl.extractors.file_extractors.epub_extractor import ( | ||
| EpubExtractor, | ||
| ) | ||
| from extractor_api_lib.impl.mapper.langchain_document2information_piece import ( | ||
| LangchainDocument2InformationPiece, | ||
| ) | ||
| from extractor_api_lib.impl.types.file_type import FileType | ||
| from extractor_api_lib.models.content_type import ContentType | ||
|
|
||
|
|
||
| class TestEpubExtractor: | ||
| """Test class for EpubExtractor.""" | ||
|
|
||
| @pytest.fixture | ||
| def mapper(self) -> LangchainDocument2InformationPiece: | ||
| return LangchainDocument2InformationPiece() | ||
|
|
||
| @pytest.fixture | ||
| def epub_extractor(self, mock_file_service, mapper): | ||
| """Create a EpubExtractor instance for testing.""" | ||
| return EpubExtractor(file_service=mock_file_service, mapper=mapper) | ||
|
|
||
| def test_init(self, mock_file_service, mapper): | ||
| """Test EpubExtractor initialization.""" | ||
| extractor = EpubExtractor(file_service=mock_file_service, mapper=mapper) | ||
| assert extractor._mapper == mapper | ||
| assert extractor._file_service == mock_file_service | ||
|
|
||
| def test_file_type(self, epub_extractor): | ||
| """Test that extractor_type returns EPUB.""" | ||
| assert epub_extractor.compatible_file_types == [FileType.EPUB] | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_extract_content_success(self, epub_extractor): | ||
| page_content = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam" | ||
|
|
||
| test_data_dir = Path(__file__).parent / "test_data" | ||
|
|
||
| file_path = test_data_dir / "LoremIpsum.epub" | ||
| result = await epub_extractor.aextract_content(file_path, file_path.name) | ||
|
|
||
| assert len(result) == 1 | ||
| assert result[0].type == ContentType.TEXT | ||
| assert result[0].page_content == page_content |
Binary file not shown.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.