diff --git a/CHANGELOG.md b/CHANGELOG.md index aa47187bdc..2fb45d5385 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.3-dev0 + +### Enhancements + +### Features + +### Fixes +- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml + ## 0.17.2 * Fix Image in a
tag is "UncategorizedText" with no .text diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 8376e4440a..ec6c805f34 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -15,6 +15,7 @@ LogCaptureFixture, Mock, example_doc_path, + input_path, patch, property_mock, ) @@ -30,6 +31,7 @@ is_in_docker = os.path.exists("/.dockerenv") + # ================================================================================================ # STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES) # ================================================================================================ @@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson(): file_buffer.name = "filename.pdf" predicted_type = detect_filetype(file=file_buffer, content_type="application/json") assert predicted_type == FileType.NDJSON + + +def test_office_files_when_document_archive_has_non_standard_prefix(): + + predicted_type = detect_filetype( + file_path=input_path("file_type/test_document_from_office365.docx") + ) + assert predicted_type == FileType.DOCX diff --git a/test_unstructured/testfiles/file_type/test_document_from_office365.docx b/test_unstructured/testfiles/file_type/test_document_from_office365.docx new file mode 100644 index 0000000000..fd9ca065eb Binary files /dev/null and b/test_unstructured/testfiles/file_type/test_document_from_office365.docx differ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3578637acd..433383a01d 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.2" # pragma: no cover +__version__ = "0.17.3-dev0" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index eeba899dfd..81557562f4 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -747,13 +747,13 @@ def _file_type(self) -> FileType | None: filenames = zip.namelist() - if "word/document.xml" in filenames: + if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames): return FileType.DOCX - if "xl/workbook.xml" in filenames: + if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames): return FileType.XLSX - if "ppt/presentation.xml" in filenames: + if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames): return FileType.PPTX # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --