Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## 0.17.3-dev0

### Enhancements

### Features

### Fixes
- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml

## 0.17.2

* Fix Image in a <div> tag is "UncategorizedText" with no .text
Expand Down
10 changes: 10 additions & 0 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
LogCaptureFixture,
Mock,
example_doc_path,
input_path,
patch,
property_mock,
)
Expand All @@ -30,6 +31,7 @@

is_in_docker = os.path.exists("/.dockerenv")


# ================================================================================================
# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
# ================================================================================================
Expand Down Expand Up @@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson():
file_buffer.name = "filename.pdf"
predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
assert predicted_type == FileType.NDJSON


def test_office_files_when_document_archive_has_non_standard_prefix():

predicted_type = detect_filetype(
file_path=input_path("file_type/test_document_from_office365.docx")
)
assert predicted_type == FileType.DOCX
Binary file not shown.
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.17.2" # pragma: no cover
__version__ = "0.17.3-dev0" # pragma: no cover
6 changes: 3 additions & 3 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,13 +747,13 @@ def _file_type(self) -> FileType | None:

filenames = zip.namelist()

if "word/document.xml" in filenames:
if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the final word on is_docx will be a package part with the ContentType you highlighted in the .rels file, but I'm fine with this as a step toward that :)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aligned too. Will raise another PR with changes incorporating the content type identified from [Content_Types].xml

return FileType.DOCX

if "xl/workbook.xml" in filenames:
if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
return FileType.XLSX

if "ppt/presentation.xml" in filenames:
if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):
return FileType.PPTX

# -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --
Expand Down
Loading