Skip to content

Commit 3497281

Browse files
Matches prefix to verify presence of DOCX,PPTX,XLSX files instead of standard file names (#3959)
Instead of looking for presence of `word/document.xml` , `ppt/presentation.xml` and `xl/workbook.xml` to identify DOCX,PPTX and XLSX files, we look for prefix `word/document*.xml`, `ppt/presentation*.xml` and `xl/workbook*.xml` as certain files generated from office365 has files with different names. Fixes #3937 --------- Co-authored-by: Yao You <theyaoyou@gmail.com>
1 parent 0fa5174 commit 3497281

5 files changed

Lines changed: 23 additions & 4 deletions

File tree

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.17.3-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml
9+
110
## 0.17.2
211

312
* Fix Image in a <div> tag is "UncategorizedText" with no .text

test_unstructured/file_utils/test_filetype.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
LogCaptureFixture,
1616
Mock,
1717
example_doc_path,
18+
input_path,
1819
patch,
1920
property_mock,
2021
)
@@ -30,6 +31,7 @@
3031

3132
is_in_docker = os.path.exists("/.dockerenv")
3233

34+
3335
# ================================================================================================
3436
# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
3537
# ================================================================================================
@@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson():
987989
file_buffer.name = "filename.pdf"
988990
predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
989991
assert predicted_type == FileType.NDJSON
992+
993+
994+
def test_office_files_when_document_archive_has_non_standard_prefix():
995+
996+
predicted_type = detect_filetype(
997+
file_path=input_path("file_type/test_document_from_office365.docx")
998+
)
999+
assert predicted_type == FileType.DOCX
Binary file not shown.

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.17.2" # pragma: no cover
1+
__version__ = "0.17.3-dev0" # pragma: no cover

unstructured/file_utils/filetype.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -747,13 +747,13 @@ def _file_type(self) -> FileType | None:
747747

748748
filenames = zip.namelist()
749749

750-
if "word/document.xml" in filenames:
750+
if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames):
751751
return FileType.DOCX
752752

753-
if "xl/workbook.xml" in filenames:
753+
if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
754754
return FileType.XLSX
755755

756-
if "ppt/presentation.xml" in filenames:
756+
if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):
757757
return FileType.PPTX
758758

759759
# -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --

0 commit comments

Comments
 (0)