Matches prefix to verify presence of DOCX,PPTX,XLSX files instead of standard file names (#3959)

srisudarsan · badGarnet · web-flow · commit 349728162e59 · 2025-03-21T16:27:13.000Z
Instead of looking for presence of `word/document.xml` , `ppt/presentation.xml` and `xl/workbook.xml` to identify DOCX,PPTX and XLSX files, we look for prefix `word/document*.xml`, `ppt/presentation*.xml` and `xl/workbook*.xml` as certain files generated from office365 has files with different names. Fixes #3937 --------- Co-authored-by: Yao You <theyaoyou@gmail.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## 0.17.3-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml 
+
 ## 0.17.2
 
 * Fix Image in a <div> tag is "UncategorizedText" with no .text
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -15,6 +15,7 @@
     LogCaptureFixture,
     Mock,
     example_doc_path,
+    input_path,
     patch,
     property_mock,
 )
@@ -30,6 +31,7 @@
 
 is_in_docker = os.path.exists("/.dockerenv")
 
+
 # ================================================================================================
 # STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
 # ================================================================================================
@@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson():
     file_buffer.name = "filename.pdf"
     predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
     assert predicted_type == FileType.NDJSON
+
+
+def test_office_files_when_document_archive_has_non_standard_prefix():
+
+    predicted_type = detect_filetype(
+        file_path=input_path("file_type/test_document_from_office365.docx")
+    )
+    assert predicted_type == FileType.DOCX
diff --git a/test_unstructured/testfiles/file_type/test_document_from_office365.docx b/test_unstructured/testfiles/file_type/test_document_from_office365.docx
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.17.2"  # pragma: no cover
+__version__ = "0.17.3-dev0"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -747,13 +747,13 @@ def _file_type(self) -> FileType | None:
 
             filenames = zip.namelist()
 
-            if "word/document.xml" in filenames:
+            if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames):
                 return FileType.DOCX
 
-            if "xl/workbook.xml" in filenames:
+            if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
                 return FileType.XLSX
 
-            if "ppt/presentation.xml" in filenames:
+            if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):
                 return FileType.PPTX
 
             # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.17.2" # pragma: no cover`
	`1`	`+__version__ = "0.17.3-dev0" # pragma: no cover`