diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa47187bdc..2fb45d5385 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## 0.17.3-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml
+
## 0.17.2
* Fix Image in a
tag is "UncategorizedText" with no .text
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
index 8376e4440a..ec6c805f34 100644
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@@ -15,6 +15,7 @@
LogCaptureFixture,
Mock,
example_doc_path,
+ input_path,
patch,
property_mock,
)
@@ -30,6 +31,7 @@
is_in_docker = os.path.exists("/.dockerenv")
+
# ================================================================================================
# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
# ================================================================================================
@@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson():
file_buffer.name = "filename.pdf"
predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
assert predicted_type == FileType.NDJSON
+
+
+def test_office_files_when_document_archive_has_non_standard_prefix():
+
+ predicted_type = detect_filetype(
+ file_path=input_path("file_type/test_document_from_office365.docx")
+ )
+ assert predicted_type == FileType.DOCX
diff --git a/test_unstructured/testfiles/file_type/test_document_from_office365.docx b/test_unstructured/testfiles/file_type/test_document_from_office365.docx
new file mode 100644
index 0000000000..fd9ca065eb
Binary files /dev/null and b/test_unstructured/testfiles/file_type/test_document_from_office365.docx differ
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 3578637acd..433383a01d 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.17.2" # pragma: no cover
+__version__ = "0.17.3-dev0" # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
index eeba899dfd..81557562f4 100644
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@@ -747,13 +747,13 @@ def _file_type(self) -> FileType | None:
filenames = zip.namelist()
- if "word/document.xml" in filenames:
+ if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames):
return FileType.DOCX
- if "xl/workbook.xml" in filenames:
+ if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
return FileType.XLSX
- if "ppt/presentation.xml" in filenames:
+ if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):
return FileType.PPTX
# -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --