Skip to content

Commit cad83a6

Browse files
committed
Matches prefix to verify presence of DOCX,PPTX,XLSX files instead of standard file names
1 parent 66bf4b0 commit cad83a6

3 files changed

Lines changed: 10 additions & 3 deletions

File tree

18.3 KB
Binary file not shown.

test_unstructured/file_utils/test_filetype.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
is_in_docker = os.path.exists("/.dockerenv")
3232

33+
3334
# ================================================================================================
3435
# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
3536
# ================================================================================================
@@ -987,3 +988,9 @@ def test_json_content_type_is_disambiguated_for_ndjson():
987988
file_buffer.name = "filename.pdf"
988989
predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
989990
assert predicted_type == FileType.NDJSON
991+
992+
993+
def test_office_files_when_document_archive_has_non_standard_prefix():
994+
995+
predicted_type = detect_filetype(file_path="test_document_from_office365.docx")
996+
assert predicted_type == FileType.DOCX

unstructured/file_utils/filetype.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -747,13 +747,13 @@ def _file_type(self) -> FileType | None:
747747

748748
filenames = zip.namelist()
749749

750-
if "word/document.xml" in filenames:
750+
if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames):
751751
return FileType.DOCX
752752

753-
if "xl/workbook.xml" in filenames:
753+
if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
754754
return FileType.XLSX
755755

756-
if "ppt/presentation.xml" in filenames:
756+
if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):
757757
return FileType.PPTX
758758

759759
# -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --

0 commit comments

Comments
 (0)