File tree Expand file tree Collapse file tree
test_unstructured/file_utils Expand file tree Collapse file tree Original file line number Diff line number Diff line change 3030
3131is_in_docker = os .path .exists ("/.dockerenv" )
3232
33+
3334# ================================================================================================
3435# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
3536# ================================================================================================
@@ -987,3 +988,9 @@ def test_json_content_type_is_disambiguated_for_ndjson():
987988 file_buffer .name = "filename.pdf"
988989 predicted_type = detect_filetype (file = file_buffer , content_type = "application/json" )
989990 assert predicted_type == FileType .NDJSON
991+
992+
993+ def test_office_files_when_document_archive_has_non_standard_prefix ():
994+
995+ predicted_type = detect_filetype (file_path = "test_document_from_office365.docx" )
996+ assert predicted_type == FileType .DOCX
Original file line number Diff line number Diff line change @@ -747,13 +747,13 @@ def _file_type(self) -> FileType | None:
747747
748748 filenames = zip .namelist ()
749749
750- if "word/document.xml" in filenames :
750+ if any ( re . match ( r "word/document.*\. xml$" , filename ) for filename in filenames ) :
751751 return FileType .DOCX
752752
753- if "xl/workbook.xml" in filenames :
753+ if any ( re . match ( r "xl/workbook.*\. xml$" , filename ) for filename in filenames ) :
754754 return FileType .XLSX
755755
756- if "ppt/presentation.xml" in filenames :
756+ if any ( re . match ( r "ppt/presentation.*\. xml$" , filename ) for filename in filenames ) :
757757 return FileType .PPTX
758758
759759 # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --
You can’t perform that action at this time.
0 commit comments