fix: Use content based file type detection instead of client headers

CyMule · CyMule · commit 45965c3dea1c · 2025-06-23T09:28:39.000-04:00
diff --git a/prepline_general/api/filetypes.py b/prepline_general/api/filetypes.py
@@ -17,34 +17,18 @@ def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None
     return content_type.split(";")[0]
 
 
-def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]:
+def get_validated_mimetype(file: UploadFile) -> Optional[str]:
     """Given the incoming file, identify and return the correct mimetype.
 
-    Order of operations:
-    - If user passed content_type as a form param, take it as truth.
-    - Otherwise, use file.content_type (as set by the Content-Type header)
-    - If no content_type was passed and the header wasn't useful, call the library's detect_filetype
+    Always inspects the actual file bytes to determine the true file type,
+    ignoring client-provided Content-Type headers which can be misleading.
 
     Once we have a filteype, check is_partitionable and return 400 if we don't support this file.
     """
-    content_type: str | None = None
-
-    if content_type_hint is not None:
-        content_type = content_type_hint
-    else:
-        content_type = _remove_optional_info_from_mime_type(file.content_type)
-
-    filetype = FileType.from_mime_type(content_type)
-
-    # If content_type was not specified, use the library to identify the file
-    # We inspect the bytes to do this, so we need to buffer the file
-    if not filetype or filetype == FileType.UNK:
-        file_buffer = BytesIO(file.file.read())
-        file.file.seek(0)
-
-        file_buffer.name = file.filename
-
-        filetype = detect_filetype(file=file_buffer)
+    file_buffer = BytesIO(file.file.read())
+    file.file.seek(0)
+    file_buffer.name = file.filename
+    filetype = detect_filetype(file=file_buffer)
 
     if not filetype.is_partitionable:
         raise HTTPException(
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -684,9 +684,7 @@ def general_partition(
 
     def response_generator(is_multipart: bool):
         for file in files:
-            file_content_type = get_validated_mimetype(
-                file, content_type_hint=form_params.content_type
-            )
+            file_content_type = get_validated_mimetype(file)
 
             _file = file.file
 
diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
@@ -1201,3 +1201,37 @@ def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, tes
         assert "Here are important notes" == df["text"][0]
     else:
         assert "Here are important notes" != df["text"][0]
+
+
+def test_text_file_with_pdf_extension_detected_correctly():
+    """
+    Verify that a text file with a .pdf extension is correctly detected as text/plain
+    instead of failing as a malformed PDF.
+
+    This test validates that the API inspects actual file content rather than
+    trusting client-provided Content-Type headers based on file extensions.
+    """
+    client = TestClient(app)
+
+    with tempfile.NamedTemporaryFile(suffix=".pdf", mode="w", delete=False) as temp_file:
+        temp_file.write("This is simple text content, not a PDF file.")
+        temp_file_path = temp_file.name
+
+    try:
+        # Upload the file without explicitly setting content type
+        # The client will auto-detect Content-Type as application/pdf based on .pdf extension
+        with open(temp_file_path, "rb") as f:
+            response = client.post(
+                MAIN_API_ROUTE, files=[("files", (temp_file_path, f))], data={"strategy": "fast"}
+            )
+
+        assert response.status_code == 200
+
+        elements = response.json()
+        assert len(elements) > 0
+        assert any("This is simple text content" in elem["text"] for elem in elements)
+
+        assert all(elem["metadata"]["filetype"] == "text/plain" for elem in elements)
+
+    finally:
+        os.unlink(temp_file_path)