Skip to content

Commit 45965c3

Browse files
committed
fix: Use content based file type detection instead of client headers
1 parent 1d799b9 commit 45965c3

3 files changed

Lines changed: 42 additions & 26 deletions

File tree

prepline_general/api/filetypes.py

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,34 +17,18 @@ def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None
1717
return content_type.split(";")[0]
1818

1919

20-
def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]:
20+
def get_validated_mimetype(file: UploadFile) -> Optional[str]:
2121
"""Given the incoming file, identify and return the correct mimetype.
2222
23-
Order of operations:
24-
- If user passed content_type as a form param, take it as truth.
25-
- Otherwise, use file.content_type (as set by the Content-Type header)
26-
- If no content_type was passed and the header wasn't useful, call the library's detect_filetype
23+
Always inspects the actual file bytes to determine the true file type,
24+
ignoring client-provided Content-Type headers which can be misleading.
2725
2826
Once we have a filteype, check is_partitionable and return 400 if we don't support this file.
2927
"""
30-
content_type: str | None = None
31-
32-
if content_type_hint is not None:
33-
content_type = content_type_hint
34-
else:
35-
content_type = _remove_optional_info_from_mime_type(file.content_type)
36-
37-
filetype = FileType.from_mime_type(content_type)
38-
39-
# If content_type was not specified, use the library to identify the file
40-
# We inspect the bytes to do this, so we need to buffer the file
41-
if not filetype or filetype == FileType.UNK:
42-
file_buffer = BytesIO(file.file.read())
43-
file.file.seek(0)
44-
45-
file_buffer.name = file.filename
46-
47-
filetype = detect_filetype(file=file_buffer)
28+
file_buffer = BytesIO(file.file.read())
29+
file.file.seek(0)
30+
file_buffer.name = file.filename
31+
filetype = detect_filetype(file=file_buffer)
4832

4933
if not filetype.is_partitionable:
5034
raise HTTPException(

prepline_general/api/general.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -684,9 +684,7 @@ def general_partition(
684684

685685
def response_generator(is_multipart: bool):
686686
for file in files:
687-
file_content_type = get_validated_mimetype(
688-
file, content_type_hint=form_params.content_type
689-
)
687+
file_content_type = get_validated_mimetype(file)
690688

691689
_file = file.file
692690

test_general/api/test_app.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1201,3 +1201,37 @@ def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, tes
12011201
assert "Here are important notes" == df["text"][0]
12021202
else:
12031203
assert "Here are important notes" != df["text"][0]
1204+
1205+
1206+
def test_text_file_with_pdf_extension_detected_correctly():
1207+
"""
1208+
Verify that a text file with a .pdf extension is correctly detected as text/plain
1209+
instead of failing as a malformed PDF.
1210+
1211+
This test validates that the API inspects actual file content rather than
1212+
trusting client-provided Content-Type headers based on file extensions.
1213+
"""
1214+
client = TestClient(app)
1215+
1216+
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="w", delete=False) as temp_file:
1217+
temp_file.write("This is simple text content, not a PDF file.")
1218+
temp_file_path = temp_file.name
1219+
1220+
try:
1221+
# Upload the file without explicitly setting content type
1222+
# The client will auto-detect Content-Type as application/pdf based on .pdf extension
1223+
with open(temp_file_path, "rb") as f:
1224+
response = client.post(
1225+
MAIN_API_ROUTE, files=[("files", (temp_file_path, f))], data={"strategy": "fast"}
1226+
)
1227+
1228+
assert response.status_code == 200
1229+
1230+
elements = response.json()
1231+
assert len(elements) > 0
1232+
assert any("This is simple text content" in elem["text"] for elem in elements)
1233+
1234+
assert all(elem["metadata"]["filetype"] == "text/plain" for elem in elements)
1235+
1236+
finally:
1237+
os.unlink(temp_file_path)

0 commit comments

Comments
 (0)