1616import logging
1717import mimetypes
1818import os
19+ import re
1920from typing import List
2021
2122from fastapi import HTTPException , UploadFile
@@ -575,14 +576,14 @@ async def _get_document_preview(session):
575576
576577 converted_pdf_object_path = None
577578 index_data = json .loads (doc_index .index_data ) if doc_index and doc_index .index_data else {}
578- if index_data .get ("has_pdf_source_map" ) and not document . name . lower (). endswith ( ".pdf" ) :
579+ if index_data .get ("has_pdf_source_map" ):
579580 # If the parsing result contains pdf_source_map metadata,
580581 # it means it is a PDF or has been converted to a PDF.
581- # But only converted documents have a converted .pdf file.
582- pdf_path = f"{ document .object_store_base_path ()} /converted.pdf "
582+ converted_pdf_name = " converted.pdf"
583+ pdf_path = f"{ document .object_store_base_path ()} /{ converted_pdf_name } "
583584 exists = await async_obj_store .obj_exists (pdf_path )
584585 if exists :
585- converted_pdf_object_path = "converted.pdf"
586+ converted_pdf_object_path = converted_pdf_name
586587
587588 # 5. Construct and return response
588589 return DocumentPreview (
@@ -596,9 +597,12 @@ async def _get_document_preview(session):
596597 # Execute query with proper session management
597598 return await self .db_ops ._execute_query (_get_document_preview )
598599
599- async def get_document_object (self , user_id : str , collection_id : str , document_id : str , path : str ):
600+ async def get_document_object (
601+ self , user_id : str , collection_id : str , document_id : str , path : str , range_header : str = None
602+ ):
600603 """
601604 Get a file object associated with a document from the object store.
605+ Supports HTTP Range requests.
602606 """
603607
604608 # Use database operations with proper session management
@@ -622,19 +626,49 @@ async def _get_document_object(session):
622626 # 2. Get the object from object store
623627 try :
624628 async_obj_store = get_async_object_store ()
625- get_obj_result = await async_obj_store .get (full_path )
629+ headers = {"Accept-Ranges" : "bytes" }
630+ content_type , _ = mimetypes .guess_type (full_path )
631+ if content_type is None :
632+ content_type = "application/octet-stream"
633+ headers ["Content-Type" ] = content_type
634+
635+ if range_header :
636+ # For range requests, we need the total size first.
637+ total_size = await async_obj_store .get_obj_size (full_path )
638+ if total_size is None :
639+ raise HTTPException (status_code = 404 , detail = "Object not found at specified path" )
640+
641+ range_match = re .match (r"bytes=(\d+)-(\d*)" , range_header )
642+ if not range_match :
643+ raise HTTPException (status_code = 400 , detail = "Invalid range header format" )
644+
645+ start_byte = int (range_match .group (1 ))
646+ end_byte_str = range_match .group (2 )
647+ end_byte = int (end_byte_str ) if end_byte_str else total_size - 1
648+
649+ if start_byte >= total_size or end_byte >= total_size or start_byte > end_byte :
650+ headers ["Content-Range" ] = f"bytes */{ total_size } "
651+ raise HTTPException (status_code = 416 , headers = headers , detail = "Requested range not satisfiable" )
626652
653+ # Use stream_range to get the partial content
654+ range_result = await async_obj_store .stream_range (full_path , start = start_byte , end = end_byte )
655+ if not range_result :
656+ raise HTTPException (status_code = 404 , detail = "Object not found at specified path" )
657+
658+ data_stream , content_length = range_result
659+ headers ["Content-Range" ] = f"bytes { start_byte } -{ end_byte } /{ total_size } "
660+ headers ["Content-Length" ] = str (content_length )
661+ return StreamingResponse (data_stream , status_code = 206 , headers = headers )
662+
663+ # Full content response - optimized to use size from get()
664+ get_obj_result = await async_obj_store .get (full_path )
627665 if not get_obj_result :
628666 raise HTTPException (status_code = 404 , detail = "Object not found at specified path" )
629667
630- data_stream , _ = get_obj_result
631-
632- # 3. Stream the response
633- content_type , _ = mimetypes .guess_type (full_path )
634- if content_type is None :
635- content_type = "application/octet-stream"
668+ data_stream , file_size = get_obj_result
669+ headers ["Content-Length" ] = str (file_size )
670+ return StreamingResponse (data_stream , headers = headers )
636671
637- return StreamingResponse (data_stream , media_type = content_type )
638672 except Exception as e :
639673 logger .error (f"Failed to get object for document { document_id } at path { full_path } : { e } " , exc_info = True )
640674 raise HTTPException (status_code = 500 , detail = "Failed to get object from store" )
0 commit comments