99from fastapi .exceptions import HTTPException
1010from fastapi .middleware .cors import CORSMiddleware
1111
12- from services import identify_file_type , merge_pdfs , save_file , extract_pdf_text_searchable , get_file_size , extract_image_text
12+ from services import identify_file_type , merge_pdfs , save_file
1313from service_wrappers import extract_image_text_and_set_db , extract_pdf_text_and_set_db
1414from textract_wrapper import detect_text_and_set_db
15- from text_analysis import analyze
1615from tasks import enqueue_extraction
1716from db import set_object , get_object
1817
@@ -38,18 +37,6 @@ def root():
3837 return "Document Processing"
3938
4039
41- @app .post ("/content-type" )
42- def identify_content_type (attachment : UploadFile ):
43- # Identify the file mime type.
44- filename = f"/media/content-type-identification/{ attachment .filename } "
45- save_file (attachment .file , filename )
46- # We read through the file in the last step, i.e save_file().
47- # We must seek(0), and go to the beginning before trying to identify the file type.
48- attachment .file .seek (0 )
49- file_type = identify_file_type (attachment .file )
50- return {"content-type" : file_type .mime_type }
51-
52-
5340@app .post ("/pdfs-merge" )
5441def pdfs_merge (attachments : List [UploadFile ]):
5542 """
@@ -75,48 +62,6 @@ def pdfs_merge(attachments: List[UploadFile]):
7562 return {"status" : "processed" , "filename" : merged_filename }
7663
7764
78- @app .post ("/extract-pdf-text" )
79- def extract_text (attachment : UploadFile ):
80- """
81- Extracts text from an attachment uploaded through multipart/form-data.
82- """
83- type_details = identify_file_type (attachment .file )
84- if type_details .mime_type != 'application/pdf' :
85- raise HTTPException (status_code = 400 , detail = "A non-pdf file found." )
86- attachment_name = attachment .filename
87- output_filename = f"/media/extraction-pdfs/{ attachment_name } "
88- save_file (attachment .file , output_filename )
89- attachment .file .seek (0 )
90- is_success , content = extract_pdf_text_searchable (attachment .file )
91- if is_success is False :
92- raise HTTPException (status_code = 400 , detail = content )
93- analysis_result = analyze (content )
94- return {"content" : content , "analysis_result" : analysis_result }
95-
96-
97- @app .post ("/extract-image-text" )
98- def extract_img_text (attachment : UploadFile ):
99- """
100- Perform OCR on the uploaded attachment.
101- Currently works with images having text.
102- Later add support for PDFs and Docx as well.
103- """
104- type_details = identify_file_type (attachment .file )
105- if not type_details .mime_type .startswith ('image' ):
106- raise HTTPException (status_code = 400 , detail = "A non image file found." )
107- file_size = get_file_size (attachment .file )
108- # 100 MB
109- if file_size > (10 * 1024 * 1024 ):
110- raise HTTPException (status_code = 400 , detail = "Only supports upto 10MB files." )
111- output_filename = f"/media/extraction-images/{ attachment .filename } "
112- attachment .file .seek (0 )
113- save_file (attachment .file , output_filename )
114- is_success , content = extract_image_text (output_filename )
115- if is_success is False :
116- raise HTTPException (status_code = 400 , detail = content )
117- return {"content" : content }
118-
119-
12065@app .post ("/ocr" )
12166def ocr (attachment : UploadFile , gray : bool = Form (True ), denoise : bool = Form (True ), binarize : bool = Form (True )):
12267 """
0 commit comments