-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
167 lines (146 loc) · 6.41 KB
/
main.py
File metadata and controls
167 lines (146 loc) · 6.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os
import logging
import hashlib
import json
from typing import List
from fastapi import FastAPI
from fastapi import UploadFile, Form
from fastapi.exceptions import HTTPException
from fastapi.middleware.cors import CORSMiddleware
from models import ConverseModel
from services import identify_file_type, merge_pdfs, save_file
from service_wrappers import extract_image_text_and_set_db, extract_pdf_text_and_set_db
from textract_wrapper import detect_text_and_set_db
from language_processing import converse
from tasks import enqueue_extraction
from db import set_object, get_object
app = FastAPI()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Allow CORS for your frontend origin
app.add_middleware(
CORSMiddleware,
allow_origins=["http://localhost:8000", "http://localhost:8001", "http://ocr.petprojects.in", "http://nlp.petprojects.in"], # Frontend origin
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
def root():
logger.info("Root invoked")
return "Document Processing"
@app.post("/pdfs-merge")
def pdfs_merge(attachments: List[UploadFile]):
"""
Allows uploading multiple PDFs as multipart/form-data.
It stitches the PDFs together and stores the new PDF.
"""
# First validate that all the attachments are PDFs.
if len(attachments) > 10:
raise HTTPException(status_code=400, detail="A maximum of 10 attachments are allowed.")
logger.info("Performing mime type validation on attachments.")
for attachment in attachments:
# Save the attachment for further analysis
filename = f"/media/original-pdfs/{attachment.filename}"
save_file(attachment.file, filename)
attachment.file.seek(0) # Reset the pointer to the file beginning
file_type = identify_file_type(attachment.file)
mime_type = file_type.mime_type
if file_type.mime_type != 'application/pdf':
logger.info(f"Validation on {filename} failed.")
raise HTTPException(status_code=400, detail=f"A {mime_type} file posted.")
# Validation passed
merged_filename = merge_pdfs(attachments)
return {"status": "processed", "filename": merged_filename}
@app.post("/ocr")
def ocr(attachment: UploadFile, gray: bool = Form(True), denoise: bool = Form(True), binarize: bool = Form(True)):
"""
TODO: Support multiple attachments
It could pass a PDF or an image.
A PDF could be searchable or non-searchable.
Image Case:
We can run the image file through tesseract and extract the text.
PDF Case:
Searchable: We can use pdfminer.six as being used.
Non-Searchable: Covert the PDF to an image and then extract the text
In all of the above cases, the processing would happen asynchronously.
The task would be queued and a link would be returned to the user.
"""
options = {
"gray": gray,
"denoise": denoise,
"binarize": binarize
}
type_details = identify_file_type(attachment.file)
if not type_details.mime_type.startswith('image') and not type_details.mime_type.startswith('application/pdf'):
raise HTTPException(status_code=400, detail="Provide either an image or a PDF")
# 1. Save the attachment, for later auditing
output_filename = f"/media/ocr-files/{attachment.filename}"
save_file(attachment.file, output_filename)
attachment.file.seek(0)
path_hash = hashlib.sha256(output_filename.encode('utf-8')).hexdigest()
# Check the content-type, if image, then extract text using Tesseract.
if type_details.mime_type.startswith('image'):
# Attempt extraction through Tesseract
set_object(key=path_hash, field="type", value="image")
enqueue_extraction(extraction_function=extract_image_text_and_set_db, file_path=output_filename, key=path_hash, options=options)
elif type_details.mime_type.startswith('application/pdf'):
# Attempt extracting text using pdfminer.six or else through the image conversion -> OCR pipeline.
set_object(key=path_hash, field="type", value="pdf")
enqueue_extraction(extraction_function=extract_pdf_text_and_set_db, file_path=output_filename, key=path_hash)
# Add it to a queue.
BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
link = f"{BASE_URL}/ocr-result/{path_hash}"
return {"link": link}
@app.get("/ocr-result/{key}")
def ocr_result(key: str):
content = get_object(key, "content")
if content is None:
return {"content": content}
response_data = {}
category = get_object(key, "category")
if category is not None:
# Only if category is not None, then include it in the response
response_data["category"] = category
if category == 'passport':
passport_data = get_object(key, "passport_data")
passport_data = json.loads(passport_data)
response_data["passport_data"] = passport_data
elif category == 'pan':
pan_data = get_object(key, "pan_data")
pan_data = json.loads(pan_data)
response_data["pan_data"] = pan_data
# Remove empty lines
lines = content.splitlines()
non_blank_lines = [line for line in lines if line.strip() != '']
content = '\n'.join(non_blank_lines)
response_data["content"] = content
return response_data
@app.post("/textract-ocr")
def textract_ocr(attachment: UploadFile):
type_details = identify_file_type(attachment.file)
if not type_details.mime_type.startswith('image'):
raise HTTPException(status_code=400, detail="Provide an image")
output_filename = f"/media/textract-ocr-files/{attachment.filename}"
save_file(attachment.file, output_filename)
attachment.file.seek(0)
path_hash = hashlib.sha256(output_filename.encode('utf-8')).hexdigest()
set_object(key=path_hash, field="type", value="pdf")
# Add it to a queue.
enqueue_extraction(extraction_function=detect_text_and_set_db, file_path=output_filename, key=path_hash)
BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
link = f"{BASE_URL}/ocr-result/{path_hash}"
return {"link": link}
@app.post("/converse")
def conversation(body: ConverseModel):
"""
Performs things like:
- Tokenization
- Parts of Speech tagging
- Named Entity Recognition
"""
answer = converse(body.text, body.question)
logger.info(f"Answer: {answer}")
if answer is None:
answer = "Failed to parse"
return {"answer": answer}