1717import os
1818import tempfile
1919from pathlib import Path
20- from langdetect import detect , LangDetectException
21- from docling .document_converter import DocumentConverter
22- from docling .datamodel .base_models import InputFormat
2320
2421from comps import MegaServiceEndpoint , MicroService , ServiceOrchestrator , ServiceRoleType , ServiceType
2522from comps .cores .proto .api_protocol import (
2926 ChatMessage ,
3027 UsageInfo ,
3128)
32- from fastapi import Request , UploadFile , File , Form , HTTPException
29+ from docling .datamodel .base_models import InputFormat
30+ from docling .document_converter import DocumentConverter
31+ from fastapi import File , Form , HTTPException , Request , UploadFile
3332from fastapi .responses import StreamingResponse
33+ from langdetect import LangDetectException , detect
3434
3535MEGA_SERVICE_PORT = int (os .getenv ("MEGA_SERVICE_PORT" , 8888 ))
3636LLM_SERVICE_HOST_IP = os .getenv ("LLM_SERVICE_HOST_IP" , "0.0.0.0" )
@@ -79,8 +79,7 @@ def __init__(self):
7979 self .converter = DocumentConverter ()
8080
8181 async def process_file (self , file : UploadFile ) -> list [str ]:
82- """
83- Process an uploaded file and extract text content in chunks.
82+ """Process an uploaded file and extract text content in chunks.
8483
8584 Args:
8685 file: The uploaded file
@@ -100,8 +99,7 @@ async def process_file(self, file: UploadFile) -> list[str]:
10099 file_ext = Path (file .filename ).suffix .lower ()
101100 if file_ext not in SUPPORTED_EXTENSIONS :
102101 raise ValueError (
103- f"Unsupported file type: { file_ext } . "
104- f"Supported types: { ', ' .join (sorted (SUPPORTED_EXTENSIONS ))} "
102+ f"Unsupported file type: { file_ext } . " f"Supported types: { ', ' .join (sorted (SUPPORTED_EXTENSIONS ))} "
105103 )
106104
107105 page_texts = []
@@ -112,19 +110,19 @@ async def process_file(self, file: UploadFile) -> list[str]:
112110 print (f"Reading text file { file .filename } ..." )
113111 try :
114112 # Try UTF-8 first
115- text_content = contents .decode (' utf-8' )
113+ text_content = contents .decode (" utf-8" )
116114 except UnicodeDecodeError :
117115 # Fallback to latin-1 for other encodings
118116 print ("UTF-8 decode failed, trying latin-1..." )
119- text_content = contents .decode (' latin-1' )
117+ text_content = contents .decode (" latin-1" )
120118
121119 print (f"Read { len (text_content )} characters from text file" )
122120
123121 # Split into chunks if needed
124122 if len (text_content ) > CHUNK_SIZE :
125123 print (f"Splitting into chunks of { CHUNK_SIZE } chars" )
126124 for i in range (0 , len (text_content ), CHUNK_SIZE ):
127- chunk = text_content [i : i + CHUNK_SIZE ]
125+ chunk = text_content [i : i + CHUNK_SIZE ]
128126 page_texts .append (chunk )
129127 print (f"Chunk { len (page_texts )} : { len (chunk )} chars" )
130128 else :
@@ -145,7 +143,7 @@ async def process_file(self, file: UploadFile) -> list[str]:
145143 # Convert document using docling
146144 print (f"Converting document { file .filename } ..." )
147145 result = self .converter .convert (tmp_path )
148- print (f "Conversion completed" )
146+ print ("Conversion completed" )
149147
150148 # Export entire document to markdown
151149 full_markdown = result .document .export_to_markdown ()
@@ -156,7 +154,7 @@ async def process_file(self, file: UploadFile) -> list[str]:
156154 print (f"Splitting into chunks of { CHUNK_SIZE } chars" )
157155 # Split into manageable chunks
158156 for i in range (0 , len (full_markdown ), CHUNK_SIZE ):
159- chunk = full_markdown [i : i + CHUNK_SIZE ]
157+ chunk = full_markdown [i : i + CHUNK_SIZE ]
160158 page_texts .append (chunk )
161159 print (f"Chunk { len (page_texts )} : { len (chunk )} chars" )
162160 else :
@@ -209,16 +207,14 @@ async def translate_page(self, page_text: str, language_from: str, language_to:
209207 {source_language}
210208
211209 """
212- prompt = prompt_template .format (
213- language_from = language_from , language_to = language_to , source_language = page_text
214- )
210+ prompt = prompt_template .format (language_from = language_from , language_to = language_to , source_language = page_text )
215211
216212 # Create chat completion request with streaming
217213 chat_request_dict = {
218214 "model" : LLM_MODEL_ID ,
219215 "messages" : [{"role" : "user" , "content" : prompt }],
220216 "max_tokens" : 4096 ,
221- "stream" : True
217+ "stream" : True ,
222218 }
223219
224220 result_dict , runtime_graph = await self .megaservice .schedule (initial_inputs = chat_request_dict )
@@ -235,21 +231,21 @@ async def translate_page(self, page_text: str, language_from: str, language_to:
235231
236232 # Get the response body iterator
237233 async for chunk in response .body_iterator :
238- chunk_str = chunk .decode (' utf-8' ) if isinstance (chunk , bytes ) else chunk
234+ chunk_str = chunk .decode (" utf-8" ) if isinstance (chunk , bytes ) else chunk
239235
240236 # Parse SSE format
241- lines = chunk_str .split (' \n ' )
237+ lines = chunk_str .split (" \n " )
242238 for line in lines :
243- if line .startswith (' data: ' ):
239+ if line .startswith (" data: " ):
244240 data = line [6 :] # Remove "data: " prefix
245241
246- if data == ' [DONE]' :
242+ if data == " [DONE]" :
247243 continue
248244
249245 try :
250246 parsed = json .loads (data )
251247 # Extract content from chat completion format
252- text = parsed .get (' choices' , [{}])[0 ].get (' delta' , {}).get (' content' , '' )
248+ text = parsed .get (" choices" , [{}])[0 ].get (" delta" , {}).get (" content" , "" )
253249 if text :
254250 accumulated_text += text
255251 except :
@@ -274,7 +270,7 @@ async def handle_request(self, request: Request):
274270 language_to = form_data .get ("language_to" )
275271 file = form_data .get ("file" )
276272
277- if not file or not hasattr (file , ' filename' ):
273+ if not file or not hasattr (file , " filename" ):
278274 raise HTTPException (status_code = 400 , detail = "No file uploaded" )
279275
280276 if not language_to :
@@ -368,7 +364,7 @@ async def handle_request(self, request: Request):
368364 chat_request_dict = {
369365 "model" : LLM_MODEL_ID ,
370366 "messages" : [{"role" : "user" , "content" : prompt }],
371- "stream" : True
367+ "stream" : True ,
372368 }
373369
374370 result_dict , runtime_graph = await self .megaservice .schedule (initial_inputs = chat_request_dict )
0 commit comments