|
| 1 | +import shutil |
| 2 | +import time |
| 3 | +import uuid |
| 4 | +import os |
| 5 | + |
| 6 | +import click |
| 7 | +import uvicorn |
| 8 | +from pydantic import BaseModel |
| 9 | +from pathlib import Path |
| 10 | +from fastapi import FastAPI |
| 11 | +from fastapi.responses import JSONResponse |
| 12 | +from loguru import logger |
| 13 | +from mineru.cli.common import aio_do_parse, read_fn |
| 14 | +from mineru.cli.fast_api import get_infer_result |
| 15 | + |
| 16 | +# 日志配置 |
| 17 | +LOG_DIR = "/var/log/datamate/mineru" |
| 18 | +os.makedirs(LOG_DIR, exist_ok=True) |
| 19 | +logger.add( |
| 20 | + f"{LOG_DIR}/mineru.log", |
| 21 | + format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}", |
| 22 | + level="DEBUG", |
| 23 | + enqueue=True |
| 24 | +) |
| 25 | + |
| 26 | +app = FastAPI() |
| 27 | +class PDFParseRequest(BaseModel): |
| 28 | + source_path: str |
| 29 | + export_path: str |
| 30 | + |
| 31 | +@app.post(path="/api/pdf-extract") |
| 32 | +async def parse_pdf(request: PDFParseRequest): |
| 33 | + try: |
| 34 | + start = time.time() |
| 35 | + # 创建唯一的输出目录 |
| 36 | + unique_id = str(uuid.uuid4()) |
| 37 | + unique_dir = os.path.join(request.export_path, unique_id) |
| 38 | + os.makedirs(unique_dir, exist_ok=True) |
| 39 | + |
| 40 | + # 如果是PDF,使用read_fn处理 |
| 41 | + file_path = Path(request.source_path) |
| 42 | + file_suffix = file_path.suffix.lower() |
| 43 | + if file_suffix == ".pdf": |
| 44 | + try: |
| 45 | + pdf_bytes = read_fn(file_path) |
| 46 | + pdf_name = file_path.stem |
| 47 | + pdf_bytes_list = [pdf_bytes] |
| 48 | + pdf_file_names = [pdf_name] |
| 49 | + except Exception as e: |
| 50 | + return JSONResponse( |
| 51 | + status_code=400, |
| 52 | + content={"error": f"Failed to load file: {str(e)}"} |
| 53 | + ) |
| 54 | + else: |
| 55 | + return JSONResponse( |
| 56 | + status_code=400, |
| 57 | + content={"error": f"Unsupported file type: {file_suffix}"} |
| 58 | + ) |
| 59 | + |
| 60 | + # 调用异步处理函数 |
| 61 | + await aio_do_parse( |
| 62 | + output_dir=unique_dir, |
| 63 | + pdf_file_names=pdf_file_names, |
| 64 | + pdf_bytes_list=pdf_bytes_list, |
| 65 | + p_lang_list=["ch"], |
| 66 | + f_draw_layout_bbox=False, |
| 67 | + f_draw_span_bbox=False, |
| 68 | + f_dump_orig_pdf=False, |
| 69 | + ) |
| 70 | + |
| 71 | + if os.getenv("MINERU_BACKEND_MODE").startswith("pipeline"): |
| 72 | + parse_dir = os.path.join(unique_dir, pdf_name, "auto") |
| 73 | + else: |
| 74 | + parse_dir = os.path.join(unique_dir, pdf_name, "vlm") |
| 75 | + |
| 76 | + content = "" |
| 77 | + if os.path.exists(parse_dir): |
| 78 | + content = get_infer_result(".md", pdf_name, parse_dir) |
| 79 | + |
| 80 | + if os.path.exists(unique_dir): |
| 81 | + try: |
| 82 | + shutil.rmtree(unique_dir) |
| 83 | + except Exception as e: |
| 84 | + logger.error(f"Failed to remove unique dir for {unique_id}: {str(e)}") |
| 85 | + |
| 86 | + logger.info(f"fileName: {file_path.name} costs {time.time() - start:.6f} s") |
| 87 | + |
| 88 | + return JSONResponse(status_code=200, content={"result": content}) |
| 89 | + except Exception as e: |
| 90 | + logger.exception(e) |
| 91 | + return JSONResponse( |
| 92 | + status_code=500, |
| 93 | + content={"error": f"Failed to process file: {str(e)}"} |
| 94 | + ) |
| 95 | + |
| 96 | + |
| 97 | +@click.command() |
| 98 | +@click.option('--ip', default='0.0.0.0', help='Service ip for this API, default to use 0.0.0.0.') |
| 99 | +@click.option('--port', default=9001, type=int, help='Service port for this API, default to use 8082.') |
| 100 | +def main(ip, port): |
| 101 | + """Create API for Submitting Job to MinerU""" |
| 102 | + logger.info(f"Start MinerU FastAPI Service: http://{ip}:{port}") |
| 103 | + uvicorn.run( |
| 104 | + app, |
| 105 | + host=ip, |
| 106 | + port=port |
| 107 | + ) |
| 108 | + |
| 109 | + |
| 110 | +if __name__ == "__main__": |
| 111 | + main() |
| 112 | + |
0 commit comments