Skip to content

Commit 155603b

Browse files
Startalkersyc366
andauthored
feature: add external pdf extract operator by using mineru (#36)
* feature: add UnstructuredFormatter * feature: add UnstructuredFormatter in db * feature: add unstructured[docx]==0.18.15 * feature: support doc * feature: add mineru * feature: add external pdf extract operator by using mineru * feature: mineru docker install bugfix --------- Co-authored-by: Startalker <438747480@qq.com>
1 parent 2f7341d commit 155603b

12 files changed

Lines changed: 370 additions & 3 deletions

File tree

Makefile

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
MAKEFLAGS += --no-print-directory
22

3+
WITH_MINERU ?= false # 默认不构建mineru
34
VERSION ?= latest
45
NAMESPACE ?= datamate
56

@@ -8,7 +9,7 @@ build-%:
89
$(MAKE) $*-docker-build
910

1011
.PHONY: build
11-
build: backend-docker-build frontend-docker-build runtime-docker-build
12+
build: backend-docker-build frontend-docker-build runtime-docker-build $(if $(WITH_MINERU),mineru-docker-build)
1213

1314
.PHONY: create-namespace
1415
create-namespace:
@@ -85,6 +86,9 @@ deer-flow-docker-build:
8586
cp deployment/docker/deer-flow/conf.yaml.example ../deer-flow/conf.yaml
8687
cd ../deer-flow && docker compose build
8788

89+
.PHONY: mineru-docker-build
90+
mineru-docker-build:
91+
docker build -t datamate-mineru:$(VERSION) . -f scripts/images/mineru/Dockerfile
8892
.PHONY: backend-docker-install
8993
backend-docker-install:
9094
cd deployment/docker/datamate && docker compose up -d backend
@@ -109,6 +113,22 @@ runtime-docker-install:
109113
runtime-docker-uninstall:
110114
cd deployment/docker/datamate && docker compose down runtime
111115

116+
.PHONY: mineru-docker-install
117+
mineru-docker-install:
118+
cd deployment/docker/datamate && cp .env.example .env && docker compose up -d datamate-mineru
119+
120+
.PHONY: mineru-docker-uninstall
121+
mineru-docker-uninstall:
122+
cd deployment/docker/datamate && docker compose down datamate-mineru
123+
124+
.PHONY: mineru-k8s-install
125+
mineru-k8s-install: create-namespace
126+
kubectl apply -f deployment/kubernetes/mineru/deploy.yaml -n $(NAMESPACE)
127+
128+
.PHONY: mineru-k8s-uninstall
129+
mineru-k8s-uninstall:
130+
kubectl delete -f deployment/kubernetes/mineru/deploy.yaml -n $(NAMESPACE)
131+
112132
.PHONY: datamate-docker-install
113133
datamate-docker-install:
114134
cd deployment/docker/datamate && cp .env.example .env && docker compose -f docker-compose.yml up -d

deployment/docker/datamate/docker-compose.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ services:
6060
MYSQL_USER: "root"
6161
MYSQL_PASSWORD: "password"
6262
MYSQL_DATABASE: "datamate"
63+
PDF_FORMATTER_BASE_URL: "http://datamate-mineru:9001"
6364
command:
6465
- python
6566
- /opt/runtime/datamate/operator_runtime.py
@@ -72,6 +73,27 @@ services:
7273
- flow_volume:/flow
7374
networks: [ datamate ]
7475

76+
# 4) mineru
77+
datamate-mineru:
78+
container_name: datamate-mineru
79+
image: datamate-mineru
80+
restart: on-failure
81+
environment:
82+
MINERU_MODEL_SOURCE: local
83+
MINERU_DEVICE_MODE: cpu # cpu|cuda|npu|mps
84+
MINERU_BACKEND_MODE: pipeline
85+
privileged: true
86+
command:
87+
- python
88+
- /opt/runtime/datamate/mineru/mineru_api.py
89+
- --port
90+
- "9001"
91+
volumes:
92+
- dataset_volume:/dataset
93+
- mineru_log_volume:/var/log/datamate/mineru
94+
networks: [ datamate ]
95+
profiles: [ mineru ]
96+
7597
volumes:
7698
dataset_volume:
7799
name: datamate-dataset-volume

deployment/helm/datamate/charts/ray-cluster/values.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ head:
7777
value: "password"
7878
- name: MYSQL_DATABASE
7979
value: "datamate"
80+
- name: PDF_FORMATTER_BASE_URL
81+
value: "http://datamate-mineru:9001"
8082
# - name: EXAMPLE_ENV
8183
# value: "1"
8284
envFrom: []
@@ -154,6 +156,8 @@ head:
154156
value: "password"
155157
- name: MYSQL_DATABASE
156158
value: "datamate"
159+
- name: PDF_FORMATTER_BASE_URL
160+
value: "http://datamate-mineru:9001"
157161
ports:
158162
- containerPort: 8081
159163
volumeMounts:
@@ -221,6 +225,8 @@ worker:
221225
value: "password"
222226
- name: MYSQL_DATABASE
223227
value: "datamate"
228+
- name: PDF_FORMATTER_BASE_URL
229+
value: "http://datamate-mineru:9001"
224230
# - name: EXAMPLE_ENV
225231
# value: "1"
226232
envFrom: []
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: datamate-mineru
5+
labels:
6+
app: datamate
7+
tier: mineru
8+
spec:
9+
replicas: 1
10+
selector:
11+
matchLabels:
12+
app: datamate
13+
tier: mineru
14+
template:
15+
metadata:
16+
labels:
17+
app: datamate
18+
tier: mineru
19+
spec:
20+
containers:
21+
- name: mineru
22+
image: datamate-mineru
23+
imagePullPolicy: IfNotPresent
24+
command:
25+
- python
26+
- /opt/runtime/datamate/mineru/mineru_api.py
27+
- --port
28+
- "9001"
29+
env:
30+
- name: MINERU_MODEL_SOURCE
31+
value: local
32+
- name: MINERU_DEVICE_MODE
33+
value: cpu
34+
- name: MINERU_BACKEND_MODE
35+
value: pipeline
36+
ports:
37+
- containerPort: 9001
38+
volumeMounts:
39+
- name: dataset-volume
40+
mountPath: /dataset
41+
- name: log-volume
42+
mountPath: /var/log/datamate/mineru
43+
subPath: mineru
44+
volumes:
45+
- name: dataset-volume
46+
hostPath:
47+
path: /opt/datamate/data/dataset
48+
type: DirectoryOrCreate
49+
- name: log-volume
50+
hostPath:
51+
path: /opt/datamate/data/log
52+
type: DirectoryOrCreate
53+
54+
---
55+
apiVersion: v1
56+
kind: Service
57+
metadata:
58+
name: datamate-mineru
59+
labels:
60+
app: datamate
61+
tier: mineru
62+
spec:
63+
type: ClusterIP
64+
ports:
65+
- port: 9001
66+
targetPort: 9001
67+
protocol: TCP
68+
selector:
69+
app: datamate
70+
tier: mineru

runtime/mineru/mineru_api.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import shutil
2+
import time
3+
import uuid
4+
import os
5+
6+
import click
7+
import uvicorn
8+
from pydantic import BaseModel
9+
from pathlib import Path
10+
from fastapi import FastAPI
11+
from fastapi.responses import JSONResponse
12+
from loguru import logger
13+
from mineru.cli.common import aio_do_parse, read_fn
14+
from mineru.cli.fast_api import get_infer_result
15+
16+
# 日志配置
17+
LOG_DIR = "/var/log/datamate/mineru"
18+
os.makedirs(LOG_DIR, exist_ok=True)
19+
logger.add(
20+
f"{LOG_DIR}/mineru.log",
21+
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}",
22+
level="DEBUG",
23+
enqueue=True
24+
)
25+
26+
app = FastAPI()
27+
class PDFParseRequest(BaseModel):
28+
source_path: str
29+
export_path: str
30+
31+
@app.post(path="/api/pdf-extract")
32+
async def parse_pdf(request: PDFParseRequest):
33+
try:
34+
start = time.time()
35+
# 创建唯一的输出目录
36+
unique_id = str(uuid.uuid4())
37+
unique_dir = os.path.join(request.export_path, unique_id)
38+
os.makedirs(unique_dir, exist_ok=True)
39+
40+
# 如果是PDF,使用read_fn处理
41+
file_path = Path(request.source_path)
42+
file_suffix = file_path.suffix.lower()
43+
if file_suffix == ".pdf":
44+
try:
45+
pdf_bytes = read_fn(file_path)
46+
pdf_name = file_path.stem
47+
pdf_bytes_list = [pdf_bytes]
48+
pdf_file_names = [pdf_name]
49+
except Exception as e:
50+
return JSONResponse(
51+
status_code=400,
52+
content={"error": f"Failed to load file: {str(e)}"}
53+
)
54+
else:
55+
return JSONResponse(
56+
status_code=400,
57+
content={"error": f"Unsupported file type: {file_suffix}"}
58+
)
59+
60+
# 调用异步处理函数
61+
await aio_do_parse(
62+
output_dir=unique_dir,
63+
pdf_file_names=pdf_file_names,
64+
pdf_bytes_list=pdf_bytes_list,
65+
p_lang_list=["ch"],
66+
f_draw_layout_bbox=False,
67+
f_draw_span_bbox=False,
68+
f_dump_orig_pdf=False,
69+
)
70+
71+
if os.getenv("MINERU_BACKEND_MODE").startswith("pipeline"):
72+
parse_dir = os.path.join(unique_dir, pdf_name, "auto")
73+
else:
74+
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
75+
76+
content = ""
77+
if os.path.exists(parse_dir):
78+
content = get_infer_result(".md", pdf_name, parse_dir)
79+
80+
if os.path.exists(unique_dir):
81+
try:
82+
shutil.rmtree(unique_dir)
83+
except Exception as e:
84+
logger.error(f"Failed to remove unique dir for {unique_id}: {str(e)}")
85+
86+
logger.info(f"fileName: {file_path.name} costs {time.time() - start:.6f} s")
87+
88+
return JSONResponse(status_code=200, content={"result": content})
89+
except Exception as e:
90+
logger.exception(e)
91+
return JSONResponse(
92+
status_code=500,
93+
content={"error": f"Failed to process file: {str(e)}"}
94+
)
95+
96+
97+
@click.command()
98+
@click.option('--ip', default='0.0.0.0', help='Service ip for this API, default to use 0.0.0.0.')
99+
@click.option('--port', default=9001, type=int, help='Service port for this API, default to use 8082.')
100+
def main(ip, port):
101+
"""Create API for Submitting Job to MinerU"""
102+
logger.info(f"Start MinerU FastAPI Service: http://{ip}:{port}")
103+
uvicorn.run(
104+
app,
105+
host=ip,
106+
port=port
107+
)
108+
109+
110+
if __name__ == "__main__":
111+
main()
112+

runtime/ops/formatter/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def _import_operators():
2121
from . import file_exporter
2222
from . import slide_formatter
2323
from . import unstructured_formatter
24+
from . import external_pdf_formatter
2425

2526

2627
_import_operators()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from datamate.core.base_op import OPERATORS
4+
5+
OPERATORS.register_module(module_name='ExternalPDFFormatter',
6+
module_path="ops.formatter.external_pdf_formatter.process")
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
name: '外部PDF文本抽取'
2+
name_en: 'External PDF Text Extraction'
3+
description: '基于外部API,抽取PDF中的文本。'
4+
description_en: 'Extracts text from PDF files based on external APIs.'
5+
language: 'python'
6+
vendor: 'huawei'
7+
raw_id: 'ExternalPDFFormatter'
8+
version: '1.0.0'
9+
types:
10+
- 'collect'
11+
modal: 'text'
12+
effect:
13+
before: ''
14+
after: ''
15+
inputs: 'text'
16+
outputs: 'text'
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/user/bin/python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Description: 外部PDF文本抽取
6+
Create: 2025/10/29 17:24
7+
"""
8+
import json
9+
import os
10+
import time
11+
from loguru import logger
12+
from typing import Dict, Any
13+
14+
from datamate.core.base_op import Mapper
15+
from datamate.common.utils.rest_client import http_request
16+
17+
18+
class ExternalPDFFormatter(Mapper):
19+
"""基于外部API,抽取PDF中的文本"""
20+
21+
def __init__(self, *args, **kwargs):
22+
super(ExternalPDFFormatter, self).__init__(*args, **kwargs)
23+
self.base_url = os.getenv("EXTERNAL_PDF_BASE_URL", "http://datamate-mineru:9001")
24+
self.pdf_extract_url = f"{self.base_url}/api/pdf-extract"
25+
26+
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
27+
start = time.time()
28+
filename = sample[self.filename_key]
29+
try:
30+
data = {"source_path": sample[self.filepath_key], "export_path": sample[self.export_path_key]}
31+
response = http_request(method="POST", url=self.pdf_extract_url, data=data)
32+
sample[self.text_key] = json.loads(response.text).get("result")
33+
logger.info(
34+
f"fileName: {filename}, method: ExternalPDFFormatter costs {(time.time() - start):6f} s")
35+
except UnicodeDecodeError as err:
36+
logger.exception(f"fileName: {filename}, method: ExternalPDFFormatter causes decode error: {err}")
37+
raise
38+
return sample

0 commit comments

Comments
 (0)