Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions runtime/ops/formatter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def _import_operators():
from . import img_formatter
from . import file_exporter
from . import slide_formatter
from . import unstructured_formatter


_import_operators()
6 changes: 6 additions & 0 deletions runtime/ops/formatter/unstructured_formatter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-

from datamate.core.base_op import OPERATORS

OPERATORS.register_module(module_name='UnstructuredFormatter',
module_path="ops.formatter.unstructured_formatter.process")
16 changes: 16 additions & 0 deletions runtime/ops/formatter/unstructured_formatter/metadata.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: '非结构化文本抽取'
name_en: 'Unstructured Text Extraction'
description: '抽取非结构化文件的文本,目前支持word文档'
description_en: 'Extracts text from Unstructured files, currently supporting Word documents.'
language: 'python'
vendor: 'huawei'
raw_id: 'UnstructuredFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'
35 changes: 35 additions & 0 deletions runtime/ops/formatter/unstructured_formatter/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@

#!/user/bin/python
# -*- coding: utf-8 -*-

"""
Description: 非结构化文本抽取
Create: 2025/10/22 15:15
"""
import time
from typing import Dict, Any

from loguru import logger
from unstructured.partition.auto import partition

from datamate.core.base_op import Mapper


class UnstructuredFormatter(Mapper):
"""把输入的非结构化文本抽取为txt"""

def __init__(self, *args, **kwargs):
super(UnstructuredFormatter, self).__init__(*args, **kwargs)

def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
filepath = sample.get(self.filepath_key)
filename = sample.get(self.filename_key)
try:
elements = partition(filename=filepath)
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
except UnicodeDecodeError as err:
logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
raise
return sample
1 change: 1 addition & 0 deletions runtime/ops/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ xmltodict==1.0.2
zhconv==1.4.3
sqlalchemy==2.0.40
pymysql==1.1.1
unstructured[docx]==0.18.15
5 changes: 3 additions & 2 deletions scripts/db/data-operator-init.sql
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ VALUES (1, '模态', 'predefined', 0),
INSERT IGNORE INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持word文档。', '1.0.0', 'text', 'text', null, null, '', false),
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
Expand Down Expand Up @@ -121,7 +122,7 @@ AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRe
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'UnstructuredFormatter');

INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
Expand All @@ -137,4 +138,4 @@ SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN (7, 8, 11)
AND o.id IN ('FileExporter');
AND o.id IN ('FileExporter', 'UnstructuredFormatter');
2 changes: 1 addition & 1 deletion scripts/images/runtime/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ ENV PYTHONPATH=/opt/runtime/datamate/

RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \
&& apt update \
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 \
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 libreoffice\
&& apt clean \
&& rm -rf /var/lib/apt/lists/*

Expand Down