Skip to content

Commit 76ee4f2

Browse files
feat: add epub support for knowledge base document upload (#7594)
* feat: add EPUB parsing support for knowledge base and file reader * feat: update supported file formats for document upload in knowledge base * feat: enhance EPUB parser to support spine order and generic containers * makeitdown parse epub * update parser * fix
1 parent 4398947 commit 76ee4f2

File tree

16 files changed

+566
-24
lines changed

16 files changed

+566
-24
lines changed

astrbot/core/computer/file_read_utils.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ class FileProbe:
7373

7474
@dataclass(frozen=True)
7575
class ParsedDocument:
76-
kind: Literal["docx", "pdf"]
76+
kind: Literal["docx", "epub", "pdf"]
7777
file_bytes: bytes
7878
text: str
7979

@@ -371,6 +371,18 @@ def _is_docx_bytes(file_bytes: bytes) -> bool:
371371
return any(name.startswith("word/") for name in names)
372372

373373

374+
def _is_epub_bytes(file_bytes: bytes) -> bool:
375+
try:
376+
with zipfile.ZipFile(io.BytesIO(file_bytes)) as archive:
377+
names = set(archive.namelist())
378+
with archive.open("mimetype") as mimetype_file:
379+
mimetype = mimetype_file.read(64).decode("utf-8").strip()
380+
except (KeyError, OSError, UnicodeDecodeError, zipfile.BadZipFile):
381+
return False
382+
383+
return mimetype == "application/epub+zip" and "META-INF/container.xml" in names
384+
385+
374386
async def _parse_local_docx_text(file_bytes: bytes, file_name: str) -> str:
375387
from astrbot.core.knowledge_base.parsers.markitdown_parser import (
376388
MarkitdownParser,
@@ -387,23 +399,48 @@ async def _parse_local_pdf_text(file_bytes: bytes, file_name: str) -> str:
387399
return result.text
388400

389401

402+
async def _parse_local_epub_text(file_bytes: bytes, file_name: str) -> str:
403+
from astrbot.core.knowledge_base.parsers.epub_parser import EpubParser
404+
405+
result = await EpubParser().parse(file_bytes, file_name)
406+
return result.text
407+
408+
390409
async def _parse_local_supported_document(
391410
path: str,
392411
sample: bytes,
393412
) -> ParsedDocument | None:
394413
file_name = Path(path).name
414+
suffix = Path(path).suffix.lower()
395415
if _looks_like_pdf(path, sample):
396416
file_bytes = await _read_local_file_bytes(path)
397417
text = await _parse_local_pdf_text(file_bytes, file_name)
398418
return ParsedDocument(kind="pdf", file_bytes=file_bytes, text=text)
399419

400-
if Path(path).suffix.lower() == ".docx" or _looks_like_zip_container(sample):
420+
if suffix == ".epub":
421+
file_bytes = await _read_local_file_bytes(path)
422+
if not _is_epub_bytes(file_bytes):
423+
return None
424+
text = await _parse_local_epub_text(file_bytes, file_name)
425+
return ParsedDocument(kind="epub", file_bytes=file_bytes, text=text)
426+
427+
if suffix == ".docx":
401428
file_bytes = await _read_local_file_bytes(path)
402429
if not _is_docx_bytes(file_bytes):
403430
return None
404431
text = await _parse_local_docx_text(file_bytes, file_name)
405432
return ParsedDocument(kind="docx", file_bytes=file_bytes, text=text)
406433

434+
if _looks_like_zip_container(sample):
435+
file_bytes = await _read_local_file_bytes(path)
436+
if _is_epub_bytes(file_bytes):
437+
text = await _parse_local_epub_text(file_bytes, file_name)
438+
return ParsedDocument(kind="epub", file_bytes=file_bytes, text=text)
439+
if _is_docx_bytes(file_bytes):
440+
text = await _parse_local_docx_text(file_bytes, file_name)
441+
return ParsedDocument(kind="docx", file_bytes=file_bytes, text=text)
442+
return None
443+
407444
return None
408445

409446

astrbot/core/knowledge_base/kb_helper.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ async def _repair_and_translate_chunk_with_retry(
109109
return [chunk]
110110

111111

112+
def _compact_chunks(chunks: list[str]) -> list[str]:
113+
return [chunk.strip() for chunk in chunks if chunk and chunk.strip()]
114+
115+
112116
class KBHelper:
113117
vec_db: BaseVecDB
114118
kb: KnowledgeBase
@@ -249,7 +253,7 @@ async def upload_document(
249253

250254
if pre_chunked_text is not None:
251255
# 如果提供了预分块文本,直接使用
252-
chunks_text = pre_chunked_text
256+
chunks_text = _compact_chunks(pre_chunked_text)
253257
file_size = sum(len(chunk) for chunk in chunks_text)
254258
logger.info(f"使用预分块文本进行上传,共 {len(chunks_text)} 个块。")
255259
else:
@@ -316,6 +320,7 @@ async def upload_document(
316320
chunk_size=chunk_size,
317321
chunk_overlap=chunk_overlap,
318322
)
323+
chunks_text = _compact_chunks(chunks_text)
319324
except KnowledgeBaseUploadError:
320325
raise
321326
except Exception as exc:
@@ -728,6 +733,8 @@ async def _clean_and_rechunk_content(
728733
elif isinstance(result, list):
729734
final_chunks.extend(result)
730735

736+
final_chunks = _compact_chunks(final_chunks)
737+
731738
logger.info(
732739
f"文本修复完成: {len(initial_chunks)} 个原始块 -> {len(final_chunks)} 个最终块。"
733740
)

astrbot/core/knowledge_base/parsers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
"""文档解析器模块"""
22

33
from .base import BaseParser, MediaItem, ParseResult
4+
from .epub_parser import EpubParser
45
from .pdf_parser import PDFParser
56
from .text_parser import TextParser
67

78
__all__ = [
89
"BaseParser",
10+
"EpubParser",
911
"MediaItem",
1012
"PDFParser",
1113
"ParseResult",
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
"""EPUB document parser."""
2+
3+
import html
4+
import re
5+
6+
from astrbot.core.knowledge_base.parsers.base import BaseParser, ParseResult
7+
8+
_KEYS = (
9+
"Title|Author|Creator|Language|Publisher|Date|Modified|Identifier|ISBN|Description|"
10+
"Subject|Rights|Source|Series|标题|书名|作者|语言|出版社|日期|出版日期|标识符|简介|描述|"
11+
"主题|版权|来源|系列|タイトル|書名|著者|言語|出版社|日付|識別子|説明|件名|権利|ソース|シリーズ"
12+
)
13+
_META_RE = re.compile(rf"^\s*(?:[-*]\s*)?\*\*(?:{_KEYS})\s*[::]\*\*\s+\S")
14+
_TOC_HEAD_RE = re.compile(
15+
r"^\s{0,3}(?:#{1,6}\s*)?(?:table of contents|contents|toc|目录|目次|もくじ)\s*$",
16+
re.I,
17+
)
18+
_LINK_RE = re.compile(r"(?<!!)\[([^\]]+)\]\(([^)]+)\)")
19+
_IMG_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
20+
_EMPTY_IMG_LINK_RE = re.compile(
21+
r"\[\s*\]\([^)]+\.(?:png|jpe?g|gif|webp|svg)(?:#[^)]+)?\)", re.I
22+
)
23+
_FOOTNOTE_LABEL_RE = re.compile(
24+
r"^(?:\d{1,3}|[ivxlcdm]{1,8}|[*†‡§¶]|↩|↑|back|return|返回|回到正文)$", re.I
25+
)
26+
_FOOTNOTE_HREF_RE = re.compile(
27+
r"(?:^#|[#/_-](?:fn|footnote|note|noteref|backlink|return|filepos)\b)", re.I
28+
)
29+
_DOTTED_TOC_RE = re.compile(r"^\s*.+?\.{2,}\s*(?:\d+|[ivxlcdm]+)\s*$", re.I)
30+
_SEP_RE = re.compile(r"^\s*(?:[-=*_]){3,}\s*$")
31+
_NOISE_RE = re.compile(
32+
r"^\s*(?:\[\s*)?(?:\d{1,3}|[ivxlcdm]{1,8}|[*†‡§¶]|↩|↑)(?:\s*\])?\s*$", re.I
33+
)
34+
_GENERIC_ALT_RE = re.compile(
35+
r"^(?:image|img|picture|photo|illustration|figure|fig|cover|插图|图片|图像|封面)\s*[\d._-]*$",
36+
re.I,
37+
)
38+
_FILENAME_ALT_RE = re.compile(r"^[\w.\- ]+\.(?:png|jpe?g|gif|webp|svg)$", re.I)
39+
40+
41+
def _n(s: str) -> str:
42+
return (
43+
html.unescape(s)
44+
.replace("\r\n", "\n")
45+
.replace("\r", "\n")
46+
.replace("\ufeff", "")
47+
.replace("\u00a0", " ")
48+
.replace("\u200b", "")
49+
)
50+
51+
52+
def _is_internal(href: str) -> bool:
53+
href = html.unescape(href).strip().lower()
54+
return (
55+
href.startswith("#")
56+
or href.endswith(".html")
57+
or href.endswith(".xhtml")
58+
or ".html#" in href
59+
or ".xhtml#" in href
60+
)
61+
62+
63+
def _is_toc_line(s: str) -> bool:
64+
s = s.strip()
65+
if not s:
66+
return False
67+
s = re.sub(r"^\s*(?:[-*+]|\d+\.)\s+", "", s)
68+
m = re.fullmatch(r"\[([^\]]+)\]\(([^)]+)\)", s)
69+
return bool((m and _is_internal(m.group(2))) or _DOTTED_TOC_RE.match(s))
70+
71+
72+
def _strip_head(text: str) -> str:
73+
lines = _n(text).split("\n")
74+
i = 0
75+
while i < len(lines) and not lines[i].strip():
76+
i += 1
77+
start = i
78+
while i < len(lines) and _META_RE.match(lines[i].strip()):
79+
i += 1
80+
if i - start >= 2:
81+
while i < len(lines) and not lines[i].strip():
82+
i += 1
83+
else:
84+
i = start
85+
toc0, had_head = i, False
86+
if i < len(lines) and _TOC_HEAD_RE.match(lines[i].strip()):
87+
had_head = True
88+
i += 1
89+
while i < len(lines) and not lines[i].strip():
90+
i += 1
91+
toc = 0
92+
while i < len(lines) and i - toc0 < 120:
93+
s = lines[i].strip()
94+
if not s:
95+
if toc and i + 1 < len(lines) and _is_toc_line(lines[i + 1]):
96+
i += 1
97+
continue
98+
break
99+
if not _is_toc_line(s):
100+
break
101+
toc += 1
102+
i += 1
103+
if toc >= 2 and (had_head or toc >= 3):
104+
while i < len(lines) and not lines[i].strip():
105+
i += 1
106+
return "\n".join(lines[i:]).strip()
107+
return "\n".join(lines[toc0:]).strip()
108+
109+
110+
def _strip_links(text: str) -> str:
111+
def repl(m: re.Match[str]) -> str:
112+
label = html.unescape(m.group(1)).strip()
113+
href = html.unescape(m.group(2)).strip().lower()
114+
if not _is_internal(href):
115+
return m.group(0)
116+
if _FOOTNOTE_HREF_RE.search(href) or (
117+
href.startswith("#") and _FOOTNOTE_LABEL_RE.fullmatch(label)
118+
):
119+
return ""
120+
return label
121+
122+
return _LINK_RE.sub(repl, _n(text))
123+
124+
125+
def _img_alt(m: re.Match[str]) -> str:
126+
alt = re.sub(r"\s+", " ", html.unescape(m.group(1)).strip())
127+
if not alt or _GENERIC_ALT_RE.fullmatch(alt) or _FILENAME_ALT_RE.fullmatch(alt):
128+
return ""
129+
return alt
130+
131+
132+
def _sanitize(text: str) -> str:
133+
out, prev_blank, prev = [], True, ""
134+
for raw in _n(text).split("\n"):
135+
line = _IMG_RE.sub(_img_alt, raw)
136+
line = _EMPTY_IMG_LINK_RE.sub("", line).rstrip()
137+
s = line.strip()
138+
if not s:
139+
if not prev_blank:
140+
out.append("")
141+
prev_blank = True
142+
continue
143+
if _SEP_RE.match(s) or _NOISE_RE.match(s):
144+
continue
145+
norm = re.sub(r"^\s{0,3}#{1,6}\s*", "", s).strip("*_ ").casefold()
146+
if norm and norm == prev and len(norm) <= 120:
147+
continue
148+
out.append(line)
149+
prev_blank = False
150+
prev = norm
151+
return "\n".join(out).strip()
152+
153+
154+
class EpubParser(BaseParser):
155+
"""Parse EPUB files via MarkItDown."""
156+
157+
async def parse(self, file_content: bytes, file_name: str) -> ParseResult:
158+
from .markitdown_parser import MarkitdownParser
159+
160+
result = await MarkitdownParser().parse(file_content, file_name)
161+
text = _sanitize(_strip_links(_strip_head(result.text)))
162+
return ParseResult(text=text, media=result.media)

astrbot/core/knowledge_base/parsers/util.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ async def select_parser(ext: str) -> BaseParser:
66
from .markitdown_parser import MarkitdownParser
77

88
return MarkitdownParser()
9+
if ext == ".epub":
10+
from .epub_parser import EpubParser
11+
12+
return EpubParser()
913
if ext == ".pdf":
1014
from .pdf_parser import PDFParser
1115

dashboard/src/i18n/locales/en-US/features/alkaid/knowledge-base.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
},
7171
"upload": {
7272
"title": "Upload Files to Knowledge Base",
73-
"subtitle": "Supports txt, pdf, word, excel and other formats",
73+
"subtitle": "Supports txt, pdf, epub, word, excel and other formats",
7474
"dropzone": "Drag and drop files here or click to upload",
7575
"chunkSettings": {
7676
"title": "Chunk Settings",
@@ -152,4 +152,4 @@
152152
"preRequisite": "Hint: Please go to the plugin market to install astrbot_plugin_url_2_knowledge_base and follow the instructions in the plugin documentation to complete the playwright installation before using this feature.",
153153
"allChunksUploaded": "All chunks uploaded successfully"
154154
}
155-
}
155+
}

dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
"title": "Upload Document",
4747
"selectFile": "Select File",
4848
"dropzone": "Drop files here or click to select",
49-
"supportedFormats": "Supported formats: ",
49+
"supportedFormats": "Supported formats: .txt, .md, .pdf, .docx, .epub, .xls, .xlsx",
5050
"maxSize": "Max file size: 128MB",
5151
"chunkSettings": "Chunk Settings",
5252
"batchSettings": "Batch Settings",

dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
"title": "Добавление контента",
4747
"selectFile": "Файл",
4848
"dropzone": "Нажмите или перетащите файл сюда",
49-
"supportedFormats": "Форматы: ",
49+
"supportedFormats": "Форматы: .txt, .md, .pdf, .docx, .epub, .xls, .xlsx",
5050
"maxSize": "Максимум: 128MB",
5151
"chunkSettings": "Фрагментация",
5252
"batchSettings": "Пакетная обработка",
@@ -115,4 +115,4 @@
115115
"saveFailed": "Ошибка сохранения",
116116
"tips": "Внимание! Изменение этих параметров повлияет на будущую выдачу базы знаний."
117117
}
118-
}
118+
}

dashboard/src/i18n/locales/zh-CN/features/alkaid/knowledge-base.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
},
7171
"upload": {
7272
"title": "上传文件到知识库",
73-
"subtitle": "支持 txt、pdf、word、excel 等多种格式",
73+
"subtitle": "支持 txt、pdf、epub、word、excel 等多种格式",
7474
"dropzone": "拖放文件到这里或点击上传",
7575
"chunkSettings": {
7676
"title": "分片设置",
@@ -152,4 +152,4 @@
152152
"preRequisite": "提示:请先前往插件市场安装 astrbot_plugin_url_2_knowledge_base 并根据插件文档内的指示完成 playwright 安装后才可使用本功能",
153153
"allChunksUploaded": "所有分片上传成功"
154154
}
155-
}
155+
}

dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
"title": "上传文档",
4747
"selectFile": "选择文件",
4848
"dropzone": "拖放文件到这里或点击选择",
49-
"supportedFormats": "支持的格式: ",
49+
"supportedFormats": "支持的格式: .txt, .md, .pdf, .docx, .epub, .xls, .xlsx",
5050
"maxSize": "最大文件大小: 128MB",
5151
"chunkSettings": "分块设置",
5252
"batchSettings": "批处理设置",

0 commit comments

Comments
 (0)