feat: add epub support for knowledge base document upload (#7594)

Aster-amellus · web-flow · commit 76ee4f27dd58 · 2026-04-20T15:24:07.000+08:00
* feat: add EPUB parsing support for knowledge base and file reader

* feat: update supported file formats for document upload in knowledge base

* feat: enhance EPUB parser to support spine order and generic containers

* makeitdown parse epub

* update parser

* fix
diff --git a/astrbot/core/computer/file_read_utils.py b/astrbot/core/computer/file_read_utils.py
@@ -73,7 +73,7 @@ class FileProbe:
 
 @dataclass(frozen=True)
 class ParsedDocument:
-    kind: Literal["docx", "pdf"]
+    kind: Literal["docx", "epub", "pdf"]
     file_bytes: bytes
     text: str
 
@@ -371,6 +371,18 @@ def _is_docx_bytes(file_bytes: bytes) -> bool:
     return any(name.startswith("word/") for name in names)
 
 
+def _is_epub_bytes(file_bytes: bytes) -> bool:
+    try:
+        with zipfile.ZipFile(io.BytesIO(file_bytes)) as archive:
+            names = set(archive.namelist())
+            with archive.open("mimetype") as mimetype_file:
+                mimetype = mimetype_file.read(64).decode("utf-8").strip()
+    except (KeyError, OSError, UnicodeDecodeError, zipfile.BadZipFile):
+        return False
+
+    return mimetype == "application/epub+zip" and "META-INF/container.xml" in names
+
+
 async def _parse_local_docx_text(file_bytes: bytes, file_name: str) -> str:
     from astrbot.core.knowledge_base.parsers.markitdown_parser import (
         MarkitdownParser,
@@ -387,23 +399,48 @@ async def _parse_local_pdf_text(file_bytes: bytes, file_name: str) -> str:
     return result.text
 
 
+async def _parse_local_epub_text(file_bytes: bytes, file_name: str) -> str:
+    from astrbot.core.knowledge_base.parsers.epub_parser import EpubParser
+
+    result = await EpubParser().parse(file_bytes, file_name)
+    return result.text
+
+
 async def _parse_local_supported_document(
     path: str,
     sample: bytes,
 ) -> ParsedDocument | None:
     file_name = Path(path).name
+    suffix = Path(path).suffix.lower()
     if _looks_like_pdf(path, sample):
         file_bytes = await _read_local_file_bytes(path)
         text = await _parse_local_pdf_text(file_bytes, file_name)
         return ParsedDocument(kind="pdf", file_bytes=file_bytes, text=text)
 
-    if Path(path).suffix.lower() == ".docx" or _looks_like_zip_container(sample):
+    if suffix == ".epub":
+        file_bytes = await _read_local_file_bytes(path)
+        if not _is_epub_bytes(file_bytes):
+            return None
+        text = await _parse_local_epub_text(file_bytes, file_name)
+        return ParsedDocument(kind="epub", file_bytes=file_bytes, text=text)
+
+    if suffix == ".docx":
         file_bytes = await _read_local_file_bytes(path)
         if not _is_docx_bytes(file_bytes):
             return None
         text = await _parse_local_docx_text(file_bytes, file_name)
         return ParsedDocument(kind="docx", file_bytes=file_bytes, text=text)
 
+    if _looks_like_zip_container(sample):
+        file_bytes = await _read_local_file_bytes(path)
+        if _is_epub_bytes(file_bytes):
+            text = await _parse_local_epub_text(file_bytes, file_name)
+            return ParsedDocument(kind="epub", file_bytes=file_bytes, text=text)
+        if _is_docx_bytes(file_bytes):
+            text = await _parse_local_docx_text(file_bytes, file_name)
+            return ParsedDocument(kind="docx", file_bytes=file_bytes, text=text)
+        return None
+
     return None
 
 
diff --git a/astrbot/core/knowledge_base/kb_helper.py b/astrbot/core/knowledge_base/kb_helper.py
@@ -109,6 +109,10 @@ async def _repair_and_translate_chunk_with_retry(
     return [chunk]
 
 
+def _compact_chunks(chunks: list[str]) -> list[str]:
+    return [chunk.strip() for chunk in chunks if chunk and chunk.strip()]
+
+
 class KBHelper:
     vec_db: BaseVecDB
     kb: KnowledgeBase
@@ -249,7 +253,7 @@ async def upload_document(
 
             if pre_chunked_text is not None:
                 # 如果提供了预分块文本，直接使用
-                chunks_text = pre_chunked_text
+                chunks_text = _compact_chunks(pre_chunked_text)
                 file_size = sum(len(chunk) for chunk in chunks_text)
                 logger.info(f"使用预分块文本进行上传，共 {len(chunks_text)} 个块。")
             else:
@@ -316,6 +320,7 @@ async def upload_document(
                         chunk_size=chunk_size,
                         chunk_overlap=chunk_overlap,
                     )
+                    chunks_text = _compact_chunks(chunks_text)
                 except KnowledgeBaseUploadError:
                     raise
                 except Exception as exc:
@@ -728,6 +733,8 @@ async def _clean_and_rechunk_content(
                 elif isinstance(result, list):
                     final_chunks.extend(result)
 
+            final_chunks = _compact_chunks(final_chunks)
+
             logger.info(
                 f"文本修复完成: {len(initial_chunks)} 个原始块 -> {len(final_chunks)} 个最终块。"
             )
diff --git a/astrbot/core/knowledge_base/parsers/__init__.py b/astrbot/core/knowledge_base/parsers/__init__.py
@@ -1,11 +1,13 @@
 """文档解析器模块"""
 
 from .base import BaseParser, MediaItem, ParseResult
+from .epub_parser import EpubParser
 from .pdf_parser import PDFParser
 from .text_parser import TextParser
 
 __all__ = [
     "BaseParser",
+    "EpubParser",
     "MediaItem",
     "PDFParser",
     "ParseResult",
diff --git a/astrbot/core/knowledge_base/parsers/epub_parser.py b/astrbot/core/knowledge_base/parsers/epub_parser.py
@@ -0,0 +1,162 @@
+"""EPUB document parser."""
+
+import html
+import re
+
+from astrbot.core.knowledge_base.parsers.base import BaseParser, ParseResult
+
+_KEYS = (
+    "Title|Author|Creator|Language|Publisher|Date|Modified|Identifier|ISBN|Description|"
+    "Subject|Rights|Source|Series|标题|书名|作者|语言|出版社|日期|出版日期|标识符|简介|描述|"
+    "主题|版权|来源|系列|タイトル|書名|著者|言語|出版社|日付|識別子|説明|件名|権利|ソース|シリーズ"
+)
+_META_RE = re.compile(rf"^\s*(?:[-*]\s*)?\*\*(?:{_KEYS})\s*[:：]\*\*\s+\S")
+_TOC_HEAD_RE = re.compile(
+    r"^\s{0,3}(?:#{1,6}\s*)?(?:table of contents|contents|toc|目录|目次|もくじ)\s*$",
+    re.I,
+)
+_LINK_RE = re.compile(r"(?<!!)\[([^\]]+)\]\(([^)]+)\)")
+_IMG_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
+_EMPTY_IMG_LINK_RE = re.compile(
+    r"\[\s*\]\([^)]+\.(?:png|jpe?g|gif|webp|svg)(?:#[^)]+)?\)", re.I
+)
+_FOOTNOTE_LABEL_RE = re.compile(
+    r"^(?:\d{1,3}|[ivxlcdm]{1,8}|[*†‡§¶]|↩|↑|back|return|返回|回到正文)$", re.I
+)
+_FOOTNOTE_HREF_RE = re.compile(
+    r"(?:^#|[#/_-](?:fn|footnote|note|noteref|backlink|return|filepos)\b)", re.I
+)
+_DOTTED_TOC_RE = re.compile(r"^\s*.+?\.{2,}\s*(?:\d+|[ivxlcdm]+)\s*$", re.I)
+_SEP_RE = re.compile(r"^\s*(?:[-=*_]){3,}\s*$")
+_NOISE_RE = re.compile(
+    r"^\s*(?:\[\s*)?(?:\d{1,3}|[ivxlcdm]{1,8}|[*†‡§¶]|↩|↑)(?:\s*\])?\s*$", re.I
+)
+_GENERIC_ALT_RE = re.compile(
+    r"^(?:image|img|picture|photo|illustration|figure|fig|cover|插图|图片|图像|封面)\s*[\d._-]*$",
+    re.I,
+)
+_FILENAME_ALT_RE = re.compile(r"^[\w.\- ]+\.(?:png|jpe?g|gif|webp|svg)$", re.I)
+
+
+def _n(s: str) -> str:
+    return (
+        html.unescape(s)
+        .replace("\r\n", "\n")
+        .replace("\r", "\n")
+        .replace("\ufeff", "")
+        .replace("\u00a0", " ")
+        .replace("\u200b", "")
+    )
+
+
+def _is_internal(href: str) -> bool:
+    href = html.unescape(href).strip().lower()
+    return (
+        href.startswith("#")
+        or href.endswith(".html")
+        or href.endswith(".xhtml")
+        or ".html#" in href
+        or ".xhtml#" in href
+    )
+
+
+def _is_toc_line(s: str) -> bool:
+    s = s.strip()
+    if not s:
+        return False
+    s = re.sub(r"^\s*(?:[-*+]|\d+\.)\s+", "", s)
+    m = re.fullmatch(r"\[([^\]]+)\]\(([^)]+)\)", s)
+    return bool((m and _is_internal(m.group(2))) or _DOTTED_TOC_RE.match(s))
+
+
+def _strip_head(text: str) -> str:
+    lines = _n(text).split("\n")
+    i = 0
+    while i < len(lines) and not lines[i].strip():
+        i += 1
+    start = i
+    while i < len(lines) and _META_RE.match(lines[i].strip()):
+        i += 1
+    if i - start >= 2:
+        while i < len(lines) and not lines[i].strip():
+            i += 1
+    else:
+        i = start
+    toc0, had_head = i, False
+    if i < len(lines) and _TOC_HEAD_RE.match(lines[i].strip()):
+        had_head = True
+        i += 1
+        while i < len(lines) and not lines[i].strip():
+            i += 1
+    toc = 0
+    while i < len(lines) and i - toc0 < 120:
+        s = lines[i].strip()
+        if not s:
+            if toc and i + 1 < len(lines) and _is_toc_line(lines[i + 1]):
+                i += 1
+                continue
+            break
+        if not _is_toc_line(s):
+            break
+        toc += 1
+        i += 1
+    if toc >= 2 and (had_head or toc >= 3):
+        while i < len(lines) and not lines[i].strip():
+            i += 1
+        return "\n".join(lines[i:]).strip()
+    return "\n".join(lines[toc0:]).strip()
+
+
+def _strip_links(text: str) -> str:
+    def repl(m: re.Match[str]) -> str:
+        label = html.unescape(m.group(1)).strip()
+        href = html.unescape(m.group(2)).strip().lower()
+        if not _is_internal(href):
+            return m.group(0)
+        if _FOOTNOTE_HREF_RE.search(href) or (
+            href.startswith("#") and _FOOTNOTE_LABEL_RE.fullmatch(label)
+        ):
+            return ""
+        return label
+
+    return _LINK_RE.sub(repl, _n(text))
+
+
+def _img_alt(m: re.Match[str]) -> str:
+    alt = re.sub(r"\s+", " ", html.unescape(m.group(1)).strip())
+    if not alt or _GENERIC_ALT_RE.fullmatch(alt) or _FILENAME_ALT_RE.fullmatch(alt):
+        return ""
+    return alt
+
+
+def _sanitize(text: str) -> str:
+    out, prev_blank, prev = [], True, ""
+    for raw in _n(text).split("\n"):
+        line = _IMG_RE.sub(_img_alt, raw)
+        line = _EMPTY_IMG_LINK_RE.sub("", line).rstrip()
+        s = line.strip()
+        if not s:
+            if not prev_blank:
+                out.append("")
+                prev_blank = True
+            continue
+        if _SEP_RE.match(s) or _NOISE_RE.match(s):
+            continue
+        norm = re.sub(r"^\s{0,3}#{1,6}\s*", "", s).strip("*_ ").casefold()
+        if norm and norm == prev and len(norm) <= 120:
+            continue
+        out.append(line)
+        prev_blank = False
+        prev = norm
+    return "\n".join(out).strip()
+
+
+class EpubParser(BaseParser):
+    """Parse EPUB files via MarkItDown."""
+
+    async def parse(self, file_content: bytes, file_name: str) -> ParseResult:
+        from .markitdown_parser import MarkitdownParser
+
+        result = await MarkitdownParser().parse(file_content, file_name)
+        text = _sanitize(_strip_links(_strip_head(result.text)))
+        return ParseResult(text=text, media=result.media)
diff --git a/astrbot/core/knowledge_base/parsers/util.py b/astrbot/core/knowledge_base/parsers/util.py
@@ -6,6 +6,10 @@ async def select_parser(ext: str) -> BaseParser:
         from .markitdown_parser import MarkitdownParser
 
         return MarkitdownParser()
+    if ext == ".epub":
+        from .epub_parser import EpubParser
+
+        return EpubParser()
     if ext == ".pdf":
         from .pdf_parser import PDFParser
 
diff --git a/dashboard/src/i18n/locales/en-US/features/alkaid/knowledge-base.json b/dashboard/src/i18n/locales/en-US/features/alkaid/knowledge-base.json
@@ -70,7 +70,7 @@
   },
   "upload": {
     "title": "Upload Files to Knowledge Base",
-    "subtitle": "Supports txt, pdf, word, excel and other formats",
+    "subtitle": "Supports txt, pdf, epub, word, excel and other formats",
     "dropzone": "Drag and drop files here or click to upload",
     "chunkSettings": {
       "title": "Chunk Settings",
@@ -152,4 +152,4 @@
     "preRequisite": "Hint: Please go to the plugin market to install astrbot_plugin_url_2_knowledge_base and follow the instructions in the plugin documentation to complete the playwright installation before using this feature.",
     "allChunksUploaded": "All chunks uploaded successfully"
   }
-}
+}
diff --git a/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json
@@ -46,7 +46,7 @@
     "title": "Upload Document",
     "selectFile": "Select File",
     "dropzone": "Drop files here or click to select",
-    "supportedFormats": "Supported formats: ",
+    "supportedFormats": "Supported formats: .txt, .md, .pdf, .docx, .epub, .xls, .xlsx",
     "maxSize": "Max file size: 128MB",
     "chunkSettings": "Chunk Settings",
     "batchSettings": "Batch Settings",
diff --git a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json
@@ -46,7 +46,7 @@
         "title": "Добавление контента",
         "selectFile": "Файл",
         "dropzone": "Нажмите или перетащите файл сюда",
-        "supportedFormats": "Форматы: ",
+        "supportedFormats": "Форматы: .txt, .md, .pdf, .docx, .epub, .xls, .xlsx",
         "maxSize": "Максимум: 128MB",
         "chunkSettings": "Фрагментация",
         "batchSettings": "Пакетная обработка",
@@ -115,4 +115,4 @@
         "saveFailed": "Ошибка сохранения",
         "tips": "Внимание! Изменение этих параметров повлияет на будущую выдачу базы знаний."
     }
-}
+}
diff --git a/dashboard/src/i18n/locales/zh-CN/features/alkaid/knowledge-base.json b/dashboard/src/i18n/locales/zh-CN/features/alkaid/knowledge-base.json
@@ -70,7 +70,7 @@
   },
   "upload": {
     "title": "上传文件到知识库",
-    "subtitle": "支持 txt、pdf、word、excel 等多种格式",
+    "subtitle": "支持 txt、pdf、epub、word、excel 等多种格式",
     "dropzone": "拖放文件到这里或点击上传",
     "chunkSettings": {
       "title": "分片设置",
@@ -152,4 +152,4 @@
     "preRequisite": "提示：请先前往插件市场安装 astrbot_plugin_url_2_knowledge_base 并根据插件文档内的指示完成 playwright 安装后才可使用本功能",
     "allChunksUploaded": "所有分片上传成功"
   }
-}
+}
diff --git a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json
@@ -46,7 +46,7 @@
     "title": "上传文档",
     "selectFile": "选择文件",
     "dropzone": "拖放文件到这里或点击选择",
-    "supportedFormats": "支持的格式: ",
+    "supportedFormats": "支持的格式: .txt, .md, .pdf, .docx, .epub, .xls, .xlsx",
     "maxSize": "最大文件大小: 128MB",
     "chunkSettings": "分块设置",
     "batchSettings": "批处理设置",
diff --git a/dashboard/src/views/alkaid/KnowledgeBase.vue b/dashboard/src/views/alkaid/KnowledgeBase.vue
@@ -830,6 +830,7 @@ export default {
             if (files.length > 0) {
                 this.selectedFile = files[0];
             }
+            event.target.value = '';
         },
 
         onFileDrop(event) {
@@ -845,6 +846,8 @@ export default {
             switch (extension) {
                 case 'pdf':
                     return 'mdi-file-pdf-box';
+                case 'epub':
+                    return 'mdi-book-open-page-variant';
                 case 'doc':
                 case 'docx':
                     return 'mdi-file-word-box';
@@ -882,11 +885,7 @@ export default {
                 formData.append('chunk_overlap', this.overlap);
             }
 
-            axios.post('/api/plug/alkaid/kb/collection/add_file', formData, {
-                headers: {
-                    'Content-Type': 'multipart/form-data'
-                }
-            })
+            axios.post('/api/plug/alkaid/kb/collection/add_file', formData)
                 .then(response => {
                     if (response.data.status === 'ok') {
                         this.showSnackbar(this.tm('messages.operationSuccess', { message: response.data.message }));
diff --git a/dashboard/src/views/knowledge-base/DocumentDetail.vue b/dashboard/src/views/knowledge-base/DocumentDetail.vue
diff --git a/dashboard/src/views/knowledge-base/components/DocumentsTab.vue b/dashboard/src/views/knowledge-base/components/DocumentsTab.vue
diff --git a/requirements.txt b/requirements.txt
diff --git a/tests/test_computer_fs_tools.py b/tests/test_computer_fs_tools.py
diff --git a/tests/test_epub_parser.py b/tests/test_epub_parser.py