Skip to content

Commit cc72c01

Browse files
authored
fix: improve knowledge base upload error messages (AstrBotDevs#7534)
* fix: improve knowledge base upload error messages * fix: deduplicate knowledge base upload logs * fix: handle type errors in kb embedding validation
1 parent 11dedf3 commit cc72c01

6 files changed

Lines changed: 297 additions & 31 deletions

File tree

astrbot/core/db/vec_db/faiss_impl/vec_db.py

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55

66
from astrbot import logger
7+
from astrbot.core.exceptions import KnowledgeBaseUploadError
78
from astrbot.core.provider.provider import EmbeddingProvider, RerankProvider
89

910
from ..base import BaseVecDB, Result
@@ -80,6 +81,32 @@ async def insert_batch(
8081
)
8182
return []
8283

84+
content_count = len(contents)
85+
if len(metadatas) != content_count:
86+
raise KnowledgeBaseUploadError(
87+
stage="storage",
88+
user_message=(
89+
f"存储失败:文本分块数量与元数据数量不一致(期望 {content_count},"
90+
f"实际 {len(metadatas)})。"
91+
),
92+
details={
93+
"expected_contents": content_count,
94+
"actual_metadatas": len(metadatas),
95+
},
96+
)
97+
if len(ids) != content_count:
98+
raise KnowledgeBaseUploadError(
99+
stage="storage",
100+
user_message=(
101+
f"存储失败:文本分块数量与文档 ID 数量不一致(期望 {content_count},"
102+
f"实际 {len(ids)})。"
103+
),
104+
details={
105+
"expected_contents": content_count,
106+
"actual_ids": len(ids),
107+
},
108+
)
109+
83110
start = time.time()
84111
logger.debug(f"Generating embeddings for {len(contents)} contents...")
85112
vectors = await self.embedding_provider.get_embeddings_batch(
@@ -93,16 +120,73 @@ async def insert_batch(
93120
logger.debug(
94121
f"Generated embeddings for {len(contents)} contents in {end - start:.2f} seconds.",
95122
)
123+
if len(vectors) != content_count:
124+
raise KnowledgeBaseUploadError(
125+
stage="embedding",
126+
user_message=(
127+
"向量化失败:嵌入模型返回的向量数量与文本分块数量不一致"
128+
f"(期望 {content_count},实际 {len(vectors)})。"
129+
"这通常说明当前 Embedding 接口未完整返回批量结果,"
130+
"或该服务不兼容当前批量请求格式。"
131+
),
132+
details={
133+
"expected_contents": content_count,
134+
"actual_vectors": len(vectors),
135+
},
136+
)
96137

97138
# 使用 DocumentStorage 的批量插入方法
98139
int_ids = await self.document_storage.insert_documents_batch(
99140
ids,
100141
contents,
101142
metadatas,
102143
)
144+
if len(int_ids) != content_count:
145+
raise KnowledgeBaseUploadError(
146+
stage="storage",
147+
user_message=(
148+
f"存储失败:写入文档索引后返回的内部 ID 数量与文本分块数量不一致"
149+
f"(期望 {content_count},实际 {len(int_ids)})。"
150+
),
151+
details={
152+
"expected_contents": content_count,
153+
"actual_int_ids": len(int_ids),
154+
},
155+
)
103156

104157
# 批量插入向量到 FAISS
105-
vectors_array = np.array(vectors).astype("float32")
158+
try:
159+
vectors_array = np.asarray(vectors, dtype=np.float32)
160+
except (TypeError, ValueError) as exc:
161+
raise KnowledgeBaseUploadError(
162+
stage="embedding",
163+
user_message=(
164+
"向量化失败:嵌入模型返回的向量格式不正确,"
165+
"无法转换为统一的浮点向量矩阵。"
166+
),
167+
details={"vector_count": len(vectors)},
168+
) from exc
169+
if vectors_array.ndim != 2:
170+
raise KnowledgeBaseUploadError(
171+
stage="embedding",
172+
user_message=(
173+
"向量化失败:嵌入模型返回的向量格式不正确,无法构造成二维向量矩阵。"
174+
),
175+
details={"actual_ndim": int(vectors_array.ndim)},
176+
)
177+
if vectors_array.shape[1] != self.embedding_storage.dimension:
178+
raise KnowledgeBaseUploadError(
179+
stage="embedding",
180+
user_message=(
181+
"向量化失败:返回向量维度与当前知识库索引维度不一致"
182+
f"(期望 {self.embedding_storage.dimension},"
183+
f"实际 {vectors_array.shape[1]})。"
184+
),
185+
details={
186+
"expected_dimension": self.embedding_storage.dimension,
187+
"actual_dimension": int(vectors_array.shape[1]),
188+
},
189+
)
106190
await self.embedding_storage.insert_batch(vectors_array, int_ids)
107191
return int_ids
108192

astrbot/core/exceptions.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,22 @@ class ProviderNotFoundError(AstrBotError):
1111

1212
class EmptyModelOutputError(AstrBotError):
1313
"""Raised when the model response contains no usable assistant output."""
14+
15+
16+
class KnowledgeBaseUploadError(AstrBotError):
17+
"""Raised when knowledge base upload fails with a user-facing message."""
18+
19+
def __init__(
20+
self,
21+
*,
22+
stage: str,
23+
user_message: str,
24+
details: dict | None = None,
25+
) -> None:
26+
super().__init__(user_message)
27+
self.stage = stage
28+
self.user_message = user_message
29+
self.details = details or {}
30+
31+
def __str__(self) -> str:
32+
return self.user_message

astrbot/core/knowledge_base/kb_helper.py

Lines changed: 103 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from astrbot.core import logger
1212
from astrbot.core.db.vec_db.base import BaseVecDB
13+
from astrbot.core.exceptions import KnowledgeBaseUploadError
1314
from astrbot.core.provider.manager import ProviderManager
1415
from astrbot.core.provider.provider import (
1516
EmbeddingProvider,
@@ -264,10 +265,31 @@ async def upload_document(
264265
if progress_callback:
265266
await progress_callback("parsing", 0, 100)
266267

267-
parser = await select_parser(f".{file_type}")
268-
parse_result = await parser.parse(file_content, file_name)
268+
try:
269+
parser = await select_parser(f".{file_type}")
270+
parse_result = await parser.parse(file_content, file_name)
271+
except KnowledgeBaseUploadError:
272+
raise
273+
except Exception as exc:
274+
raise KnowledgeBaseUploadError(
275+
stage="parsing",
276+
user_message=(
277+
"文档解析失败:无法读取或解析上传文件。"
278+
"请确认文件格式受支持且文件内容未损坏。"
279+
),
280+
details={"file_name": file_name},
281+
) from exc
269282
text_content = parse_result.text
270283
media_items = parse_result.media
284+
if not text_content or not text_content.strip():
285+
raise KnowledgeBaseUploadError(
286+
stage="parsing",
287+
user_message=(
288+
"文档解析失败:未能从文件中提取可索引文本。"
289+
"该文件可能是扫描件、纯图片 PDF,或格式暂不受支持。"
290+
),
291+
details={"file_name": file_name},
292+
)
271293

272294
if progress_callback:
273295
await progress_callback("parsing", 100, 100)
@@ -288,11 +310,31 @@ async def upload_document(
288310
if progress_callback:
289311
await progress_callback("chunking", 0, 100)
290312

291-
chunks_text = await self.chunker.chunk(
292-
text_content,
293-
chunk_size=chunk_size,
294-
chunk_overlap=chunk_overlap,
313+
try:
314+
chunks_text = await self.chunker.chunk(
315+
text_content,
316+
chunk_size=chunk_size,
317+
chunk_overlap=chunk_overlap,
318+
)
319+
except KnowledgeBaseUploadError:
320+
raise
321+
except Exception as exc:
322+
raise KnowledgeBaseUploadError(
323+
stage="chunking",
324+
user_message=(
325+
"分块失败:文档内容在切分文本块时发生错误。"
326+
"请稍后重试,或调整分块参数后再次上传。"
327+
),
328+
details={"file_name": file_name},
329+
) from exc
330+
331+
if not chunks_text or not any(chunk.strip() for chunk in chunks_text):
332+
raise KnowledgeBaseUploadError(
333+
stage="chunking",
334+
user_message=("分块失败:文档内容为空,未生成任何可索引文本块。"),
335+
details={"file_name": file_name},
295336
)
337+
296338
contents = []
297339
metadatas = []
298340
for idx, chunk_text in enumerate(chunks_text):
@@ -313,14 +355,23 @@ async def embedding_progress_callback(current, total) -> None:
313355
if progress_callback:
314356
await progress_callback("embedding", current, total)
315357

316-
await self.vec_db.insert_batch(
317-
contents=contents,
318-
metadatas=metadatas,
319-
batch_size=batch_size,
320-
tasks_limit=tasks_limit,
321-
max_retries=max_retries,
322-
progress_callback=embedding_progress_callback,
323-
)
358+
try:
359+
await self.vec_db.insert_batch(
360+
contents=contents,
361+
metadatas=metadatas,
362+
batch_size=batch_size,
363+
tasks_limit=tasks_limit,
364+
max_retries=max_retries,
365+
progress_callback=embedding_progress_callback,
366+
)
367+
except KnowledgeBaseUploadError:
368+
raise
369+
except Exception as exc:
370+
raise KnowledgeBaseUploadError(
371+
stage="storage",
372+
user_message=("存储失败:文本块已生成,但写入知识库索引时出错。"),
373+
details={"file_name": file_name},
374+
) from exc
324375

325376
# 保存文档的元数据
326377
doc = KBDocument(
@@ -334,22 +385,47 @@ async def embedding_progress_callback(current, total) -> None:
334385
chunk_count=len(chunks_text),
335386
media_count=0,
336387
)
337-
async with self.kb_db.get_db() as session:
338-
async with session.begin():
339-
session.add(doc)
340-
for media in saved_media:
341-
session.add(media)
342-
await session.commit()
343-
344-
await session.refresh(doc)
388+
try:
389+
async with self.kb_db.get_db() as session:
390+
async with session.begin():
391+
session.add(doc)
392+
for media in saved_media:
393+
session.add(media)
394+
await session.commit()
395+
396+
await session.refresh(doc)
397+
except KnowledgeBaseUploadError:
398+
raise
399+
except Exception as exc:
400+
raise KnowledgeBaseUploadError(
401+
stage="metadata",
402+
user_message=(
403+
"元数据保存失败:文本块已写入知识库,但文档记录保存失败。"
404+
),
405+
details={"file_name": file_name, "doc_id": doc_id},
406+
) from exc
345407

346408
vec_db: FaissVecDB = self.vec_db # type: ignore
347-
await self.kb_db.update_kb_stats(kb_id=self.kb.kb_id, vec_db=vec_db)
348-
await self.refresh_kb()
349-
await self.refresh_document(doc_id)
409+
try:
410+
await self.kb_db.update_kb_stats(kb_id=self.kb.kb_id, vec_db=vec_db)
411+
await self.refresh_kb()
412+
await self.refresh_document(doc_id)
413+
except KnowledgeBaseUploadError:
414+
raise
415+
except Exception as exc:
416+
raise KnowledgeBaseUploadError(
417+
stage="metadata",
418+
user_message=(
419+
"元数据更新失败:文档已上传,但知识库统计信息刷新失败。"
420+
),
421+
details={"file_name": file_name, "doc_id": doc_id},
422+
) from exc
350423
return doc
351424
except Exception as e:
352-
logger.error(f"上传文档失败: {e}")
425+
if isinstance(e, KnowledgeBaseUploadError):
426+
logger.warning(f"上传文档失败: {e}")
427+
else:
428+
logger.error(f"上传文档失败: {e}", exc_info=True)
353429
# if file_path.exists():
354430
# file_path.unlink()
355431

@@ -360,7 +436,7 @@ async def embedding_progress_callback(current, total) -> None:
360436
except Exception as me:
361437
logger.warning(f"清理多媒体文件失败 {media_path}: {me}")
362438

363-
raise e
439+
raise
364440

365441
async def list_documents(
366442
self,

astrbot/dashboard/routes/knowledge_base.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,13 @@ async def _callback(stage: str, current: int, total: int) -> None:
128128

129129
return _callback
130130

131+
@staticmethod
132+
def _format_failed_doc_error(file_name: str, error: Exception) -> str:
133+
message = str(error).strip() or "上传失败:发生未知错误。"
134+
if message.startswith(file_name):
135+
return message
136+
return f"{file_name}: {message}"
137+
131138
async def _background_upload_task(
132139
self,
133140
task_id: str,
@@ -189,7 +196,12 @@ async def _background_upload_task(
189196
except Exception as e:
190197
logger.error(f"上传文档 {file_info['file_name']} 失败: {e}")
191198
failed_docs.append(
192-
{"file_name": file_info["file_name"], "error": str(e)},
199+
{
200+
"file_name": file_info["file_name"],
201+
"error": self._format_failed_doc_error(
202+
file_info["file_name"], e
203+
),
204+
},
193205
)
194206

195207
# 更新任务完成状态
@@ -276,7 +288,10 @@ async def _background_import_task(
276288
except Exception as e:
277289
logger.error(f"导入文档 {file_name} 失败: {e}")
278290
failed_docs.append(
279-
{"file_name": file_name, "error": str(e)},
291+
{
292+
"file_name": file_name,
293+
"error": self._format_failed_doc_error(file_name, e),
294+
},
280295
)
281296

282297
# 更新任务完成状态

0 commit comments

Comments
 (0)