Skip to content

Commit 7e0e08b

Browse files
committed
fix: add normalize_for_embedding function to clean text input
--bug=1065973@tapd-62980211 --user=刘瑞斌 【知识库】关联问题带特殊符号时,命中测试结果显示错误 https://www.tapd.cn/62980211/s/1841893
1 parent a72697a commit 7e0e08b

File tree

2 files changed

+26
-2
lines changed

2 files changed

+26
-2
lines changed

apps/knowledge/vector/base_vector.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
@date:2023/10/18 19:16
77
@desc:
88
"""
9+
import re
910
import threading
1011
from abc import ABC, abstractmethod
1112
from functools import reduce
@@ -33,6 +34,26 @@ def chunk_data_list(data_list: List[Dict]):
3334
return reduce(lambda x, y: [*x, *y], result, [])
3435

3536

37+
# 预编译正则,性能更好
38+
RE_EMOJI = re.compile(
39+
r"[\U0001F300-\U0001FAFF]" # Emoji
40+
r"|[\u2600-\u27BF]" # Dingbats / Symbols(⚓ 在这)
41+
r"|[\uFE0E\uFE0F]", # Variation Selectors
42+
flags=re.UNICODE
43+
)
44+
45+
RE_WHITESPACE = re.compile(r"\s+")
46+
47+
48+
def normalize_for_embedding(text: str) -> str:
49+
if not text:
50+
return ""
51+
52+
text = RE_EMOJI.sub("", text)
53+
text = RE_WHITESPACE.sub(" ", text)
54+
return text.strip()
55+
56+
3657
class BaseVectorStore(ABC):
3758
vector_exists = False
3859

@@ -121,6 +142,7 @@ def search(self, query_text, knowledge_id_list: list[str], exclude_document_id_l
121142
embedding: Embeddings):
122143
if knowledge_id_list is None or len(knowledge_id_list) == 0:
123144
return []
145+
query_text = normalize_for_embedding(query_text)
124146
embedding_query = embedding.embed_query(query_text)
125147
result = self.query(embedding_query, knowledge_id_list, exclude_document_id_list, exclude_paragraph_list,
126148
is_active, 1, 3, 0.65)

apps/knowledge/vector/pg_vector.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from common.utils.common import get_file_content
2222
from common.utils.ts_vecto_util import to_ts_vector, to_query
2323
from knowledge.models import Embedding, SearchMode, SourceType
24-
from knowledge.vector.base_vector import BaseVectorStore
24+
from knowledge.vector.base_vector import BaseVectorStore, normalize_for_embedding
2525
from maxkb.conf import PROJECT_DIR
2626

2727

@@ -46,6 +46,7 @@ def _save(self, text, source_type: SourceType, knowledge_id: str, document_id: s
4646
source_id: str,
4747
is_active: bool,
4848
embedding: Embeddings):
49+
text = normalize_for_embedding(text)
4950
text_embedding = [float(x) for x in embedding.embed_query(text)]
5051
embedding = Embedding(
5152
id=uuid.uuid7(),
@@ -62,7 +63,7 @@ def _save(self, text, source_type: SourceType, knowledge_id: str, document_id: s
6263
return True
6364

6465
def _batch_save(self, text_list: List[Dict], embedding: Embeddings, is_the_task_interrupted):
65-
texts = [row.get('text') for row in text_list]
66+
texts = [normalize_for_embedding(row.get('text')) for row in text_list]
6667
embeddings = embedding.embed_documents(texts)
6768
embedding_list = [
6869
Embedding(
@@ -87,6 +88,7 @@ def hit_test(self, query_text, knowledge_id_list: list[str], exclude_document_id
8788
if knowledge_id_list is None or len(knowledge_id_list) == 0:
8889
return []
8990
exclude_dict = {}
91+
query_text = normalize_for_embedding(query_text)
9092
embedding_query = embedding.embed_query(query_text)
9193
query_set = QuerySet(Embedding).filter(knowledge_id__in=knowledge_id_list, is_active=True)
9294
if exclude_document_id_list is not None and len(exclude_document_id_list) > 0:

0 commit comments

Comments
 (0)