Skip to content

Commit f202e23

Browse files
authored
fix: Intelligent segmentation of knowledge base, blank line and carriage return segmentation identification are not effective #4791 (#4793)
1 parent 60576cb commit f202e23

File tree

1 file changed

+3
-4
lines changed

1 file changed

+3
-4
lines changed

apps/common/utils/split_model.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,11 +167,10 @@ def parse_level(text, pattern: str):
167167
level_content_list = list(map(to_tree_obj, [r[0:255] for r in re_findall(pattern, text) if r is not None]))
168168
# 过滤掉空标题或只包含#和空白字符的标题
169169
filtered_list = [item for item in level_content_list
170-
if item['content'].strip() and item['content'].replace('#', '').strip()]
170+
if item['content'].strip(' ') and item['content'].replace('#', '').strip(' ')]
171171
return list(map(filter_special_symbol, filtered_list))
172172

173173

174-
175174
def re_findall(pattern, text):
176175
# 检查 pattern 是否为空或无效
177176
if pattern is None:
@@ -305,8 +304,8 @@ def smart_split_paragraph(content: str, limit: int):
305304
# 优先级:句号 > 感叹号/问号 > 回车
306305
split_chars = [
307306
('。', 0), ('.', 0), # 中英文句号
308-
('!', 0), ('!', 0), # 中英文感叹号
309-
('?', 0), ('?', 0), # 中英文问号
307+
('!', 0), ('!', 0), # 中英文感叹号
308+
('?', 0), ('?', 0), # 中英文问号
310309
]
311310

312311
# 从后往前找分割点

0 commit comments

Comments
 (0)