Skip to content

Commit c54e61d

Browse files
authored
🐛Bugfix: knowledge base summary render error to avoid <p> </p>
🐛Bugfix: knowledge base summary render error to avoid <p> </p>
2 parents 6444d6c + c9dda63 commit c54e61d

15 files changed

Lines changed: 3 additions & 911 deletions

backend/prompts/analyze_file_en.yaml

Lines changed: 0 additions & 28 deletions
This file was deleted.

backend/prompts/analyze_file_zh.yaml

Lines changed: 0 additions & 28 deletions
This file was deleted.

backend/prompts/cluster_summary_agent_en.yaml

Lines changed: 0 additions & 24 deletions
This file was deleted.

backend/prompts/cluster_summary_agent_zh.yaml

Lines changed: 0 additions & 24 deletions
This file was deleted.

backend/prompts/knowledge_summary_agent_en.yaml

Lines changed: 0 additions & 17 deletions
This file was deleted.

backend/prompts/knowledge_summary_agent_zh.yaml

Lines changed: 0 additions & 17 deletions
This file was deleted.

backend/services/vectordatabase_service.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1228,7 +1228,6 @@ async def summary_index_name(self,
12281228
summarize_clusters_map_reduce,
12291229
merge_cluster_summaries
12301230
)
1231-
12321231
# Use new Map-Reduce approach
12331232
# Sample reasonable number of documents
12341233
sample_count = min(batch_size // 5, 200)

backend/utils/attachment_utils.py

Lines changed: 0 additions & 82 deletions
This file was deleted.

backend/utils/document_vector_utils.py

Lines changed: 1 addition & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@
2626
from utils.llm_utils import call_llm_for_system_prompt
2727
from utils.prompt_template_utils import (
2828
get_document_summary_prompt_template,
29-
get_cluster_summary_reduce_prompt_template,
30-
get_cluster_summary_agent_prompt_template
29+
get_cluster_summary_reduce_prompt_template
3130
)
3231

3332
logger = logging.getLogger("document_vector_utils")
@@ -492,54 +491,6 @@ def process_documents_for_clustering(index_name: str, vdb_core, sample_doc_count
492491
raise Exception(f"Failed to process documents: {str(e)}")
493492

494493

495-
def extract_cluster_content(document_samples: Dict[str, Dict], cluster_doc_ids: List[str], max_chunks_per_doc: int = 3) -> str:
496-
"""
497-
Extract representative content from a cluster for summarization
498-
499-
Args:
500-
document_samples: Dictionary mapping doc_id to document info
501-
cluster_doc_ids: List of document IDs in the cluster
502-
max_chunks_per_doc: Maximum number of chunks to include per document
503-
504-
Returns:
505-
Formatted string containing cluster content
506-
"""
507-
cluster_content_parts = []
508-
509-
for doc_id in cluster_doc_ids:
510-
if doc_id not in document_samples:
511-
continue
512-
513-
doc_info = document_samples[doc_id]
514-
chunks = doc_info.get('chunks', [])
515-
filename = doc_info.get('filename', 'unknown')
516-
517-
# Extract representative chunks
518-
representative_chunks = []
519-
if len(chunks) <= max_chunks_per_doc:
520-
representative_chunks = chunks
521-
else:
522-
# Take first, middle, and last chunks
523-
representative_chunks = (
524-
chunks[:1] +
525-
chunks[len(chunks)//2:len(chunks)//2+1] +
526-
chunks[-1:]
527-
)
528-
529-
# Format document content
530-
doc_content = f"\n--- Document: {filename} ---\n"
531-
for chunk in representative_chunks:
532-
content = chunk.get('content', '')
533-
# Limit chunk content length
534-
if len(content) > 500:
535-
content = content[:500] + "..."
536-
doc_content += f"{content}\n"
537-
538-
cluster_content_parts.append(doc_content)
539-
540-
return "\n".join(cluster_content_parts)
541-
542-
543494
def summarize_document(document_content: str, filename: str, language: str = LANGUAGE["ZH"], max_words: int = 100, model_id: Optional[int] = None, tenant_id: Optional[str] = None) -> str:
544495
"""
545496
Summarize a single document using LLM (Map stage)
@@ -861,68 +812,4 @@ def summarize_clusters_map_reduce(document_samples: Dict[str, Dict], clusters: D
861812
return cluster_summaries
862813

863814

864-
def summarize_clusters(document_samples: Dict[str, Dict], clusters: Dict[int, List[str]],
865-
language: str = LANGUAGE["ZH"], max_words: int = 150) -> Dict[int, str]:
866-
"""
867-
Summarize all clusters (legacy method - kept for backward compatibility)
868-
869-
Note: This method uses the old approach. Use summarize_clusters_map_reduce for better results.
870-
871-
Args:
872-
document_samples: Dictionary mapping doc_id to document info
873-
clusters: Dictionary mapping cluster_id to list of doc_ids
874-
language: Language code ('zh' or 'en')
875-
max_words: Maximum words per cluster summary
876-
877-
Returns:
878-
Dictionary mapping cluster_id to summary text
879-
"""
880-
cluster_summaries = {}
881-
882-
for cluster_id, doc_ids in clusters.items():
883-
logger.info(f"Summarizing cluster {cluster_id} with {len(doc_ids)} documents")
884-
885-
# Extract cluster content
886-
cluster_content = extract_cluster_content(document_samples, doc_ids, max_chunks_per_doc=3)
887-
888-
# Generate summary using old method
889-
summary = summarize_cluster_legacy(cluster_content, language, max_words)
890-
cluster_summaries[cluster_id] = summary
891-
892-
return cluster_summaries
893-
894-
895-
def summarize_cluster_legacy(cluster_content: str, language: str = LANGUAGE["ZH"], max_words: int = 150) -> str:
896-
"""
897-
Legacy cluster summarization method (single-stage)
898-
899-
Args:
900-
cluster_content: Formatted content from the cluster
901-
language: Language code ('zh' or 'en')
902-
max_words: Maximum words in the summary
903-
904-
Returns:
905-
Cluster summary text
906-
"""
907-
try:
908-
# Get prompt template from prompt_template_utils
909-
prompts = get_cluster_summary_agent_prompt_template(language)
910-
911-
system_prompt = prompts.get('system_prompt', '')
912-
user_prompt_template = prompts.get('user_prompt', '')
913-
914-
user_prompt = Template(user_prompt_template, undefined=StrictUndefined).render(
915-
cluster_content=cluster_content,
916-
max_words=max_words
917-
)
918-
919-
logger.info(f"Cluster summary prompt generated (language: {language}, max_words: {max_words})")
920-
921-
# Note: This is a legacy function, using placeholder summary
922-
# The main summarization uses summarize_cluster() with LLM integration
923-
return f"[Cluster Summary] (max {max_words} words) - Content preview: {cluster_content[:200]}..."
924-
925-
except Exception as e:
926-
logger.error(f"Error generating cluster summary: {str(e)}", exc_info=True)
927-
return f"Failed to generate summary: {str(e)}"
928815

0 commit comments

Comments
 (0)