|
26 | 26 | from utils.llm_utils import call_llm_for_system_prompt |
27 | 27 | from utils.prompt_template_utils import ( |
28 | 28 | get_document_summary_prompt_template, |
29 | | - get_cluster_summary_reduce_prompt_template, |
30 | | - get_cluster_summary_agent_prompt_template |
| 29 | + get_cluster_summary_reduce_prompt_template |
31 | 30 | ) |
32 | 31 |
|
33 | 32 | logger = logging.getLogger("document_vector_utils") |
@@ -492,54 +491,6 @@ def process_documents_for_clustering(index_name: str, vdb_core, sample_doc_count |
492 | 491 | raise Exception(f"Failed to process documents: {str(e)}") |
493 | 492 |
|
494 | 493 |
|
495 | | -def extract_cluster_content(document_samples: Dict[str, Dict], cluster_doc_ids: List[str], max_chunks_per_doc: int = 3) -> str: |
496 | | - """ |
497 | | - Extract representative content from a cluster for summarization |
498 | | - |
499 | | - Args: |
500 | | - document_samples: Dictionary mapping doc_id to document info |
501 | | - cluster_doc_ids: List of document IDs in the cluster |
502 | | - max_chunks_per_doc: Maximum number of chunks to include per document |
503 | | - |
504 | | - Returns: |
505 | | - Formatted string containing cluster content |
506 | | - """ |
507 | | - cluster_content_parts = [] |
508 | | - |
509 | | - for doc_id in cluster_doc_ids: |
510 | | - if doc_id not in document_samples: |
511 | | - continue |
512 | | - |
513 | | - doc_info = document_samples[doc_id] |
514 | | - chunks = doc_info.get('chunks', []) |
515 | | - filename = doc_info.get('filename', 'unknown') |
516 | | - |
517 | | - # Extract representative chunks |
518 | | - representative_chunks = [] |
519 | | - if len(chunks) <= max_chunks_per_doc: |
520 | | - representative_chunks = chunks |
521 | | - else: |
522 | | - # Take first, middle, and last chunks |
523 | | - representative_chunks = ( |
524 | | - chunks[:1] + |
525 | | - chunks[len(chunks)//2:len(chunks)//2+1] + |
526 | | - chunks[-1:] |
527 | | - ) |
528 | | - |
529 | | - # Format document content |
530 | | - doc_content = f"\n--- Document: {filename} ---\n" |
531 | | - for chunk in representative_chunks: |
532 | | - content = chunk.get('content', '') |
533 | | - # Limit chunk content length |
534 | | - if len(content) > 500: |
535 | | - content = content[:500] + "..." |
536 | | - doc_content += f"{content}\n" |
537 | | - |
538 | | - cluster_content_parts.append(doc_content) |
539 | | - |
540 | | - return "\n".join(cluster_content_parts) |
541 | | - |
542 | | - |
543 | 494 | def summarize_document(document_content: str, filename: str, language: str = LANGUAGE["ZH"], max_words: int = 100, model_id: Optional[int] = None, tenant_id: Optional[str] = None) -> str: |
544 | 495 | """ |
545 | 496 | Summarize a single document using LLM (Map stage) |
@@ -861,68 +812,4 @@ def summarize_clusters_map_reduce(document_samples: Dict[str, Dict], clusters: D |
861 | 812 | return cluster_summaries |
862 | 813 |
|
863 | 814 |
|
864 | | -def summarize_clusters(document_samples: Dict[str, Dict], clusters: Dict[int, List[str]], |
865 | | - language: str = LANGUAGE["ZH"], max_words: int = 150) -> Dict[int, str]: |
866 | | - """ |
867 | | - Summarize all clusters (legacy method - kept for backward compatibility) |
868 | | - |
869 | | - Note: This method uses the old approach. Use summarize_clusters_map_reduce for better results. |
870 | | - |
871 | | - Args: |
872 | | - document_samples: Dictionary mapping doc_id to document info |
873 | | - clusters: Dictionary mapping cluster_id to list of doc_ids |
874 | | - language: Language code ('zh' or 'en') |
875 | | - max_words: Maximum words per cluster summary |
876 | | - |
877 | | - Returns: |
878 | | - Dictionary mapping cluster_id to summary text |
879 | | - """ |
880 | | - cluster_summaries = {} |
881 | | - |
882 | | - for cluster_id, doc_ids in clusters.items(): |
883 | | - logger.info(f"Summarizing cluster {cluster_id} with {len(doc_ids)} documents") |
884 | | - |
885 | | - # Extract cluster content |
886 | | - cluster_content = extract_cluster_content(document_samples, doc_ids, max_chunks_per_doc=3) |
887 | | - |
888 | | - # Generate summary using old method |
889 | | - summary = summarize_cluster_legacy(cluster_content, language, max_words) |
890 | | - cluster_summaries[cluster_id] = summary |
891 | | - |
892 | | - return cluster_summaries |
893 | | - |
894 | | - |
895 | | -def summarize_cluster_legacy(cluster_content: str, language: str = LANGUAGE["ZH"], max_words: int = 150) -> str: |
896 | | - """ |
897 | | - Legacy cluster summarization method (single-stage) |
898 | | - |
899 | | - Args: |
900 | | - cluster_content: Formatted content from the cluster |
901 | | - language: Language code ('zh' or 'en') |
902 | | - max_words: Maximum words in the summary |
903 | | - |
904 | | - Returns: |
905 | | - Cluster summary text |
906 | | - """ |
907 | | - try: |
908 | | - # Get prompt template from prompt_template_utils |
909 | | - prompts = get_cluster_summary_agent_prompt_template(language) |
910 | | - |
911 | | - system_prompt = prompts.get('system_prompt', '') |
912 | | - user_prompt_template = prompts.get('user_prompt', '') |
913 | | - |
914 | | - user_prompt = Template(user_prompt_template, undefined=StrictUndefined).render( |
915 | | - cluster_content=cluster_content, |
916 | | - max_words=max_words |
917 | | - ) |
918 | | - |
919 | | - logger.info(f"Cluster summary prompt generated (language: {language}, max_words: {max_words})") |
920 | | - |
921 | | - # Note: This is a legacy function, using placeholder summary |
922 | | - # The main summarization uses summarize_cluster() with LLM integration |
923 | | - return f"[Cluster Summary] (max {max_words} words) - Content preview: {cluster_content[:200]}..." |
924 | | - |
925 | | - except Exception as e: |
926 | | - logger.error(f"Error generating cluster summary: {str(e)}", exc_info=True) |
927 | | - return f"Failed to generate summary: {str(e)}" |
928 | 815 |
|
0 commit comments