diff --git a/evaluation/.env-example b/evaluation/.env-example index bda935442..0e94e9caa 100644 --- a/evaluation/.env-example +++ b/evaluation/.env-example @@ -22,13 +22,8 @@ SUPERMEMORY_API_KEY="sm_xxx" MEMOBASE_API_KEY="xxx" MEMOBASE_PROJECT_URL="http://***.***.***.***:8019" -# pref -PRE_SPLIT_CHUNK=false # pre split chunk in client end, for personamem and prefeval -# 1. text_mem + pref_mem + instruction_completion: set INSTRUCT_COMPLETE=true, ABLATION_PREF=false -# 2. text_mem + pref_mem: set INSTRUCT_COMPLETE=false, ABLATION_PREF=false -# 3. text_mem: set INSTRUCT_COMPLETE=false, ABLATION_PREF=true -INSTRUCT_COMPLETE=true # use instruct complete format or not -ABLATION_PREF=false # remove pref mem, only text mem +# eval settings +PRE_SPLIT_CHUNK=false # Configuration Only For Scheduler # RabbitMQ Configuration diff --git a/evaluation/scripts/PrefEval/pref_memos.py b/evaluation/scripts/PrefEval/pref_memos.py index 753a77d99..7336d4612 100644 --- a/evaluation/scripts/PrefEval/pref_memos.py +++ b/evaluation/scripts/PrefEval/pref_memos.py @@ -72,7 +72,6 @@ def search_memory_for_line(line_data: tuple, mem_client, top_k_value: int) -> di """ Processes a single line of data, searching memory based on the question. """ - from utils.pref_mem_utils import create_mem_string i, line = line_data try: @@ -94,7 +93,13 @@ def search_memory_for_line(line_data: tuple, mem_client, top_k_value: int) -> di start_time_search = time.monotonic() relevant_memories = mem_client.search(query=question, user_id=user_id, top_k=top_k_value) search_memories_duration = time.monotonic() - start_time_search - memories_str = create_mem_string(relevant_memories) + memories_str = ( + "\n".join( + f"- {entry.get('memory', '')}" + for entry in relevant_memories["text_mem"][0]["memories"] + ) + + f"\n{relevant_memories['pref_mem']}" + ) memory_tokens_used = len(tokenizer.encode(memories_str)) @@ -119,7 +124,6 @@ def generate_response_for_line(line_data: tuple, openai_client: OpenAI, lib: str """ Generates a response for a single line of data using pre-fetched memories. """ - from utils.pref_mem_utils import add_pref_instruction, remove_pref_mem_from_mem_string from utils.prompts import PREFEVAL_ANSWER_PROMPT i, line = line_data @@ -146,10 +150,7 @@ def generate_response_for_line(line_data: tuple, openai_client: OpenAI, lib: str ) return original_data - memories_str = remove_pref_mem_from_mem_string(memories_str, frame=lib) - - template = add_pref_instruction(PREFEVAL_ANSWER_PROMPT, frame=lib) - system_prompt = template.format(context=memories_str) + system_prompt = PREFEVAL_ANSWER_PROMPT.format(context=memories_str) messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": question}, diff --git a/evaluation/scripts/locomo/locomo_responses.py b/evaluation/scripts/locomo/locomo_responses.py index 2ae4dcb6e..35a444b7d 100644 --- a/evaluation/scripts/locomo/locomo_responses.py +++ b/evaluation/scripts/locomo/locomo_responses.py @@ -35,10 +35,7 @@ async def locomo_response(frame, llm_client, context: str, question: str) -> str question=question, ) else: - from utils.pref_mem_utils import add_pref_instruction - - template = add_pref_instruction(ANSWER_PROMPT_MEMOS, frame=frame) - prompt = template.format( + prompt = ANSWER_PROMPT_MEMOS.format( context=context, question=question, ) @@ -55,8 +52,6 @@ async def locomo_response(frame, llm_client, context: str, question: str) -> str async def process_qa(frame, qa, search_result, oai_client): - from utils.pref_mem_utils import remove_pref_mem_from_mem_string - start = time() query = qa.get("question") gold_answer = qa.get("answer") @@ -64,7 +59,6 @@ async def process_qa(frame, qa, search_result, oai_client): context = search_result.get("context") - context = remove_pref_mem_from_mem_string(context, frame) answer = await locomo_response(frame, oai_client, context, query) response_duration_ms = (time() - start) * 1000 diff --git a/evaluation/scripts/locomo/locomo_search.py b/evaluation/scripts/locomo/locomo_search.py index 19efb5b92..c629124dd 100644 --- a/evaluation/scripts/locomo/locomo_search.py +++ b/evaluation/scripts/locomo/locomo_search.py @@ -100,14 +100,19 @@ def memos_api_search( client, query, speaker_a_user_id, speaker_b_user_id, top_k, speaker_a, speaker_b ): from prompts import TEMPLATE_MEMOS - from utils.pref_mem_utils import create_mem_string start = time() search_a_results = client.search(query=query, user_id=speaker_a_user_id, top_k=top_k) search_b_results = client.search(query=query, user_id=speaker_b_user_id, top_k=top_k) - speaker_a_context = create_mem_string(search_a_results) - speaker_b_context = create_mem_string(search_b_results) + speaker_a_context = ( + "\n".join([i["memory"] for i in search_a_results["text_mem"][0]["memories"]]) + + f"\n{search_a_results['pref_mem']}" + ) + speaker_b_context = ( + "\n".join([i["memory"] for i in search_b_results["text_mem"][0]["memories"]]) + + f"\n{search_b_results['pref_mem']}" + ) context = TEMPLATE_MEMOS.format( speaker_1=speaker_a, diff --git a/evaluation/scripts/locomo/prompts.py b/evaluation/scripts/locomo/prompts.py index caf462f6a..152e5b87f 100644 --- a/evaluation/scripts/locomo/prompts.py +++ b/evaluation/scripts/locomo/prompts.py @@ -1,14 +1,3 @@ -import os - - -PREF_INSTRUCTIONS = """ - # Note: - Plaintext memory are summaries of facts, while preference memories are summaries of user preferences. - Your response must not violate any of the user's preferences, whether explicit or implicit, and briefly explain why you answer this way to avoid conflicts. - When encountering preference conflicts, the priority is: explicit preference > implicit preference > plaintext memory. -""" - - ANSWER_PROMPT_MEM0 = """ You are an intelligent memory assistant tasked with retrieving accurate information from conversation memories. @@ -114,7 +103,7 @@ 5. Formulate a precise, concise answer based on the evidence from the memories (and allowed world knowledge). 6. Double-check that your answer directly addresses the question asked and adheres to all instructions. 7. Ensure your final answer is specific and avoids vague time references. - {pref_instructions} + {context} Question: {question} @@ -122,10 +111,6 @@ Answer: """ -if os.getenv("INSTRUCT_COMPLETE") == "true": - ANSWER_PROMPT_MEMOS = ANSWER_PROMPT_MEMOS.replace("{pref_instructions}", PREF_INSTRUCTIONS) -else: - ANSWER_PROMPT_MEMOS = ANSWER_PROMPT_MEMOS.replace("{pref_instructions}", "") custom_instructions = """ Generate personal memories that follow these guidelines: diff --git a/evaluation/scripts/longmemeval/lme_responses.py b/evaluation/scripts/longmemeval/lme_responses.py index 22f17c304..a4adf90b5 100644 --- a/evaluation/scripts/longmemeval/lme_responses.py +++ b/evaluation/scripts/longmemeval/lme_responses.py @@ -12,13 +12,11 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from utils.pref_mem_utils import add_pref_instruction, remove_pref_mem_from_mem_string from utils.prompts import LME_ANSWER_PROMPT -def lme_response(llm_client, context, question, question_date, frame): - template = add_pref_instruction(LME_ANSWER_PROMPT, frame=frame) - prompt = template.format( +def lme_response(llm_client, context, question, question_date): + prompt = LME_ANSWER_PROMPT.format( question=question, question_date=question_date, context=context, @@ -35,14 +33,13 @@ def lme_response(llm_client, context, question, question_date, frame): return result -def process_qa(user_id, search_result, llm_client, frame): +def process_qa(user_id, search_result, llm_client): start = time() search_result = search_result[0] question = search_result.get("question") question_date = search_result.get("date") context = search_result.get("search_context", "") - context = remove_pref_mem_from_mem_string(context, frame=frame) - anwer = lme_response(llm_client, context, question, question_date, frame) + anwer = lme_response(llm_client, context, question, question_date) response_duration_ms = (time() - start) * 1000 @@ -97,7 +94,7 @@ def main(frame, version, num_workers=4): future_to_user_id = {} for user_id, search_results in lme_search_results.items(): - future = executor.submit(process_qa, user_id, search_results, oai_client, frame) + future = executor.submit(process_qa, user_id, search_results, oai_client) future_to_user_id[future] = user_id for future in tqdm( diff --git a/evaluation/scripts/longmemeval/lme_search.py b/evaluation/scripts/longmemeval/lme_search.py index d21795eef..c02518083 100644 --- a/evaluation/scripts/longmemeval/lme_search.py +++ b/evaluation/scripts/longmemeval/lme_search.py @@ -13,7 +13,6 @@ import pandas as pd from tqdm import tqdm -from utils.pref_mem_utils import create_mem_string from utils.prompts import ( MEM0_CONTEXT_TEMPLATE, MEM0_GRAPH_CONTEXT_TEMPLATE, @@ -45,7 +44,10 @@ def mem0_search(client, query, user_id, top_k): def memos_search(client, query, user_id, top_k): start = time() results = client.search(query=query, user_id=user_id, top_k=top_k) - context = create_mem_string(results) + context = ( + "\n".join([i["memory"] for i in results["text_mem"][0]["memories"]]) + + f"\n{results['pref_mem']}" + ) context = MEMOS_CONTEXT_TEMPLATE.format(user_id=user_id, memories=context) duration_ms = (time() - start) * 1000 return context, duration_ms diff --git a/evaluation/scripts/personamem/pm_responses.py b/evaluation/scripts/personamem/pm_responses.py index 5b54f9bb8..ff561f8d8 100644 --- a/evaluation/scripts/personamem/pm_responses.py +++ b/evaluation/scripts/personamem/pm_responses.py @@ -14,7 +14,6 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import re -from utils.pref_mem_utils import add_pref_instruction, remove_pref_mem_from_mem_string from utils.prompts import PM_ANSWER_PROMPT @@ -49,9 +48,8 @@ def _extract_only_options(text): return False, predicted_answer -def pm_response(llm_client, context, question, options, frame): - template = add_pref_instruction(PM_ANSWER_PROMPT, frame=frame) - prompt = template.format( +def pm_response(llm_client, context, question, options): + prompt = PM_ANSWER_PROMPT.format( question=question, context=context, options=options, @@ -68,19 +66,17 @@ def pm_response(llm_client, context, question, options, frame): return result -def process_qa(user_id, search_result, num_runs, llm_client, frame): +def process_qa(user_id, search_result, num_runs, llm_client): search_result = search_result[0] question = search_result.get("question") context = search_result.get("search_context", "") options = search_result.get("all_options", []) - context = remove_pref_mem_from_mem_string(context, frame=frame) - run_results = [] for idx in range(num_runs): start = time() - answer = pm_response(llm_client, context, question, options, frame) + answer = pm_response(llm_client, context, question, options) is_correct, answer = extract_choice_answer(answer, search_result.get("golden_answer", "")) response_duration_ms = (time() - start) * 1000 @@ -154,9 +150,7 @@ def main(frame, version, num_runs=3, num_workers=4): future_to_user_id = {} for user_id, search_results in pm_search_results.items(): - future = executor.submit( - process_qa, user_id, search_results, num_runs, oai_client, frame - ) + future = executor.submit(process_qa, user_id, search_results, num_runs, oai_client) future_to_user_id[future] = user_id for future in tqdm( diff --git a/evaluation/scripts/personamem/pm_search.py b/evaluation/scripts/personamem/pm_search.py index 243c64589..c18e05623 100644 --- a/evaluation/scripts/personamem/pm_search.py +++ b/evaluation/scripts/personamem/pm_search.py @@ -14,7 +14,6 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from utils.pref_mem_utils import create_mem_string from utils.prompts import ( MEM0_CONTEXT_TEMPLATE, MEM0_GRAPH_CONTEXT_TEMPLATE, @@ -83,7 +82,10 @@ def memobase_search(client, query, user_id, top_k): def memos_search(client, user_id, query, top_k): start = time() results = client.search(query=query, user_id=user_id, top_k=top_k) - search_memories = create_mem_string(results) + search_memories = ( + "\n".join(item["memory"] for cube in results["text_mem"] for item in cube["memories"]) + + f"\n{results['pref_mem']}" + ) context = MEMOS_CONTEXT_TEMPLATE.format(user_id=user_id, memories=search_memories) duration_ms = (time() - start) * 1000 diff --git a/evaluation/scripts/utils/pref_mem_utils.py b/evaluation/scripts/utils/pref_mem_utils.py deleted file mode 100644 index 22a5bb86c..000000000 --- a/evaluation/scripts/utils/pref_mem_utils.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import sys - - -sys.path.append(os.path.dirname(os.path.abspath(__file__))) -from prompts import PREF_INSTRUCTIONS - - -def create_mem_string(relevant_memories) -> str: - text_memories = [] - explicit = [] - implicit = [] - for item in relevant_memories["text_mem"]: - for mem in item["memories"]: - text_memories.append(mem["memory"]) - text_memories_text = "\n".join(f"{i + 1}. {mem}" for i, mem in enumerate(text_memories)).strip() - text_context = f"Plaintext Memory:\n{text_memories_text}\n" if text_memories_text else "" - - for item in relevant_memories.get("prefs", []): - for mem in item["memories"]: - if mem["metadata"]["preference_type"] == "explicit_preference": - explicit.append(mem["metadata"]["explicit_preference"]) - elif mem["metadata"]["preference_type"] == "implicit_preference": - implicit.append(mem["metadata"]["implicit_preference"]) - explicit_text = "\n".join(f"{i + 1}. {pref}" for i, pref in enumerate(explicit)).strip() - explicit_context = f"Explicit Preference:\n{explicit_text}\n" if explicit_text else "" - implicit_text = "\n".join(f"{i + 1}. {pref}" for i, pref in enumerate(implicit)).strip() - implicit_context = f"Implicit Preference:\n{implicit_text}\n" if implicit_text else "" - return text_context + explicit_context + implicit_context - - -def remove_pref_mem_from_mem_string(mem_string: str, frame: str) -> str: - if os.getenv("ABLATION_PREF", "false").lower() == "true" and frame == "memos-api": - tmp_list = mem_string.split("Plaintext Memory:") - if len(tmp_list) > 1: - return tmp_list[1].split("Explicit Preference:")[0] - return mem_string - - -def add_pref_instruction(template: str, frame: str): - if os.getenv("INSTRUCT_COMPLETE", "false").lower() == "true" and frame == "memos-api": - return template.replace("{pref_instructions}", PREF_INSTRUCTIONS) - return template.replace("{pref_instructions}", "") diff --git a/evaluation/scripts/utils/prompts.py b/evaluation/scripts/utils/prompts.py index 902bbb1be..32e6d6729 100644 --- a/evaluation/scripts/utils/prompts.py +++ b/evaluation/scripts/utils/prompts.py @@ -1,11 +1,3 @@ -PREF_INSTRUCTIONS = """ - # Note: - Plaintext memory are summaries of facts, while preference memories are summaries of user preferences. - Your response must not violate any of the user's preferences, whether explicit or implicit, and briefly explain why you answer this way to avoid conflicts. - When encountering preference conflicts, the priority is: explicit preference > implicit preference > plaintext memory. -""" - - LME_ANSWER_PROMPT = """ You are an intelligent memory assistant tasked with retrieving accurate information from conversation memories. @@ -25,7 +17,7 @@ 5. Formulate a precise, concise answer based solely on the evidence in the memories. 6. Double-check that your answer directly addresses the question asked. 7. Ensure your final answer is specific and avoids vague time references. - {pref_instructions} + {context} Current Date: {question_date} @@ -55,7 +47,7 @@ - Your final answer **must use parentheses**, like (a) or (b). - Do NOT list multiple choices. Choose only one. - Do NOT include extra text after . Just output the answer. - {pref_instructions} + # QUESTION: {question} @@ -71,7 +63,6 @@ You are a helpful AI. Answer the question based on the query and the following memories: User Memories: {context} - {pref_instructions} """ diff --git a/src/memos/vec_dbs/milvus.py b/src/memos/vec_dbs/milvus.py index fb19fd6ff..c1cb26362 100644 --- a/src/memos/vec_dbs/milvus.py +++ b/src/memos/vec_dbs/milvus.py @@ -138,7 +138,7 @@ def search( items.append( MilvusVecDBItem( - id=str(hit["id"]), + id=str(entity.get("id")), memory=entity.get("memory"), vector=entity.get("vector"), payload=entity.get("payload", {}),