|
| 1 | +import json |
| 2 | + |
| 3 | +from datasets import Dataset |
| 4 | +from langchain_community.llms import LlamaCpp # Use LangChain's LlamaCpp for evaluation |
| 5 | +from llama_index.embeddings.huggingface import ( |
| 6 | + HuggingFaceEmbedding, |
| 7 | +) # Need this for embeddings |
| 8 | +from ragas import evaluate |
| 9 | +from ragas.llms import LangchainLLMWrapper |
| 10 | +from ragas.metrics import answer_relevancy, faithfulness |
| 11 | + |
| 12 | +# --- 1. Configuration --- |
| 13 | +MODEL_PATH = "D:/Mistral7B/mistral-7b-instruct-v0.2.Q4_K_M.gguf" # Same model used for generation |
| 14 | +INPUT_FILE = "D:/Mistral7B/rag_results.json" # The file saved by the previous script |
| 15 | + |
| 16 | +# --- 2. Load the Saved Results --- |
| 17 | +print(f"Loading results from {INPUT_FILE}...") |
| 18 | +with open(INPUT_FILE, "r", encoding="utf-8") as f: |
| 19 | + loaded_data = json.load(f) |
| 20 | + |
| 21 | +# Convert the list of dictionaries into a Hugging Face Dataset |
| 22 | +# Ragas requires columns named 'question', 'answer', 'contexts' |
| 23 | +eval_dataset = Dataset.from_list([loaded_data[0]]) # Only evaluate the first item |
| 24 | +print(f"Loaded {len(eval_dataset)} results.") |
| 25 | + |
| 26 | +# --- 3. Initialize Evaluator Model and Embeddings --- |
| 27 | +print("Initializing evaluator models...") |
| 28 | +# ... (gpu_layers = 0 setting) ... |
| 29 | + |
| 30 | +eval_llm = LlamaCpp( |
| 31 | + model_path=MODEL_PATH, |
| 32 | + # ... other parameters ... |
| 33 | + n_ctx=1024, # Keep reduced context |
| 34 | + # ... |
| 35 | +) |
| 36 | +ragas_llm = LangchainLLMWrapper(eval_llm) |
| 37 | + |
| 38 | +# --- ADD THIS TEST BLOCK --- |
| 39 | +print("\n--- Testing eval_llm directly ---") |
| 40 | +try: |
| 41 | + test_prompt = ( |
| 42 | + "Explain the importance of testing in software development in one sentence." |
| 43 | + ) |
| 44 | + print(f"Sending test prompt: {test_prompt}") |
| 45 | + response = eval_llm.invoke(test_prompt) |
| 46 | + print(f"Test response received: {response}") |
| 47 | + print("--- eval_llm test successful ---\n") |
| 48 | +except Exception as e: |
| 49 | + print("--- eval_llm test FAILED ---") |
| 50 | + print(f"Error during direct invocation: {e}") |
| 51 | + import traceback |
| 52 | + |
| 53 | + traceback.print_exc() |
| 54 | + # Decide if you want to exit here or continue to ragas evaluation |
| 55 | + # exit() # Uncomment to stop if the direct test fails |
| 56 | +# --- END OF TEST BLOCK --- |
| 57 | + |
| 58 | +# Ragas metrics might also need embeddings |
| 59 | +embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") |
| 60 | + |
| 61 | +# --- 4. Run Ragas Evaluation --- |
| 62 | +print("\n--- Running Ragas Accuracy Evaluation ---") |
| 63 | + |
| 64 | +try: |
| 65 | + print("Starting Ragas evaluate()...") # <-- ADDED |
| 66 | + result = evaluate( |
| 67 | + eval_dataset, |
| 68 | + metrics=[ |
| 69 | + faithfulness, |
| 70 | + answer_relevancy, |
| 71 | + ], |
| 72 | + llm=ragas_llm, |
| 73 | + embeddings=embed_model, |
| 74 | + # raise_exceptions=False # Optional: Try adding this if it keeps crashing |
| 75 | + ) |
| 76 | + print("Ragas evaluate() finished.") # <-- ADDED |
| 77 | + |
| 78 | + print("\n--- Ragas Accuracy Results ---") |
| 79 | + print(result) # <-- KEEP THIS |
| 80 | + |
| 81 | + # Save results to a file for later analysis |
| 82 | + print("Preparing to save results to JSON...") # <-- ADDED |
| 83 | + with open("ragas_evaluation_results.json", "w") as f: |
| 84 | + # Convert numpy values to Python native types for JSON serialization |
| 85 | + import numpy as np |
| 86 | + |
| 87 | + # Check if result is not None and is a dictionary before processing |
| 88 | + if result and isinstance(result, dict): |
| 89 | + result_dict = { |
| 90 | + k: float(v) |
| 91 | + if isinstance(v, (np.number, float)) and not np.isnan(v) |
| 92 | + else None |
| 93 | + for k, v in result.items() |
| 94 | + } |
| 95 | + print( |
| 96 | + f"Result dictionary prepared: {result_dict}" |
| 97 | + ) # <-- ADDED (optional, can be verbose) |
| 98 | + json.dump(result_dict, f, indent=4) |
| 99 | + print("Results saved to ragas_evaluation_results.json") # <-- ADDED |
| 100 | + else: |
| 101 | + print( |
| 102 | + "Evaluation result was None or not a dictionary, skipping save." |
| 103 | + ) # <-- ADDED |
| 104 | + |
| 105 | +except Exception as e: |
| 106 | + print("\n--- Evaluation Error ---") |
| 107 | + print(f"Error during evaluation or saving: {e}") # <-- MODIFIED |
| 108 | + import traceback |
| 109 | + |
| 110 | + traceback.print_exc() |
| 111 | + |
| 112 | +# Make sure to explicitly delete the model to avoid memory issues |
| 113 | +if "eval_llm" in locals(): |
| 114 | + print("Deleting LLM objects...") # <-- ADDED |
| 115 | + del ragas_llm |
| 116 | + del eval_llm |
| 117 | + print("LLM objects deleted.") # <-- ADDED |
| 118 | + |
| 119 | +print("\n--- Evaluation Script Finished ---") |
0 commit comments