feat: add rag example (#12)

aabedraba · web-flow · commit d8876de75664 · 2025-11-01T17:34:10.000+01:00
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Here, we try to highlight amazing work by our contributors and partners.
 | [DSPy.rb Langfuse Integration](https://github.com/vicentereig/dspy.rb) | A Ruby framework for LLM programming with built-in Langfuse tracing via OpenTelemetry. | [@vicentereig](https://github.com/vicentereig) |
 | [Tracing Pipecat Applications](./applications/langchat)             | A Pipecat application sending traces to Langfuse.                             | [@aabedraba](https://github.com/aabedraba)     |
 | [Tracing MCP Servers](./applications/mcp-tracing)  | An example on using the OpenAI agents SDK together with an MCP server. | [@aabedraba](https://github.com/aabedraba)   |
+| [RAG Observability and Evals](./applications/rag) | A RAG application that uses Langfuse for tracing and evals. | [@aabedraba](https://github.com/aabedraba) |
 
 ## Deployment Examples
 
diff --git a/applications/rag/.env.example b/applications/rag/.env.example
@@ -0,0 +1,4 @@
+OPENAI_API_KEY=sk-proj-123
+LANGFUSE_PUBLIC_KEY=pk-lf-123
+LANGFUSE_SECRET_KEY=sk-lf-123
+LANGFUSE_HOST=https://cloud.langfuse.com
diff --git a/applications/rag/.gitignore b/applications/rag/.gitignore
@@ -0,0 +1,3 @@
+.env
+.venv
+__pycache__/
diff --git a/applications/rag/.python-version b/applications/rag/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/applications/rag/README.md b/applications/rag/README.md
@@ -0,0 +1,75 @@
+# RAG Obvservability and Evals with Langfuse
+
+A RAG (Retrieval-Augmented Generation) chatbot that answers questions about Langfuse using OpenAI and LangChain, with observability and evals.
+
+Follow the Langfuse blogpost on [RAG Observability and Evals](https://langfuse.com/blog/2025-10-28-rag-observability-and-evals) for more details.
+
+## Features
+
+- **Observability**: Full tracing of the RAG pipeline with Langfuse
+- **Component Evaluation**: Example evaluation for the chunk retrieval with Langfuse Experiments
+- **Evaluation**: Evaluation of the entire RAG pipeline with Langfuse Experiments
+
+## Requirements
+
+• Python ≥3.11
+• OpenAI API key
+• Langfuse credentials
+
+## Setup
+
+Create .env with:
+
+```
+OPENAI_API_KEY=your_key
+LANGFUSE_PUBLIC_KEY=your_key
+LANGFUSE_SECRET_KEY=your_key
+LANGFUSE_HOST=https://cloud.langfuse.com
+```
+
+And install the dependencies:
+
+```sh
+uv sync
+```
+
+## Usage
+
+### For Observability
+
+Run the bot:
+
+```
+uv run rag_bot/main.py
+```
+
+You should see traces in Langfuse like this:
+
+![Langfuse Traces](./assets/rag-traces.png)
+
+### Evaluations
+
+Create a dataset in Langfuse with the following name: `rag_bot_evals`
+
+Each item in the dataset should have the following fields:
+
+- input: `{ "question": "What is Langfuse?" }`
+- expected_output: `{ "answer": "Langfuse is a platform for building and evaluating LLMs." }`
+
+#### a. Answer Evaluation
+
+```
+uv run rag_bot/answer_evaluation.py
+```
+
+#### b. Component Evaluation
+
+An example is provided to evaluate the right chunk size and overlap:
+
+```
+uv run rag_bot/chunk_evaluation.py
+```
+
+You should see the evaluation results in Langfuse like this:
+
+![Langfuse Evaluation](./assets/rag-evaluation.png)
diff --git a/applications/rag/assets/rag-evaluation.png b/applications/rag/assets/rag-evaluation.png
diff --git a/applications/rag/assets/rag-traces.png b/applications/rag/assets/rag-traces.png
diff --git a/applications/rag/pyproject.toml b/applications/rag/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "rag"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "langchain-community>=0.4",
+    "langchain-openai>=1.0.1",
+    "langchain-core>=0.4",
+    "langchain-text-splitters>=0.4",
+    "langfuse>=3.8.0",
+    "python-dotenv>=1.1.1",
+    "beautifulsoup4>=4.14.2",
+    "langchain>=1.0.2",
+]
+
+[tool.ruff]
+line-length = 120
+indent-width = 2
diff --git a/applications/rag/rag_bot/__init__.py b/applications/rag/rag_bot/__init__.py
@@ -0,0 +1 @@
+# RAG Bot package
diff --git a/applications/rag/rag_bot/answer_evaluation.py b/applications/rag/rag_bot/answer_evaluation.py
@@ -0,0 +1,115 @@
+# An answer evaluation is added to evaluate the quality of the answer generated by the entire RAG pipeline.
+# In this example, we evaluate the relevance and faithfulness of the answer to the question and the expected output.
+
+from typing import Annotated, TypedDict
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from langfuse import Evaluation, get_client
+from langfuse.experiment import ExperimentItem
+from main import rag_bot
+
+load_dotenv()
+langfuse = get_client()
+
+
+def rag_task(*, item: ExperimentItem, **kwargs):
+  """Task function that runs the full RAG pipeline."""
+  question = item.input["question"]  # type: ignore
+  result = rag_bot(question)
+
+  return {"answer": result["answer"], "documents": result["documents"]}
+
+
+# Answer Relevance Evaluation
+class AnswerRelevanceGrade(TypedDict):
+  explanation: Annotated[str, ..., "Explain your reasoning for the score"]
+  score: Annotated[int, ..., "Rate the relevance of the answer to the question of 0 or 1"]
+
+
+answer_relevance_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
+  AnswerRelevanceGrade, method="json_schema", strict=True
+)
+
+answer_relevance_instructions = """You are evaluating the relevance of an answer to a question.
+You will be given a QUESTION, an ANSWER, and an EXPECTED OUTPUT.
+
+Here is the grade criteria to follow:
+(1) The ANSWER should directly address the QUESTION
+(2) The ANSWER should be similar in scope to the EXPECTED OUTPUT
+(3) The ANSWER should not contain significant irrelevant information
+(4) It's acceptable if the ANSWER provides additional helpful context as long as it addresses the core question
+
+You should return a score of 0 or 1, where:
+- 0: The answer is irrelevant or doesn't address the question
+- 1: The answer is relevant and addresses the question
+"""
+
+
+def answer_relevance_evaluator(*, input, output, expected_output, metadata, **kwargs):
+  """Evaluates how relevant the generated answer is to the question."""
+  result = answer_relevance_llm.invoke(
+    answer_relevance_instructions
+    + "\n\nQUESTION: "
+    + input["question"]
+    + "\n\nANSWER: "
+    + output["answer"]
+    + "\n\nEXPECTED OUTPUT: "
+    + expected_output["answer"]
+  )
+
+  return Evaluation(name="answer_relevance", value=result["score"], comment=result.get("explanation", ""))
+
+
+# Faithfulness Evaluation
+class FaithfulnessGrade(TypedDict):
+  explanation: Annotated[str, ..., "Explain your reasoning for the score"]
+  score: Annotated[int, ..., "Rate the faithfulness of the answer to the source documents of 0 or 1"]
+
+
+faithfulness_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
+  FaithfulnessGrade, method="json_schema", strict=True
+)
+
+faithfulness_instructions = """You are evaluating the faithfulness of an answer to the source documents.
+You will be given an ANSWER and the FACTS (source documents) that were used to generate it.
+
+Here is the grade criteria to follow:
+(1) The ANSWER should only contain information that can be verified from the FACTS
+(2) The ANSWER should not hallucinate or make up information not present in the FACTS
+(3) The ANSWER should not contradict information in the FACTS
+(4) It's acceptable for the ANSWER to say "I don't know" if the FACTS don't contain the information
+
+You should return a score of 0 or 1, where:
+- 1: The answer is fully grounded in the source facts
+- 0: The answer contains hallucinations or unverified claims
+
+Explain your reasoning for the score."""
+
+
+def faithfulness_evaluator(*, input, output, expected_output, metadata, **kwargs):
+  """Evaluates how faithful the generated answer is to the source facts."""
+  result = faithfulness_llm.invoke(
+    faithfulness_instructions
+    + "\n\nANSWER: "
+    + output["answer"]
+    + "\n\FACTS: "
+    + "\n\n".join(doc.page_content for doc in output["documents"])
+  )
+
+  return Evaluation(name="faithfulness", value=result["score"], comment=result.get("explanation", ""))
+
+
+if __name__ == "__main__":
+  print("Fetching dataset")
+  dataset = langfuse.get_dataset(name="rag_bot_evals")
+
+  print("Running answer evaluation experiment")
+  dataset.run_experiment(
+    name="Answer Quality: Relevance and Faithfulness",
+    task=rag_task,
+    evaluators=[answer_relevance_evaluator, faithfulness_evaluator],
+  )
+
+  print("Experiment run successfully")
+  langfuse.flush()
diff --git a/applications/rag/rag_bot/chunk_evaluation.py b/applications/rag/rag_bot/chunk_evaluation.py
@@ -0,0 +1,102 @@
+# A chunk evaluation is added to find the right chunk size and overlap for the RAG pipeline.
+# In this example, we evaluate multiple chunk sizes and overlaps to find the right balance,
+# with the following combinations:
+# | chunk_size | chunk_overlap |
+# |------------|---------------|
+# | 128        | 0            |
+# | 128        | 64           |
+# | 256        | 0            |
+# | 256        | 128          |
+# | 512        | 0            |
+# | 512        | 256          |
+
+from typing import Annotated, TypedDict
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from langfuse import Evaluation, get_client
+from langfuse.experiment import ExperimentItem
+from main import get_retriever, urls
+
+load_dotenv()
+langfuse = get_client()
+
+print("Fetching dataset")
+dataset = langfuse.get_dataset(name="rag_bot_evals")
+
+def create_retriever_task(chunk_size: int, chunk_overlap: int):
+  """Factory function to create a retriever task with specific chunk settings."""
+
+  def retriever_task(*, item: ExperimentItem, **kwargs):
+    question = item.input["question"]
+    retriever = get_retriever(urls=urls, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    docs = retriever.invoke(question)
+
+    return {"documents": docs}
+
+  return retriever_task
+
+
+class RetrieverRelevanceGrade(TypedDict):
+  explanation: Annotated[str, ..., "Explain your reasoning for the score"]
+  relevant: Annotated[list[int], ..., "Please rate the relevance of each chunk to the question from 1 to 10"]
+
+
+retrieval_relevance_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
+  RetrieverRelevanceGrade, method="json_schema", strict=True
+)
+
+retrieval_relevance_instructions = """You are evaluating the relevance of a set of chunks to a question. 
+You will be given a QUESTION, an EXPECTED OUTPUT, and a set of DOCUMENTS retrieved from the retriever.
+
+Here is the grade criteria to follow:
+(1) Your goal is to identify DOCUMENTS that are completely unrelated to the QUESTION
+(2) It is OK if the facts have SOME information that is unrelated as long as it is close to the EXPECTED OUTPUT
+
+You should return a list of numbers, one for each chunk, indicating the relevance of the chunk to the question.
+"""
+
+
+# Define evaluation functions
+def relevant_chunks_evaluator(*, input, output, expected_output, metadata, **kwargs):
+  retrieval_relevance_result = retrieval_relevance_llm.invoke(
+    retrieval_relevance_instructions
+    + "\n\nQUESTION: "
+    + input["question"]
+    + "\n\nEXPECTED OUTPUT: "
+    + expected_output["answer"]
+    + "\n\nDOCUMENTS: "
+    + "\n\n".join(doc.page_content for doc in output["documents"])
+  )
+
+  # Calculate average relevance score
+  relevance_scores = retrieval_relevance_result["relevant"]
+  avg_score = sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0
+
+  return Evaluation(
+    name="retrieval_relevance", value=avg_score, comment=retrieval_relevance_result.get("explanation", "")
+  )
+
+
+chunk_sizes = [128, 256, 512]
+for chunk_size in chunk_sizes:
+  print(f"Running experiments for chunk_size {chunk_size}")
+  # Run experiment with no overlap
+  dataset.run_experiment(
+    name=f"Chunk precision: chunk_size {chunk_size} and chunk_overlap 0",
+    task=create_retriever_task(chunk_size=chunk_size, chunk_overlap=0),
+    evaluators=[relevant_chunks_evaluator],
+    metadata={"chunk_size": chunk_size, "chunk_overlap": 0},
+  )
+
+  # Run experiment with 50% overlap
+  chunk_overlap = chunk_size // 2
+  dataset.run_experiment(
+    name=f"Chunk precision: chunk_size {chunk_size} and chunk_overlap {chunk_overlap}",
+    task=create_retriever_task(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
+    evaluators=[relevant_chunks_evaluator],
+    metadata={"chunk_size": chunk_size, "chunk_overlap": chunk_overlap},
+  )
+
+print("Experiment run successfully")
+langfuse.flush()
diff --git a/applications/rag/rag_bot/main.py b/applications/rag/rag_bot/main.py
diff --git a/applications/rag/uv.lock b/applications/rag/uv.lock