diff --git a/.gitignore b/.gitignore index 23d6b5655..98dd604cb 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ __pycache__ .env* .venv/ logs/ +trivy_report.txt +.sonar/.sonar_lock +.sonar/report-task.txt \ No newline at end of file diff --git a/cookbook/agentic_retrieval.ipynb b/cookbook/agentic_retrieval.ipynb index dacaf01ea..fb175ce1e 100644 --- a/cookbook/agentic_retrieval.ipynb +++ b/cookbook/agentic_retrieval.ipynb @@ -1,899 +1,902 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "XTboY7brzyp2" - }, - "source": [ - "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EtjMbl9Pz3S-" - }, - "source": [ - "

Reasoning-based RAG  ◦  No Vector DB  ◦  No Chunking  ◦  Human-like Retrieval

\n", - "\n", - "

\n", - " 🏠 Homepage  •  \n", - " 🖥️ Platform  •  \n", - " 📚 API Docs  •  \n", - " 📦 GitHub  •  \n", - " 💬 Discord  •  \n", - " ✉️ Contact \n", - "

\n", - "\n", - "
\n", - "\n", - "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex)    [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n", - "\n", - "
\n", - "\n", - "---\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bbC9uLWCz8zl" - }, - "source": [ - "# Agentic Retrieval with PageIndex Chat API\n", - "\n", - "Similarity-based RAG based on Vector-DB has shown big limitations in recent AI applications, reasoning-based or agentic retrieval has become important in current developments. However, unlike classic RAG pipeine with embedding input, top-K chunks returns, re-rank, what should a agentic-native retreival API looks like?\n", - "\n", - "For an agentic-native retrieval system, we need the ability to prompt for retrieval just as naturally as you interact with ChatGPT. Below, we provide an example of how the PageIndex Chat API enables this style of prompt-driven retrieval.\n", - "\n", - "\n", - "## PageIndex Chat API\n", - "[PageIndex Chat](https://chat.pageindex.ai/) is a AI assistant that allow you chat with multiple super-long documents without worrying about limited context or context rot problem. It is based on [PageIndex](https://pageindex.ai/blog/pageindex-intro), a vectorless reasoning-based RAG framework which gives more transparent and reliable results like a human expert.\n", - "
\n", - " \n", - "
\n", - "\n", - "You can now access PageIndex Chat with API or SDK.\n", - "\n", - "## 📝 Notebook Overview\n", - "\n", - "This notebook demonstrates a simple, minimal example of agentic retrieval with PageIndex. You will learn:\n", - "- [x] How to use PageIndex Chat API.\n", - "- [x] How to prompt the PageIndex Chat to make it a retrieval system" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "77SQbPoe-LTN" - }, - "source": [ - "### Install PageIndex SDK" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "id": "6Eiv_cHf0OXz" - }, - "outputs": [], - "source": [ - "%pip install -q --upgrade pageindex" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UR9-qkdD-Om7" - }, - "source": [ - "### Setup PageIndex" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "id": "AFzsW4gq0fjh" - }, - "outputs": [], - "source": [ - "from pageindex import PageIndexClient\n", - "\n", - "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n", - "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n", - "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uvzf9oWL-Ts9" - }, - "source": [ - "### Upload a document" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qf7sNRoL0hGw", - "outputId": "529f53c1-c827-45a7-cf01-41f567d4feaa" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloaded https://arxiv.org/pdf/2507.13334.pdf\n", - "Document Submitted: pi-cmi34m6jy01sg0bqzofch62n8\n" - ] - } - ], - "source": [ - "import os, requests\n", - "\n", - "pdf_url = \"https://arxiv.org/pdf/2507.13334.pdf\"\n", - "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n", - "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n", - "\n", - "response = requests.get(pdf_url)\n", - "with open(pdf_path, \"wb\") as f:\n", - " f.write(response.content)\n", - "print(f\"Downloaded {pdf_url}\")\n", - "\n", - "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n", - "print('Document Submitted:', doc_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "U4hpLB4T-fCt" - }, - "source": [ - "### Check the processing status" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "PB1S_CWd2n87", - "outputId": "472a64ab-747d-469c-9e46-3329456df212" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'createdAt': '2025-11-16T08:36:41.177000',\n", - " 'description': 'This survey provides a comprehensive overview and taxonomy of '\n", - " 'Context Engineering for Large Language Models, covering '\n", - " 'foundational components, system implementations, evaluation '\n", - " 'methods, and future research directions.',\n", - " 'id': 'pi-cmi1gp1hg01t20do2l3bgzwz1',\n", - " 'name': '2507.13334_19.pdf',\n", - " 'pageNum': 166,\n", - " 'status': 'completed'}\n", - "\n", - " Document ready! (166 pages)\n" - ] - } - ], - "source": [ - "from pprint import pprint\n", - "\n", - "doc_info = pi_client.get_document(doc_id)\n", - "pprint(doc_info)\n", - "\n", - "if doc_info['status'] == 'completed':\n", - " print(f\"\\n Document ready! ({doc_info['pageNum']} pages)\")\n", - "elif doc_info['status'] == 'processing':\n", - " print(\"\\n Document is still processing. Please wait and check again.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1C9FOvO-p1m" - }, - "source": [ - "### Ask a question about this document" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "X3RbQvy_0nt7", - "outputId": "9bfb314b-24ad-4eb2-d26c-01be5728d3cc" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "I'll help you find the evaluation methods used in this paper. Let me start by examining the document structure to locate the relevant sections.{\"doc_name\": \"2507.13334_19.pdf\"}Perfect! I can see there's a dedicated section on \"Evaluation\" (node_id: 0015) that covers pages 45-50. Let me extract the content from those pages to get detailed information about the evaluation methods.{\"doc_name\": \"2507.13334_19.pdf\", \"pages\": \"45-50\"}Based on the comprehensive evaluation section of the paper, here are the **evaluation methods** used:\n", - "\n", - "## Evaluation Framework Overview\n", - "\n", - "The paper presents a comprehensive evaluation framework organized into **Component-Level Assessment** and **System-Level Integration Assessment**.\n", - "\n", - "### 1. **Component-Level Assessment (Intrinsic Evaluation)**\n", - "\n", - "#### Prompt Engineering Evaluation:\n", - "- **Semantic similarity metrics**\n", - "- **Response quality assessment**\n", - "- **Robustness testing** across diverse input variations\n", - "- **Contextual calibration** assessment\n", - "\n", - "#### Long Context Processing Evaluation:\n", - "- **\"Needle in a haystack\"** evaluation paradigm - tests models' ability to retrieve specific information embedded within long contexts\n", - "- **Multi-document reasoning tasks** - assess synthesis capabilities\n", - "- **Position interpolation techniques** evaluation\n", - "- **Information retention, positional bias, and reasoning coherence** metrics\n", - "\n", - "#### Self-Contextualization Evaluation:\n", - "- **Meta-learning assessments**\n", - "- **Adaptation speed measurements**\n", - "- **Consistency analysis** across multiple iterations\n", - "- Self-refinement frameworks: **Self-Refine, Reflexion, N-CRITICS**\n", - "- Performance improvements measured (~20% improvement with GPT-4)\n", - "\n", - "#### Structured/Relational Data Integration:\n", - "- **Knowledge graph traversal accuracy**\n", - "- **Table comprehension assessment**\n", - "- **Database query generation evaluation**\n", - "\n", - "### 2. **System-Level Integration Assessment (Extrinsic Evaluation)**\n", - "\n", - "#### Retrieval-Augmented Generation (RAG):\n", - "- **Precision, recall, relevance metrics**\n", - "- **Factual accuracy assessment**\n", - "- **Task decomposition accuracy**\n", - "- **Multi-plan selection effectiveness**\n", - "- Memory-augmented planning evaluation\n", - "\n", - "#### Memory Systems Evaluation:\n", - "- **LongMemEval benchmark** (500 curated questions covering):\n", - " - Information extraction\n", - " - Temporal reasoning\n", - " - Multi-session reasoning\n", - " - Knowledge updates\n", - "- Dedicated benchmarks: **NarrativeQA, QMSum, QuALITY, MEMENTO**\n", - "- Accuracy degradation tracking (~30% degradation in extended interactions)\n", - "\n", - "#### Tool-Integrated Reasoning:\n", - "- **MCP-RADAR framework** for standardized evaluation\n", - "- **Berkeley Function Calling Leaderboard (BFCL)** - 2,000 test cases\n", - "- **T-Eval** - 553 tool-use cases\n", - "- **API-Bank** - 73 APIs, 314 dialogues\n", - "- **ToolHop** - 995 queries, 3,912 tools\n", - "- **StableToolBench** for API instability\n", - "- **WebArena** and **Mind2Web** for web agents\n", - "- **VideoWebArena** for multimodal agents\n", - "- Metrics: tool selection accuracy, parameter extraction precision, execution success rates, error recovery\n", - "\n", - "#### Multi-Agent Systems:\n", - "- **Communication effectiveness metrics**\n", - "- **Coordination efficiency assessment**\n", - "- **Protocol adherence evaluation**\n", - "- **Task decomposition accuracy**\n", - "- **Emergent collaborative behaviors** assessment\n", - "- Context handling and transaction support evaluation\n", - "\n", - "### 3. **Emerging Evaluation Paradigms**\n", - "\n", - "#### Self-Refinement Evaluation:\n", - "- Iterative improvement assessment across multiple cycles\n", - "- Multi-dimensional feedback mechanisms\n", - "- Ensemble-based evaluation approaches\n", - "\n", - "#### Multi-Aspect Feedback:\n", - "- Correctness, relevance, clarity, and robustness dimensions\n", - "- Self-rewarding mechanisms for autonomous evolution\n", - "\n", - "#### Criticism-Guided Evaluation:\n", - "- Specialized critic models providing detailed feedback\n", - "- Fine-grained assessment of reasoning quality, factual accuracy, logical consistency\n", - "\n", - "### 4. **Safety and Robustness Assessment**\n", - "\n", - "- **Adversarial attack resistance testing**\n", - "- **Distribution shift evaluation**\n", - "- **Input perturbation testing**\n", - "- **Alignment assessment** (adherence to intended behaviors)\n", - "- **Graceful degradation strategies**\n", - "- **Error recovery protocols**\n", - "- **Long-term behavior consistency** evaluation\n", - "\n", - "### Key Benchmarks Mentioned:\n", - "- GAIA (general assistant tasks - 92% human vs 15% GPT-4 accuracy)\n", - "- GTA benchmark (GPT-4 <50% task completion vs 92% human)\n", - "- WebArena Leaderboard (with success rates ranging from 23.5% to 61.7%)\n", - "\n", - "### Challenges Identified:\n", - "- Traditional metrics (BLEU, ROUGE, perplexity) inadequate for complex systems\n", - "- Need for \"living\" benchmarks that co-evolve with AI capabilities\n", - "- Longitudinal evaluation frameworks for tracking memory fidelity over time\n", - "- Compositional generalization assessment\n", - "- Evaluation of \"unknown unknowns\" in multi-agent systems\n", - "\n", - "The paper emphasizes a **paradigm shift from static benchmarks to dynamic, holistic assessments** that evaluate not just task success but reasoning quality, robustness, and long-term autonomy." - ] - } - ], - "source": [ - "query = \"What are the evaluation methods used in this paper?\"\n", - "\n", - "for chunk in pi_client.chat_completions(\n", - " messages=[{\"role\": \"user\", \"content\": query}],\n", - " doc_id=doc_id,\n", - " stream=True\n", - "):\n", - " print(chunk, end='', flush=True)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "XTboY7brzyp2" + }, + "source": [ + "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EtjMbl9Pz3S-" + }, + "source": [ + "

Reasoning-based RAG  ◦  No Vector DB  ◦  No Chunking  ◦  Human-like Retrieval

\n", + "\n", + "

\n", + " 🏠 Homepage  •  \n", + " 🖥️ Platform  •  \n", + " 📚 API Docs  •  \n", + " 📦 GitHub  •  \n", + " 💬 Discord  •  \n", + " ✉️ Contact \n", + "

\n", + "\n", + "
\n", + "\n", + "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex)    [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n", + "\n", + "
\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bbC9uLWCz8zl" + }, + "source": [ + "# Agentic Retrieval with PageIndex Chat API\n", + "\n", + "Similarity-based RAG based on Vector-DB has shown big limitations in recent AI applications, reasoning-based or agentic retrieval has become important in current developments. However, unlike classic RAG pipeine with embedding input, top-K chunks returns, re-rank, what should a agentic-native retreival API looks like?\n", + "\n", + "For an agentic-native retrieval system, we need the ability to prompt for retrieval just as naturally as you interact with ChatGPT. Below, we provide an example of how the PageIndex Chat API enables this style of prompt-driven retrieval.\n", + "\n", + "\n", + "## PageIndex Chat API\n", + "[PageIndex Chat](https://chat.pageindex.ai/) is a AI assistant that allow you chat with multiple super-long documents without worrying about limited context or context rot problem. It is based on [PageIndex](https://pageindex.ai/blog/pageindex-intro), a vectorless reasoning-based RAG framework which gives more transparent and reliable results like a human expert.\n", + "
\n", + " \n", + "
\n", + "\n", + "You can now access PageIndex Chat with API or SDK.\n", + "\n", + "## 📝 Notebook Overview\n", + "\n", + "This notebook demonstrates a simple, minimal example of agentic retrieval with PageIndex. You will learn:\n", + "- [x] How to use PageIndex Chat API.\n", + "- [x] How to prompt the PageIndex Chat to make it a retrieval system" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "77SQbPoe-LTN" + }, + "source": [ + "### Install PageIndex SDK" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "id": "6Eiv_cHf0OXz" + }, + "outputs": [], + "source": [ + "%pip install -q --upgrade pageindex" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UR9-qkdD-Om7" + }, + "source": [ + "### Setup PageIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "id": "AFzsW4gq0fjh" + }, + "outputs": [], + "source": [ + "from pageindex import PageIndexClient\n", + "\n", + "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n", + "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n", + "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uvzf9oWL-Ts9" + }, + "source": [ + "### Upload a document" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "qf7sNRoL0hGw", + "outputId": "529f53c1-c827-45a7-cf01-41f567d4feaa" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "RXrdnxv92LY1" - }, - "source": [ - "## Agentic Retrieval with PageIndex Chat API\n", - "\n", - "You can now easily prompt the PageIndex Chat API to be a retrieval assistant." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloaded https://arxiv.org/pdf/2507.13334.pdf\n", + "Document Submitted: pi-cmi34m6jy01sg0bqzofch62n8\n" + ] + } + ], + "source": [ + "import os\n", + "import requests\n", + "\n", + "pdf_url = \"https://arxiv.org/pdf/2507.13334.pdf\"\n", + "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n", + "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n", + "\n", + "response = requests.get(pdf_url)\n", + "with open(pdf_path, \"wb\") as f:\n", + " f.write(response.content)\n", + "print(f\"Downloaded {pdf_url}\")\n", + "\n", + "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n", + "print('Document Submitted:', doc_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U4hpLB4T-fCt" + }, + "source": [ + "### Check the processing status" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "PB1S_CWd2n87", + "outputId": "472a64ab-747d-469c-9e46-3329456df212" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "puyOGkAi0wrH", - "outputId": "b08c682f-abfb-4381-8bb0-726d210120ad" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "I'll help you find the evaluation methods discussed in this paper. Let me start by examining the document structure to locate relevant sections.{\"doc_name\": \"2507.13334_22.pdf\"}Now I can see the document structure. Section 6 is dedicated to \"Evaluation\" and spans pages 45-50. Let me retrieve the content from those pages to get the detailed evaluation methods.{\"doc_name\": \"2507.13334_22.pdf\", \"pages\": \"45-50\"}Perfect! I have retrieved the comprehensive evaluation section. Now let me compile the evaluation methods in the requested JSON format.\n", - "\n", - "```json\n", - "[\n", - " {\n", - " \"page\": 45,\n", - " \"content\": \"## 6. Evaluation\\n\\nThe evaluation of context-engineered systems presents unprecedented challenges that transcend traditional language model assessment paradigms. These systems exhibit complex, multi-component architectures with dynamic, context-dependent behaviors requiring comprehensive evaluation frameworks that assess component-level diagnostics, task-based performance, and overall system robustness [841, 1141].\\n\\nThe heterogeneous nature of context engineering components-spanning retrieval mechanisms, memory systems, reasoning chains, and multi-agent coordination-demands evaluation methodologies that can capture both individual component effectiveness and emergent system-level behaviors [314, 939].\\n\\n### 6.1. Evaluation Frameworks and Methodologies\\n\\nThis subsection presents comprehensive approaches for evaluating both individual components and integrated systems in context engineering.\\n\\n#### 6.1.1. Component-Level Assessment\\n\\nIntrinsic evaluation focuses on the performance of individual components in isolation, providing foundational insights into system capabilities and failure modes.\\n\\nFor prompt engineering components, evaluation encompasses prompt effectiveness measurement through semantic similarity metrics, response quality assessment, and robustness testing across diverse input variations. Current approaches reveal brittleness and robustness challenges in prompt design, necessitating more sophisticated evaluation frameworks that can assess contextual calibration and adaptive prompt optimization $[1141,669]$.\"\n", - " },\n", - " {\n", - " \"page\": 46,\n", - " \"content\": \"Long context processing evaluation requires specialized metrics addressing information retention, positional bias, and reasoning coherence across extended sequences. The \\\"needle in a haystack\\\" evaluation paradigm tests models' ability to retrieve specific information embedded within long contexts, while multi-document reasoning tasks assess synthesis capabilities across multiple information sources. Position interpolation techniques and ultra-long sequence processing methods face significant computational challenges that limit practical evaluation scenarios [737, 299].\\n\\nSelf-contextualization mechanisms undergo evaluation through meta-learning assessments, adaptation speed measurements, and consistency analysis across multiple iterations. Self-refinement frameworks including Self-Refine, Reflexion, and N-CRITICS demonstrate substantial performance improvements, with GPT-4 achieving approximately 20\\\\% improvement through iterative self-refinement processes [741, 964, 795]. Multi-dimensional feedback mechanisms and ensemble-based evaluation approaches provide comprehensive assessment of autonomous evolution capabilities [583, 710].\\n\\nStructured and relational data integration evaluation examines accuracy in knowledge graph traversal, table comprehension, and database query generation. However, current evaluation frameworks face significant limitations in assessing structural reasoning capabilities, with high-quality structured training data development presenting ongoing challenges. LSTM-based models demonstrate increased errors when sequential and structural information conflict, highlighting the need for more sophisticated benchmarks testing structural understanding $[769,674,167]$.\\n\\n#### 6.1.2. System-Level Integration Assessment\\n\\nExtrinsic evaluation measures end-to-end performance on downstream tasks, providing holistic assessments of system utility through comprehensive benchmarks spanning question answering, reasoning, and real-world applications.\\n\\nSystem-level evaluation must capture emergent behaviors arising from component interactions, including synergistic effects where combined components exceed individual performance and potential interference patterns where component integration degrades overall effectiveness [841, 1141].\\n\\nRetrieval-Augmented Generation evaluation encompasses both retrieval quality and generation effectiveness through comprehensive metrics addressing precision, recall, relevance, and factual accuracy. Agentic RAG systems introduce additional complexity requiring evaluation of task decomposition accuracy, multi-plan selection effectiveness, and memory-augmented planning capabilities. Self-reflection mechanisms demonstrate iterative improvement through feedback loops, with MemoryBank implementations incorporating Ebbinghaus Forgetting Curve principles for enhanced memory evaluation [444, 166, 1372, 1192, 41].\\n\\nMemory systems evaluation encounters substantial difficulties stemming from the absence of standardized assessment frameworks and the inherently stateless characteristics of contemporary LLMs. LongMemEval offers 500 carefully curated questions that evaluate fundamental capabilities encompassing information extraction, temporal reasoning, multi-session reasoning, and knowledge updates. Commercial AI assistants exhibit $30 \\\\%$ accuracy degradation throughout extended interactions, underscoring significant deficiencies in memory persistence and retrieval effectiveness [1340, 1180, 463, 847, 390]. Dedicated benchmarks such as NarrativeQA, QMSum, QuALITY, and MEMENTO tackle episodic memory evaluation challenges [556, 572].\\n\\nTool-integrated reasoning systems require comprehensive evaluation covering the entire interaction trajectory, including tool selection accuracy, parameter extraction precision, execution success rates, and error recovery capabilities. The MCP-RADAR framework provides standardized evaluation employing objective metrics for software engineering and mathematical reasoning domains. Real-world evaluation reveals\"\n", - " },\n", - " {\n", - " \"page\": 47,\n", - " \"content\": \"significant performance gaps, with GPT-4 completing less than 50\\\\% of tasks in the GTA benchmark, compared to human performance of $92 \\\\%$ [314, 1098, 126, 939]. Advanced benchmarks including BFCL (2,000 testing cases), T-Eval (553 tool-use cases), API-Bank (73 APIs, 314 dialogues), and ToolHop ( 995 queries, 3,912 tools) address multi-turn interactions and nested tool calling scenarios [263, 363, 377, 1264, 160, 835].\\n\\nMulti-agent systems evaluation captures communication effectiveness, coordination efficiency, and collective outcome quality through specialized metrics addressing protocol adherence, task decomposition accuracy, and emergent collaborative behaviors. Contemporary orchestration frameworks including LangGraph, AutoGen, and CAMEL demonstrate insufficient transaction support, with validation limitations emerging as systems rely exclusively on LLM self-validation capabilities without independent validation procedures. Context handling failures compound challenges as agents struggle with long-term context maintenance encompassing both episodic and semantic information [128, 394, 901].\\n\\n### 6.2. Benchmark Datasets and Evaluation Paradigms\\n\\nThis subsection reviews specialized benchmarks and evaluation paradigms designed for assessing context engineering system performance.\\n\\n#### 6.2.1. Foundational Component Benchmarks\\n\\nLong context processing evaluation employs specialized benchmark suites designed to test information retention, reasoning, and synthesis across extended sequences. Current benchmarks face significant computational complexity challenges, with $\\\\mathrm{O}\\\\left(\\\\mathrm{n}^{2}\\\\right)$ scaling limitations in attention mechanisms creating substantial memory constraints for ultra-long sequences. Position interpolation and extension techniques require sophisticated evaluation frameworks that can assess both computational efficiency and reasoning quality across varying sequence lengths [737, 299, 1236].\\n\\nAdvanced architectures including LongMamba and specialized position encoding methods demonstrate promising directions for long context processing, though evaluation reveals persistent challenges in maintaining coherence across extended sequences. The development of sliding attention mechanisms and memory-efficient implementations requires comprehensive benchmarks that can assess both computational tractability and task performance [1267, 351].\\n\\nStructured and relational data integration benchmarks encompass diverse knowledge representation formats and reasoning patterns. However, current evaluation frameworks face limitations in assessing structural reasoning capabilities, with the development of high-quality structured training data presenting ongoing challenges. Evaluation must address the fundamental tension between sequential and structural information processing, particularly in scenarios where these information types conflict [769, 674, 167].\\n\\n#### 6.2.2. System Implementation Benchmarks\\n\\nRetrieval-Augmented Generation evaluation leverages comprehensive benchmark suites addressing diverse retrieval and generation challenges. Modular RAG architectures demonstrate enhanced flexibility through specialized modules for retrieval, augmentation, and generation, enabling fine-grained evaluation of individual components and their interactions. Graph-enhanced RAG systems incorporating GraphRAG and LightRAG demonstrate improved performance in complex reasoning scenarios, though evaluation frameworks must address the additional complexity of graph traversal and multi-hop reasoning assessment [316, 973, 364].\\n\\nAgentic RAG systems introduce sophisticated planning and reflection mechanisms requiring evaluation\"\n", - " },\n", - " {\n", - " \"page\": 48,\n", - " \"content\": \"of task decomposition accuracy, multi-plan selection effectiveness, and iterative refinement capabilities. Real-time and streaming RAG applications present unique evaluation challenges in assessing both latency and accuracy under dynamic information conditions [444, 166, 1192].\\n\\nTool-integrated reasoning system evaluation employs comprehensive benchmarks spanning diverse tool usage scenarios and complexity levels. The Berkeley Function Calling Leaderboard (BFCL) provides 2,000 testing cases with step-by-step and end-to-end assessments measuring call accuracy, pass rates, and win rates across increasingly complex scenarios. T-Eval contributes 553 tool-use cases testing multi-turn interactions and nested tool calling capabilities [263, 1390, 835]. Advanced benchmarks including StableToolBench address API instability challenges, while NesTools evaluates nested tool scenarios and ToolHop assesses multi-hop tool usage across 995 queries and 3,912 tools [363, 377, 1264].\\n\\nWeb agent evaluation frameworks including WebArena and Mind2Web provide comprehensive assessment across thousands of tasks spanning 137 websites, revealing significant performance gaps in current LLM capabilities for complex web interactions. VideoWebArena extends evaluation to multimodal agents, while Deep Research Bench and DeepShop address specialized evaluation for research and shopping agents respectively $[1378,206,87,482]$.\\n\\nMulti-agent system evaluation employs specialized frameworks addressing coordination, communication, and collective intelligence. However, current frameworks face significant challenges in transactional integrity across complex workflows, with many systems lacking adequate compensation mechanisms for partial failures. Orchestration evaluation must address context management, coordination strategy effectiveness, and the ability to maintain system coherence under varying operational conditions [128, 901].\\n\\n| Release Date | Open Source | Method / Model | Success Rate (\\\\%) | Source |\\n| :-- | :--: | :-- | :--: | :-- |\\n| $2025-02$ | $\\\\times$ | IBM CUGA | 61.7 | $[753]$ |\\n| $2025-01$ | $\\\\times$ | OpenAI Operator | 58.1 | $[813]$ |\\n| $2024-08$ | $\\\\times$ | Jace.AI | 57.1 | $[476]$ |\\n| $2024-12$ | $\\\\times$ | ScribeAgent + GPT-4o | 53.0 | $[950]$ |\\n| $2025-01$ | $\\\\checkmark$ | AgentSymbiotic | 52.1 | $[1323]$ |\\n| $2025-01$ | $\\\\checkmark$ | Learn-by-Interact | 48.0 | $[998]$ |\\n| $2024-10$ | $\\\\checkmark$ | AgentOccam-Judge | 45.7 | $[1231]$ |\\n| $2024-08$ | $\\\\times$ | WebPilot | 37.2 | $[1331]$ |\\n| $2024-10$ | $\\\\checkmark$ | GUI-API Hybrid Agent | 35.8 | $[988]$ |\\n| $2024-09$ | $\\\\checkmark$ | Agent Workflow Memory | 35.5 | $[1144]$ |\\n| $2024-04$ | $\\\\checkmark$ | SteP | 33.5 | $[979]$ |\\n| $2025-06$ | $\\\\checkmark$ | TTI | 26.1 | $[951]$ |\\n| $2024-04$ | $\\\\checkmark$ | BrowserGym + GPT-4 | 23.5 | $[238]$ |\\n\\nTable 8: WebArena [1378] Leaderboard: Top performing models with their success rates and availability status.\\n\\n### 6.3. Evaluation Challenges and Emerging Paradigms\\n\\nThis subsection identifies current limitations in evaluation methodologies and explores emerging approaches for more effective assessment.\"\n", - " },\n", - " {\n", - " \"page\": 49,\n", - " \"content\": \"#### 6.3.1. Methodological Limitations and Biases\\n\\nTraditional evaluation metrics prove fundamentally inadequate for capturing the nuanced, dynamic behaviors exhibited by context-engineered systems. Static metrics like BLEU, ROUGE, and perplexity, originally designed for simpler text generation tasks, fail to assess complex reasoning chains, multi-step interactions, and emergent system behaviors. The inherent complexity and interdependencies of multi-component systems create attribution challenges where isolating failures and identifying root causes becomes computationally and methodologically intractable. Future metrics must evolve to capture not just task success, but the quality and robustness of the underlying reasoning process, especially in scenarios requiring compositional generalization and creative problem-solving [841, 1141].\\n\\nMemory system evaluation faces particular challenges due to the lack of standardized benchmarks and the stateless nature of current LLMs. Automated memory testing frameworks must address the isolation problem where different memory testing stages cannot be effectively separated, leading to unreliable assessment results. Commercial AI assistants demonstrate significant performance degradation during sustained interactions, with accuracy drops of up to $30 \\\\%$ highlighting critical gaps in current evaluation methodologies and pointing to the need for longitudinal evaluation frameworks that track memory fidelity over time $[1340,1180,463]$.\\n\\nTool-integrated reasoning system evaluation reveals substantial performance gaps between current systems and human-level capabilities. The GAIA benchmark demonstrates that while humans achieve $92 \\\\%$ accuracy on general assistant tasks, advanced models like GPT-4 achieve only $15 \\\\%$ accuracy, indicating fundamental limitations in current evaluation frameworks and system capabilities [778, 1098, 126]. Evaluation frameworks must address the complexity of multi-tool coordination, error recovery, and adaptive tool selection across diverse operational contexts [314, 939].\\n\\n#### 6.3.2. Emerging Evaluation Paradigms\\n\\nSelf-refinement evaluation paradigms leverage iterative improvement mechanisms to assess system capabilities across multiple refinement cycles. Frameworks including Self-Refine, Reflexion, and N-CRITICS demonstrate substantial performance improvements through multi-dimensional feedback and ensemblebased evaluation approaches. GPT-4 achieves approximately 20\\\\% improvement through self-refinement processes, highlighting the importance of evaluating systems across multiple iteration cycles rather than single-shot assessments. However, a key future challenge lies in evaluating the meta-learning capability itself—not just whether the system improves, but how efficiently and robustly it learns to refine its strategies over time $[741,964,795,583]$.\\n\\nMulti-aspect feedback evaluation incorporates diverse feedback dimensions including correctness, relevance, clarity, and robustness, providing comprehensive assessment of system outputs. Self-rewarding mechanisms enable autonomous evolution and meta-learning assessment, allowing systems to develop increasingly sophisticated evaluation criteria through iterative refinement [710].\\n\\nCriticism-guided evaluation employs specialized critic models to provide detailed feedback on system outputs, enabling fine-grained assessment of reasoning quality, factual accuracy, and logical consistency. These approaches address the limitations of traditional metrics by providing contextual, content-aware evaluation that can adapt to diverse task requirements and output formats [795, 583].\\n\\nOrchestration evaluation frameworks address the unique challenges of multi-agent coordination by incorporating transactional integrity assessment, context management evaluation, and coordination strategy effectiveness measurement. Advanced frameworks including SagaLLM provide transaction support and\"\n", - " },\n", - " {\n", - " \"page\": 50,\n", - " \"content\": \"independent validation procedures to address the limitations of systems that rely exclusively on LLM selfvalidation capabilities $[128,394]$.\\n\\n#### 6.3.3. Safety and Robustness Assessment\\n\\nSafety-oriented evaluation incorporates comprehensive robustness testing, adversarial attack resistance, and alignment assessment to ensure responsible development of context-engineered systems. Particular attention must be paid to the evaluation of agentic systems that can operate autonomously across extended periods, as these systems present unique safety challenges that traditional evaluation frameworks cannot adequately address $[973,364]$.\\n\\nRobustness evaluation must assess system performance under distribution shifts, input perturbations, and adversarial conditions through comprehensive stress testing protocols. Multi-agent systems face additional challenges in coordination failure scenarios, where partial system failures can cascade through the entire agent network. Evaluation frameworks must address graceful degradation strategies, error recovery protocols, and the ability to maintain system functionality under adverse conditions. Beyond predefined failure modes, future evaluation must grapple with assessing resilience to \\\"unknown unknowns\\\"-emergent and unpredictable failure cascades in highly complex, autonomous multi-agent systems [128, 394].\\n\\nAlignment evaluation measures system adherence to intended behaviors, value consistency, and beneficial outcome optimization through specialized assessment frameworks. Context engineering systems present unique alignment challenges due to their dynamic adaptation capabilities and complex interaction patterns across multiple components. Long-term evaluation must assess whether systems maintain beneficial behaviors as they adapt and evolve through extended operational periods [901].\\n\\nLooking ahead, the evaluation of context-engineered systems requires a paradigm shift from static benchmarks to dynamic, holistic assessments. Future frameworks must move beyond measuring task success to evaluating compositional generalization for novel problems and tracking long-term autonomy in interactive environments. The development of 'living' benchmarks that co-evolve with AI capabilities, alongside the integration of socio-technical and economic metrics, will be critical for ensuring these advanced systems are not only powerful but also reliable, efficient, and aligned with human values in real-world applications $[314,1378,1340]$.\\n\\nThe evaluation landscape for context-engineered systems continues evolving rapidly as new architectures, capabilities, and applications emerge. Future evaluation paradigms must address increasing system complexity while providing reliable, comprehensive, and actionable insights for system improvement and deployment decisions. The integration of multiple evaluation approaches-from component-level assessment to systemwide robustness testing-represents a critical research priority for ensuring the reliable deployment of context-engineered systems in real-world applications [841, 1141].\"\n", - " }\n", - "]\n", - "```" - ] - } - ], - "source": [ - "retrieval_prompt = f\"\"\"\n", - "Your job is to retrieve the raw relevant content from the document based on the user's query.\n", - "\n", - "Query: {query}\n", - "\n", - "Return in JSON format:\n", - "```json\n", - "[\n", - " {{\n", - " \"page\": ,\n", - " \"content\": \"\"\n", - " }},\n", - " ...\n", - "]\n", - "```\n", - "\"\"\"\n", - "\n", - "full_response = \"\"\n", - "\n", - "for chunk in pi_client.chat_completions(\n", - " messages=[{\"role\": \"user\", \"content\": retrieval_prompt}],\n", - " doc_id=doc_id,\n", - " stream=True\n", - "):\n", - " print(chunk, end='', flush=True)\n", - " full_response += chunk" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "{'createdAt': '2025-11-16T08:36:41.177000',\n", + " 'description': 'This survey provides a comprehensive overview and taxonomy of '\n", + " 'Context Engineering for Large Language Models, covering '\n", + " 'foundational components, system implementations, evaluation '\n", + " 'methods, and future research directions.',\n", + " 'id': 'pi-cmi1gp1hg01t20do2l3bgzwz1',\n", + " 'name': '2507.13334_19.pdf',\n", + " 'pageNum': 166,\n", + " 'status': 'completed'}\n", + "\n", + " Document ready! (166 pages)\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "\n", + "doc_info = pi_client.get_document(doc_id)\n", + "pprint(doc_info)\n", + "\n", + "if doc_info['status'] == 'completed':\n", + " print(f\"\\n Document ready! ({doc_info['pageNum']} pages)\")\n", + "elif doc_info['status'] == 'processing':\n", + " print(\"\\n Document is still processing. Please wait and check again.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1C9FOvO-p1m" + }, + "source": [ + "### Ask a question about this document" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "X3RbQvy_0nt7", + "outputId": "9bfb314b-24ad-4eb2-d26c-01be5728d3cc" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "d-Y9towQ_CiF" - }, - "source": [ - "### Extract the JSON retreived results" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "I'll help you find the evaluation methods used in this paper. Let me start by examining the document structure to locate the relevant sections.{\"doc_name\": \"2507.13334_19.pdf\"}Perfect! I can see there's a dedicated section on \"Evaluation\" (node_id: 0015) that covers pages 45-50. Let me extract the content from those pages to get detailed information about the evaluation methods.{\"doc_name\": \"2507.13334_19.pdf\", \"pages\": \"45-50\"}Based on the comprehensive evaluation section of the paper, here are the **evaluation methods** used:\n", + "\n", + "## Evaluation Framework Overview\n", + "\n", + "The paper presents a comprehensive evaluation framework organized into **Component-Level Assessment** and **System-Level Integration Assessment**.\n", + "\n", + "### 1. **Component-Level Assessment (Intrinsic Evaluation)**\n", + "\n", + "#### Prompt Engineering Evaluation:\n", + "- **Semantic similarity metrics**\n", + "- **Response quality assessment**\n", + "- **Robustness testing** across diverse input variations\n", + "- **Contextual calibration** assessment\n", + "\n", + "#### Long Context Processing Evaluation:\n", + "- **\"Needle in a haystack\"** evaluation paradigm - tests models' ability to retrieve specific information embedded within long contexts\n", + "- **Multi-document reasoning tasks** - assess synthesis capabilities\n", + "- **Position interpolation techniques** evaluation\n", + "- **Information retention, positional bias, and reasoning coherence** metrics\n", + "\n", + "#### Self-Contextualization Evaluation:\n", + "- **Meta-learning assessments**\n", + "- **Adaptation speed measurements**\n", + "- **Consistency analysis** across multiple iterations\n", + "- Self-refinement frameworks: **Self-Refine, Reflexion, N-CRITICS**\n", + "- Performance improvements measured (~20% improvement with GPT-4)\n", + "\n", + "#### Structured/Relational Data Integration:\n", + "- **Knowledge graph traversal accuracy**\n", + "- **Table comprehension assessment**\n", + "- **Database query generation evaluation**\n", + "\n", + "### 2. **System-Level Integration Assessment (Extrinsic Evaluation)**\n", + "\n", + "#### Retrieval-Augmented Generation (RAG):\n", + "- **Precision, recall, relevance metrics**\n", + "- **Factual accuracy assessment**\n", + "- **Task decomposition accuracy**\n", + "- **Multi-plan selection effectiveness**\n", + "- Memory-augmented planning evaluation\n", + "\n", + "#### Memory Systems Evaluation:\n", + "- **LongMemEval benchmark** (500 curated questions covering):\n", + " - Information extraction\n", + " - Temporal reasoning\n", + " - Multi-session reasoning\n", + " - Knowledge updates\n", + "- Dedicated benchmarks: **NarrativeQA, QMSum, QuALITY, MEMENTO**\n", + "- Accuracy degradation tracking (~30% degradation in extended interactions)\n", + "\n", + "#### Tool-Integrated Reasoning:\n", + "- **MCP-RADAR framework** for standardized evaluation\n", + "- **Berkeley Function Calling Leaderboard (BFCL)** - 2,000 test cases\n", + "- **T-Eval** - 553 tool-use cases\n", + "- **API-Bank** - 73 APIs, 314 dialogues\n", + "- **ToolHop** - 995 queries, 3,912 tools\n", + "- **StableToolBench** for API instability\n", + "- **WebArena** and **Mind2Web** for web agents\n", + "- **VideoWebArena** for multimodal agents\n", + "- Metrics: tool selection accuracy, parameter extraction precision, execution success rates, error recovery\n", + "\n", + "#### Multi-Agent Systems:\n", + "- **Communication effectiveness metrics**\n", + "- **Coordination efficiency assessment**\n", + "- **Protocol adherence evaluation**\n", + "- **Task decomposition accuracy**\n", + "- **Emergent collaborative behaviors** assessment\n", + "- Context handling and transaction support evaluation\n", + "\n", + "### 3. **Emerging Evaluation Paradigms**\n", + "\n", + "#### Self-Refinement Evaluation:\n", + "- Iterative improvement assessment across multiple cycles\n", + "- Multi-dimensional feedback mechanisms\n", + "- Ensemble-based evaluation approaches\n", + "\n", + "#### Multi-Aspect Feedback:\n", + "- Correctness, relevance, clarity, and robustness dimensions\n", + "- Self-rewarding mechanisms for autonomous evolution\n", + "\n", + "#### Criticism-Guided Evaluation:\n", + "- Specialized critic models providing detailed feedback\n", + "- Fine-grained assessment of reasoning quality, factual accuracy, logical consistency\n", + "\n", + "### 4. **Safety and Robustness Assessment**\n", + "\n", + "- **Adversarial attack resistance testing**\n", + "- **Distribution shift evaluation**\n", + "- **Input perturbation testing**\n", + "- **Alignment assessment** (adherence to intended behaviors)\n", + "- **Graceful degradation strategies**\n", + "- **Error recovery protocols**\n", + "- **Long-term behavior consistency** evaluation\n", + "\n", + "### Key Benchmarks Mentioned:\n", + "- GAIA (general assistant tasks - 92% human vs 15% GPT-4 accuracy)\n", + "- GTA benchmark (GPT-4 <50% task completion vs 92% human)\n", + "- WebArena Leaderboard (with success rates ranging from 23.5% to 61.7%)\n", + "\n", + "### Challenges Identified:\n", + "- Traditional metrics (BLEU, ROUGE, perplexity) inadequate for complex systems\n", + "- Need for \"living\" benchmarks that co-evolve with AI capabilities\n", + "- Longitudinal evaluation frameworks for tracking memory fidelity over time\n", + "- Compositional generalization assessment\n", + "- Evaluation of \"unknown unknowns\" in multi-agent systems\n", + "\n", + "The paper emphasizes a **paradigm shift from static benchmarks to dynamic, holistic assessments** that evaluate not just task success but reasoning quality, robustness, and long-term autonomy." + ] + } + ], + "source": [ + "query = \"What are the evaluation methods used in this paper?\"\n", + "\n", + "for chunk in pi_client.chat_completions(\n", + " messages=[{\"role\": \"user\", \"content\": query}],\n", + " doc_id=doc_id,\n", + " stream=True\n", + "):\n", + " print(chunk, end='', flush=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RXrdnxv92LY1" + }, + "source": [ + "## Agentic Retrieval with PageIndex Chat API\n", + "\n", + "You can now easily prompt the PageIndex Chat API to be a retrieval assistant." + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "puyOGkAi0wrH", + "outputId": "b08c682f-abfb-4381-8bb0-726d210120ad" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "rwjC65oB05Tt", - "outputId": "64504ad5-1778-463f-989b-46e18aba2ea6" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n", - "[{'content': '## 6. Evaluation\\n'\n", - " '\\n'\n", - " 'The evaluation of context-engineered systems presents '\n", - " 'unprecedented challenges that transcend traditional language '\n", - " 'model assessment paradigms. These systems exhibit complex, '\n", - " 'multi-component architectures with dynamic, context-dependent '\n", - " 'behaviors requiring comprehensive evaluation frameworks that '\n", - " 'assess component-level diagnostics, task-based performance, and '\n", - " 'overall system robustness [841, 1141].\\n'\n", - " '\\n'\n", - " 'The heterogeneous nature of context engineering '\n", - " 'components-spanning retrieval mechanisms, memory systems, '\n", - " 'reasoning chains, and multi-agent coordination-demands '\n", - " 'evaluation methodologies that can capture both individual '\n", - " 'component effectiveness and emergent system-level behaviors '\n", - " '[314, 939].\\n'\n", - " '\\n'\n", - " '### 6.1. Evaluation Frameworks and Methodologies\\n'\n", - " '\\n'\n", - " 'This subsection presents comprehensive approaches for evaluating '\n", - " 'both individual components and integrated systems in context '\n", - " 'engineering.\\n'\n", - " '\\n'\n", - " '#### 6.1.1. Component-Level Assessment\\n'\n", - " '\\n'\n", - " 'Intrinsic evaluation focuses on the performance of individual '\n", - " 'components in isolation, providing foundational insights into '\n", - " 'system capabilities and failure modes.\\n'\n", - " '\\n'\n", - " 'For prompt engineering components, evaluation encompasses prompt '\n", - " 'effectiveness measurement through semantic similarity metrics, '\n", - " 'response quality assessment, and robustness testing across '\n", - " 'diverse input variations. Current approaches reveal brittleness '\n", - " 'and robustness challenges in prompt design, necessitating more '\n", - " 'sophisticated evaluation frameworks that can assess contextual '\n", - " 'calibration and adaptive prompt optimization $[1141,669]$.',\n", - " 'page': 45},\n", - " {'content': 'Long context processing evaluation requires specialized metrics '\n", - " 'addressing information retention, positional bias, and reasoning '\n", - " 'coherence across extended sequences. The \"needle in a haystack\" '\n", - " \"evaluation paradigm tests models' ability to retrieve specific \"\n", - " 'information embedded within long contexts, while multi-document '\n", - " 'reasoning tasks assess synthesis capabilities across multiple '\n", - " 'information sources. Position interpolation techniques and '\n", - " 'ultra-long sequence processing methods face significant '\n", - " 'computational challenges that limit practical evaluation '\n", - " 'scenarios [737, 299].\\n'\n", - " '\\n'\n", - " 'Self-contextualization mechanisms undergo evaluation through '\n", - " 'meta-learning assessments, adaptation speed measurements, and '\n", - " 'consistency analysis across multiple iterations. Self-refinement '\n", - " 'frameworks including Self-Refine, Reflexion, and N-CRITICS '\n", - " 'demonstrate substantial performance improvements, with GPT-4 '\n", - " 'achieving approximately 20\\\\% improvement through iterative '\n", - " 'self-refinement processes [741, 964, 795]. Multi-dimensional '\n", - " 'feedback mechanisms and ensemble-based evaluation approaches '\n", - " 'provide comprehensive assessment of autonomous evolution '\n", - " 'capabilities [583, 710].\\n'\n", - " '\\n'\n", - " 'Structured and relational data integration evaluation examines '\n", - " 'accuracy in knowledge graph traversal, table comprehension, and '\n", - " 'database query generation. However, current evaluation '\n", - " 'frameworks face significant limitations in assessing structural '\n", - " 'reasoning capabilities, with high-quality structured training '\n", - " 'data development presenting ongoing challenges. LSTM-based '\n", - " 'models demonstrate increased errors when sequential and '\n", - " 'structural information conflict, highlighting the need for more '\n", - " 'sophisticated benchmarks testing structural understanding '\n", - " '$[769,674,167]$.\\n'\n", - " '\\n'\n", - " '#### 6.1.2. System-Level Integration Assessment\\n'\n", - " '\\n'\n", - " 'Extrinsic evaluation measures end-to-end performance on '\n", - " 'downstream tasks, providing holistic assessments of system '\n", - " 'utility through comprehensive benchmarks spanning question '\n", - " 'answering, reasoning, and real-world applications.\\n'\n", - " '\\n'\n", - " 'System-level evaluation must capture emergent behaviors arising '\n", - " 'from component interactions, including synergistic effects where '\n", - " 'combined components exceed individual performance and potential '\n", - " 'interference patterns where component integration degrades '\n", - " 'overall effectiveness [841, 1141].\\n'\n", - " '\\n'\n", - " 'Retrieval-Augmented Generation evaluation encompasses both '\n", - " 'retrieval quality and generation effectiveness through '\n", - " 'comprehensive metrics addressing precision, recall, relevance, '\n", - " 'and factual accuracy. Agentic RAG systems introduce additional '\n", - " 'complexity requiring evaluation of task decomposition accuracy, '\n", - " 'multi-plan selection effectiveness, and memory-augmented '\n", - " 'planning capabilities. Self-reflection mechanisms demonstrate '\n", - " 'iterative improvement through feedback loops, with MemoryBank '\n", - " 'implementations incorporating Ebbinghaus Forgetting Curve '\n", - " 'principles for enhanced memory evaluation [444, 166, 1372, 1192, '\n", - " '41].\\n'\n", - " '\\n'\n", - " 'Memory systems evaluation encounters substantial difficulties '\n", - " 'stemming from the absence of standardized assessment frameworks '\n", - " 'and the inherently stateless characteristics of contemporary '\n", - " 'LLMs. LongMemEval offers 500 carefully curated questions that '\n", - " 'evaluate fundamental capabilities encompassing information '\n", - " 'extraction, temporal reasoning, multi-session reasoning, and '\n", - " 'knowledge updates. Commercial AI assistants exhibit $30 \\\\%$ '\n", - " 'accuracy degradation throughout extended interactions, '\n", - " 'underscoring significant deficiencies in memory persistence and '\n", - " 'retrieval effectiveness [1340, 1180, 463, 847, 390]. Dedicated '\n", - " 'benchmarks such as NarrativeQA, QMSum, QuALITY, and MEMENTO '\n", - " 'tackle episodic memory evaluation challenges [556, 572].\\n'\n", - " '\\n'\n", - " 'Tool-integrated reasoning systems require comprehensive '\n", - " 'evaluation covering the entire interaction trajectory, including '\n", - " 'tool selection accuracy, parameter extraction precision, '\n", - " 'execution success rates, and error recovery capabilities. The '\n", - " 'MCP-RADAR framework provides standardized evaluation employing '\n", - " 'objective metrics for software engineering and mathematical '\n", - " 'reasoning domains. Real-world evaluation reveals',\n", - " 'page': 46},\n", - " {'content': 'significant performance gaps, with GPT-4 completing less than '\n", - " '50\\\\% of tasks in the GTA benchmark, compared to human '\n", - " 'performance of $92 \\\\%$ [314, 1098, 126, 939]. Advanced '\n", - " 'benchmarks including BFCL (2,000 testing cases), T-Eval (553 '\n", - " 'tool-use cases), API-Bank (73 APIs, 314 dialogues), and ToolHop '\n", - " '( 995 queries, 3,912 tools) address multi-turn interactions and '\n", - " 'nested tool calling scenarios [263, 363, 377, 1264, 160, 835].\\n'\n", - " '\\n'\n", - " 'Multi-agent systems evaluation captures communication '\n", - " 'effectiveness, coordination efficiency, and collective outcome '\n", - " 'quality through specialized metrics addressing protocol '\n", - " 'adherence, task decomposition accuracy, and emergent '\n", - " 'collaborative behaviors. Contemporary orchestration frameworks '\n", - " 'including LangGraph, AutoGen, and CAMEL demonstrate insufficient '\n", - " 'transaction support, with validation limitations emerging as '\n", - " 'systems rely exclusively on LLM self-validation capabilities '\n", - " 'without independent validation procedures. Context handling '\n", - " 'failures compound challenges as agents struggle with long-term '\n", - " 'context maintenance encompassing both episodic and semantic '\n", - " 'information [128, 394, 901].\\n'\n", - " '\\n'\n", - " '### 6.2. Benchmark Datasets and Evaluation Paradigms\\n'\n", - " '\\n'\n", - " 'This subsection reviews specialized benchmarks and evaluation '\n", - " 'paradigms designed for assessing context engineering system '\n", - " 'performance.\\n'\n", - " '\\n'\n", - " '#### 6.2.1. Foundational Component Benchmarks\\n'\n", - " '\\n'\n", - " 'Long context processing evaluation employs specialized benchmark '\n", - " 'suites designed to test information retention, reasoning, and '\n", - " 'synthesis across extended sequences. Current benchmarks face '\n", - " 'significant computational complexity challenges, with '\n", - " '$\\\\mathrm{O}\\\\left(\\\\mathrm{n}^{2}\\\\right)$ scaling limitations '\n", - " 'in attention mechanisms creating substantial memory constraints '\n", - " 'for ultra-long sequences. Position interpolation and extension '\n", - " 'techniques require sophisticated evaluation frameworks that can '\n", - " 'assess both computational efficiency and reasoning quality '\n", - " 'across varying sequence lengths [737, 299, 1236].\\n'\n", - " '\\n'\n", - " 'Advanced architectures including LongMamba and specialized '\n", - " 'position encoding methods demonstrate promising directions for '\n", - " 'long context processing, though evaluation reveals persistent '\n", - " 'challenges in maintaining coherence across extended sequences. '\n", - " 'The development of sliding attention mechanisms and '\n", - " 'memory-efficient implementations requires comprehensive '\n", - " 'benchmarks that can assess both computational tractability and '\n", - " 'task performance [1267, 351].\\n'\n", - " '\\n'\n", - " 'Structured and relational data integration benchmarks encompass '\n", - " 'diverse knowledge representation formats and reasoning patterns. '\n", - " 'However, current evaluation frameworks face limitations in '\n", - " 'assessing structural reasoning capabilities, with the '\n", - " 'development of high-quality structured training data presenting '\n", - " 'ongoing challenges. Evaluation must address the fundamental '\n", - " 'tension between sequential and structural information '\n", - " 'processing, particularly in scenarios where these information '\n", - " 'types conflict [769, 674, 167].\\n'\n", - " '\\n'\n", - " '#### 6.2.2. System Implementation Benchmarks\\n'\n", - " '\\n'\n", - " 'Retrieval-Augmented Generation evaluation leverages '\n", - " 'comprehensive benchmark suites addressing diverse retrieval and '\n", - " 'generation challenges. Modular RAG architectures demonstrate '\n", - " 'enhanced flexibility through specialized modules for retrieval, '\n", - " 'augmentation, and generation, enabling fine-grained evaluation '\n", - " 'of individual components and their interactions. Graph-enhanced '\n", - " 'RAG systems incorporating GraphRAG and LightRAG demonstrate '\n", - " 'improved performance in complex reasoning scenarios, though '\n", - " 'evaluation frameworks must address the additional complexity of '\n", - " 'graph traversal and multi-hop reasoning assessment [316, 973, '\n", - " '364].\\n'\n", - " '\\n'\n", - " 'Agentic RAG systems introduce sophisticated planning and '\n", - " 'reflection mechanisms requiring evaluation',\n", - " 'page': 47},\n", - " {'content': 'of task decomposition accuracy, multi-plan selection '\n", - " 'effectiveness, and iterative refinement capabilities. Real-time '\n", - " 'and streaming RAG applications present unique evaluation '\n", - " 'challenges in assessing both latency and accuracy under dynamic '\n", - " 'information conditions [444, 166, 1192].\\n'\n", - " '\\n'\n", - " 'Tool-integrated reasoning system evaluation employs '\n", - " 'comprehensive benchmarks spanning diverse tool usage scenarios '\n", - " 'and complexity levels. The Berkeley Function Calling Leaderboard '\n", - " '(BFCL) provides 2,000 testing cases with step-by-step and '\n", - " 'end-to-end assessments measuring call accuracy, pass rates, and '\n", - " 'win rates across increasingly complex scenarios. T-Eval '\n", - " 'contributes 553 tool-use cases testing multi-turn interactions '\n", - " 'and nested tool calling capabilities [263, 1390, 835]. Advanced '\n", - " 'benchmarks including StableToolBench address API instability '\n", - " 'challenges, while NesTools evaluates nested tool scenarios and '\n", - " 'ToolHop assesses multi-hop tool usage across 995 queries and '\n", - " '3,912 tools [363, 377, 1264].\\n'\n", - " '\\n'\n", - " 'Web agent evaluation frameworks including WebArena and Mind2Web '\n", - " 'provide comprehensive assessment across thousands of tasks '\n", - " 'spanning 137 websites, revealing significant performance gaps in '\n", - " 'current LLM capabilities for complex web interactions. '\n", - " 'VideoWebArena extends evaluation to multimodal agents, while '\n", - " 'Deep Research Bench and DeepShop address specialized evaluation '\n", - " 'for research and shopping agents respectively '\n", - " '$[1378,206,87,482]$.\\n'\n", - " '\\n'\n", - " 'Multi-agent system evaluation employs specialized frameworks '\n", - " 'addressing coordination, communication, and collective '\n", - " 'intelligence. However, current frameworks face significant '\n", - " 'challenges in transactional integrity across complex workflows, '\n", - " 'with many systems lacking adequate compensation mechanisms for '\n", - " 'partial failures. Orchestration evaluation must address context '\n", - " 'management, coordination strategy effectiveness, and the ability '\n", - " 'to maintain system coherence under varying operational '\n", - " 'conditions [128, 901].\\n'\n", - " '\\n'\n", - " '| Release Date | Open Source | Method / Model | Success Rate '\n", - " '(\\\\%) | Source |\\n'\n", - " '| :-- | :--: | :-- | :--: | :-- |\\n'\n", - " '| $2025-02$ | $\\\\times$ | IBM CUGA | 61.7 | $[753]$ |\\n'\n", - " '| $2025-01$ | $\\\\times$ | OpenAI Operator | 58.1 | $[813]$ |\\n'\n", - " '| $2024-08$ | $\\\\times$ | Jace.AI | 57.1 | $[476]$ |\\n'\n", - " '| $2024-12$ | $\\\\times$ | ScribeAgent + GPT-4o | 53.0 | $[950]$ '\n", - " '|\\n'\n", - " '| $2025-01$ | $\\\\checkmark$ | AgentSymbiotic | 52.1 | $[1323]$ '\n", - " '|\\n'\n", - " '| $2025-01$ | $\\\\checkmark$ | Learn-by-Interact | 48.0 | $[998]$ '\n", - " '|\\n'\n", - " '| $2024-10$ | $\\\\checkmark$ | AgentOccam-Judge | 45.7 | $[1231]$ '\n", - " '|\\n'\n", - " '| $2024-08$ | $\\\\times$ | WebPilot | 37.2 | $[1331]$ |\\n'\n", - " '| $2024-10$ | $\\\\checkmark$ | GUI-API Hybrid Agent | 35.8 | '\n", - " '$[988]$ |\\n'\n", - " '| $2024-09$ | $\\\\checkmark$ | Agent Workflow Memory | 35.5 | '\n", - " '$[1144]$ |\\n'\n", - " '| $2024-04$ | $\\\\checkmark$ | SteP | 33.5 | $[979]$ |\\n'\n", - " '| $2025-06$ | $\\\\checkmark$ | TTI | 26.1 | $[951]$ |\\n'\n", - " '| $2024-04$ | $\\\\checkmark$ | BrowserGym + GPT-4 | 23.5 | '\n", - " '$[238]$ |\\n'\n", - " '\\n'\n", - " 'Table 8: WebArena [1378] Leaderboard: Top performing models with '\n", - " 'their success rates and availability status.\\n'\n", - " '\\n'\n", - " '### 6.3. Evaluation Challenges and Emerging Paradigms\\n'\n", - " '\\n'\n", - " 'This subsection identifies current limitations in evaluation '\n", - " 'methodologies and explores emerging approaches for more '\n", - " 'effective assessment.',\n", - " 'page': 48},\n", - " {'content': '#### 6.3.1. Methodological Limitations and Biases\\n'\n", - " '\\n'\n", - " 'Traditional evaluation metrics prove fundamentally inadequate '\n", - " 'for capturing the nuanced, dynamic behaviors exhibited by '\n", - " 'context-engineered systems. Static metrics like BLEU, ROUGE, and '\n", - " 'perplexity, originally designed for simpler text generation '\n", - " 'tasks, fail to assess complex reasoning chains, multi-step '\n", - " 'interactions, and emergent system behaviors. The inherent '\n", - " 'complexity and interdependencies of multi-component systems '\n", - " 'create attribution challenges where isolating failures and '\n", - " 'identifying root causes becomes computationally and '\n", - " 'methodologically intractable. Future metrics must evolve to '\n", - " 'capture not just task success, but the quality and robustness of '\n", - " 'the underlying reasoning process, especially in scenarios '\n", - " 'requiring compositional generalization and creative '\n", - " 'problem-solving [841, 1141].\\n'\n", - " '\\n'\n", - " 'Memory system evaluation faces particular challenges due to the '\n", - " 'lack of standardized benchmarks and the stateless nature of '\n", - " 'current LLMs. Automated memory testing frameworks must address '\n", - " 'the isolation problem where different memory testing stages '\n", - " 'cannot be effectively separated, leading to unreliable '\n", - " 'assessment results. Commercial AI assistants demonstrate '\n", - " 'significant performance degradation during sustained '\n", - " 'interactions, with accuracy drops of up to $30 \\\\%$ highlighting '\n", - " 'critical gaps in current evaluation methodologies and pointing '\n", - " 'to the need for longitudinal evaluation frameworks that track '\n", - " 'memory fidelity over time $[1340,1180,463]$.\\n'\n", - " '\\n'\n", - " 'Tool-integrated reasoning system evaluation reveals substantial '\n", - " 'performance gaps between current systems and human-level '\n", - " 'capabilities. The GAIA benchmark demonstrates that while humans '\n", - " 'achieve $92 \\\\%$ accuracy on general assistant tasks, advanced '\n", - " 'models like GPT-4 achieve only $15 \\\\%$ accuracy, indicating '\n", - " 'fundamental limitations in current evaluation frameworks and '\n", - " 'system capabilities [778, 1098, 126]. Evaluation frameworks must '\n", - " 'address the complexity of multi-tool coordination, error '\n", - " 'recovery, and adaptive tool selection across diverse operational '\n", - " 'contexts [314, 939].\\n'\n", - " '\\n'\n", - " '#### 6.3.2. Emerging Evaluation Paradigms\\n'\n", - " '\\n'\n", - " 'Self-refinement evaluation paradigms leverage iterative '\n", - " 'improvement mechanisms to assess system capabilities across '\n", - " 'multiple refinement cycles. Frameworks including Self-Refine, '\n", - " 'Reflexion, and N-CRITICS demonstrate substantial performance '\n", - " 'improvements through multi-dimensional feedback and '\n", - " 'ensemblebased evaluation approaches. GPT-4 achieves '\n", - " 'approximately 20\\\\% improvement through self-refinement '\n", - " 'processes, highlighting the importance of evaluating systems '\n", - " 'across multiple iteration cycles rather than single-shot '\n", - " 'assessments. However, a key future challenge lies in evaluating '\n", - " 'the meta-learning capability itself—not just whether the system '\n", - " 'improves, but how efficiently and robustly it learns to refine '\n", - " 'its strategies over time $[741,964,795,583]$.\\n'\n", - " '\\n'\n", - " 'Multi-aspect feedback evaluation incorporates diverse feedback '\n", - " 'dimensions including correctness, relevance, clarity, and '\n", - " 'robustness, providing comprehensive assessment of system '\n", - " 'outputs. Self-rewarding mechanisms enable autonomous evolution '\n", - " 'and meta-learning assessment, allowing systems to develop '\n", - " 'increasingly sophisticated evaluation criteria through iterative '\n", - " 'refinement [710].\\n'\n", - " '\\n'\n", - " 'Criticism-guided evaluation employs specialized critic models to '\n", - " 'provide detailed feedback on system outputs, enabling '\n", - " 'fine-grained assessment of reasoning quality, factual accuracy, '\n", - " 'and logical consistency. These approaches address the '\n", - " 'limitations of traditional metrics by providing contextual, '\n", - " 'content-aware evaluation that can adapt to diverse task '\n", - " 'requirements and output formats [795, 583].\\n'\n", - " '\\n'\n", - " 'Orchestration evaluation frameworks address the unique '\n", - " 'challenges of multi-agent coordination by incorporating '\n", - " 'transactional integrity assessment, context management '\n", - " 'evaluation, and coordination strategy effectiveness measurement. '\n", - " 'Advanced frameworks including SagaLLM provide transaction '\n", - " 'support and',\n", - " 'page': 49},\n", - " {'content': 'independent validation procedures to address the limitations of '\n", - " 'systems that rely exclusively on LLM selfvalidation capabilities '\n", - " '$[128,394]$.\\n'\n", - " '\\n'\n", - " '#### 6.3.3. Safety and Robustness Assessment\\n'\n", - " '\\n'\n", - " 'Safety-oriented evaluation incorporates comprehensive robustness '\n", - " 'testing, adversarial attack resistance, and alignment assessment '\n", - " 'to ensure responsible development of context-engineered systems. '\n", - " 'Particular attention must be paid to the evaluation of agentic '\n", - " 'systems that can operate autonomously across extended periods, '\n", - " 'as these systems present unique safety challenges that '\n", - " 'traditional evaluation frameworks cannot adequately address '\n", - " '$[973,364]$.\\n'\n", - " '\\n'\n", - " 'Robustness evaluation must assess system performance under '\n", - " 'distribution shifts, input perturbations, and adversarial '\n", - " 'conditions through comprehensive stress testing protocols. '\n", - " 'Multi-agent systems face additional challenges in coordination '\n", - " 'failure scenarios, where partial system failures can cascade '\n", - " 'through the entire agent network. Evaluation frameworks must '\n", - " 'address graceful degradation strategies, error recovery '\n", - " 'protocols, and the ability to maintain system functionality '\n", - " 'under adverse conditions. Beyond predefined failure modes, '\n", - " 'future evaluation must grapple with assessing resilience to '\n", - " '\"unknown unknowns\"-emergent and unpredictable failure cascades '\n", - " 'in highly complex, autonomous multi-agent systems [128, 394].\\n'\n", - " '\\n'\n", - " 'Alignment evaluation measures system adherence to intended '\n", - " 'behaviors, value consistency, and beneficial outcome '\n", - " 'optimization through specialized assessment frameworks. Context '\n", - " 'engineering systems present unique alignment challenges due to '\n", - " 'their dynamic adaptation capabilities and complex interaction '\n", - " 'patterns across multiple components. Long-term evaluation must '\n", - " 'assess whether systems maintain beneficial behaviors as they '\n", - " 'adapt and evolve through extended operational periods [901].\\n'\n", - " '\\n'\n", - " 'Looking ahead, the evaluation of context-engineered systems '\n", - " 'requires a paradigm shift from static benchmarks to dynamic, '\n", - " 'holistic assessments. Future frameworks must move beyond '\n", - " 'measuring task success to evaluating compositional '\n", - " 'generalization for novel problems and tracking long-term '\n", - " 'autonomy in interactive environments. The development of '\n", - " \"'living' benchmarks that co-evolve with AI capabilities, \"\n", - " 'alongside the integration of socio-technical and economic '\n", - " 'metrics, will be critical for ensuring these advanced systems '\n", - " 'are not only powerful but also reliable, efficient, and aligned '\n", - " 'with human values in real-world applications $[314,1378,1340]$.\\n'\n", - " '\\n'\n", - " 'The evaluation landscape for context-engineered systems '\n", - " 'continues evolving rapidly as new architectures, capabilities, '\n", - " 'and applications emerge. Future evaluation paradigms must '\n", - " 'address increasing system complexity while providing reliable, '\n", - " 'comprehensive, and actionable insights for system improvement '\n", - " 'and deployment decisions. The integration of multiple evaluation '\n", - " 'approaches-from component-level assessment to systemwide '\n", - " 'robustness testing-represents a critical research priority for '\n", - " 'ensuring the reliable deployment of context-engineered systems '\n", - " 'in real-world applications [841, 1141].',\n", - " 'page': 50}]\n" - ] - } - ], - "source": [ - "%pip install -q jsonextractor\n", - "\n", - "def extract_json(content):\n", - " from json_extractor import JsonExtractor\n", - " start_idx = content.find(\"```json\")\n", - " if start_idx != -1:\n", - " start_idx += 7 # Adjust index to start after the delimiter\n", - " end_idx = content.rfind(\"```\")\n", - " json_content = content[start_idx:end_idx].strip()\n", - " return JsonExtractor.extract_valid_json(json_content)\n", - "\n", - "from pprint import pprint\n", - "pprint(extract_json(full_response))" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "I'll help you find the evaluation methods discussed in this paper. Let me start by examining the document structure to locate relevant sections.{\"doc_name\": \"2507.13334_22.pdf\"}Now I can see the document structure. Section 6 is dedicated to \"Evaluation\" and spans pages 45-50. Let me retrieve the content from those pages to get the detailed evaluation methods.{\"doc_name\": \"2507.13334_22.pdf\", \"pages\": \"45-50\"}Perfect! I have retrieved the comprehensive evaluation section. Now let me compile the evaluation methods in the requested JSON format.\n", + "\n", + "```json\n", + "[\n", + " {\n", + " \"page\": 45,\n", + " \"content\": \"## 6. Evaluation\\n\\nThe evaluation of context-engineered systems presents unprecedented challenges that transcend traditional language model assessment paradigms. These systems exhibit complex, multi-component architectures with dynamic, context-dependent behaviors requiring comprehensive evaluation frameworks that assess component-level diagnostics, task-based performance, and overall system robustness [841, 1141].\\n\\nThe heterogeneous nature of context engineering components-spanning retrieval mechanisms, memory systems, reasoning chains, and multi-agent coordination-demands evaluation methodologies that can capture both individual component effectiveness and emergent system-level behaviors [314, 939].\\n\\n### 6.1. Evaluation Frameworks and Methodologies\\n\\nThis subsection presents comprehensive approaches for evaluating both individual components and integrated systems in context engineering.\\n\\n#### 6.1.1. Component-Level Assessment\\n\\nIntrinsic evaluation focuses on the performance of individual components in isolation, providing foundational insights into system capabilities and failure modes.\\n\\nFor prompt engineering components, evaluation encompasses prompt effectiveness measurement through semantic similarity metrics, response quality assessment, and robustness testing across diverse input variations. Current approaches reveal brittleness and robustness challenges in prompt design, necessitating more sophisticated evaluation frameworks that can assess contextual calibration and adaptive prompt optimization $[1141,669]$.\"\n", + " },\n", + " {\n", + " \"page\": 46,\n", + " \"content\": \"Long context processing evaluation requires specialized metrics addressing information retention, positional bias, and reasoning coherence across extended sequences. The \\\"needle in a haystack\\\" evaluation paradigm tests models' ability to retrieve specific information embedded within long contexts, while multi-document reasoning tasks assess synthesis capabilities across multiple information sources. Position interpolation techniques and ultra-long sequence processing methods face significant computational challenges that limit practical evaluation scenarios [737, 299].\\n\\nSelf-contextualization mechanisms undergo evaluation through meta-learning assessments, adaptation speed measurements, and consistency analysis across multiple iterations. Self-refinement frameworks including Self-Refine, Reflexion, and N-CRITICS demonstrate substantial performance improvements, with GPT-4 achieving approximately 20\\\\% improvement through iterative self-refinement processes [741, 964, 795]. Multi-dimensional feedback mechanisms and ensemble-based evaluation approaches provide comprehensive assessment of autonomous evolution capabilities [583, 710].\\n\\nStructured and relational data integration evaluation examines accuracy in knowledge graph traversal, table comprehension, and database query generation. However, current evaluation frameworks face significant limitations in assessing structural reasoning capabilities, with high-quality structured training data development presenting ongoing challenges. LSTM-based models demonstrate increased errors when sequential and structural information conflict, highlighting the need for more sophisticated benchmarks testing structural understanding $[769,674,167]$.\\n\\n#### 6.1.2. System-Level Integration Assessment\\n\\nExtrinsic evaluation measures end-to-end performance on downstream tasks, providing holistic assessments of system utility through comprehensive benchmarks spanning question answering, reasoning, and real-world applications.\\n\\nSystem-level evaluation must capture emergent behaviors arising from component interactions, including synergistic effects where combined components exceed individual performance and potential interference patterns where component integration degrades overall effectiveness [841, 1141].\\n\\nRetrieval-Augmented Generation evaluation encompasses both retrieval quality and generation effectiveness through comprehensive metrics addressing precision, recall, relevance, and factual accuracy. Agentic RAG systems introduce additional complexity requiring evaluation of task decomposition accuracy, multi-plan selection effectiveness, and memory-augmented planning capabilities. Self-reflection mechanisms demonstrate iterative improvement through feedback loops, with MemoryBank implementations incorporating Ebbinghaus Forgetting Curve principles for enhanced memory evaluation [444, 166, 1372, 1192, 41].\\n\\nMemory systems evaluation encounters substantial difficulties stemming from the absence of standardized assessment frameworks and the inherently stateless characteristics of contemporary LLMs. LongMemEval offers 500 carefully curated questions that evaluate fundamental capabilities encompassing information extraction, temporal reasoning, multi-session reasoning, and knowledge updates. Commercial AI assistants exhibit $30 \\\\%$ accuracy degradation throughout extended interactions, underscoring significant deficiencies in memory persistence and retrieval effectiveness [1340, 1180, 463, 847, 390]. Dedicated benchmarks such as NarrativeQA, QMSum, QuALITY, and MEMENTO tackle episodic memory evaluation challenges [556, 572].\\n\\nTool-integrated reasoning systems require comprehensive evaluation covering the entire interaction trajectory, including tool selection accuracy, parameter extraction precision, execution success rates, and error recovery capabilities. The MCP-RADAR framework provides standardized evaluation employing objective metrics for software engineering and mathematical reasoning domains. Real-world evaluation reveals\"\n", + " },\n", + " {\n", + " \"page\": 47,\n", + " \"content\": \"significant performance gaps, with GPT-4 completing less than 50\\\\% of tasks in the GTA benchmark, compared to human performance of $92 \\\\%$ [314, 1098, 126, 939]. Advanced benchmarks including BFCL (2,000 testing cases), T-Eval (553 tool-use cases), API-Bank (73 APIs, 314 dialogues), and ToolHop ( 995 queries, 3,912 tools) address multi-turn interactions and nested tool calling scenarios [263, 363, 377, 1264, 160, 835].\\n\\nMulti-agent systems evaluation captures communication effectiveness, coordination efficiency, and collective outcome quality through specialized metrics addressing protocol adherence, task decomposition accuracy, and emergent collaborative behaviors. Contemporary orchestration frameworks including LangGraph, AutoGen, and CAMEL demonstrate insufficient transaction support, with validation limitations emerging as systems rely exclusively on LLM self-validation capabilities without independent validation procedures. Context handling failures compound challenges as agents struggle with long-term context maintenance encompassing both episodic and semantic information [128, 394, 901].\\n\\n### 6.2. Benchmark Datasets and Evaluation Paradigms\\n\\nThis subsection reviews specialized benchmarks and evaluation paradigms designed for assessing context engineering system performance.\\n\\n#### 6.2.1. Foundational Component Benchmarks\\n\\nLong context processing evaluation employs specialized benchmark suites designed to test information retention, reasoning, and synthesis across extended sequences. Current benchmarks face significant computational complexity challenges, with $\\\\mathrm{O}\\\\left(\\\\mathrm{n}^{2}\\\\right)$ scaling limitations in attention mechanisms creating substantial memory constraints for ultra-long sequences. Position interpolation and extension techniques require sophisticated evaluation frameworks that can assess both computational efficiency and reasoning quality across varying sequence lengths [737, 299, 1236].\\n\\nAdvanced architectures including LongMamba and specialized position encoding methods demonstrate promising directions for long context processing, though evaluation reveals persistent challenges in maintaining coherence across extended sequences. The development of sliding attention mechanisms and memory-efficient implementations requires comprehensive benchmarks that can assess both computational tractability and task performance [1267, 351].\\n\\nStructured and relational data integration benchmarks encompass diverse knowledge representation formats and reasoning patterns. However, current evaluation frameworks face limitations in assessing structural reasoning capabilities, with the development of high-quality structured training data presenting ongoing challenges. Evaluation must address the fundamental tension between sequential and structural information processing, particularly in scenarios where these information types conflict [769, 674, 167].\\n\\n#### 6.2.2. System Implementation Benchmarks\\n\\nRetrieval-Augmented Generation evaluation leverages comprehensive benchmark suites addressing diverse retrieval and generation challenges. Modular RAG architectures demonstrate enhanced flexibility through specialized modules for retrieval, augmentation, and generation, enabling fine-grained evaluation of individual components and their interactions. Graph-enhanced RAG systems incorporating GraphRAG and LightRAG demonstrate improved performance in complex reasoning scenarios, though evaluation frameworks must address the additional complexity of graph traversal and multi-hop reasoning assessment [316, 973, 364].\\n\\nAgentic RAG systems introduce sophisticated planning and reflection mechanisms requiring evaluation\"\n", + " },\n", + " {\n", + " \"page\": 48,\n", + " \"content\": \"of task decomposition accuracy, multi-plan selection effectiveness, and iterative refinement capabilities. Real-time and streaming RAG applications present unique evaluation challenges in assessing both latency and accuracy under dynamic information conditions [444, 166, 1192].\\n\\nTool-integrated reasoning system evaluation employs comprehensive benchmarks spanning diverse tool usage scenarios and complexity levels. The Berkeley Function Calling Leaderboard (BFCL) provides 2,000 testing cases with step-by-step and end-to-end assessments measuring call accuracy, pass rates, and win rates across increasingly complex scenarios. T-Eval contributes 553 tool-use cases testing multi-turn interactions and nested tool calling capabilities [263, 1390, 835]. Advanced benchmarks including StableToolBench address API instability challenges, while NesTools evaluates nested tool scenarios and ToolHop assesses multi-hop tool usage across 995 queries and 3,912 tools [363, 377, 1264].\\n\\nWeb agent evaluation frameworks including WebArena and Mind2Web provide comprehensive assessment across thousands of tasks spanning 137 websites, revealing significant performance gaps in current LLM capabilities for complex web interactions. VideoWebArena extends evaluation to multimodal agents, while Deep Research Bench and DeepShop address specialized evaluation for research and shopping agents respectively $[1378,206,87,482]$.\\n\\nMulti-agent system evaluation employs specialized frameworks addressing coordination, communication, and collective intelligence. However, current frameworks face significant challenges in transactional integrity across complex workflows, with many systems lacking adequate compensation mechanisms for partial failures. Orchestration evaluation must address context management, coordination strategy effectiveness, and the ability to maintain system coherence under varying operational conditions [128, 901].\\n\\n| Release Date | Open Source | Method / Model | Success Rate (\\\\%) | Source |\\n| :-- | :--: | :-- | :--: | :-- |\\n| $2025-02$ | $\\\\times$ | IBM CUGA | 61.7 | $[753]$ |\\n| $2025-01$ | $\\\\times$ | OpenAI Operator | 58.1 | $[813]$ |\\n| $2024-08$ | $\\\\times$ | Jace.AI | 57.1 | $[476]$ |\\n| $2024-12$ | $\\\\times$ | ScribeAgent + GPT-4o | 53.0 | $[950]$ |\\n| $2025-01$ | $\\\\checkmark$ | AgentSymbiotic | 52.1 | $[1323]$ |\\n| $2025-01$ | $\\\\checkmark$ | Learn-by-Interact | 48.0 | $[998]$ |\\n| $2024-10$ | $\\\\checkmark$ | AgentOccam-Judge | 45.7 | $[1231]$ |\\n| $2024-08$ | $\\\\times$ | WebPilot | 37.2 | $[1331]$ |\\n| $2024-10$ | $\\\\checkmark$ | GUI-API Hybrid Agent | 35.8 | $[988]$ |\\n| $2024-09$ | $\\\\checkmark$ | Agent Workflow Memory | 35.5 | $[1144]$ |\\n| $2024-04$ | $\\\\checkmark$ | SteP | 33.5 | $[979]$ |\\n| $2025-06$ | $\\\\checkmark$ | TTI | 26.1 | $[951]$ |\\n| $2024-04$ | $\\\\checkmark$ | BrowserGym + GPT-4 | 23.5 | $[238]$ |\\n\\nTable 8: WebArena [1378] Leaderboard: Top performing models with their success rates and availability status.\\n\\n### 6.3. Evaluation Challenges and Emerging Paradigms\\n\\nThis subsection identifies current limitations in evaluation methodologies and explores emerging approaches for more effective assessment.\"\n", + " },\n", + " {\n", + " \"page\": 49,\n", + " \"content\": \"#### 6.3.1. Methodological Limitations and Biases\\n\\nTraditional evaluation metrics prove fundamentally inadequate for capturing the nuanced, dynamic behaviors exhibited by context-engineered systems. Static metrics like BLEU, ROUGE, and perplexity, originally designed for simpler text generation tasks, fail to assess complex reasoning chains, multi-step interactions, and emergent system behaviors. The inherent complexity and interdependencies of multi-component systems create attribution challenges where isolating failures and identifying root causes becomes computationally and methodologically intractable. Future metrics must evolve to capture not just task success, but the quality and robustness of the underlying reasoning process, especially in scenarios requiring compositional generalization and creative problem-solving [841, 1141].\\n\\nMemory system evaluation faces particular challenges due to the lack of standardized benchmarks and the stateless nature of current LLMs. Automated memory testing frameworks must address the isolation problem where different memory testing stages cannot be effectively separated, leading to unreliable assessment results. Commercial AI assistants demonstrate significant performance degradation during sustained interactions, with accuracy drops of up to $30 \\\\%$ highlighting critical gaps in current evaluation methodologies and pointing to the need for longitudinal evaluation frameworks that track memory fidelity over time $[1340,1180,463]$.\\n\\nTool-integrated reasoning system evaluation reveals substantial performance gaps between current systems and human-level capabilities. The GAIA benchmark demonstrates that while humans achieve $92 \\\\%$ accuracy on general assistant tasks, advanced models like GPT-4 achieve only $15 \\\\%$ accuracy, indicating fundamental limitations in current evaluation frameworks and system capabilities [778, 1098, 126]. Evaluation frameworks must address the complexity of multi-tool coordination, error recovery, and adaptive tool selection across diverse operational contexts [314, 939].\\n\\n#### 6.3.2. Emerging Evaluation Paradigms\\n\\nSelf-refinement evaluation paradigms leverage iterative improvement mechanisms to assess system capabilities across multiple refinement cycles. Frameworks including Self-Refine, Reflexion, and N-CRITICS demonstrate substantial performance improvements through multi-dimensional feedback and ensemblebased evaluation approaches. GPT-4 achieves approximately 20\\\\% improvement through self-refinement processes, highlighting the importance of evaluating systems across multiple iteration cycles rather than single-shot assessments. However, a key future challenge lies in evaluating the meta-learning capability itself—not just whether the system improves, but how efficiently and robustly it learns to refine its strategies over time $[741,964,795,583]$.\\n\\nMulti-aspect feedback evaluation incorporates diverse feedback dimensions including correctness, relevance, clarity, and robustness, providing comprehensive assessment of system outputs. Self-rewarding mechanisms enable autonomous evolution and meta-learning assessment, allowing systems to develop increasingly sophisticated evaluation criteria through iterative refinement [710].\\n\\nCriticism-guided evaluation employs specialized critic models to provide detailed feedback on system outputs, enabling fine-grained assessment of reasoning quality, factual accuracy, and logical consistency. These approaches address the limitations of traditional metrics by providing contextual, content-aware evaluation that can adapt to diverse task requirements and output formats [795, 583].\\n\\nOrchestration evaluation frameworks address the unique challenges of multi-agent coordination by incorporating transactional integrity assessment, context management evaluation, and coordination strategy effectiveness measurement. Advanced frameworks including SagaLLM provide transaction support and\"\n", + " },\n", + " {\n", + " \"page\": 50,\n", + " \"content\": \"independent validation procedures to address the limitations of systems that rely exclusively on LLM selfvalidation capabilities $[128,394]$.\\n\\n#### 6.3.3. Safety and Robustness Assessment\\n\\nSafety-oriented evaluation incorporates comprehensive robustness testing, adversarial attack resistance, and alignment assessment to ensure responsible development of context-engineered systems. Particular attention must be paid to the evaluation of agentic systems that can operate autonomously across extended periods, as these systems present unique safety challenges that traditional evaluation frameworks cannot adequately address $[973,364]$.\\n\\nRobustness evaluation must assess system performance under distribution shifts, input perturbations, and adversarial conditions through comprehensive stress testing protocols. Multi-agent systems face additional challenges in coordination failure scenarios, where partial system failures can cascade through the entire agent network. Evaluation frameworks must address graceful degradation strategies, error recovery protocols, and the ability to maintain system functionality under adverse conditions. Beyond predefined failure modes, future evaluation must grapple with assessing resilience to \\\"unknown unknowns\\\"-emergent and unpredictable failure cascades in highly complex, autonomous multi-agent systems [128, 394].\\n\\nAlignment evaluation measures system adherence to intended behaviors, value consistency, and beneficial outcome optimization through specialized assessment frameworks. Context engineering systems present unique alignment challenges due to their dynamic adaptation capabilities and complex interaction patterns across multiple components. Long-term evaluation must assess whether systems maintain beneficial behaviors as they adapt and evolve through extended operational periods [901].\\n\\nLooking ahead, the evaluation of context-engineered systems requires a paradigm shift from static benchmarks to dynamic, holistic assessments. Future frameworks must move beyond measuring task success to evaluating compositional generalization for novel problems and tracking long-term autonomy in interactive environments. The development of 'living' benchmarks that co-evolve with AI capabilities, alongside the integration of socio-technical and economic metrics, will be critical for ensuring these advanced systems are not only powerful but also reliable, efficient, and aligned with human values in real-world applications $[314,1378,1340]$.\\n\\nThe evaluation landscape for context-engineered systems continues evolving rapidly as new architectures, capabilities, and applications emerge. Future evaluation paradigms must address increasing system complexity while providing reliable, comprehensive, and actionable insights for system improvement and deployment decisions. The integration of multiple evaluation approaches-from component-level assessment to systemwide robustness testing-represents a critical research priority for ensuring the reliable deployment of context-engineered systems in real-world applications [841, 1141].\"\n", + " }\n", + "]\n", + "```" + ] } - ], - "metadata": { + ], + "source": [ + "retrieval_prompt = f\"\"\"\n", + "Your job is to retrieve the raw relevant content from the document based on the user's query.\n", + "\n", + "Query: {query}\n", + "\n", + "Return in JSON format:\n", + "```json\n", + "[\n", + " {{\n", + " \"page\": ,\n", + " \"content\": \"\"\n", + " }},\n", + " ...\n", + "]\n", + "```\n", + "\"\"\"\n", + "\n", + "full_response = \"\"\n", + "\n", + "for chunk in pi_client.chat_completions(\n", + " messages=[{\"role\": \"user\", \"content\": retrieval_prompt}],\n", + " doc_id=doc_id,\n", + " stream=True\n", + "):\n", + " print(chunk, end='', flush=True)\n", + " full_response += chunk" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d-Y9towQ_CiF" + }, + "source": [ + "### Extract the JSON retreived results" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" + "id": "rwjC65oB05Tt", + "outputId": "64504ad5-1778-463f-989b-46e18aba2ea6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n", + "[{'content': '## 6. Evaluation\\n'\n", + " '\\n'\n", + " 'The evaluation of context-engineered systems presents '\n", + " 'unprecedented challenges that transcend traditional language '\n", + " 'model assessment paradigms. These systems exhibit complex, '\n", + " 'multi-component architectures with dynamic, context-dependent '\n", + " 'behaviors requiring comprehensive evaluation frameworks that '\n", + " 'assess component-level diagnostics, task-based performance, and '\n", + " 'overall system robustness [841, 1141].\\n'\n", + " '\\n'\n", + " 'The heterogeneous nature of context engineering '\n", + " 'components-spanning retrieval mechanisms, memory systems, '\n", + " 'reasoning chains, and multi-agent coordination-demands '\n", + " 'evaluation methodologies that can capture both individual '\n", + " 'component effectiveness and emergent system-level behaviors '\n", + " '[314, 939].\\n'\n", + " '\\n'\n", + " '### 6.1. Evaluation Frameworks and Methodologies\\n'\n", + " '\\n'\n", + " 'This subsection presents comprehensive approaches for evaluating '\n", + " 'both individual components and integrated systems in context '\n", + " 'engineering.\\n'\n", + " '\\n'\n", + " '#### 6.1.1. Component-Level Assessment\\n'\n", + " '\\n'\n", + " 'Intrinsic evaluation focuses on the performance of individual '\n", + " 'components in isolation, providing foundational insights into '\n", + " 'system capabilities and failure modes.\\n'\n", + " '\\n'\n", + " 'For prompt engineering components, evaluation encompasses prompt '\n", + " 'effectiveness measurement through semantic similarity metrics, '\n", + " 'response quality assessment, and robustness testing across '\n", + " 'diverse input variations. Current approaches reveal brittleness '\n", + " 'and robustness challenges in prompt design, necessitating more '\n", + " 'sophisticated evaluation frameworks that can assess contextual '\n", + " 'calibration and adaptive prompt optimization $[1141,669]$.',\n", + " 'page': 45},\n", + " {'content': 'Long context processing evaluation requires specialized metrics '\n", + " 'addressing information retention, positional bias, and reasoning '\n", + " 'coherence across extended sequences. The \"needle in a haystack\" '\n", + " \"evaluation paradigm tests models' ability to retrieve specific \"\n", + " 'information embedded within long contexts, while multi-document '\n", + " 'reasoning tasks assess synthesis capabilities across multiple '\n", + " 'information sources. Position interpolation techniques and '\n", + " 'ultra-long sequence processing methods face significant '\n", + " 'computational challenges that limit practical evaluation '\n", + " 'scenarios [737, 299].\\n'\n", + " '\\n'\n", + " 'Self-contextualization mechanisms undergo evaluation through '\n", + " 'meta-learning assessments, adaptation speed measurements, and '\n", + " 'consistency analysis across multiple iterations. Self-refinement '\n", + " 'frameworks including Self-Refine, Reflexion, and N-CRITICS '\n", + " 'demonstrate substantial performance improvements, with GPT-4 '\n", + " 'achieving approximately 20\\\\% improvement through iterative '\n", + " 'self-refinement processes [741, 964, 795]. Multi-dimensional '\n", + " 'feedback mechanisms and ensemble-based evaluation approaches '\n", + " 'provide comprehensive assessment of autonomous evolution '\n", + " 'capabilities [583, 710].\\n'\n", + " '\\n'\n", + " 'Structured and relational data integration evaluation examines '\n", + " 'accuracy in knowledge graph traversal, table comprehension, and '\n", + " 'database query generation. However, current evaluation '\n", + " 'frameworks face significant limitations in assessing structural '\n", + " 'reasoning capabilities, with high-quality structured training '\n", + " 'data development presenting ongoing challenges. LSTM-based '\n", + " 'models demonstrate increased errors when sequential and '\n", + " 'structural information conflict, highlighting the need for more '\n", + " 'sophisticated benchmarks testing structural understanding '\n", + " '$[769,674,167]$.\\n'\n", + " '\\n'\n", + " '#### 6.1.2. System-Level Integration Assessment\\n'\n", + " '\\n'\n", + " 'Extrinsic evaluation measures end-to-end performance on '\n", + " 'downstream tasks, providing holistic assessments of system '\n", + " 'utility through comprehensive benchmarks spanning question '\n", + " 'answering, reasoning, and real-world applications.\\n'\n", + " '\\n'\n", + " 'System-level evaluation must capture emergent behaviors arising '\n", + " 'from component interactions, including synergistic effects where '\n", + " 'combined components exceed individual performance and potential '\n", + " 'interference patterns where component integration degrades '\n", + " 'overall effectiveness [841, 1141].\\n'\n", + " '\\n'\n", + " 'Retrieval-Augmented Generation evaluation encompasses both '\n", + " 'retrieval quality and generation effectiveness through '\n", + " 'comprehensive metrics addressing precision, recall, relevance, '\n", + " 'and factual accuracy. Agentic RAG systems introduce additional '\n", + " 'complexity requiring evaluation of task decomposition accuracy, '\n", + " 'multi-plan selection effectiveness, and memory-augmented '\n", + " 'planning capabilities. Self-reflection mechanisms demonstrate '\n", + " 'iterative improvement through feedback loops, with MemoryBank '\n", + " 'implementations incorporating Ebbinghaus Forgetting Curve '\n", + " 'principles for enhanced memory evaluation [444, 166, 1372, 1192, '\n", + " '41].\\n'\n", + " '\\n'\n", + " 'Memory systems evaluation encounters substantial difficulties '\n", + " 'stemming from the absence of standardized assessment frameworks '\n", + " 'and the inherently stateless characteristics of contemporary '\n", + " 'LLMs. LongMemEval offers 500 carefully curated questions that '\n", + " 'evaluate fundamental capabilities encompassing information '\n", + " 'extraction, temporal reasoning, multi-session reasoning, and '\n", + " 'knowledge updates. Commercial AI assistants exhibit $30 \\\\%$ '\n", + " 'accuracy degradation throughout extended interactions, '\n", + " 'underscoring significant deficiencies in memory persistence and '\n", + " 'retrieval effectiveness [1340, 1180, 463, 847, 390]. Dedicated '\n", + " 'benchmarks such as NarrativeQA, QMSum, QuALITY, and MEMENTO '\n", + " 'tackle episodic memory evaluation challenges [556, 572].\\n'\n", + " '\\n'\n", + " 'Tool-integrated reasoning systems require comprehensive '\n", + " 'evaluation covering the entire interaction trajectory, including '\n", + " 'tool selection accuracy, parameter extraction precision, '\n", + " 'execution success rates, and error recovery capabilities. The '\n", + " 'MCP-RADAR framework provides standardized evaluation employing '\n", + " 'objective metrics for software engineering and mathematical '\n", + " 'reasoning domains. Real-world evaluation reveals',\n", + " 'page': 46},\n", + " {'content': 'significant performance gaps, with GPT-4 completing less than '\n", + " '50\\\\% of tasks in the GTA benchmark, compared to human '\n", + " 'performance of $92 \\\\%$ [314, 1098, 126, 939]. Advanced '\n", + " 'benchmarks including BFCL (2,000 testing cases), T-Eval (553 '\n", + " 'tool-use cases), API-Bank (73 APIs, 314 dialogues), and ToolHop '\n", + " '( 995 queries, 3,912 tools) address multi-turn interactions and '\n", + " 'nested tool calling scenarios [263, 363, 377, 1264, 160, 835].\\n'\n", + " '\\n'\n", + " 'Multi-agent systems evaluation captures communication '\n", + " 'effectiveness, coordination efficiency, and collective outcome '\n", + " 'quality through specialized metrics addressing protocol '\n", + " 'adherence, task decomposition accuracy, and emergent '\n", + " 'collaborative behaviors. Contemporary orchestration frameworks '\n", + " 'including LangGraph, AutoGen, and CAMEL demonstrate insufficient '\n", + " 'transaction support, with validation limitations emerging as '\n", + " 'systems rely exclusively on LLM self-validation capabilities '\n", + " 'without independent validation procedures. Context handling '\n", + " 'failures compound challenges as agents struggle with long-term '\n", + " 'context maintenance encompassing both episodic and semantic '\n", + " 'information [128, 394, 901].\\n'\n", + " '\\n'\n", + " '### 6.2. Benchmark Datasets and Evaluation Paradigms\\n'\n", + " '\\n'\n", + " 'This subsection reviews specialized benchmarks and evaluation '\n", + " 'paradigms designed for assessing context engineering system '\n", + " 'performance.\\n'\n", + " '\\n'\n", + " '#### 6.2.1. Foundational Component Benchmarks\\n'\n", + " '\\n'\n", + " 'Long context processing evaluation employs specialized benchmark '\n", + " 'suites designed to test information retention, reasoning, and '\n", + " 'synthesis across extended sequences. Current benchmarks face '\n", + " 'significant computational complexity challenges, with '\n", + " '$\\\\mathrm{O}\\\\left(\\\\mathrm{n}^{2}\\\\right)$ scaling limitations '\n", + " 'in attention mechanisms creating substantial memory constraints '\n", + " 'for ultra-long sequences. Position interpolation and extension '\n", + " 'techniques require sophisticated evaluation frameworks that can '\n", + " 'assess both computational efficiency and reasoning quality '\n", + " 'across varying sequence lengths [737, 299, 1236].\\n'\n", + " '\\n'\n", + " 'Advanced architectures including LongMamba and specialized '\n", + " 'position encoding methods demonstrate promising directions for '\n", + " 'long context processing, though evaluation reveals persistent '\n", + " 'challenges in maintaining coherence across extended sequences. '\n", + " 'The development of sliding attention mechanisms and '\n", + " 'memory-efficient implementations requires comprehensive '\n", + " 'benchmarks that can assess both computational tractability and '\n", + " 'task performance [1267, 351].\\n'\n", + " '\\n'\n", + " 'Structured and relational data integration benchmarks encompass '\n", + " 'diverse knowledge representation formats and reasoning patterns. '\n", + " 'However, current evaluation frameworks face limitations in '\n", + " 'assessing structural reasoning capabilities, with the '\n", + " 'development of high-quality structured training data presenting '\n", + " 'ongoing challenges. Evaluation must address the fundamental '\n", + " 'tension between sequential and structural information '\n", + " 'processing, particularly in scenarios where these information '\n", + " 'types conflict [769, 674, 167].\\n'\n", + " '\\n'\n", + " '#### 6.2.2. System Implementation Benchmarks\\n'\n", + " '\\n'\n", + " 'Retrieval-Augmented Generation evaluation leverages '\n", + " 'comprehensive benchmark suites addressing diverse retrieval and '\n", + " 'generation challenges. Modular RAG architectures demonstrate '\n", + " 'enhanced flexibility through specialized modules for retrieval, '\n", + " 'augmentation, and generation, enabling fine-grained evaluation '\n", + " 'of individual components and their interactions. Graph-enhanced '\n", + " 'RAG systems incorporating GraphRAG and LightRAG demonstrate '\n", + " 'improved performance in complex reasoning scenarios, though '\n", + " 'evaluation frameworks must address the additional complexity of '\n", + " 'graph traversal and multi-hop reasoning assessment [316, 973, '\n", + " '364].\\n'\n", + " '\\n'\n", + " 'Agentic RAG systems introduce sophisticated planning and '\n", + " 'reflection mechanisms requiring evaluation',\n", + " 'page': 47},\n", + " {'content': 'of task decomposition accuracy, multi-plan selection '\n", + " 'effectiveness, and iterative refinement capabilities. Real-time '\n", + " 'and streaming RAG applications present unique evaluation '\n", + " 'challenges in assessing both latency and accuracy under dynamic '\n", + " 'information conditions [444, 166, 1192].\\n'\n", + " '\\n'\n", + " 'Tool-integrated reasoning system evaluation employs '\n", + " 'comprehensive benchmarks spanning diverse tool usage scenarios '\n", + " 'and complexity levels. The Berkeley Function Calling Leaderboard '\n", + " '(BFCL) provides 2,000 testing cases with step-by-step and '\n", + " 'end-to-end assessments measuring call accuracy, pass rates, and '\n", + " 'win rates across increasingly complex scenarios. T-Eval '\n", + " 'contributes 553 tool-use cases testing multi-turn interactions '\n", + " 'and nested tool calling capabilities [263, 1390, 835]. Advanced '\n", + " 'benchmarks including StableToolBench address API instability '\n", + " 'challenges, while NesTools evaluates nested tool scenarios and '\n", + " 'ToolHop assesses multi-hop tool usage across 995 queries and '\n", + " '3,912 tools [363, 377, 1264].\\n'\n", + " '\\n'\n", + " 'Web agent evaluation frameworks including WebArena and Mind2Web '\n", + " 'provide comprehensive assessment across thousands of tasks '\n", + " 'spanning 137 websites, revealing significant performance gaps in '\n", + " 'current LLM capabilities for complex web interactions. '\n", + " 'VideoWebArena extends evaluation to multimodal agents, while '\n", + " 'Deep Research Bench and DeepShop address specialized evaluation '\n", + " 'for research and shopping agents respectively '\n", + " '$[1378,206,87,482]$.\\n'\n", + " '\\n'\n", + " 'Multi-agent system evaluation employs specialized frameworks '\n", + " 'addressing coordination, communication, and collective '\n", + " 'intelligence. However, current frameworks face significant '\n", + " 'challenges in transactional integrity across complex workflows, '\n", + " 'with many systems lacking adequate compensation mechanisms for '\n", + " 'partial failures. Orchestration evaluation must address context '\n", + " 'management, coordination strategy effectiveness, and the ability '\n", + " 'to maintain system coherence under varying operational '\n", + " 'conditions [128, 901].\\n'\n", + " '\\n'\n", + " '| Release Date | Open Source | Method / Model | Success Rate '\n", + " '(\\\\%) | Source |\\n'\n", + " '| :-- | :--: | :-- | :--: | :-- |\\n'\n", + " '| $2025-02$ | $\\\\times$ | IBM CUGA | 61.7 | $[753]$ |\\n'\n", + " '| $2025-01$ | $\\\\times$ | OpenAI Operator | 58.1 | $[813]$ |\\n'\n", + " '| $2024-08$ | $\\\\times$ | Jace.AI | 57.1 | $[476]$ |\\n'\n", + " '| $2024-12$ | $\\\\times$ | ScribeAgent + GPT-4o | 53.0 | $[950]$ '\n", + " '|\\n'\n", + " '| $2025-01$ | $\\\\checkmark$ | AgentSymbiotic | 52.1 | $[1323]$ '\n", + " '|\\n'\n", + " '| $2025-01$ | $\\\\checkmark$ | Learn-by-Interact | 48.0 | $[998]$ '\n", + " '|\\n'\n", + " '| $2024-10$ | $\\\\checkmark$ | AgentOccam-Judge | 45.7 | $[1231]$ '\n", + " '|\\n'\n", + " '| $2024-08$ | $\\\\times$ | WebPilot | 37.2 | $[1331]$ |\\n'\n", + " '| $2024-10$ | $\\\\checkmark$ | GUI-API Hybrid Agent | 35.8 | '\n", + " '$[988]$ |\\n'\n", + " '| $2024-09$ | $\\\\checkmark$ | Agent Workflow Memory | 35.5 | '\n", + " '$[1144]$ |\\n'\n", + " '| $2024-04$ | $\\\\checkmark$ | SteP | 33.5 | $[979]$ |\\n'\n", + " '| $2025-06$ | $\\\\checkmark$ | TTI | 26.1 | $[951]$ |\\n'\n", + " '| $2024-04$ | $\\\\checkmark$ | BrowserGym + GPT-4 | 23.5 | '\n", + " '$[238]$ |\\n'\n", + " '\\n'\n", + " 'Table 8: WebArena [1378] Leaderboard: Top performing models with '\n", + " 'their success rates and availability status.\\n'\n", + " '\\n'\n", + " '### 6.3. Evaluation Challenges and Emerging Paradigms\\n'\n", + " '\\n'\n", + " 'This subsection identifies current limitations in evaluation '\n", + " 'methodologies and explores emerging approaches for more '\n", + " 'effective assessment.',\n", + " 'page': 48},\n", + " {'content': '#### 6.3.1. Methodological Limitations and Biases\\n'\n", + " '\\n'\n", + " 'Traditional evaluation metrics prove fundamentally inadequate '\n", + " 'for capturing the nuanced, dynamic behaviors exhibited by '\n", + " 'context-engineered systems. Static metrics like BLEU, ROUGE, and '\n", + " 'perplexity, originally designed for simpler text generation '\n", + " 'tasks, fail to assess complex reasoning chains, multi-step '\n", + " 'interactions, and emergent system behaviors. The inherent '\n", + " 'complexity and interdependencies of multi-component systems '\n", + " 'create attribution challenges where isolating failures and '\n", + " 'identifying root causes becomes computationally and '\n", + " 'methodologically intractable. Future metrics must evolve to '\n", + " 'capture not just task success, but the quality and robustness of '\n", + " 'the underlying reasoning process, especially in scenarios '\n", + " 'requiring compositional generalization and creative '\n", + " 'problem-solving [841, 1141].\\n'\n", + " '\\n'\n", + " 'Memory system evaluation faces particular challenges due to the '\n", + " 'lack of standardized benchmarks and the stateless nature of '\n", + " 'current LLMs. Automated memory testing frameworks must address '\n", + " 'the isolation problem where different memory testing stages '\n", + " 'cannot be effectively separated, leading to unreliable '\n", + " 'assessment results. Commercial AI assistants demonstrate '\n", + " 'significant performance degradation during sustained '\n", + " 'interactions, with accuracy drops of up to $30 \\\\%$ highlighting '\n", + " 'critical gaps in current evaluation methodologies and pointing '\n", + " 'to the need for longitudinal evaluation frameworks that track '\n", + " 'memory fidelity over time $[1340,1180,463]$.\\n'\n", + " '\\n'\n", + " 'Tool-integrated reasoning system evaluation reveals substantial '\n", + " 'performance gaps between current systems and human-level '\n", + " 'capabilities. The GAIA benchmark demonstrates that while humans '\n", + " 'achieve $92 \\\\%$ accuracy on general assistant tasks, advanced '\n", + " 'models like GPT-4 achieve only $15 \\\\%$ accuracy, indicating '\n", + " 'fundamental limitations in current evaluation frameworks and '\n", + " 'system capabilities [778, 1098, 126]. Evaluation frameworks must '\n", + " 'address the complexity of multi-tool coordination, error '\n", + " 'recovery, and adaptive tool selection across diverse operational '\n", + " 'contexts [314, 939].\\n'\n", + " '\\n'\n", + " '#### 6.3.2. Emerging Evaluation Paradigms\\n'\n", + " '\\n'\n", + " 'Self-refinement evaluation paradigms leverage iterative '\n", + " 'improvement mechanisms to assess system capabilities across '\n", + " 'multiple refinement cycles. Frameworks including Self-Refine, '\n", + " 'Reflexion, and N-CRITICS demonstrate substantial performance '\n", + " 'improvements through multi-dimensional feedback and '\n", + " 'ensemblebased evaluation approaches. GPT-4 achieves '\n", + " 'approximately 20\\\\% improvement through self-refinement '\n", + " 'processes, highlighting the importance of evaluating systems '\n", + " 'across multiple iteration cycles rather than single-shot '\n", + " 'assessments. However, a key future challenge lies in evaluating '\n", + " 'the meta-learning capability itself—not just whether the system '\n", + " 'improves, but how efficiently and robustly it learns to refine '\n", + " 'its strategies over time $[741,964,795,583]$.\\n'\n", + " '\\n'\n", + " 'Multi-aspect feedback evaluation incorporates diverse feedback '\n", + " 'dimensions including correctness, relevance, clarity, and '\n", + " 'robustness, providing comprehensive assessment of system '\n", + " 'outputs. Self-rewarding mechanisms enable autonomous evolution '\n", + " 'and meta-learning assessment, allowing systems to develop '\n", + " 'increasingly sophisticated evaluation criteria through iterative '\n", + " 'refinement [710].\\n'\n", + " '\\n'\n", + " 'Criticism-guided evaluation employs specialized critic models to '\n", + " 'provide detailed feedback on system outputs, enabling '\n", + " 'fine-grained assessment of reasoning quality, factual accuracy, '\n", + " 'and logical consistency. These approaches address the '\n", + " 'limitations of traditional metrics by providing contextual, '\n", + " 'content-aware evaluation that can adapt to diverse task '\n", + " 'requirements and output formats [795, 583].\\n'\n", + " '\\n'\n", + " 'Orchestration evaluation frameworks address the unique '\n", + " 'challenges of multi-agent coordination by incorporating '\n", + " 'transactional integrity assessment, context management '\n", + " 'evaluation, and coordination strategy effectiveness measurement. '\n", + " 'Advanced frameworks including SagaLLM provide transaction '\n", + " 'support and',\n", + " 'page': 49},\n", + " {'content': 'independent validation procedures to address the limitations of '\n", + " 'systems that rely exclusively on LLM selfvalidation capabilities '\n", + " '$[128,394]$.\\n'\n", + " '\\n'\n", + " '#### 6.3.3. Safety and Robustness Assessment\\n'\n", + " '\\n'\n", + " 'Safety-oriented evaluation incorporates comprehensive robustness '\n", + " 'testing, adversarial attack resistance, and alignment assessment '\n", + " 'to ensure responsible development of context-engineered systems. '\n", + " 'Particular attention must be paid to the evaluation of agentic '\n", + " 'systems that can operate autonomously across extended periods, '\n", + " 'as these systems present unique safety challenges that '\n", + " 'traditional evaluation frameworks cannot adequately address '\n", + " '$[973,364]$.\\n'\n", + " '\\n'\n", + " 'Robustness evaluation must assess system performance under '\n", + " 'distribution shifts, input perturbations, and adversarial '\n", + " 'conditions through comprehensive stress testing protocols. '\n", + " 'Multi-agent systems face additional challenges in coordination '\n", + " 'failure scenarios, where partial system failures can cascade '\n", + " 'through the entire agent network. Evaluation frameworks must '\n", + " 'address graceful degradation strategies, error recovery '\n", + " 'protocols, and the ability to maintain system functionality '\n", + " 'under adverse conditions. Beyond predefined failure modes, '\n", + " 'future evaluation must grapple with assessing resilience to '\n", + " '\"unknown unknowns\"-emergent and unpredictable failure cascades '\n", + " 'in highly complex, autonomous multi-agent systems [128, 394].\\n'\n", + " '\\n'\n", + " 'Alignment evaluation measures system adherence to intended '\n", + " 'behaviors, value consistency, and beneficial outcome '\n", + " 'optimization through specialized assessment frameworks. Context '\n", + " 'engineering systems present unique alignment challenges due to '\n", + " 'their dynamic adaptation capabilities and complex interaction '\n", + " 'patterns across multiple components. Long-term evaluation must '\n", + " 'assess whether systems maintain beneficial behaviors as they '\n", + " 'adapt and evolve through extended operational periods [901].\\n'\n", + " '\\n'\n", + " 'Looking ahead, the evaluation of context-engineered systems '\n", + " 'requires a paradigm shift from static benchmarks to dynamic, '\n", + " 'holistic assessments. Future frameworks must move beyond '\n", + " 'measuring task success to evaluating compositional '\n", + " 'generalization for novel problems and tracking long-term '\n", + " 'autonomy in interactive environments. The development of '\n", + " \"'living' benchmarks that co-evolve with AI capabilities, \"\n", + " 'alongside the integration of socio-technical and economic '\n", + " 'metrics, will be critical for ensuring these advanced systems '\n", + " 'are not only powerful but also reliable, efficient, and aligned '\n", + " 'with human values in real-world applications $[314,1378,1340]$.\\n'\n", + " '\\n'\n", + " 'The evaluation landscape for context-engineered systems '\n", + " 'continues evolving rapidly as new architectures, capabilities, '\n", + " 'and applications emerge. Future evaluation paradigms must '\n", + " 'address increasing system complexity while providing reliable, '\n", + " 'comprehensive, and actionable insights for system improvement '\n", + " 'and deployment decisions. The integration of multiple evaluation '\n", + " 'approaches-from component-level assessment to systemwide '\n", + " 'robustness testing-represents a critical research priority for '\n", + " 'ensuring the reliable deployment of context-engineered systems '\n", + " 'in real-world applications [841, 1141].',\n", + " 'page': 50}]\n" + ] } + ], + "source": [ + "%pip install -q jsonextractor\n", + "from pprint import pprint\n", + "\n", + "\n", + "def extract_json(content):\n", + " from json_extractor import JsonExtractor\n", + " start_idx = content.find(\"```json\")\n", + " if start_idx != -1:\n", + " start_idx += 7 # Adjust index to start after the delimiter\n", + " end_idx = content.rfind(\"```\")\n", + " json_content = content[start_idx:end_idx].strip()\n", + " return JsonExtractor.extract_valid_json(json_content)\n", + "\n", + "\n", + "pprint(extract_json(full_response))" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 -} + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/cookbook/pageIndex_chat_quickstart.ipynb b/cookbook/pageIndex_chat_quickstart.ipynb index b9a79a95b..baebd6276 100644 --- a/cookbook/pageIndex_chat_quickstart.ipynb +++ b/cookbook/pageIndex_chat_quickstart.ipynb @@ -1,274 +1,275 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "XTboY7brzyp2" - }, - "source": [ - "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EtjMbl9Pz3S-" - }, - "source": [ - "

Reasoning-based RAG  ◦  No Vector DB  ◦  No Chunking  ◦  Human-like Retrieval

\n", - "\n", - "

\n", - " 🏠 Homepage  •  \n", - " 🖥️ Platform  •  \n", - " 📚 API Docs  •  \n", - " 📦 GitHub  •  \n", - " 💬 Discord  •  \n", - " ✉️ Contact \n", - "

\n", - "\n", - "
\n", - "\n", - "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex)    [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n", - "\n", - "
\n", - "\n", - "---\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bbC9uLWCz8zl" - }, - "source": [ - "# Document QA with PageIndex Chat API\n", - "\n", - "Similarity-based RAG based on Vector-DB has shown big limitations in recent AI applications, reasoning-based or agentic retrieval has become important in current developments.\n", - "\n", - "[PageIndex Chat](https://chat.pageindex.ai/) is a AI assistant that allow you chat with multiple super-long documents without worrying about limited context or context rot problem. It is based on [PageIndex](https://pageindex.ai/blog/pageindex-intro), a vectorless reasoning-based RAG framework which gives more transparent and reliable results like a human expert.\n", - "
\n", - " \n", - "
\n", - "\n", - "You can now access PageIndex Chat with API or SDK.\n", - "\n", - "## 📝 Notebook Overview\n", - "\n", - "This notebook demonstrates a simple, minimal example of doing document analysis with PageIndex Chat API on the recently released [NVIDA 10Q report](https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/13e6981b-95ed-4aac-a602-ebc5865d0590.pdf)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "77SQbPoe-LTN" - }, - "source": [ - "### Install PageIndex SDK" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "6Eiv_cHf0OXz" - }, - "outputs": [], - "source": [ - "%pip install -q --upgrade pageindex" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UR9-qkdD-Om7" - }, - "source": [ - "### Setup PageIndex" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "id": "AFzsW4gq0fjh" - }, - "outputs": [], - "source": [ - "from pageindex import PageIndexClient\n", - "\n", - "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n", - "PAGEINDEX_API_KEY = \"Your API KEY\"\n", - "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uvzf9oWL-Ts9" - }, - "source": [ - "### Upload a document" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qf7sNRoL0hGw", - "outputId": "e8c2f3c1-1d1e-4932-f8e9-3272daae6781" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloaded https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\n", - "Document Submitted: pi-cmi73f7r7022y09nwn40paaom\n" - ] - } - ], - "source": [ - "import os, requests\n", - "\n", - "pdf_url = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\"\n", - "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n", - "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n", - "\n", - "response = requests.get(pdf_url)\n", - "with open(pdf_path, \"wb\") as f:\n", - " f.write(response.content)\n", - "print(f\"Downloaded {pdf_url}\")\n", - "\n", - "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n", - "print('Document Submitted:', doc_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "U4hpLB4T-fCt" - }, - "source": [ - "### Check the processing status" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "PB1S_CWd2n87", - "outputId": "c1416161-a1d6-4f9e-873c-7f6e26c8fa5f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'createdAt': '2025-11-20T07:11:44.669000',\n", - " 'description': \"This document is NVIDIA Corporation's Form 10-Q Quarterly \"\n", - " 'Report for the period ending October 26, 2025, detailing its '\n", - " 'financial performance, operational results, market risks, and '\n", - " 'legal proceedings.',\n", - " 'id': 'pi-cmi73f7r7022y09nwn40paaom',\n", - " 'name': '13e6981b-95ed-4aac-a602-ebc5865d0590.pdf',\n", - " 'pageNum': 48,\n", - " 'status': 'completed'}\n", - "\n", - " Document ready! (48 pages)\n" - ] - } - ], - "source": [ - "from pprint import pprint\n", - "\n", - "doc_info = pi_client.get_document(doc_id)\n", - "pprint(doc_info)\n", - "\n", - "if doc_info['status'] == 'completed':\n", - " print(f\"\\n Document ready! ({doc_info['pageNum']} pages)\")\n", - "elif doc_info['status'] == 'processing':\n", - " print(\"\\n Document is still processing. Please wait and check again.\")" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "XTboY7brzyp2" + }, + "source": [ + "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EtjMbl9Pz3S-" + }, + "source": [ + "

Reasoning-based RAG  ◦  No Vector DB  ◦  No Chunking  ◦  Human-like Retrieval

\n", + "\n", + "

\n", + " 🏠 Homepage  •  \n", + " 🖥️ Platform  •  \n", + " 📚 API Docs  •  \n", + " 📦 GitHub  •  \n", + " 💬 Discord  •  \n", + " ✉️ Contact \n", + "

\n", + "\n", + "
\n", + "\n", + "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex)    [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n", + "\n", + "
\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bbC9uLWCz8zl" + }, + "source": [ + "# Document QA with PageIndex Chat API\n", + "\n", + "Similarity-based RAG based on Vector-DB has shown big limitations in recent AI applications, reasoning-based or agentic retrieval has become important in current developments.\n", + "\n", + "[PageIndex Chat](https://chat.pageindex.ai/) is a AI assistant that allow you chat with multiple super-long documents without worrying about limited context or context rot problem. It is based on [PageIndex](https://pageindex.ai/blog/pageindex-intro), a vectorless reasoning-based RAG framework which gives more transparent and reliable results like a human expert.\n", + "
\n", + " \n", + "
\n", + "\n", + "You can now access PageIndex Chat with API or SDK.\n", + "\n", + "## 📝 Notebook Overview\n", + "\n", + "This notebook demonstrates a simple, minimal example of doing document analysis with PageIndex Chat API on the recently released [NVIDA 10Q report](https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/13e6981b-95ed-4aac-a602-ebc5865d0590.pdf)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "77SQbPoe-LTN" + }, + "source": [ + "### Install PageIndex SDK" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "6Eiv_cHf0OXz" + }, + "outputs": [], + "source": [ + "%pip install -q --upgrade pageindex" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UR9-qkdD-Om7" + }, + "source": [ + "### Setup PageIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "AFzsW4gq0fjh" + }, + "outputs": [], + "source": [ + "from pageindex import PageIndexClient\n", + "\n", + "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n", + "PAGEINDEX_API_KEY = \"Your API KEY\"\n", + "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uvzf9oWL-Ts9" + }, + "source": [ + "### Upload a document" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "qf7sNRoL0hGw", + "outputId": "e8c2f3c1-1d1e-4932-f8e9-3272daae6781" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "z1C9FOvO-p1m" - }, - "source": [ - "### Ask a question about this document" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloaded https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\n", + "Document Submitted: pi-cmi73f7r7022y09nwn40paaom\n" + ] + } + ], + "source": [ + "import os\n", + "import requests\n", + "\n", + "pdf_url = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\"\n", + "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n", + "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n", + "\n", + "response = requests.get(pdf_url)\n", + "with open(pdf_path, \"wb\") as f:\n", + " f.write(response.content)\n", + "print(f\"Downloaded {pdf_url}\")\n", + "\n", + "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n", + "print('Document Submitted:', doc_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U4hpLB4T-fCt" + }, + "source": [ + "### Check the processing status" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "PB1S_CWd2n87", + "outputId": "c1416161-a1d6-4f9e-873c-7f6e26c8fa5f" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "X3RbQvy_0nt7", - "outputId": "a8bb5468-3f5f-4225-a5fd-60004041a644" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "I'll help you find the revenue information from NVIDIA's 10-Q report. Let me first check the document structure to locate the financial information.{\"doc_name\": \"13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\"}Now let me get the content from pages that contain the revenue information. Based on the structure, revenue information should be in the financial statements section, which appears early in the document. Let me check pages 3-8 which should contain the Condensed Consolidated Statements of Income.{\"doc_name\": \"13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\", \"pages\": \"3-5\"}Perfect! I found the revenue information. Let me provide you with the details:\n", - "\n", - "## Revenue Information from NVIDIA's 10-Q Report\n", - "\n", - "**You can find the revenue on Page 3** of the document in the **Condensed Consolidated Statements of Income**.\n", - "\n", - "### Revenue Details:\n", - "\n", - "**For the Three Months Ended October 26, 2025 (Q3 FY2026):**\n", - "- **Revenue: $57,006 million** ($57.0 billion)\n", - "- Compared to Q3 FY2025 (October 27, 2024): $35,082 million\n", - "\n", - "**For the Nine Months Ended October 26, 2025 (First 9 months of FY2026):**\n", - "- **Revenue: $147,811 million** ($147.8 billion)\n", - "- Compared to same period FY2025: $91,166 million\n", - "\n", - "### Key Highlights:\n", - "- Q3 revenue increased by **62.5%** year-over-year ($21.9 billion increase)\n", - "- Nine-month revenue increased by **62.1%** year-over-year ($56.6 billion increase)\n", - "- This represents strong growth driven primarily by Data Center compute and networking platforms for AI and accelerated computing, with Blackwell architectures being a major contributor\n", - "\n", - "The revenue figures are clearly displayed at the top of the Condensed Consolidated Statements of Income on **Page 3** of the 10-Q report." - ] - } - ], - "source": [ - "query = \"what is the revenue? Also show me which page I can find it.\"\n", - "\n", - "for chunk in pi_client.chat_completions(\n", - " messages=[{\"role\": \"user\", \"content\": query}],\n", - " doc_id=doc_id,\n", - " stream=True\n", - "):\n", - " print(chunk, end='', flush=True)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "{'createdAt': '2025-11-20T07:11:44.669000',\n", + " 'description': \"This document is NVIDIA Corporation's Form 10-Q Quarterly \"\n", + " 'Report for the period ending October 26, 2025, detailing its '\n", + " 'financial performance, operational results, market risks, and '\n", + " 'legal proceedings.',\n", + " 'id': 'pi-cmi73f7r7022y09nwn40paaom',\n", + " 'name': '13e6981b-95ed-4aac-a602-ebc5865d0590.pdf',\n", + " 'pageNum': 48,\n", + " 'status': 'completed'}\n", + "\n", + " Document ready! (48 pages)\n" + ] } - ], - "metadata": { + ], + "source": [ + "from pprint import pprint\n", + "\n", + "doc_info = pi_client.get_document(doc_id)\n", + "pprint(doc_info)\n", + "\n", + "if doc_info['status'] == 'completed':\n", + " print(f\"\\n Document ready! ({doc_info['pageNum']} pages)\")\n", + "elif doc_info['status'] == 'processing':\n", + " print(\"\\n Document is still processing. Please wait and check again.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1C9FOvO-p1m" + }, + "source": [ + "### Ask a question about this document" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python" + "id": "X3RbQvy_0nt7", + "outputId": "a8bb5468-3f5f-4225-a5fd-60004041a644" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I'll help you find the revenue information from NVIDIA's 10-Q report. Let me first check the document structure to locate the financial information.{\"doc_name\": \"13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\"}Now let me get the content from pages that contain the revenue information. Based on the structure, revenue information should be in the financial statements section, which appears early in the document. Let me check pages 3-8 which should contain the Condensed Consolidated Statements of Income.{\"doc_name\": \"13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\", \"pages\": \"3-5\"}Perfect! I found the revenue information. Let me provide you with the details:\n", + "\n", + "## Revenue Information from NVIDIA's 10-Q Report\n", + "\n", + "**You can find the revenue on Page 3** of the document in the **Condensed Consolidated Statements of Income**.\n", + "\n", + "### Revenue Details:\n", + "\n", + "**For the Three Months Ended October 26, 2025 (Q3 FY2026):**\n", + "- **Revenue: $57,006 million** ($57.0 billion)\n", + "- Compared to Q3 FY2025 (October 27, 2024): $35,082 million\n", + "\n", + "**For the Nine Months Ended October 26, 2025 (First 9 months of FY2026):**\n", + "- **Revenue: $147,811 million** ($147.8 billion)\n", + "- Compared to same period FY2025: $91,166 million\n", + "\n", + "### Key Highlights:\n", + "- Q3 revenue increased by **62.5%** year-over-year ($21.9 billion increase)\n", + "- Nine-month revenue increased by **62.1%** year-over-year ($56.6 billion increase)\n", + "- This represents strong growth driven primarily by Data Center compute and networking platforms for AI and accelerated computing, with Blackwell architectures being a major contributor\n", + "\n", + "The revenue figures are clearly displayed at the top of the Condensed Consolidated Statements of Income on **Page 3** of the 10-Q report." + ] } + ], + "source": [ + "query = \"what is the revenue? Also show me which page I can find it.\"\n", + "\n", + "for chunk in pi_client.chat_completions(\n", + " messages=[{\"role\": \"user\", \"content\": query}],\n", + " doc_id=doc_id,\n", + " stream=True\n", + "):\n", + " print(chunk, end='', flush=True)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/cookbook/pageindex_RAG_simple.ipynb b/cookbook/pageindex_RAG_simple.ipynb index 1ad9bd194..ce99dd012 100644 --- a/cookbook/pageindex_RAG_simple.ipynb +++ b/cookbook/pageindex_RAG_simple.ipynb @@ -1,609 +1,610 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "TCh9BTedHJK1" - }, - "source": [ - "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nD0hb4TFHWTt" - }, - "source": [ - "

Reasoning-based RAG  ✧  No Vector DB  ✧  No Chunking  ✧  Human-like Retrieval

\n", - "\n", - "

\n", - " 🏠 Homepage  •  \n", - " 🖥️ Dashboard  •  \n", - " 📚 API Docs  •  \n", - " 📦 GitHub  •  \n", - " 💬 Discord  •  \n", - " ✉️ Contact \n", - "

\n", - "\n", - "
\n", - "\n", - "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex)    [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n", - "\n", - "
\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ebvn5qfpcG1K" - }, - "source": [ - "# Simple Vectorless RAG with PageIndex" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## PageIndex Introduction\n", - "PageIndex is a new **reasoning-based**, **vectorless RAG** framework that performs retrieval in two steps: \n", - "1. Generate a tree structure index of documents \n", - "2. Perform reasoning-based retrieval through tree search \n", - "\n", - "
\n", - " \n", - "
\n", - "\n", - "Compared to traditional vector-based RAG, PageIndex features:\n", - "- **No Vectors Needed**: Uses document structure and LLM reasoning for retrieval.\n", - "- **No Chunking Needed**: Documents are organized into natural sections rather than artificial chunks.\n", - "- **Human-like Retrieval**: Simulates how human experts navigate and extract knowledge from complex documents. \n", - "- **Transparent Retrieval Process**: Retrieval based on reasoning — say goodbye to approximate semantic search (\"vibe retrieval\")." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📝 Notebook Overview\n", - "\n", - "This notebook demonstrates a simple, minimal example of **vectorless RAG** with PageIndex. You will learn how to:\n", - "- [x] Build a PageIndex tree structure of a document\n", - "- [x] Perform reasoning-based retrieval with tree search\n", - "- [x] Generate answers based on the retrieved context\n", - "\n", - "> ⚡ Note: This is a **minimal example** to illustrate PageIndex's core philosophy and idea, not its full capabilities. More advanced examples are coming soon.\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7ziuTbbWcG1L" - }, - "source": [ - "## Step 0: Preparation\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "edTfrizMFK4c" - }, - "source": [ - "#### 0.1 Install PageIndex" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "id": "LaoB58wQFNDh" - }, - "outputs": [], - "source": [ - "%pip install -q --upgrade pageindex" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WVEWzPKGcG1M" - }, - "source": [ - "#### 0.2 Setup PageIndex" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "StvqfcK4cG1M" - }, - "outputs": [], - "source": [ - "from pageindex import PageIndexClient\n", - "import pageindex.utils as utils\n", - "\n", - "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n", - "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n", - "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 0.3 Setup LLM\n", - "\n", - "Choose your preferred LLM for reasoning-based retrieval. In this example, we use OpenAI’s GPT-4.1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import openai\n", - "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n", - "\n", - "async def call_llm(prompt, model=\"gpt-4.1\", temperature=0):\n", - " client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)\n", - " response = await client.chat.completions.create(\n", - " model=model,\n", - " messages=[{\"role\": \"user\", \"content\": prompt}],\n", - " temperature=temperature\n", - " )\n", - " return response.choices[0].message.content.strip()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "heGtIMOVcG1N" - }, - "source": [ - "## Step 1: PageIndex Tree Generation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mzd1VWjwMUJL" - }, - "source": [ - "#### 1.1 Submit a document for generating PageIndex tree" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f6--eZPLcG1N", - "outputId": "ca688cfd-6c4b-4a57-dac2-f3c2604c4112" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloaded https://arxiv.org/pdf/2501.12948.pdf\n", - "Document Submitted: pi-cmeseq08w00vt0bo3u6tr244g\n" - ] - } - ], - "source": [ - "import os, requests\n", - "\n", - "# You can also use our GitHub repo to generate PageIndex tree\n", - "# https://github.com/VectifyAI/PageIndex\n", - "\n", - "pdf_url = \"https://arxiv.org/pdf/2501.12948.pdf\"\n", - "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n", - "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n", - "\n", - "response = requests.get(pdf_url)\n", - "with open(pdf_path, \"wb\") as f:\n", - " f.write(response.content)\n", - "print(f\"Downloaded {pdf_url}\")\n", - "\n", - "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n", - "print('Document Submitted:', doc_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4-Hrh0azcG1N" - }, - "source": [ - "#### 1.2 Get the generated PageIndex tree structure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "b1Q1g6vrcG1O", - "outputId": "dc944660-38ad-47ea-d358-be422edbae53" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Simplified Tree Structure of the Document:\n", - "[{'title': 'DeepSeek-R1: Incentivizing Reasoning Cap...',\n", - " 'node_id': '0000',\n", - " 'prefix_summary': '# DeepSeek-R1: Incentivizing Reasoning C...',\n", - " 'nodes': [{'title': 'Abstract',\n", - " 'node_id': '0001',\n", - " 'summary': 'The partial document introduces two reas...'},\n", - " {'title': 'Contents',\n", - " 'node_id': '0002',\n", - " 'summary': 'This partial document provides a detaile...'},\n", - " {'title': '1. Introduction',\n", - " 'node_id': '0003',\n", - " 'prefix_summary': 'The partial document introduces recent a...',\n", - " 'nodes': [{'title': '1.1. Contributions',\n", - " 'node_id': '0004',\n", - " 'summary': 'This partial document outlines the main ...'},\n", - " {'title': '1.2. Summary of Evaluation Results',\n", - " 'node_id': '0005',\n", - " 'summary': 'The partial document provides a summary ...'}]},\n", - " {'title': '2. Approach',\n", - " 'node_id': '0006',\n", - " 'prefix_summary': '## 2. Approach\\n',\n", - " 'nodes': [{'title': '2.1. Overview',\n", - " 'node_id': '0007',\n", - " 'summary': '### 2.1. Overview\\n\\nPrevious work has hea...'},\n", - " {'title': '2.2. DeepSeek-R1-Zero: Reinforcement Lea...',\n", - " 'node_id': '0008',\n", - " 'prefix_summary': '### 2.2. DeepSeek-R1-Zero: Reinforcement...',\n", - " 'nodes': [{'title': '2.2.1. Reinforcement Learning Algorithm',\n", - " 'node_id': '0009',\n", - " 'summary': 'The partial document describes the Group...'},\n", - " {'title': '2.2.2. Reward Modeling',\n", - " 'node_id': '0010',\n", - " 'summary': 'This partial document discusses the rewa...'},\n", - " {'title': '2.2.3. Training Template',\n", - " 'node_id': '0011',\n", - " 'summary': '#### 2.2.3. Training Template\\n\\nTo train ...'},\n", - " {'title': '2.2.4. Performance, Self-evolution Proce...',\n", - " 'node_id': '0012',\n", - " 'summary': 'This partial document discusses the perf...'}]},\n", - " {'title': '2.3. DeepSeek-R1: Reinforcement Learning...',\n", - " 'node_id': '0013',\n", - " 'summary': 'This partial document describes the trai...'},\n", - " {'title': '2.4. Distillation: Empower Small Models ...',\n", - " 'node_id': '0014',\n", - " 'summary': 'This partial document discusses the proc...'}]},\n", - " {'title': '3. Experiment',\n", - " 'node_id': '0015',\n", - " 'prefix_summary': 'The partial document describes the exper...',\n", - " 'nodes': [{'title': '3.1. DeepSeek-R1 Evaluation',\n", - " 'node_id': '0016',\n", - " 'summary': 'This partial document presents a compreh...'},\n", - " {'title': '3.2. Distilled Model Evaluation',\n", - " 'node_id': '0017',\n", - " 'summary': 'This partial document presents an evalua...'}]},\n", - " {'title': '4. Discussion',\n", - " 'node_id': '0018',\n", - " 'summary': 'This partial document discusses the comp...'},\n", - " {'title': '5. Conclusion, Limitations, and Future W...',\n", - " 'node_id': '0019',\n", - " 'summary': 'This partial document presents the concl...'},\n", - " {'title': 'References',\n", - " 'node_id': '0020',\n", - " 'summary': 'This partial document consists of the re...'},\n", - " {'title': 'Appendix', 'node_id': '0021', 'summary': '## Appendix\\n'},\n", - " {'title': 'A. Contributions and Acknowledgments',\n", - " 'node_id': '0022',\n", - " 'summary': 'This partial document section details th...'}]}]\n" - ] - } - ], - "source": [ - "if pi_client.is_retrieval_ready(doc_id):\n", - " tree = pi_client.get_tree(doc_id, node_summary=True)['result']\n", - " print('Simplified Tree Structure of the Document:')\n", - " utils.print_tree(tree)\n", - "else:\n", - " print(\"Processing document, please try again later...\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "USoCLOiQcG1O" - }, - "source": [ - "## Step 2: Reasoning-Based Retrieval with Tree Search" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.1 Use LLM for tree search and identify nodes that might contain relevant context" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "id": "LLHNJAtTcG1O" - }, - "outputs": [], - "source": [ - "import json\n", - "\n", - "query = \"What are the conclusions in this document?\"\n", - "\n", - "tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])\n", - "\n", - "search_prompt = f\"\"\"\n", - "You are given a question and a tree structure of a document.\n", - "Each node contains a node id, node title, and a corresponding summary.\n", - "Your task is to find all nodes that are likely to contain the answer to the question.\n", - "\n", - "Question: {query}\n", - "\n", - "Document tree structure:\n", - "{json.dumps(tree_without_text, indent=2)}\n", - "\n", - "Please reply in the following JSON format:\n", - "{{\n", - " \"thinking\": \"\",\n", - " \"node_list\": [\"node_id_1\", \"node_id_2\", ..., \"node_id_n\"]\n", - "}}\n", - "Directly return the final JSON structure. Do not output anything else.\n", - "\"\"\"\n", - "\n", - "tree_search_result = await call_llm(search_prompt)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.2 Print retrieved nodes and reasoning process" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - }, - "id": "P8DVUOuAen5u", - "outputId": "6bb6d052-ef30-4716-f88e-be98bcb7ebdb" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reasoning Process:\n", - "The question asks for the conclusions in the document. Typically, conclusions are found in sections\n", - "explicitly titled 'Conclusion' or in sections summarizing the findings and implications of the work.\n", - "In this document tree, node 0019 ('5. Conclusion, Limitations, and Future Work') is the most\n", - "directly relevant, as it is dedicated to the conclusion and related topics. Additionally, the\n", - "'Abstract' (node 0001) may contain a high-level summary that sometimes includes concluding remarks,\n", - "but it is less likely to contain the full conclusions. Other sections like 'Discussion' (node 0018)\n", - "may discuss implications but are not explicitly conclusions. Therefore, the primary node is 0019.\n", - "\n", - "Retrieved Nodes:\n", - "Node ID: 0019\t Page: 16\t Title: 5. Conclusion, Limitations, and Future Work\n" - ] - } - ], - "source": [ - "node_map = utils.create_node_mapping(tree)\n", - "tree_search_result_json = json.loads(tree_search_result)\n", - "\n", - "print('Reasoning Process:')\n", - "utils.print_wrapped(tree_search_result_json['thinking'])\n", - "\n", - "print('\\nRetrieved Nodes:')\n", - "for node_id in tree_search_result_json[\"node_list\"]:\n", - " node = node_map[node_id]\n", - " print(f\"Node ID: {node['node_id']}\\t Page: {node['page_index']}\\t Title: {node['title']}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "10wOZDG_cG1O" - }, - "source": [ - "## Step 3: Answer Generation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3.1 Extract relevant context from retrieved nodes" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 279 - }, - "id": "a7UCBnXlcG1O", - "outputId": "8a026ea3-4ef3-473a-a57b-b4565409749e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Retrieved Context:\n", - "\n", - "## 5. Conclusion, Limitations, and Future Work\n", - "\n", - "In this work, we share our journey in enhancing model reasoning abilities through reinforcement\n", - "learning. DeepSeek-R1-Zero represents a pure RL approach without relying on cold-start data,\n", - "achieving strong performance across various tasks. DeepSeek-R1 is more powerful, leveraging cold-\n", - "start data alongside iterative RL fine-tuning. Ultimately, DeepSeek-R1 achieves performance\n", - "comparable to OpenAI-o1-1217 on a range of tasks.\n", - "\n", - "We further explore distillation the reasoning capability to small dense models. We use DeepSeek-R1\n", - "as the teacher model to generate 800K training samples, and fine-tune several small dense models.\n", - "The results are promising: DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on\n", - "math benchmarks with $28.9 \\%$ on AIME and $83.9 \\%$ on MATH. Other dense models also achieve\n", - "impressive results, significantly outperforming other instructiontuned models based on the same\n", - "underlying checkpoints.\n", - "\n", - "In the fut...\n" - ] - } - ], - "source": [ - "node_list = json.loads(tree_search_result)[\"node_list\"]\n", - "relevant_content = \"\\n\\n\".join(node_map[node_id][\"text\"] for node_id in node_list)\n", - "\n", - "print('Retrieved Context:\\n')\n", - "utils.print_wrapped(relevant_content[:1000] + '...')" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "TCh9BTedHJK1" + }, + "source": [ + "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nD0hb4TFHWTt" + }, + "source": [ + "

Reasoning-based RAG  ✧  No Vector DB  ✧  No Chunking  ✧  Human-like Retrieval

\n", + "\n", + "

\n", + " 🏠 Homepage  •  \n", + " 🖥️ Dashboard  •  \n", + " 📚 API Docs  •  \n", + " 📦 GitHub  •  \n", + " 💬 Discord  •  \n", + " ✉️ Contact \n", + "

\n", + "\n", + "
\n", + "\n", + "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex)    [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n", + "\n", + "
\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ebvn5qfpcG1K" + }, + "source": [ + "# Simple Vectorless RAG with PageIndex" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PageIndex Introduction\n", + "PageIndex is a new **reasoning-based**, **vectorless RAG** framework that performs retrieval in two steps: \n", + "1. Generate a tree structure index of documents \n", + "2. Perform reasoning-based retrieval through tree search \n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "Compared to traditional vector-based RAG, PageIndex features:\n", + "- **No Vectors Needed**: Uses document structure and LLM reasoning for retrieval.\n", + "- **No Chunking Needed**: Documents are organized into natural sections rather than artificial chunks.\n", + "- **Human-like Retrieval**: Simulates how human experts navigate and extract knowledge from complex documents. \n", + "- **Transparent Retrieval Process**: Retrieval based on reasoning — say goodbye to approximate semantic search (\"vibe retrieval\")." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📝 Notebook Overview\n", + "\n", + "This notebook demonstrates a simple, minimal example of **vectorless RAG** with PageIndex. You will learn how to:\n", + "- [x] Build a PageIndex tree structure of a document\n", + "- [x] Perform reasoning-based retrieval with tree search\n", + "- [x] Generate answers based on the retrieved context\n", + "\n", + "> ⚡ Note: This is a **minimal example** to illustrate PageIndex's core philosophy and idea, not its full capabilities. More advanced examples are coming soon.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7ziuTbbWcG1L" + }, + "source": [ + "## Step 0: Preparation\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "edTfrizMFK4c" + }, + "source": [ + "#### 0.1 Install PageIndex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "LaoB58wQFNDh" + }, + "outputs": [], + "source": [ + "%pip install -q --upgrade pageindex" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WVEWzPKGcG1M" + }, + "source": [ + "#### 0.2 Setup PageIndex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "StvqfcK4cG1M" + }, + "outputs": [], + "source": [ + "from pageindex import PageIndexClient\n", + "import pageindex.utils as utils\n", + "\n", + "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n", + "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n", + "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 0.3 Setup LLM\n", + "\n", + "Choose your preferred LLM for reasoning-based retrieval. In this example, we use OpenAI’s GPT-4.1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n", + "\n", + "async def call_llm(prompt, model=\"gpt-4.1\", temperature=0):\n", + " client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)\n", + " response = await client.chat.completions.create(\n", + " model=model,\n", + " messages=[{\"role\": \"user\", \"content\": prompt}],\n", + " temperature=temperature\n", + " )\n", + " return response.choices[0].message.content.strip()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "heGtIMOVcG1N" + }, + "source": [ + "## Step 1: PageIndex Tree Generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mzd1VWjwMUJL" + }, + "source": [ + "#### 1.1 Submit a document for generating PageIndex tree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "f6--eZPLcG1N", + "outputId": "ca688cfd-6c4b-4a57-dac2-f3c2604c4112" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3.2 Generate answer based on retrieved context" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloaded https://arxiv.org/pdf/2501.12948.pdf\n", + "Document Submitted: pi-cmeseq08w00vt0bo3u6tr244g\n" + ] + } + ], + "source": [ + "import os\n", + "import requests\n", + "\n", + "# You can also use our GitHub repo to generate PageIndex tree\n", + "# https://github.com/VectifyAI/PageIndex\n", + "\n", + "pdf_url = \"https://arxiv.org/pdf/2501.12948.pdf\"\n", + "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n", + "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n", + "\n", + "response = requests.get(pdf_url)\n", + "with open(pdf_path, \"wb\") as f:\n", + " f.write(response.content)\n", + "print(f\"Downloaded {pdf_url}\")\n", + "\n", + "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n", + "print('Document Submitted:', doc_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4-Hrh0azcG1N" + }, + "source": [ + "#### 1.2 Get the generated PageIndex tree structure" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "id": "b1Q1g6vrcG1O", + "outputId": "dc944660-38ad-47ea-d358-be422edbae53" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 210 - }, - "id": "tcp_PhHzcG1O", - "outputId": "187ff116-9bb0-4ab4-bacb-13944460b5ff" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generated Answer:\n", - "\n", - "The conclusions in this document are:\n", - "\n", - "- DeepSeek-R1-Zero, a pure reinforcement learning (RL) approach without cold-start data, achieves\n", - "strong performance across various tasks.\n", - "- DeepSeek-R1, which combines cold-start data with iterative RL fine-tuning, is more powerful and\n", - "achieves performance comparable to OpenAI-o1-1217 on a range of tasks.\n", - "- Distilling DeepSeek-R1’s reasoning capabilities into smaller dense models is promising; for\n", - "example, DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on math benchmarks,\n", - "and other dense models also show significant improvements over similar instruction-tuned models.\n", - "\n", - "These results demonstrate the effectiveness of the RL-based approach and the potential for\n", - "distilling reasoning abilities into smaller models.\n" - ] - } - ], - "source": [ - "answer_prompt = f\"\"\"\n", - "Answer the question based on the context:\n", - "\n", - "Question: {query}\n", - "Context: {relevant_content}\n", - "\n", - "Provide a clear, concise answer based only on the context provided.\n", - "\"\"\"\n", - "\n", - "print('Generated Answer:\\n')\n", - "answer = await call_llm(answer_prompt)\n", - "utils.print_wrapped(answer)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Simplified Tree Structure of the Document:\n", + "[{'title': 'DeepSeek-R1: Incentivizing Reasoning Cap...',\n", + " 'node_id': '0000',\n", + " 'prefix_summary': '# DeepSeek-R1: Incentivizing Reasoning C...',\n", + " 'nodes': [{'title': 'Abstract',\n", + " 'node_id': '0001',\n", + " 'summary': 'The partial document introduces two reas...'},\n", + " {'title': 'Contents',\n", + " 'node_id': '0002',\n", + " 'summary': 'This partial document provides a detaile...'},\n", + " {'title': '1. Introduction',\n", + " 'node_id': '0003',\n", + " 'prefix_summary': 'The partial document introduces recent a...',\n", + " 'nodes': [{'title': '1.1. Contributions',\n", + " 'node_id': '0004',\n", + " 'summary': 'This partial document outlines the main ...'},\n", + " {'title': '1.2. Summary of Evaluation Results',\n", + " 'node_id': '0005',\n", + " 'summary': 'The partial document provides a summary ...'}]},\n", + " {'title': '2. Approach',\n", + " 'node_id': '0006',\n", + " 'prefix_summary': '## 2. Approach\\n',\n", + " 'nodes': [{'title': '2.1. Overview',\n", + " 'node_id': '0007',\n", + " 'summary': '### 2.1. Overview\\n\\nPrevious work has hea...'},\n", + " {'title': '2.2. DeepSeek-R1-Zero: Reinforcement Lea...',\n", + " 'node_id': '0008',\n", + " 'prefix_summary': '### 2.2. DeepSeek-R1-Zero: Reinforcement...',\n", + " 'nodes': [{'title': '2.2.1. Reinforcement Learning Algorithm',\n", + " 'node_id': '0009',\n", + " 'summary': 'The partial document describes the Group...'},\n", + " {'title': '2.2.2. Reward Modeling',\n", + " 'node_id': '0010',\n", + " 'summary': 'This partial document discusses the rewa...'},\n", + " {'title': '2.2.3. Training Template',\n", + " 'node_id': '0011',\n", + " 'summary': '#### 2.2.3. Training Template\\n\\nTo train ...'},\n", + " {'title': '2.2.4. Performance, Self-evolution Proce...',\n", + " 'node_id': '0012',\n", + " 'summary': 'This partial document discusses the perf...'}]},\n", + " {'title': '2.3. DeepSeek-R1: Reinforcement Learning...',\n", + " 'node_id': '0013',\n", + " 'summary': 'This partial document describes the trai...'},\n", + " {'title': '2.4. Distillation: Empower Small Models ...',\n", + " 'node_id': '0014',\n", + " 'summary': 'This partial document discusses the proc...'}]},\n", + " {'title': '3. Experiment',\n", + " 'node_id': '0015',\n", + " 'prefix_summary': 'The partial document describes the exper...',\n", + " 'nodes': [{'title': '3.1. DeepSeek-R1 Evaluation',\n", + " 'node_id': '0016',\n", + " 'summary': 'This partial document presents a compreh...'},\n", + " {'title': '3.2. Distilled Model Evaluation',\n", + " 'node_id': '0017',\n", + " 'summary': 'This partial document presents an evalua...'}]},\n", + " {'title': '4. Discussion',\n", + " 'node_id': '0018',\n", + " 'summary': 'This partial document discusses the comp...'},\n", + " {'title': '5. Conclusion, Limitations, and Future W...',\n", + " 'node_id': '0019',\n", + " 'summary': 'This partial document presents the concl...'},\n", + " {'title': 'References',\n", + " 'node_id': '0020',\n", + " 'summary': 'This partial document consists of the re...'},\n", + " {'title': 'Appendix', 'node_id': '0021', 'summary': '## Appendix\\n'},\n", + " {'title': 'A. Contributions and Acknowledgments',\n", + " 'node_id': '0022',\n", + " 'summary': 'This partial document section details th...'}]}]\n" + ] + } + ], + "source": [ + "if pi_client.is_retrieval_ready(doc_id):\n", + " tree = pi_client.get_tree(doc_id, node_summary=True)['result']\n", + " print('Simplified Tree Structure of the Document:')\n", + " utils.print_tree(tree)\n", + "else:\n", + " print(\"Processing document, please try again later...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "USoCLOiQcG1O" + }, + "source": [ + "## Step 2: Reasoning-Based Retrieval with Tree Search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.1 Use LLM for tree search and identify nodes that might contain relevant context" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "LLHNJAtTcG1O" + }, + "outputs": [], + "source": [ + "import json\n", + "\n", + "query = \"What are the conclusions in this document?\"\n", + "\n", + "tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])\n", + "\n", + "search_prompt = f\"\"\"\n", + "You are given a question and a tree structure of a document.\n", + "Each node contains a node id, node title, and a corresponding summary.\n", + "Your task is to find all nodes that are likely to contain the answer to the question.\n", + "\n", + "Question: {query}\n", + "\n", + "Document tree structure:\n", + "{json.dumps(tree_without_text, indent=2)}\n", + "\n", + "Please reply in the following JSON format:\n", + "{{\n", + " \"thinking\": \"\",\n", + " \"node_list\": [\"node_id_1\", \"node_id_2\", ..., \"node_id_n\"]\n", + "}}\n", + "Directly return the final JSON structure. Do not output anything else.\n", + "\"\"\"\n", + "\n", + "tree_search_result = await call_llm(search_prompt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.2 Print retrieved nodes and reasoning process" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "P8DVUOuAen5u", + "outputId": "6bb6d052-ef30-4716-f88e-be98bcb7ebdb" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "_1kaGD3GcG1O" - }, - "source": [ - "---\n", - "\n", - "## 🎯 What's Next\n", - "\n", - "This notebook has demonstrated a **basic**, **minimal** example of **reasoning-based**, **vectorless** RAG with PageIndex. The workflow illustrates the core idea:\n", - "> *Generating a hierarchical tree structure from a document, reasoning over that tree structure, and extracting relevant context, without relying on a vector database or top-k similarity search*.\n", - "\n", - "While this notebook highlights a minimal workflow, the PageIndex framework is built to support **far more advanced** use cases. In upcoming tutorials, we will introduce:\n", - "* **Multi-Node Reasoning with Content Extraction** — Scale tree search to extract and select relevant content from multiple nodes.\n", - "* **Multi-Document Search** — Enable reasoning-based navigation across large document collections, extending beyond a single file.\n", - "* **Efficient Tree Search** — Improve tree search efficiency for long documents with a large number of nodes.\n", - "* **Expert Knowledge Integration and Preference Alignment** — Incorporate user preferences or expert insights by adding knowledge directly into the LLM tree search, without the need for fine-tuning.\n", - "\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Reasoning Process:\n", + "The question asks for the conclusions in the document. Typically, conclusions are found in sections\n", + "explicitly titled 'Conclusion' or in sections summarizing the findings and implications of the work.\n", + "In this document tree, node 0019 ('5. Conclusion, Limitations, and Future Work') is the most\n", + "directly relevant, as it is dedicated to the conclusion and related topics. Additionally, the\n", + "'Abstract' (node 0001) may contain a high-level summary that sometimes includes concluding remarks,\n", + "but it is less likely to contain the full conclusions. Other sections like 'Discussion' (node 0018)\n", + "may discuss implications but are not explicitly conclusions. Therefore, the primary node is 0019.\n", + "\n", + "Retrieved Nodes:\n", + "Node ID: 0019\t Page: 16\t Title: 5. Conclusion, Limitations, and Future Work\n" + ] + } + ], + "source": [ + "node_map = utils.create_node_mapping(tree)\n", + "tree_search_result_json = json.loads(tree_search_result)\n", + "\n", + "print('Reasoning Process:')\n", + "utils.print_wrapped(tree_search_result_json['thinking'])\n", + "\n", + "print('\\nRetrieved Nodes:')\n", + "for node_id in tree_search_result_json[\"node_list\"]:\n", + " node = node_map[node_id]\n", + " print(f\"Node ID: {node['node_id']}\\t Page: {node['page_index']}\\t Title: {node['title']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "10wOZDG_cG1O" + }, + "source": [ + "## Step 3: Answer Generation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.1 Extract relevant context from retrieved nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 279 }, + "id": "a7UCBnXlcG1O", + "outputId": "8a026ea3-4ef3-473a-a57b-b4565409749e" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🔎 Learn More About PageIndex\n", - " 🏠 Homepage  •  \n", - " 🖥️ Dashboard  •  \n", - " 📚 API Docs  •  \n", - " 📦 GitHub  •  \n", - " 💬 Discord  •  \n", - " ✉️ Contact\n", - "\n", - "
\n", - "\n", - "© 2025 [Vectify AI](https://vectify.ai)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Retrieved Context:\n", + "\n", + "## 5. Conclusion, Limitations, and Future Work\n", + "\n", + "In this work, we share our journey in enhancing model reasoning abilities through reinforcement\n", + "learning. DeepSeek-R1-Zero represents a pure RL approach without relying on cold-start data,\n", + "achieving strong performance across various tasks. DeepSeek-R1 is more powerful, leveraging cold-\n", + "start data alongside iterative RL fine-tuning. Ultimately, DeepSeek-R1 achieves performance\n", + "comparable to OpenAI-o1-1217 on a range of tasks.\n", + "\n", + "We further explore distillation the reasoning capability to small dense models. We use DeepSeek-R1\n", + "as the teacher model to generate 800K training samples, and fine-tune several small dense models.\n", + "The results are promising: DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on\n", + "math benchmarks with $28.9 \\%$ on AIME and $83.9 \\%$ on MATH. Other dense models also achieve\n", + "impressive results, significantly outperforming other instructiontuned models based on the same\n", + "underlying checkpoints.\n", + "\n", + "In the fut...\n" + ] } - ], - "metadata": { + ], + "source": [ + "node_list = json.loads(tree_search_result)[\"node_list\"]\n", + "relevant_content = \"\\n\\n\".join(node_map[node_id][\"text\"] for node_id in node_list)\n", + "\n", + "print('Retrieved Context:\\n')\n", + "utils.print_wrapped(relevant_content[:1000] + '...')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.2 Generate answer based on retrieved context" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/", + "height": 210 }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" + "id": "tcp_PhHzcG1O", + "outputId": "187ff116-9bb0-4ab4-bacb-13944460b5ff" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated Answer:\n", + "\n", + "The conclusions in this document are:\n", + "\n", + "- DeepSeek-R1-Zero, a pure reinforcement learning (RL) approach without cold-start data, achieves\n", + "strong performance across various tasks.\n", + "- DeepSeek-R1, which combines cold-start data with iterative RL fine-tuning, is more powerful and\n", + "achieves performance comparable to OpenAI-o1-1217 on a range of tasks.\n", + "- Distilling DeepSeek-R1’s reasoning capabilities into smaller dense models is promising; for\n", + "example, DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on math benchmarks,\n", + "and other dense models also show significant improvements over similar instruction-tuned models.\n", + "\n", + "These results demonstrate the effectiveness of the RL-based approach and the potential for\n", + "distilling reasoning abilities into smaller models.\n" + ] } + ], + "source": [ + "answer_prompt = f\"\"\"\n", + "Answer the question based on the context:\n", + "\n", + "Question: {query}\n", + "Context: {relevant_content}\n", + "\n", + "Provide a clear, concise answer based only on the context provided.\n", + "\"\"\"\n", + "\n", + "print('Generated Answer:\\n')\n", + "answer = await call_llm(answer_prompt)\n", + "utils.print_wrapped(answer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_1kaGD3GcG1O" + }, + "source": [ + "---\n", + "\n", + "## 🎯 What's Next\n", + "\n", + "This notebook has demonstrated a **basic**, **minimal** example of **reasoning-based**, **vectorless** RAG with PageIndex. The workflow illustrates the core idea:\n", + "> *Generating a hierarchical tree structure from a document, reasoning over that tree structure, and extracting relevant context, without relying on a vector database or top-k similarity search*.\n", + "\n", + "While this notebook highlights a minimal workflow, the PageIndex framework is built to support **far more advanced** use cases. In upcoming tutorials, we will introduce:\n", + "* **Multi-Node Reasoning with Content Extraction** — Scale tree search to extract and select relevant content from multiple nodes.\n", + "* **Multi-Document Search** — Enable reasoning-based navigation across large document collections, extending beyond a single file.\n", + "* **Efficient Tree Search** — Improve tree search efficiency for long documents with a large number of nodes.\n", + "* **Expert Knowledge Integration and Preference Alignment** — Incorporate user preferences or expert insights by adding knowledge directly into the LLM tree search, without the need for fine-tuning.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🔎 Learn More About PageIndex\n", + " 🏠 Homepage  •  \n", + " 🖥️ Dashboard  •  \n", + " 📚 API Docs  •  \n", + " 📦 GitHub  •  \n", + " 💬 Discord  •  \n", + " ✉️ Contact\n", + "\n", + "
\n", + "\n", + "© 2025 [Vectify AI](https://vectify.ai)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/cookbook/vision_RAG_pageindex.ipynb b/cookbook/vision_RAG_pageindex.ipynb index d39a27614..7774fb95f 100644 --- a/cookbook/vision_RAG_pageindex.ipynb +++ b/cookbook/vision_RAG_pageindex.ipynb @@ -1,667 +1,670 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "TCh9BTedHJK1" - }, - "source": [ - "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nD0hb4TFHWTt" - }, - "source": [ - "
\n", - "

Reasoning-based RAG  ◦  No Vector DB  ◦  No Chunking  ◦  Human-like Retrieval

\n", - "
\n", - "\n", - "
\n", - "

\n", - " 🏠 Homepage  •  \n", - " 💻 Chat  •  \n", - " 🔌 MCP  •  \n", - " 📚 API  •  \n", - " 📦 GitHub  •  \n", - " 💬 Discord  •  \n", - " ✉️ Contact \n", - "

\n", - "
\n", - "\n", - "
\n", - "\n", - "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex)    [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n", - "\n", - "
\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Check out our blog post, \"[Do We Still Need OCR?](https://pageindex.ai/blog/do-we-need-ocr)\", for a more detailed discussion." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ebvn5qfpcG1K" - }, - "source": [ - "# A Vision-based, Vectorless RAG System for Long Documents\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In modern document question answering (QA) systems, Optical Character Recognition (OCR) serves an important role by converting PDF pages into text that can be processed by Large Language Models (LLMs). The resulting text can provide contextual input that enables LLMs to perform question answering over document content.\n", - "\n", - "Traditional OCR systems typically use a two-stage process that first detects the layout of a PDF — dividing it into text, tables, and images — and then recognizes and converts these elements into plain text. With the rise of vision-language models (VLMs) (such as [Qwen-VL](https://github.com/QwenLM/Qwen3-VL) and [GPT-4.1](https://openai.com/index/gpt-4-1/)), new end-to-end OCR models like [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR) have emerged. These models jointly understand visual and textual information, enabling direct interpretation of PDFs without an explicit layout detection step.\n", - "\n", - "However, this paradigm shift raises an important question: \n", - "\n", - "\n", - "> **If a VLM can already process both the document images and the query to produce an answer directly, do we still need the intermediate OCR step?**\n", - "\n", - "In this notebook, we give a practical implementation of a vision-based question-answering system for long documents, without relying on OCR. Specifically, we use PageIndex as a reasoning-based retrieval layer and OpenAI's multimodal GPT-4.1 as the VLM for visual reasoning and answer generation.\n", - "\n", - "See the original [blog post](https://pageindex.ai/blog/do-we-need-ocr) for a more detailed discussion on how VLMs can replace traditional OCR pipelines in document question-answering." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📝 Notebook Overview\n", - "\n", - "This notebook demonstrates a *minimal*, **vision-based vectorless RAG** pipeline for long documents with PageIndex, using only visual context from PDF pages. You will learn how to:\n", - "- [x] Build a PageIndex tree structure of a document\n", - "- [x] Perform reasoning-based retrieval with tree search\n", - "- [x] Extract PDF page images of retrieved tree nodes for visual context\n", - "- [x] Generate answers using VLM with PDF image inputs only (no OCR required)\n", - "\n", - "> ⚡ Note: This example uses PageIndex's reasoning-based retrieval with OpenAI's multimodal GPT-4.1 model for both tree search and visual context reasoning.\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7ziuTbbWcG1L" - }, - "source": [ - "## Step 0: Preparation\n", - "\n", - "This notebook demonstrates **Vision-based RAG** with PageIndex, using PDF page images as visual context for retrieval and answer generation.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "edTfrizMFK4c" - }, - "source": [ - "#### 0.1 Install PageIndex" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "id": "LaoB58wQFNDh" - }, - "outputs": [], - "source": [ - "%pip install -q --upgrade pageindex requests openai PyMuPDF" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WVEWzPKGcG1M" - }, - "source": [ - "#### 0.2 Setup PageIndex" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "StvqfcK4cG1M" - }, - "outputs": [], - "source": [ - "from pageindex import PageIndexClient\n", - "import pageindex.utils as utils\n", - "\n", - "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n", - "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n", - "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 0.3 Setup VLM\n", - "\n", - "Choose your preferred VLM — in this notebook, we use OpenAI's multimodal GPT-4.1 as the VLM." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import openai, fitz, base64, os\n", - "\n", - "# Setup OpenAI client\n", - "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n", - "\n", - "async def call_vlm(prompt, image_paths=None, model=\"gpt-4.1\"):\n", - " client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)\n", - " messages = [{\"role\": \"user\", \"content\": prompt}]\n", - " if image_paths:\n", - " content = [{\"type\": \"text\", \"text\": prompt}]\n", - " for image in image_paths:\n", - " if os.path.exists(image):\n", - " with open(image, \"rb\") as image_file:\n", - " image_data = base64.b64encode(image_file.read()).decode('utf-8')\n", - " content.append({\n", - " \"type\": \"image_url\",\n", - " \"image_url\": {\n", - " \"url\": f\"data:image/jpeg;base64,{image_data}\"\n", - " }\n", - " })\n", - " messages[0][\"content\"] = content\n", - " response = await client.chat.completions.create(model=model, messages=messages, temperature=0)\n", - " return response.choices[0].message.content.strip()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 0.4 PDF Image Extraction Helper Functions\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def extract_pdf_page_images(pdf_path, output_dir=\"pdf_images\"):\n", - " os.makedirs(output_dir, exist_ok=True)\n", - " pdf_document = fitz.open(pdf_path)\n", - " page_images = {}\n", - " total_pages = len(pdf_document)\n", - " for page_number in range(len(pdf_document)):\n", - " page = pdf_document.load_page(page_number)\n", - " # Convert page to image\n", - " mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality\n", - " pix = page.get_pixmap(matrix=mat)\n", - " img_data = pix.tobytes(\"jpeg\")\n", - " image_path = os.path.join(output_dir, f\"page_{page_number + 1}.jpg\")\n", - " with open(image_path, \"wb\") as image_file:\n", - " image_file.write(img_data)\n", - " page_images[page_number + 1] = image_path\n", - " print(f\"Saved page {page_number + 1} image: {image_path}\")\n", - " pdf_document.close()\n", - " return page_images, total_pages\n", - "\n", - "def get_page_images_for_nodes(node_list, node_map, page_images):\n", - " # Get PDF page images for retrieved nodes\n", - " image_paths = []\n", - " seen_pages = set()\n", - " for node_id in node_list:\n", - " node_info = node_map[node_id]\n", - " for page_num in range(node_info['start_index'], node_info['end_index'] + 1):\n", - " if page_num not in seen_pages:\n", - " image_paths.append(page_images[page_num])\n", - " seen_pages.add(page_num)\n", - " return image_paths\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "heGtIMOVcG1N" - }, - "source": [ - "## Step 1: PageIndex Tree Generation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mzd1VWjwMUJL" - }, - "source": [ - "#### 1.1 Submit a document for generating PageIndex tree" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f6--eZPLcG1N", - "outputId": "ca688cfd-6c4b-4a57-dac2-f3c2604c4112" - }, - "outputs": [], - "source": [ - "import os, requests\n", - "\n", - "# You can also use our GitHub repo to generate PageIndex tree\n", - "# https://github.com/VectifyAI/PageIndex\n", - "\n", - "pdf_url = \"https://arxiv.org/pdf/1706.03762.pdf\" # the \"Attention Is All You Need\" paper\n", - "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n", - "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n", - "\n", - "response = requests.get(pdf_url)\n", - "with open(pdf_path, \"wb\") as f:\n", - " f.write(response.content)\n", - "print(f\"Downloaded {pdf_url}\\n\")\n", - "\n", - "# Extract page images from PDF\n", - "print(\"Extracting page images...\")\n", - "page_images, total_pages = extract_pdf_page_images(pdf_path)\n", - "print(f\"Extracted {len(page_images)} page images from {total_pages} total pages.\\n\")\n", - "\n", - "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n", - "print('Document Submitted:', doc_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4-Hrh0azcG1N" - }, - "source": [ - "#### 1.2 Get the generated PageIndex tree structure" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "b1Q1g6vrcG1O", - "outputId": "dc944660-38ad-47ea-d358-be422edbae53" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Simplified Tree Structure of the Document:\n", - "[{'title': 'Attention Is All You Need',\n", - " 'node_id': '0000',\n", - " 'page_index': 1,\n", - " 'prefix_summary': '# Attention Is All You Need\\n\\nAshish Vasw...',\n", - " 'nodes': [{'title': 'Abstract',\n", - " 'node_id': '0001',\n", - " 'page_index': 1,\n", - " 'summary': 'The text introduces the Transformer, a n...'},\n", - " {'title': '1 Introduction',\n", - " 'node_id': '0002',\n", - " 'page_index': 2,\n", - " 'summary': 'The text introduces the Transformer, a n...'},\n", - " {'title': '2 Background',\n", - " 'node_id': '0003',\n", - " 'page_index': 2,\n", - " 'summary': 'This section discusses the Transformer m...'},\n", - " {'title': '3 Model Architecture',\n", - " 'node_id': '0004',\n", - " 'page_index': 2,\n", - " 'prefix_summary': 'The text describes the encoder-decoder a...',\n", - " 'nodes': [{'title': '3.1 Encoder and Decoder Stacks',\n", - " 'node_id': '0005',\n", - " 'page_index': 3,\n", - " 'summary': 'The text describes the encoder and decod...'},\n", - " {'title': '3.2 Attention',\n", - " 'node_id': '0006',\n", - " 'page_index': 3,\n", - " 'prefix_summary': '### 3.2 Attention\\n\\nAn attention function...',\n", - " 'nodes': [{'title': '3.2.1 Scaled Dot-Product Attention',\n", - " 'node_id': '0007',\n", - " 'page_index': 4,\n", - " 'summary': 'The text describes Scaled Dot-Product At...'},\n", - " {'title': '3.2.2 Multi-Head Attention',\n", - " 'node_id': '0008',\n", - " 'page_index': 4,\n", - " 'summary': 'The text describes Multi-Head Attention,...'},\n", - " {'title': '3.2.3 Applications of Attention in our M...',\n", - " 'node_id': '0009',\n", - " 'page_index': 5,\n", - " 'summary': 'The text describes the three application...'}]},\n", - " {'title': '3.3 Position-wise Feed-Forward Networks',\n", - " 'node_id': '0010',\n", - " 'page_index': 5,\n", - " 'summary': '### 3.3 Position-wise Feed-Forward Netwo...'},\n", - " {'title': '3.4 Embeddings and Softmax',\n", - " 'node_id': '0011',\n", - " 'page_index': 5,\n", - " 'summary': 'The text describes the use of learned em...'},\n", - " {'title': '3.5 Positional Encoding',\n", - " 'node_id': '0012',\n", - " 'page_index': 6,\n", - " 'summary': 'This section explains the necessity of p...'}]},\n", - " {'title': '4 Why Self-Attention',\n", - " 'node_id': '0013',\n", - " 'page_index': 6,\n", - " 'summary': 'This text compares self-attention layers...'},\n", - " {'title': '5 Training',\n", - " 'node_id': '0014',\n", - " 'page_index': 7,\n", - " 'prefix_summary': '## 5 Training\\n\\nThis section describes th...',\n", - " 'nodes': [{'title': '5.1 Training Data and Batching',\n", - " 'node_id': '0015',\n", - " 'page_index': 7,\n", - " 'summary': '### 5.1 Training Data and Batching\\n\\nWe t...'},\n", - " {'title': '5.2 Hardware and Schedule',\n", - " 'node_id': '0016',\n", - " 'page_index': 7,\n", - " 'summary': '### 5.2 Hardware and Schedule\\n\\nWe traine...'},\n", - " {'title': '5.3 Optimizer',\n", - " 'node_id': '0017',\n", - " 'page_index': 7,\n", - " 'summary': '### 5.3 Optimizer\\n\\nWe used the Adam opti...'},\n", - " {'title': '5.4 Regularization',\n", - " 'node_id': '0018',\n", - " 'page_index': 7,\n", - " 'summary': 'The text details three regularization te...'}]},\n", - " {'title': '6 Results',\n", - " 'node_id': '0019',\n", - " 'page_index': 8,\n", - " 'prefix_summary': '## 6 Results\\n',\n", - " 'nodes': [{'title': '6.1 Machine Translation',\n", - " 'node_id': '0020',\n", - " 'page_index': 8,\n", - " 'summary': 'The text details the performance of a Tr...'},\n", - " {'title': '6.2 Model Variations',\n", - " 'node_id': '0021',\n", - " 'page_index': 8,\n", - " 'summary': 'This text details experiments varying co...'},\n", - " {'title': '6.3 English Constituency Parsing',\n", - " 'node_id': '0022',\n", - " 'page_index': 9,\n", - " 'summary': 'The text describes experiments evaluatin...'}]},\n", - " {'title': '7 Conclusion',\n", - " 'node_id': '0023',\n", - " 'page_index': 10,\n", - " 'summary': 'This text concludes by presenting the Tr...'},\n", - " {'title': 'References',\n", - " 'node_id': '0024',\n", - " 'page_index': 10,\n", - " 'summary': 'The provided text is a collection of ref...'},\n", - " {'title': 'Attention Visualizations',\n", - " 'node_id': '0025',\n", - " 'page_index': 13,\n", - " 'summary': 'The text provides examples of attention ...'}]}]\n" - ] - } - ], - "source": [ - "if pi_client.is_retrieval_ready(doc_id):\n", - " tree = pi_client.get_tree(doc_id, node_summary=True)['result']\n", - " print('Simplified Tree Structure of the Document:')\n", - " utils.print_tree(tree, exclude_fields=['text'])\n", - "else:\n", - " print(\"Processing document, please try again later...\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "USoCLOiQcG1O" - }, - "source": [ - "## Step 2: Reasoning-Based Retrieval with Tree Search" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.1 Reasoning-based retrieval with PageIndex to identify nodes that might contain relevant context" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LLHNJAtTcG1O" - }, - "outputs": [], - "source": [ - "import json\n", - "\n", - "query = \"What is the last operation in the Scaled Dot-Product Attention figure?\"\n", - "\n", - "tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])\n", - "\n", - "search_prompt = f\"\"\"\n", - "You are given a question and a tree structure of a document.\n", - "Each node contains a node id, node title, and a corresponding summary.\n", - "Your task is to find all tree nodes that are likely to contain the answer to the question.\n", - "\n", - "Question: {query}\n", - "\n", - "Document tree structure:\n", - "{json.dumps(tree_without_text, indent=2)}\n", - "\n", - "Please reply in the following JSON format:\n", - "{{\n", - " \"thinking\": \"\",\n", - " \"node_list\": [\"node_id_1\", \"node_id_2\", ..., \"node_id_n\"]\n", - "}}\n", - "Directly return the final JSON structure. Do not output anything else.\n", - "\"\"\"\n", - "\n", - "tree_search_result = await call_vlm(search_prompt)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.2 Print retrieved nodes and reasoning process" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - }, - "id": "P8DVUOuAen5u", - "outputId": "6bb6d052-ef30-4716-f88e-be98bcb7ebdb" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reasoning Process:\n", - "\n", - "The question asks about the last operation in the Scaled Dot-Product Attention figure. The most\n", - "relevant section is the one that describes Scaled Dot-Product Attention in detail, including its\n", - "computation and the figure itself. This is likely found in section 3.2.1 'Scaled Dot-Product\n", - "Attention' (node_id: 0007), which is a subsection of 3.2 'Attention' (node_id: 0006). The parent\n", - "section 3.2 may also contain the figure and its caption, as the summary mentions Figure 2 (which is\n", - "the Scaled Dot-Product Attention figure). Therefore, both node 0006 and node 0007 are likely to\n", - "contain the answer.\n", - "\n", - "Retrieved Nodes:\n", - "\n", - "Node ID: 0006\t Pages: 3-4\t Title: 3.2 Attention\n", - "Node ID: 0007\t Pages: 4\t Title: 3.2.1 Scaled Dot-Product Attention\n" - ] - } - ], - "source": [ - "node_map = utils.create_node_mapping(tree, include_page_ranges=True, max_page=total_pages)\n", - "tree_search_result_json = json.loads(tree_search_result)\n", - "\n", - "print('Reasoning Process:\\n')\n", - "utils.print_wrapped(tree_search_result_json['thinking'])\n", - "\n", - "print('\\nRetrieved Nodes:\\n')\n", - "for node_id in tree_search_result_json[\"node_list\"]:\n", - " node_info = node_map[node_id]\n", - " node = node_info['node']\n", - " start_page = node_info['start_index']\n", - " end_page = node_info['end_index']\n", - " page_range = start_page if start_page == end_page else f\"{start_page}-{end_page}\"\n", - " print(f\"Node ID: {node['node_id']}\\t Pages: {page_range}\\t Title: {node['title']}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.3 Get corresponding PDF page images of retrieved nodes" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Retrieved 2 PDF page image(s) for visual context.\n" - ] - } - ], - "source": [ - "retrieved_nodes = tree_search_result_json[\"node_list\"]\n", - "retrieved_page_images = get_page_images_for_nodes(retrieved_nodes, node_map, page_images)\n", - "print(f'\\nRetrieved {len(retrieved_page_images)} PDF page image(s) for visual context.')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "10wOZDG_cG1O" - }, - "source": [ - "## Step 3: Answer Generation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3.1 Generate answer using VLM with visual context" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 210 - }, - "id": "tcp_PhHzcG1O", - "outputId": "187ff116-9bb0-4ab4-bacb-13944460b5ff" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generated answer using VLM with retrieved PDF page images as visual context:\n", - "\n", - "The last operation in the **Scaled Dot-Product Attention** figure is a **MatMul** (matrix\n", - "multiplication). This operation multiplies the attention weights (after softmax) by the value matrix\n", - "\\( V \\).\n" - ] - } - ], - "source": [ - "# Generate answer using VLM with only PDF page images as visual context\n", - "answer_prompt = f\"\"\"\n", - "Answer the question based on the images of the document pages as context.\n", - "\n", - "Question: {query}\n", - "\n", - "Provide a clear, concise answer based only on the context provided.\n", - "\"\"\"\n", - "\n", - "print('Generated answer using VLM with retrieved PDF page images as visual context:\\n')\n", - "answer = await call_vlm(answer_prompt, retrieved_page_images)\n", - "utils.print_wrapped(answer)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "\n", - "In this notebook, we demonstrated a *minimal* **vision-based, vectorless RAG pipeline** using PageIndex and a VLM. The system retrieves relevant pages by reasoning over the document’s hierarchical tree index and answers questions directly from PDF images — no OCR required.\n", - "\n", - "If you’re interested in building your own **reasoning-based document QA system**, try [PageIndex Chat](https://chat.pageindex.ai), or integrate via [PageIndex MCP](https://pageindex.ai/mcp) and the [API](https://docs.pageindex.ai/quickstart). You can also explore the [GitHub repo](https://github.com/VectifyAI/PageIndex) for open-source implementations and additional examples." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "© 2025 [Vectify AI](https://vectify.ai)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "TCh9BTedHJK1" + }, + "source": [ + "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nD0hb4TFHWTt" + }, + "source": [ + "
\n", + "

Reasoning-based RAG  ◦  No Vector DB  ◦  No Chunking  ◦  Human-like Retrieval

\n", + "
\n", + "\n", + "
\n", + "

\n", + " 🏠 Homepage  •  \n", + " 💻 Chat  •  \n", + " 🔌 MCP  •  \n", + " 📚 API  •  \n", + " 📦 GitHub  •  \n", + " 💬 Discord  •  \n", + " ✉️ Contact \n", + "

\n", + "
\n", + "\n", + "
\n", + "\n", + "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex)    [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n", + "\n", + "
\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> Check out our blog post, \"[Do We Still Need OCR?](https://pageindex.ai/blog/do-we-need-ocr)\", for a more detailed discussion." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ebvn5qfpcG1K" + }, + "source": [ + "# A Vision-based, Vectorless RAG System for Long Documents\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In modern document question answering (QA) systems, Optical Character Recognition (OCR) serves an important role by converting PDF pages into text that can be processed by Large Language Models (LLMs). The resulting text can provide contextual input that enables LLMs to perform question answering over document content.\n", + "\n", + "Traditional OCR systems typically use a two-stage process that first detects the layout of a PDF — dividing it into text, tables, and images — and then recognizes and converts these elements into plain text. With the rise of vision-language models (VLMs) (such as [Qwen-VL](https://github.com/QwenLM/Qwen3-VL) and [GPT-4.1](https://openai.com/index/gpt-4-1/)), new end-to-end OCR models like [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR) have emerged. These models jointly understand visual and textual information, enabling direct interpretation of PDFs without an explicit layout detection step.\n", + "\n", + "However, this paradigm shift raises an important question: \n", + "\n", + "\n", + "> **If a VLM can already process both the document images and the query to produce an answer directly, do we still need the intermediate OCR step?**\n", + "\n", + "In this notebook, we give a practical implementation of a vision-based question-answering system for long documents, without relying on OCR. Specifically, we use PageIndex as a reasoning-based retrieval layer and OpenAI's multimodal GPT-4.1 as the VLM for visual reasoning and answer generation.\n", + "\n", + "See the original [blog post](https://pageindex.ai/blog/do-we-need-ocr) for a more detailed discussion on how VLMs can replace traditional OCR pipelines in document question-answering." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📝 Notebook Overview\n", + "\n", + "This notebook demonstrates a *minimal*, **vision-based vectorless RAG** pipeline for long documents with PageIndex, using only visual context from PDF pages. You will learn how to:\n", + "- [x] Build a PageIndex tree structure of a document\n", + "- [x] Perform reasoning-based retrieval with tree search\n", + "- [x] Extract PDF page images of retrieved tree nodes for visual context\n", + "- [x] Generate answers using VLM with PDF image inputs only (no OCR required)\n", + "\n", + "> ⚡ Note: This example uses PageIndex's reasoning-based retrieval with OpenAI's multimodal GPT-4.1 model for both tree search and visual context reasoning.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7ziuTbbWcG1L" + }, + "source": [ + "## Step 0: Preparation\n", + "\n", + "This notebook demonstrates **Vision-based RAG** with PageIndex, using PDF page images as visual context for retrieval and answer generation.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "edTfrizMFK4c" + }, + "source": [ + "#### 0.1 Install PageIndex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "LaoB58wQFNDh" + }, + "outputs": [], + "source": [ + "%pip install -q --upgrade pageindex requests openai PyMuPDF" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WVEWzPKGcG1M" + }, + "source": [ + "#### 0.2 Setup PageIndex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "StvqfcK4cG1M" + }, + "outputs": [], + "source": [ + "from pageindex import PageIndexClient\n", + "import pageindex.utils as utils\n", + "\n", + "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n", + "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n", + "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 0.3 Setup VLM\n", + "\n", + "Choose your preferred VLM — in this notebook, we use OpenAI's multimodal GPT-4.1 as the VLM." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "import fitz\n", + "import base64\n", + "import os\n", + "\n", + "# Setup OpenAI client\n", + "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n", + "\n", + "async def call_vlm(prompt, image_paths=None, model=\"gpt-4.1\"):\n", + " client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)\n", + " messages = [{\"role\": \"user\", \"content\": prompt}]\n", + " if image_paths:\n", + " content = [{\"type\": \"text\", \"text\": prompt}]\n", + " for image in image_paths:\n", + " if os.path.exists(image):\n", + " with open(image, \"rb\") as image_file:\n", + " image_data = base64.b64encode(image_file.read()).decode('utf-8')\n", + " content.append({\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": f\"data:image/jpeg;base64,{image_data}\"\n", + " }\n", + " })\n", + " messages[0][\"content\"] = content\n", + " response = await client.chat.completions.create(model=model, messages=messages, temperature=0)\n", + " return response.choices[0].message.content.strip()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 0.4 PDF Image Extraction Helper Functions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_pdf_page_images(pdf_path, output_dir=\"pdf_images\"):\n", + " os.makedirs(output_dir, exist_ok=True)\n", + " pdf_document = fitz.open(pdf_path)\n", + " page_images = {}\n", + " total_pages = len(pdf_document)\n", + " for page_number in range(len(pdf_document)):\n", + " page = pdf_document.load_page(page_number)\n", + " # Convert page to image\n", + " mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality\n", + " pix = page.get_pixmap(matrix=mat)\n", + " img_data = pix.tobytes(\"jpeg\")\n", + " image_path = os.path.join(output_dir, f\"page_{page_number + 1}.jpg\")\n", + " with open(image_path, \"wb\") as image_file:\n", + " image_file.write(img_data)\n", + " page_images[page_number + 1] = image_path\n", + " print(f\"Saved page {page_number + 1} image: {image_path}\")\n", + " pdf_document.close()\n", + " return page_images, total_pages\n", + "\n", + "def get_page_images_for_nodes(node_list, node_map, page_images):\n", + " # Get PDF page images for retrieved nodes\n", + " image_paths = []\n", + " seen_pages = set()\n", + " for node_id in node_list:\n", + " node_info = node_map[node_id]\n", + " for page_num in range(node_info['start_index'], node_info['end_index'] + 1):\n", + " if page_num not in seen_pages:\n", + " image_paths.append(page_images[page_num])\n", + " seen_pages.add(page_num)\n", + " return image_paths\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "heGtIMOVcG1N" + }, + "source": [ + "## Step 1: PageIndex Tree Generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mzd1VWjwMUJL" + }, + "source": [ + "#### 1.1 Submit a document for generating PageIndex tree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f6--eZPLcG1N", + "outputId": "ca688cfd-6c4b-4a57-dac2-f3c2604c4112" + }, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "# You can also use our GitHub repo to generate PageIndex tree\n", + "# https://github.com/VectifyAI/PageIndex\n", + "\n", + "pdf_url = \"https://arxiv.org/pdf/1706.03762.pdf\" # the \"Attention Is All You Need\" paper\n", + "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n", + "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n", + "\n", + "response = requests.get(pdf_url)\n", + "with open(pdf_path, \"wb\") as f:\n", + " f.write(response.content)\n", + "print(f\"Downloaded {pdf_url}\\n\")\n", + "\n", + "# Extract page images from PDF\n", + "print(\"Extracting page images...\")\n", + "page_images, total_pages = extract_pdf_page_images(pdf_path)\n", + "print(f\"Extracted {len(page_images)} page images from {total_pages} total pages.\\n\")\n", + "\n", + "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n", + "print('Document Submitted:', doc_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4-Hrh0azcG1N" + }, + "source": [ + "#### 1.2 Get the generated PageIndex tree structure" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "b1Q1g6vrcG1O", + "outputId": "dc944660-38ad-47ea-d358-be422edbae53" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Simplified Tree Structure of the Document:\n", + "[{'title': 'Attention Is All You Need',\n", + " 'node_id': '0000',\n", + " 'page_index': 1,\n", + " 'prefix_summary': '# Attention Is All You Need\\n\\nAshish Vasw...',\n", + " 'nodes': [{'title': 'Abstract',\n", + " 'node_id': '0001',\n", + " 'page_index': 1,\n", + " 'summary': 'The text introduces the Transformer, a n...'},\n", + " {'title': '1 Introduction',\n", + " 'node_id': '0002',\n", + " 'page_index': 2,\n", + " 'summary': 'The text introduces the Transformer, a n...'},\n", + " {'title': '2 Background',\n", + " 'node_id': '0003',\n", + " 'page_index': 2,\n", + " 'summary': 'This section discusses the Transformer m...'},\n", + " {'title': '3 Model Architecture',\n", + " 'node_id': '0004',\n", + " 'page_index': 2,\n", + " 'prefix_summary': 'The text describes the encoder-decoder a...',\n", + " 'nodes': [{'title': '3.1 Encoder and Decoder Stacks',\n", + " 'node_id': '0005',\n", + " 'page_index': 3,\n", + " 'summary': 'The text describes the encoder and decod...'},\n", + " {'title': '3.2 Attention',\n", + " 'node_id': '0006',\n", + " 'page_index': 3,\n", + " 'prefix_summary': '### 3.2 Attention\\n\\nAn attention function...',\n", + " 'nodes': [{'title': '3.2.1 Scaled Dot-Product Attention',\n", + " 'node_id': '0007',\n", + " 'page_index': 4,\n", + " 'summary': 'The text describes Scaled Dot-Product At...'},\n", + " {'title': '3.2.2 Multi-Head Attention',\n", + " 'node_id': '0008',\n", + " 'page_index': 4,\n", + " 'summary': 'The text describes Multi-Head Attention,...'},\n", + " {'title': '3.2.3 Applications of Attention in our M...',\n", + " 'node_id': '0009',\n", + " 'page_index': 5,\n", + " 'summary': 'The text describes the three application...'}]},\n", + " {'title': '3.3 Position-wise Feed-Forward Networks',\n", + " 'node_id': '0010',\n", + " 'page_index': 5,\n", + " 'summary': '### 3.3 Position-wise Feed-Forward Netwo...'},\n", + " {'title': '3.4 Embeddings and Softmax',\n", + " 'node_id': '0011',\n", + " 'page_index': 5,\n", + " 'summary': 'The text describes the use of learned em...'},\n", + " {'title': '3.5 Positional Encoding',\n", + " 'node_id': '0012',\n", + " 'page_index': 6,\n", + " 'summary': 'This section explains the necessity of p...'}]},\n", + " {'title': '4 Why Self-Attention',\n", + " 'node_id': '0013',\n", + " 'page_index': 6,\n", + " 'summary': 'This text compares self-attention layers...'},\n", + " {'title': '5 Training',\n", + " 'node_id': '0014',\n", + " 'page_index': 7,\n", + " 'prefix_summary': '## 5 Training\\n\\nThis section describes th...',\n", + " 'nodes': [{'title': '5.1 Training Data and Batching',\n", + " 'node_id': '0015',\n", + " 'page_index': 7,\n", + " 'summary': '### 5.1 Training Data and Batching\\n\\nWe t...'},\n", + " {'title': '5.2 Hardware and Schedule',\n", + " 'node_id': '0016',\n", + " 'page_index': 7,\n", + " 'summary': '### 5.2 Hardware and Schedule\\n\\nWe traine...'},\n", + " {'title': '5.3 Optimizer',\n", + " 'node_id': '0017',\n", + " 'page_index': 7,\n", + " 'summary': '### 5.3 Optimizer\\n\\nWe used the Adam opti...'},\n", + " {'title': '5.4 Regularization',\n", + " 'node_id': '0018',\n", + " 'page_index': 7,\n", + " 'summary': 'The text details three regularization te...'}]},\n", + " {'title': '6 Results',\n", + " 'node_id': '0019',\n", + " 'page_index': 8,\n", + " 'prefix_summary': '## 6 Results\\n',\n", + " 'nodes': [{'title': '6.1 Machine Translation',\n", + " 'node_id': '0020',\n", + " 'page_index': 8,\n", + " 'summary': 'The text details the performance of a Tr...'},\n", + " {'title': '6.2 Model Variations',\n", + " 'node_id': '0021',\n", + " 'page_index': 8,\n", + " 'summary': 'This text details experiments varying co...'},\n", + " {'title': '6.3 English Constituency Parsing',\n", + " 'node_id': '0022',\n", + " 'page_index': 9,\n", + " 'summary': 'The text describes experiments evaluatin...'}]},\n", + " {'title': '7 Conclusion',\n", + " 'node_id': '0023',\n", + " 'page_index': 10,\n", + " 'summary': 'This text concludes by presenting the Tr...'},\n", + " {'title': 'References',\n", + " 'node_id': '0024',\n", + " 'page_index': 10,\n", + " 'summary': 'The provided text is a collection of ref...'},\n", + " {'title': 'Attention Visualizations',\n", + " 'node_id': '0025',\n", + " 'page_index': 13,\n", + " 'summary': 'The text provides examples of attention ...'}]}]\n" + ] } - ], - "metadata": { + ], + "source": [ + "if pi_client.is_retrieval_ready(doc_id):\n", + " tree = pi_client.get_tree(doc_id, node_summary=True)['result']\n", + " print('Simplified Tree Structure of the Document:')\n", + " utils.print_tree(tree, exclude_fields=['text'])\n", + "else:\n", + " print(\"Processing document, please try again later...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "USoCLOiQcG1O" + }, + "source": [ + "## Step 2: Reasoning-Based Retrieval with Tree Search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.1 Reasoning-based retrieval with PageIndex to identify nodes that might contain relevant context" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LLHNJAtTcG1O" + }, + "outputs": [], + "source": [ + "import json\n", + "\n", + "query = \"What is the last operation in the Scaled Dot-Product Attention figure?\"\n", + "\n", + "tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])\n", + "\n", + "search_prompt = f\"\"\"\n", + "You are given a question and a tree structure of a document.\n", + "Each node contains a node id, node title, and a corresponding summary.\n", + "Your task is to find all tree nodes that are likely to contain the answer to the question.\n", + "\n", + "Question: {query}\n", + "\n", + "Document tree structure:\n", + "{json.dumps(tree_without_text, indent=2)}\n", + "\n", + "Please reply in the following JSON format:\n", + "{{\n", + " \"thinking\": \"\",\n", + " \"node_list\": [\"node_id_1\", \"node_id_2\", ..., \"node_id_n\"]\n", + "}}\n", + "Directly return the final JSON structure. Do not output anything else.\n", + "\"\"\"\n", + "\n", + "tree_search_result = await call_vlm(search_prompt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.2 Print retrieved nodes and reasoning process" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "P8DVUOuAen5u", + "outputId": "6bb6d052-ef30-4716-f88e-be98bcb7ebdb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reasoning Process:\n", + "\n", + "The question asks about the last operation in the Scaled Dot-Product Attention figure. The most\n", + "relevant section is the one that describes Scaled Dot-Product Attention in detail, including its\n", + "computation and the figure itself. This is likely found in section 3.2.1 'Scaled Dot-Product\n", + "Attention' (node_id: 0007), which is a subsection of 3.2 'Attention' (node_id: 0006). The parent\n", + "section 3.2 may also contain the figure and its caption, as the summary mentions Figure 2 (which is\n", + "the Scaled Dot-Product Attention figure). Therefore, both node 0006 and node 0007 are likely to\n", + "contain the answer.\n", + "\n", + "Retrieved Nodes:\n", + "\n", + "Node ID: 0006\t Pages: 3-4\t Title: 3.2 Attention\n", + "Node ID: 0007\t Pages: 4\t Title: 3.2.1 Scaled Dot-Product Attention\n" + ] + } + ], + "source": [ + "node_map = utils.create_node_mapping(tree, include_page_ranges=True, max_page=total_pages)\n", + "tree_search_result_json = json.loads(tree_search_result)\n", + "\n", + "print('Reasoning Process:\\n')\n", + "utils.print_wrapped(tree_search_result_json['thinking'])\n", + "\n", + "print('\\nRetrieved Nodes:\\n')\n", + "for node_id in tree_search_result_json[\"node_list\"]:\n", + " node_info = node_map[node_id]\n", + " node = node_info['node']\n", + " start_page = node_info['start_index']\n", + " end_page = node_info['end_index']\n", + " page_range = start_page if start_page == end_page else f\"{start_page}-{end_page}\"\n", + " print(f\"Node ID: {node['node_id']}\\t Pages: {page_range}\\t Title: {node['title']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.3 Get corresponding PDF page images of retrieved nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Retrieved 2 PDF page image(s) for visual context.\n" + ] + } + ], + "source": [ + "retrieved_nodes = tree_search_result_json[\"node_list\"]\n", + "retrieved_page_images = get_page_images_for_nodes(retrieved_nodes, node_map, page_images)\n", + "print(f'\\nRetrieved {len(retrieved_page_images)} PDF page image(s) for visual context.')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "10wOZDG_cG1O" + }, + "source": [ + "## Step 3: Answer Generation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.1 Generate answer using VLM with visual context" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 210 + }, + "id": "tcp_PhHzcG1O", + "outputId": "187ff116-9bb0-4ab4-bacb-13944460b5ff" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated answer using VLM with retrieved PDF page images as visual context:\n", + "\n", + "The last operation in the **Scaled Dot-Product Attention** figure is a **MatMul** (matrix\n", + "multiplication). This operation multiplies the attention weights (after softmax) by the value matrix\n", + "\\( V \\).\n" + ] } + ], + "source": [ + "# Generate answer using VLM with only PDF page images as visual context\n", + "answer_prompt = f\"\"\"\n", + "Answer the question based on the images of the document pages as context.\n", + "\n", + "Question: {query}\n", + "\n", + "Provide a clear, concise answer based only on the context provided.\n", + "\"\"\"\n", + "\n", + "print('Generated answer using VLM with retrieved PDF page images as visual context:\\n')\n", + "answer = await call_vlm(answer_prompt, retrieved_page_images)\n", + "utils.print_wrapped(answer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "In this notebook, we demonstrated a *minimal* **vision-based, vectorless RAG pipeline** using PageIndex and a VLM. The system retrieves relevant pages by reasoning over the document’s hierarchical tree index and answers questions directly from PDF images — no OCR required.\n", + "\n", + "If you’re interested in building your own **reasoning-based document QA system**, try [PageIndex Chat](https://chat.pageindex.ai), or integrate via [PageIndex MCP](https://pageindex.ai/mcp) and the [API](https://docs.pageindex.ai/quickstart). You can also explore the [GitHub repo](https://github.com/VectifyAI/PageIndex) for open-source implementations and additional examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "© 2025 [Vectify AI](https://vectify.ai)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/examples/agentic_vectorless_rag_demo.py b/examples/agentic_vectorless_rag_demo.py index b4ed9c2f8..0722b574b 100644 --- a/examples/agentic_vectorless_rag_demo.py +++ b/examples/agentic_vectorless_rag_demo.py @@ -28,7 +28,6 @@ sys.path.insert(0, str(Path(__file__).parent.parent)) from agents import Agent, Runner, function_tool, set_tracing_disabled -from agents.model_settings import ModelSettings from agents.stream_events import RawResponsesStreamEvent, RunItemStreamEvent from openai.types.responses import ResponseTextDeltaEvent, ResponseReasoningSummaryTextDeltaEvent @@ -83,7 +82,6 @@ def get_page_content(pages: str) -> str: instructions=AGENT_SYSTEM_PROMPT, tools=[get_document, get_document_structure, get_page_content], model=client.retrieve_model, - # model_settings=ModelSettings(reasoning={"effort": "low", "summary": "auto"}), # Uncomment to enable reasoning ) async def _run(): diff --git a/pageindex/__init__.py b/pageindex/__init__.py index 658003bf5..59cf6704d 100644 --- a/pageindex/__init__.py +++ b/pageindex/__init__.py @@ -1,4 +1,14 @@ -from .page_index import * +from .page_index import page_index, page_index_main from .page_index_md import md_to_tree from .retrieve import get_document, get_document_structure, get_page_content from .client import PageIndexClient + +__all__ = [ + "page_index", + "page_index_main", + "md_to_tree", + "get_document", + "get_document_structure", + "get_page_content", + "PageIndexClient", +] diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 9004309fb..77ddfcbc2 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1,12 +1,34 @@ -import os -import json +import asyncio import copy +import json import math +import os import random import re -from .utils import * -import os -from concurrent.futures import ThreadPoolExecutor, as_completed +from io import BytesIO + +from .utils import ( + ConfigLoader, + JsonLogger, + add_node_text, + add_preface_if_needed, + convert_page_to_int, + convert_physical_index_to_int, + count_tokens, + create_clean_structure_for_description, + extract_json, + format_structure, + generate_doc_description, + generate_summaries_for_structure, + get_json_content, + get_page_tokens, + get_pdf_name, + llm_acompletion, + llm_completion, + post_processing, + remove_structure_text, + write_node_id, +) ################### check title in page ######################################################### @@ -123,15 +145,15 @@ def toc_detector_single_page(content, model=None): def check_if_toc_extraction_is_complete(content, toc, model=None): - prompt = f""" + prompt = """ You are given a partial document and a table of contents. Your job is to check if the table of contents is complete, which it contains all the main sections in the partial document. Reply format: - {{ + { "thinking": "completed": "yes" or "no" - }} + } Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc @@ -141,15 +163,15 @@ def check_if_toc_extraction_is_complete(content, toc, model=None): def check_if_toc_transformation_is_complete(content, toc, model=None): - prompt = f""" + prompt = """ You are given a raw table of contents and a table of contents. Your job is to check if the table of contents is complete. Reply format: - {{ + { "thinking": "completed": "yes" or "no" - }} + } Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc @@ -175,7 +197,7 @@ def extract_toc_content(content, model=None): {"role": "user", "content": prompt}, {"role": "assistant", "content": response}, ] - prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" + prompt = """please continue the generation of table of contents , directly output the remaining part of the structure""" new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) @@ -186,13 +208,13 @@ def extract_toc_content(content, model=None): while not (if_complete == "yes" and finish_reason == "finished"): attempt += 1 if attempt > max_attempts: - raise Exception('Failed to complete table of contents after maximum retries') + raise RuntimeError('Failed to complete table of contents after maximum retries') chat_history = [ {"role": "user", "content": prompt}, {"role": "assistant", "content": response}, ] - prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" + prompt = """please continue the generation of table of contents , directly output the remaining part of the structure""" new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) @@ -305,7 +327,7 @@ def toc_transformer(toc_content, model=None): while not (if_complete == "yes" and finish_reason == "finished"): attempt += 1 if attempt > max_attempts: - raise Exception('Failed to complete toc transformation after maximum retries') + raise RuntimeError('Failed to complete toc transformation after maximum retries') position = last_complete.rfind('}') if position != -1: last_complete = last_complete[:position+2] @@ -368,7 +390,7 @@ def find_toc_pages(start_page_index, page_list, opt, logger=None): def remove_page_number(data): if isinstance(data, dict): data.pop('page_number', None) - for key in list(data.keys()): + for key in data.keys(): if 'nodes' in key: remove_page_number(data[key]) elif isinstance(data, list): @@ -536,7 +558,7 @@ def generate_toc_continue(toc_content, part, model=None): if finish_reason == 'finished': return extract_json(response) else: - raise Exception(f'finish reason: {finish_reason}') + raise RuntimeError(f'finish reason: {finish_reason}') ### add verify completeness def generate_toc_init(part, model=None): @@ -571,7 +593,7 @@ def generate_toc_init(part, model=None): if finish_reason == 'finished': return extract_json(response) else: - raise Exception(f'finish reason: {finish_reason}') + raise RuntimeError(f'finish reason: {finish_reason}') def process_no_toc(page_list, start_index=1, model=None, logger=None): page_contents=[] @@ -594,7 +616,7 @@ def process_no_toc(page_list, start_index=1, model=None, logger=None): return toc_with_page_number -def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None, logger=None): +def process_toc_no_page_numbers(toc_content, _toc_page_list, page_list, start_index=1, model=None, logger=None): page_contents=[] token_lengths=[] toc_content = toc_transformer(toc_content, model) @@ -843,7 +865,6 @@ async def process_and_check_item(incorrect_item): for item, result in zip(incorrect_results, results): if isinstance(result, Exception): print(f"Processing item {item} generated an exception: {result}") - continue results = [result for result in results if not isinstance(result, Exception)] # Update the toc_with_page_number with the fixed indices and check for any invalid results @@ -897,7 +918,7 @@ async def fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorr ################### verify toc ######################################################### -async def verify_toc(page_list, list_result, start_index=1, N=None, model=None): +async def verify_toc(page_list, list_result, start_index=1, n=None, model=None): print('start verify_toc') # Find the last non-None physical_index last_physical_index = None @@ -905,19 +926,19 @@ async def verify_toc(page_list, list_result, start_index=1, N=None, model=None): if item.get('physical_index') is not None: last_physical_index = item['physical_index'] break - + # Early return if we don't have valid physical indices if last_physical_index is None or last_physical_index < len(page_list)/2: return 0, [] - + # Determine which items to check - if N is None: + if n is None: print('check all items') sample_indices = range(0, len(list_result)) else: - N = min(N, len(list_result)) - print(f'check {N} items') - sample_indices = random.sample(range(0, len(list_result)), N) + n = min(n, len(list_result)) + print(f'check {n} items') + sample_indices = random.sample(range(0, len(list_result)), n) # Prepare items with their list indices indexed_sample_list = [] @@ -983,7 +1004,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N 'accuracy': accuracy, 'incorrect_results': incorrect_results }) - if accuracy == 1.0 and len(incorrect_results) == 0: + if accuracy >= 1.0 and len(incorrect_results) == 0: return toc_with_page_number if accuracy > 0.6 and len(incorrect_results) > 0: toc_with_page_number, incorrect_results = await fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results,start_index=start_index, max_attempts=3, model=opt.model, logger=logger) @@ -994,7 +1015,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N elif mode == 'process_toc_no_page_numbers': return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger) else: - raise Exception('Processing failed') + raise RuntimeError('Processing failed') async def process_large_node_recursively(node, page_list, opt=None, logger=None): @@ -1026,7 +1047,7 @@ async def process_large_node_recursively(node, page_list, opt=None, logger=None) return node -async def tree_parser(page_list, opt, doc=None, logger=None): +async def tree_parser(page_list, opt, _doc=None, logger=None): check_toc_result = check_toc(page_list, opt) logger.info(check_toc_result) @@ -1080,7 +1101,7 @@ def page_index_main(doc, opt=None): logger.info({'total_token': sum([page[1] for page in page_list])}) async def page_index_builder(): - structure = await tree_parser(page_list, opt, doc=doc, logger=logger) + structure = await tree_parser(page_list, opt, _doc=doc, logger=logger) if opt.if_add_node_id == 'yes': write_node_id(structure) if opt.if_add_node_text == 'yes': diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py index 5a5971690..47da307ef 100644 --- a/pageindex/page_index_md.py +++ b/pageindex/page_index_md.py @@ -1,11 +1,33 @@ import asyncio import json -import re import os +import re +from pathlib import Path + try: - from .utils import * -except: - from utils import * + from .utils import ( + count_tokens, + create_clean_structure_for_description, + format_structure, + generate_doc_description, + generate_node_summary, + print_json, + print_toc, + structure_to_list, + write_node_id, + ) +except ImportError: + from utils import ( + count_tokens, + create_clean_structure_for_description, + format_structure, + generate_doc_description, + generate_node_summary, + print_json, + print_toc, + structure_to_list, + write_node_id, + ) async def get_node_summary(node, summary_token_threshold=200, model=None): node_text = node.get('text') @@ -241,34 +263,33 @@ def clean_tree_for_output(tree_nodes): async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'): - with open(md_path, 'r', encoding='utf-8') as f: - markdown_content = f.read() + markdown_content = await asyncio.to_thread(Path(md_path).read_text, encoding='utf-8') line_count = markdown_content.count('\n') + 1 - print(f"Extracting nodes from markdown...") + print("Extracting nodes from markdown...") node_list, markdown_lines = extract_nodes_from_markdown(markdown_content) - print(f"Extracting text content from nodes...") + print("Extracting text content from nodes...") nodes_with_content = extract_node_text_content(node_list, markdown_lines) if if_thinning: nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model) - print(f"Thinning nodes...") + print("Thinning nodes...") nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model) - print(f"Building tree from nodes...") + print("Building tree from nodes...") tree_structure = build_tree_from_nodes(nodes_with_content) if if_add_node_id == 'yes': write_node_id(tree_structure) - print(f"Formatting tree structure...") + print("Formatting tree structure...") if if_add_node_summary == 'yes': # Always include text for summary generation tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes']) - print(f"Generating summaries for each node...") + print("Generating summaries for each node...") tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model) if if_add_node_text == 'no': @@ -276,7 +297,7 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes']) if if_add_doc_description == 'yes': - print(f"Generating document description...") + print("Generating document description...") # Create a clean structure without unnecessary fields for description generation clean_structure = create_clean_structure_for_description(tree_structure) doc_description = generate_doc_description(clean_structure, model=model) diff --git a/pageindex/utils.py b/pageindex/utils.py index f00ccf3a7..4adddea0c 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -1,62 +1,71 @@ -import litellm +import asyncio +import copy +import json import logging import os +import re import textwrap -from datetime import datetime import time -import json -import PyPDF2 -import copy -import asyncio -import pymupdf +from datetime import datetime from io import BytesIO -from dotenv import load_dotenv -load_dotenv() -import logging -import yaml from pathlib import Path from types import SimpleNamespace as config +import litellm +import pymupdf +import pypdf +import yaml +from dotenv import load_dotenv + +load_dotenv() + # Backward compatibility: support CHATGPT_API_KEY as alias for OPENAI_API_KEY if not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY") litellm.drop_params = True + def count_tokens(text, model=None): if not text: return 0 return litellm.token_counter(model=model, text=text) +def _build_messages(prompt, chat_history=None): + base = [{"role": "user", "content": prompt}] + return list(chat_history) + base if chat_history else base + + +def _extract_completion_result(response, return_finish_reason): + content = response.choices[0].message.content + if return_finish_reason: + finish_reason = ( + "max_output_reached" + if response.choices[0].finish_reason == "length" + else "finished" + ) + return content, finish_reason + return content + + def llm_completion(model, prompt, chat_history=None, return_finish_reason=False): if model: model = model.removeprefix("litellm/") + messages = _build_messages(prompt, chat_history) max_retries = 10 - messages = list(chat_history) + [{"role": "user", "content": prompt}] if chat_history else [{"role": "user", "content": prompt}] for i in range(max_retries): try: - response = litellm.completion( - model=model, - messages=messages, - temperature=0, - ) - content = response.choices[0].message.content - if return_finish_reason: - finish_reason = "max_output_reached" if response.choices[0].finish_reason == "length" else "finished" - return content, finish_reason - return content + response = litellm.completion(model=model, messages=messages, temperature=0) + return _extract_completion_result(response, return_finish_reason) except Exception as e: - print('************* Retrying *************') + print("************* Retrying *************") logging.error(f"Error: {e}") if i < max_retries - 1: time.sleep(1) else: - logging.error('Max retries reached for prompt: ' + prompt) - if return_finish_reason: - return "", "error" - return "" - + logging.error("Max retries reached for prompt: " + prompt) + return ("", "error") if return_finish_reason else "" async def llm_acompletion(model, prompt): @@ -73,81 +82,77 @@ async def llm_acompletion(model, prompt): ) return response.choices[0].message.content except Exception as e: - print('************* Retrying *************') + print("************* Retrying *************") logging.error(f"Error: {e}") if i < max_retries - 1: await asyncio.sleep(1) else: - logging.error('Max retries reached for prompt: ' + prompt) + logging.error("Max retries reached for prompt: " + prompt) return "" - - + + def get_json_content(response): start_idx = response.find("```json") if start_idx != -1: start_idx += 7 response = response[start_idx:] - + end_idx = response.rfind("```") if end_idx != -1: response = response[:end_idx] - + json_content = response.strip() return json_content - + def extract_json(content): try: - # First, try to extract JSON enclosed within ```json and ``` start_idx = content.find("```json") if start_idx != -1: - start_idx += 7 # Adjust index to start after the delimiter + start_idx += 7 end_idx = content.rfind("```") json_content = content[start_idx:end_idx].strip() else: - # If no delimiters, assume entire content could be JSON json_content = content.strip() - # Clean up common issues that might cause parsing errors - json_content = json_content.replace('None', 'null') # Replace Python None with JSON null - json_content = json_content.replace('\n', ' ').replace('\r', ' ') # Remove newlines - json_content = ' '.join(json_content.split()) # Normalize whitespace + json_content = json_content.replace("None", "null") + json_content = json_content.replace("\n", " ").replace("\r", " ") + json_content = " ".join(json_content.split()) - # Attempt to parse and return the JSON object return json.loads(json_content) except json.JSONDecodeError as e: logging.error(f"Failed to extract JSON: {e}") - # Try to clean up the content further if initial parsing fails try: - # Remove any trailing commas before closing brackets/braces - json_content = json_content.replace(',]', ']').replace(',}', '}') + json_content = json_content.replace(",]", "]").replace(",}", "}") return json.loads(json_content) - except: + except Exception: logging.error("Failed to parse JSON even after cleanup") return {} except Exception as e: logging.error(f"Unexpected error while extracting JSON: {e}") return {} + def write_node_id(data, node_id=0): if isinstance(data, dict): - data['node_id'] = str(node_id).zfill(4) + data["node_id"] = str(node_id).zfill(4) node_id += 1 - for key in list(data.keys()): - if 'nodes' in key: + for key in data.keys(): + if "nodes" in key: node_id = write_node_id(data[key], node_id) elif isinstance(data, list): for index in range(len(data)): node_id = write_node_id(data[index], node_id) return node_id + def get_nodes(structure): if isinstance(structure, dict): structure_node = copy.deepcopy(structure) - structure_node.pop('nodes', None) + structure_node.pop("nodes", None) nodes = [structure_node] - for key in list(structure.keys()): - if 'nodes' in key: + for key in structure.keys(): + if "nodes" in key: nodes.extend(get_nodes(structure[key])) return nodes elif isinstance(structure, list): @@ -155,13 +160,14 @@ def get_nodes(structure): for item in structure: nodes.extend(get_nodes(item)) return nodes - + + def structure_to_list(structure): if isinstance(structure, dict): nodes = [] nodes.append(structure) - if 'nodes' in structure: - nodes.extend(structure_to_list(structure['nodes'])) + if "nodes" in structure: + nodes.extend(structure_to_list(structure["nodes"])) return nodes elif isinstance(structure, list): nodes = [] @@ -169,17 +175,17 @@ def structure_to_list(structure): nodes.extend(structure_to_list(item)) return nodes - + def get_leaf_nodes(structure): if isinstance(structure, dict): - if not structure['nodes']: + if not structure["nodes"]: structure_node = copy.deepcopy(structure) - structure_node.pop('nodes', None) + structure_node.pop("nodes", None) return [structure_node] else: leaf_nodes = [] - for key in list(structure.keys()): - if 'nodes' in key: + for key in structure.keys(): + if "nodes" in key: leaf_nodes.extend(get_leaf_nodes(structure[key])) return leaf_nodes elif isinstance(structure, list): @@ -188,117 +194,108 @@ def get_leaf_nodes(structure): leaf_nodes.extend(get_leaf_nodes(item)) return leaf_nodes -def is_leaf_node(data, node_id): - # Helper function to find the node by its node_id - def find_node(data, node_id): - if isinstance(data, dict): - if data.get('node_id') == node_id: - return data - for key in data.keys(): - if 'nodes' in key: - result = find_node(data[key], node_id) - if result: - return result - elif isinstance(data, list): - for item in data: - result = find_node(item, node_id) + +def _find_node_by_id(data, node_id): + """Recursively search for a node with the given node_id.""" + if isinstance(data, dict): + if data.get("node_id") == node_id: + return data + for key in data.keys(): + if "nodes" in key: + result = _find_node_by_id(data[key], node_id) if result: return result - return None + elif isinstance(data, list): + for item in data: + result = _find_node_by_id(item, node_id) + if result: + return result + return None - # Find the node with the given node_id - node = find_node(data, node_id) - # Check if the node is a leaf node - if node and not node.get('nodes'): - return True - return False +def is_leaf_node(data, node_id): + node = _find_node_by_id(data, node_id) + return bool(node and not node.get("nodes")) + def get_last_node(structure): return structure[-1] def extract_text_from_pdf(pdf_path): - pdf_reader = PyPDF2.PdfReader(pdf_path) - ###return text not list - text="" + pdf_reader = pypdf.PdfReader(pdf_path) + text = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] - text+=page.extract_text() + text += page.extract_text() return text + def get_pdf_title(pdf_path): - pdf_reader = PyPDF2.PdfReader(pdf_path) + pdf_reader = pypdf.PdfReader(pdf_path) meta = pdf_reader.metadata - title = meta.title if meta and meta.title else 'Untitled' + title = meta.title if meta and meta.title else "Untitled" return title + def get_text_of_pages(pdf_path, start_page, end_page, tag=True): - pdf_reader = PyPDF2.PdfReader(pdf_path) + pdf_reader = pypdf.PdfReader(pdf_path) text = "" - for page_num in range(start_page-1, end_page): + for page_num in range(start_page - 1, end_page): page = pdf_reader.pages[page_num] page_text = page.extract_text() if tag: - text += f"\n{page_text}\n\n" + text += f"\n{page_text}\n\n" else: text += page_text return text + def get_first_start_page_from_text(text): start_page = -1 - start_page_match = re.search(r'', text) + start_page_match = re.search(r"", text) if start_page_match: start_page = int(start_page_match.group(1)) return start_page + def get_last_start_page_from_text(text): start_page = -1 - # Find all matches of start_index tags - start_page_matches = re.finditer(r'', text) - # Convert iterator to list and get the last match if any exist + start_page_matches = re.finditer(r"", text) matches_list = list(start_page_matches) if matches_list: start_page = int(matches_list[-1].group(1)) return start_page -def sanitize_filename(filename, replacement='-'): - # In Linux, only '/' and '\0' (null) are invalid in filenames. - # Null can't be represented in strings, so we only handle '/'. - return filename.replace('/', replacement) +def sanitize_filename(filename, replacement="-"): + return filename.replace("/", replacement) + def get_pdf_name(pdf_path): - # Extract PDF name if isinstance(pdf_path, str): pdf_name = os.path.basename(pdf_path) elif isinstance(pdf_path, BytesIO): - pdf_reader = PyPDF2.PdfReader(pdf_path) + pdf_reader = pypdf.PdfReader(pdf_path) meta = pdf_reader.metadata - pdf_name = meta.title if meta and meta.title else 'Untitled' + pdf_name = meta.title if meta and meta.title else "Untitled" pdf_name = sanitize_filename(pdf_name) return pdf_name class JsonLogger: def __init__(self, file_path): - # Extract PDF name for logger name pdf_name = get_pdf_name(file_path) - current_time = datetime.now().strftime("%Y%m%d_%H%M%S") self.filename = f"{pdf_name}_{current_time}.json" os.makedirs("./logs", exist_ok=True) - # Initialize empty list to store all messages self.log_data = [] - def log(self, level, message, **kwargs): + def log(self, _level, message, **kwargs): if isinstance(message, dict): self.log_data.append(message) else: - self.log_data.append({'message': message}) - # Add new message to the log data - - # Write entire log data to file + self.log_data.append({"message": message}) with open(self._filepath(), "w") as f: json.dump(self.log_data, f, indent=2) @@ -317,63 +314,53 @@ def exception(self, message, **kwargs): def _filepath(self): return os.path.join("logs", self.filename) - +def _clean_tree_node(node): + """Remove empty 'nodes' arrays from a tree node recursively.""" + if not node["nodes"]: + del node["nodes"] + else: + for child in node["nodes"]: + _clean_tree_node(child) + return node + def list_to_tree(data): def get_parent_structure(structure): - """Helper function to get the parent structure code""" if not structure: return None - parts = str(structure).split('.') - return '.'.join(parts[:-1]) if len(parts) > 1 else None - - # First pass: Create nodes and track parent-child relationships + parts = str(structure).split(".") + return ".".join(parts[:-1]) if len(parts) > 1 else None + nodes = {} root_nodes = [] - + for item in data: - structure = item.get('structure') + structure = item.get("structure") node = { - 'title': item.get('title'), - 'start_index': item.get('start_index'), - 'end_index': item.get('end_index'), - 'nodes': [] + "title": item.get("title"), + "start_index": item.get("start_index"), + "end_index": item.get("end_index"), + "nodes": [], } - nodes[structure] = node - - # Find parent parent_structure = get_parent_structure(structure) - if parent_structure: - # Add as child to parent if parent exists if parent_structure in nodes: - nodes[parent_structure]['nodes'].append(node) + nodes[parent_structure]["nodes"].append(node) else: root_nodes.append(node) else: - # No parent, this is a root node root_nodes.append(node) - - # Helper function to clean empty children arrays - def clean_node(node): - if not node['nodes']: - del node['nodes'] - else: - for child in node['nodes']: - clean_node(child) - return node - - # Clean and return the tree - return [clean_node(node) for node in root_nodes] + + return [_clean_tree_node(node) for node in root_nodes] + def add_preface_if_needed(data): if not isinstance(data, list) or not data: return data - - if data[0]['physical_index'] is not None and data[0]['physical_index'] > 1: + if data[0]["physical_index"] is not None and data[0]["physical_index"] > 1: preface_node = { "structure": "0", "title": "Preface", @@ -383,10 +370,9 @@ def add_preface_if_needed(data): return data - def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): if pdf_parser == "PyPDF2": - pdf_reader = PyPDF2.PdfReader(pdf_path) + pdf_reader = pypdf.PdfReader(pdf_path) page_list = [] for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] @@ -396,9 +382,12 @@ def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): return page_list elif pdf_parser == "PyMuPDF": if isinstance(pdf_path, BytesIO): - pdf_stream = pdf_path - doc = pymupdf.open(stream=pdf_stream, filetype="pdf") - elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"): + doc = pymupdf.open(stream=pdf_path, filetype="pdf") + elif ( + isinstance(pdf_path, str) + and os.path.isfile(pdf_path) + and pdf_path.lower().endswith(".pdf") + ): doc = pymupdf.open(pdf_path) page_list = [] for page in doc: @@ -409,73 +398,76 @@ def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): else: raise ValueError(f"Unsupported PDF parser: {pdf_parser}") - def get_text_of_pdf_pages(pdf_pages, start_page, end_page): text = "" - for page_num in range(start_page-1, end_page): + for page_num in range(start_page - 1, end_page): text += pdf_pages[page_num][0] return text + def get_text_of_pdf_pages_with_labels(pdf_pages, start_page, end_page): text = "" - for page_num in range(start_page-1, end_page): - text += f"\n{pdf_pages[page_num][0]}\n\n" + for page_num in range(start_page - 1, end_page): + text += f"\n{pdf_pages[page_num][0]}\n\n" return text + def get_number_of_pages(pdf_path): - pdf_reader = PyPDF2.PdfReader(pdf_path) + pdf_reader = pypdf.PdfReader(pdf_path) num = len(pdf_reader.pages) return num - def post_processing(structure, end_physical_index): - # First convert page_number to start_index in flat list for i, item in enumerate(structure): - item['start_index'] = item.get('physical_index') + item["start_index"] = item.get("physical_index") if i < len(structure) - 1: - if structure[i + 1].get('appear_start') == 'yes': - item['end_index'] = structure[i + 1]['physical_index']-1 + if structure[i + 1].get("appear_start") == "yes": + item["end_index"] = structure[i + 1]["physical_index"] - 1 else: - item['end_index'] = structure[i + 1]['physical_index'] + item["end_index"] = structure[i + 1]["physical_index"] else: - item['end_index'] = end_physical_index + item["end_index"] = end_physical_index tree = list_to_tree(structure) - if len(tree)!=0: + if len(tree) != 0: return tree else: - ### remove appear_start for node in structure: - node.pop('appear_start', None) - node.pop('physical_index', None) + node.pop("appear_start", None) + node.pop("physical_index", None) return structure + def clean_structure_post(data): if isinstance(data, dict): - data.pop('page_number', None) - data.pop('start_index', None) - data.pop('end_index', None) - if 'nodes' in data: - clean_structure_post(data['nodes']) + data.pop("page_number", None) + data.pop("start_index", None) + data.pop("end_index", None) + if "nodes" in data: + clean_structure_post(data["nodes"]) elif isinstance(data, list): for section in data: clean_structure_post(section) return data -def remove_fields(data, fields=['text']): + +def remove_fields(data, fields=None): + if fields is None: + fields = ["text"] if isinstance(data, dict): - return {k: remove_fields(v, fields) - for k, v in data.items() if k not in fields} + return {k: remove_fields(v, fields) for k, v in data.items() if k not in fields} elif isinstance(data, list): return [remove_fields(item, fields) for item in data] return data + def print_toc(tree, indent=0): for node in tree: - print(' ' * indent + node['title']) - if node.get('nodes'): - print_toc(node['nodes'], indent + 1) + print(" " * indent + node["title"]) + if node.get("nodes"): + print_toc(node["nodes"], indent + 1) + def print_json(data, max_len=40, indent=2): def simplify_data(obj): @@ -484,19 +476,19 @@ def simplify_data(obj): elif isinstance(obj, list): return [simplify_data(item) for item in obj] elif isinstance(obj, str) and len(obj) > max_len: - return obj[:max_len] + '...' + return obj[:max_len] + "..." else: return obj - + simplified = simplify_data(data) print(json.dumps(simplified, indent=indent, ensure_ascii=False)) def remove_structure_text(data): if isinstance(data, dict): - data.pop('text', None) - if 'nodes' in data: - remove_structure_text(data['nodes']) + data.pop("text", None) + if "nodes" in data: + remove_structure_text(data["nodes"]) elif isinstance(data, list): for item in data: remove_structure_text(item) @@ -504,82 +496,79 @@ def remove_structure_text(data): def check_token_limit(structure, limit=110000): - list = structure_to_list(structure) - for node in list: - num_tokens = count_tokens(node['text'], model=None) + nodes = structure_to_list(structure) + for node in nodes: + num_tokens = count_tokens(node["text"], model=None) if num_tokens > limit: print(f"Node ID: {node['node_id']} has {num_tokens} tokens") - print("Start Index:", node['start_index']) - print("End Index:", node['end_index']) - print("Title:", node['title']) + print("Start Index:", node["start_index"]) + print("End Index:", node["end_index"]) + print("Title:", node["title"]) print("\n") +def _parse_physical_index_str(value): + """Parse a physical_index string tag into an integer, or return None.""" + if value.startswith("").strip()) + if value.startswith("physical_index_"): + return int(value.split("_")[-1].strip()) + return None + + def convert_physical_index_to_int(data): if isinstance(data, list): for i in range(len(data)): - # Check if item is a dictionary and has 'physical_index' key - if isinstance(data[i], dict) and 'physical_index' in data[i]: - if isinstance(data[i]['physical_index'], str): - if data[i]['physical_index'].startswith('').strip()) - elif data[i]['physical_index'].startswith('physical_index_'): - data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip()) + if isinstance(data[i], dict) and "physical_index" in data[i]: + if isinstance(data[i]["physical_index"], str): + parsed = _parse_physical_index_str(data[i]["physical_index"]) + if parsed is not None: + data[i]["physical_index"] = parsed elif isinstance(data, str): - if data.startswith('').strip()) - elif data.startswith('physical_index_'): - data = int(data.split('_')[-1].strip()) - # Check data is int - if isinstance(data, int): - return data - else: - return None + parsed = _parse_physical_index_str(data) + return parsed if parsed is not None else (data if isinstance(data, int) else None) return data def convert_page_to_int(data): for item in data: - if 'page' in item and isinstance(item['page'], str): + if "page" in item and isinstance(item["page"], str): try: - item['page'] = int(item['page']) + item["page"] = int(item["page"]) except ValueError: - # Keep original value if conversion fails pass return data def add_node_text(node, pdf_pages): if isinstance(node, dict): - start_page = node.get('start_index') - end_page = node.get('end_index') - node['text'] = get_text_of_pdf_pages(pdf_pages, start_page, end_page) - if 'nodes' in node: - add_node_text(node['nodes'], pdf_pages) + start_page = node.get("start_index") + end_page = node.get("end_index") + node["text"] = get_text_of_pdf_pages(pdf_pages, start_page, end_page) + if "nodes" in node: + add_node_text(node["nodes"], pdf_pages) elif isinstance(node, list): for index in range(len(node)): add_node_text(node[index], pdf_pages) - return def add_node_text_with_labels(node, pdf_pages): if isinstance(node, dict): - start_page = node.get('start_index') - end_page = node.get('end_index') - node['text'] = get_text_of_pdf_pages_with_labels(pdf_pages, start_page, end_page) - if 'nodes' in node: - add_node_text_with_labels(node['nodes'], pdf_pages) + start_page = node.get("start_index") + end_page = node.get("end_index") + node["text"] = get_text_of_pdf_pages_with_labels(pdf_pages, start_page, end_page) + if "nodes" in node: + add_node_text_with_labels(node["nodes"], pdf_pages) elif isinstance(node, list): for index in range(len(node)): add_node_text_with_labels(node[index], pdf_pages) - return async def generate_node_summary(node, model=None): prompt = f"""You are given a part of a document, your task is to generate a description of the partial document about what are main points covered in the partial document. Partial Document Text: {node['text']} - + Directly return the description, do not include any other text. """ response = await llm_acompletion(model, prompt) @@ -590,9 +579,9 @@ async def generate_summaries_for_structure(structure, model=None): nodes = structure_to_list(structure) tasks = [generate_node_summary(node, model=model) for node in nodes] summaries = await asyncio.gather(*tasks) - + for node, summary in zip(nodes, summaries): - node['summary'] = summary + node["summary"] = summary return structure @@ -603,15 +592,11 @@ def create_clean_structure_for_description(structure): """ if isinstance(structure, dict): clean_node = {} - # Only include essential fields for description - for key in ['title', 'node_id', 'summary', 'prefix_summary']: + for key in ["title", "node_id", "summary", "prefix_summary"]: if key in structure: clean_node[key] = structure[key] - - # Recursively process child nodes - if 'nodes' in structure and structure['nodes']: - clean_node['nodes'] = create_clean_structure_for_description(structure['nodes']) - + if "nodes" in structure and structure["nodes"]: + clean_node["nodes"] = create_clean_structure_for_description(structure["nodes"]) return clean_node elif isinstance(structure, list): return [create_clean_structure_for_description(item) for item in structure] @@ -622,9 +607,9 @@ def create_clean_structure_for_description(structure): def generate_doc_description(structure, model=None): prompt = f"""Your are an expert in generating descriptions for a document. You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents. - + Document Structure: {structure} - + Directly return the description, do not include any other text. """ response = llm_completion(model, prompt) @@ -641,10 +626,10 @@ def format_structure(structure, order=None): if not order: return structure if isinstance(structure, dict): - if 'nodes' in structure: - structure['nodes'] = format_structure(structure['nodes'], order) - if not structure.get('nodes'): - structure.pop('nodes', None) + if "nodes" in structure: + structure["nodes"] = format_structure(structure["nodes"], order) + if not structure.get("nodes"): + structure.pop("nodes", None) structure = reorder_dict(structure, order) elif isinstance(structure, list): structure = [format_structure(item, order) for item in structure] @@ -668,9 +653,7 @@ def _validate_keys(self, user_dict): raise ValueError(f"Unknown config keys: {unknown_keys}") def load(self, user_opt=None) -> config: - """ - Load the configuration, merging user options with default values. - """ + """Load configuration, merging user options with default values.""" if user_opt is None: user_dict = {} elif isinstance(user_opt, config): @@ -684,27 +667,34 @@ def load(self, user_opt=None) -> config: merged = {**self._default_dict, **user_dict} return config(**merged) + def create_node_mapping(tree): """Create a flat dict mapping node_id to node for quick lookup.""" mapping = {} + def _traverse(nodes): for node in nodes: - if node.get('node_id'): - mapping[node['node_id']] = node - if node.get('nodes'): - _traverse(node['nodes']) + if node.get("node_id"): + mapping[node["node_id"]] = node + if node.get("nodes"): + _traverse(node["nodes"]) + _traverse(tree) return mapping + def print_tree(tree, indent=0): for node in tree: - summary = node.get('summary') or node.get('prefix_summary', '') + summary = node.get("summary") or node.get("prefix_summary", "") summary_str = f" — {summary[:60]}..." if summary else "" - print(' ' * indent + f"[{node.get('node_id', '?')}] {node.get('title', '')}{summary_str}") - if node.get('nodes'): - print_tree(node['nodes'], indent + 1) + print( + " " * indent + + f"[{node.get('node_id', '?')}] {node.get('title', '')}{summary_str}" + ) + if node.get("nodes"): + print_tree(node["nodes"], indent + 1) + def print_wrapped(text, width=100): for line in text.splitlines(): print(textwrap.fill(line, width=width)) - diff --git a/requirements.txt b/requirements.txt index 613e92161..67690014e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ litellm==1.82.0 # openai-agents # optional: required for examples/agentic_vectorless_rag_demo.py pymupdf==1.26.4 -PyPDF2==3.0.1 +pypdf>=4.0.0 python-dotenv==1.1.0 pyyaml==6.0.2 diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 000000000..bbd135bc1 --- /dev/null +++ b/ruff.toml @@ -0,0 +1 @@ +exclude = [".claude"] diff --git a/run_pageindex.py b/run_pageindex.py index 673439d89..661c58303 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -1,7 +1,8 @@ import argparse -import os import json -from pageindex import * +import os + +from pageindex import page_index_main from pageindex.page_index_md import md_to_tree from pageindex.utils import ConfigLoader