Skip to content

Commit b63fd97

Browse files
feat: add PageIndex SDK with local/cloud dual-mode support (#207)
1 parent 8f1ed77 commit b63fd97

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+4227
-276
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,7 @@ __pycache__
44
.env*
55
.venv/
66
logs/
7+
pageindex.egg-info/
8+
*.db
9+
venv/
10+
uv.lock

examples/cloud_demo.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""
2+
Agentic Vectorless RAG with PageIndex SDK - Cloud Demo
3+
4+
Uses CloudClient for fully-managed document indexing and QA.
5+
No LLM API key needed — the cloud service handles everything.
6+
7+
Steps:
8+
1 — Upload and index a PDF via PageIndex cloud
9+
2 — Stream a question with tool call visibility
10+
11+
Requirements:
12+
pip install pageindex
13+
export PAGEINDEX_API_KEY=your-api-key
14+
"""
15+
import asyncio
16+
import os
17+
from pathlib import Path
18+
import requests
19+
from pageindex import CloudClient
20+
21+
_EXAMPLES_DIR = Path(__file__).parent
22+
PDF_URL = "https://arxiv.org/pdf/1706.03762.pdf"
23+
PDF_PATH = _EXAMPLES_DIR / "documents" / "attention.pdf"
24+
25+
# Download PDF if needed
26+
if not PDF_PATH.exists():
27+
print(f"Downloading {PDF_URL} ...")
28+
PDF_PATH.parent.mkdir(parents=True, exist_ok=True)
29+
with requests.get(PDF_URL, stream=True, timeout=30) as r:
30+
r.raise_for_status()
31+
with open(PDF_PATH, "wb") as f:
32+
for chunk in r.iter_content(chunk_size=8192):
33+
if chunk:
34+
f.write(chunk)
35+
print("Download complete.\n")
36+
37+
client = CloudClient(api_key=os.environ["PAGEINDEX_API_KEY"])
38+
col = client.collection()
39+
40+
doc_id = col.add(str(PDF_PATH))
41+
print(f"Indexed: {doc_id}\n")
42+
43+
# Streaming query
44+
stream = col.query("What is the main contribution of this paper?", stream=True)
45+
46+
async def main():
47+
streamed_text = False
48+
async for event in stream:
49+
if event.type == "answer_delta":
50+
print(event.data, end="", flush=True)
51+
streamed_text = True
52+
elif event.type == "tool_call":
53+
if streamed_text:
54+
print()
55+
streamed_text = False
56+
args = event.data.get("args", "")
57+
print(f"[tool call] {event.data['name']}({args})")
58+
elif event.type == "answer_done":
59+
print()
60+
streamed_text = False
61+
62+
asyncio.run(main())

examples/local_demo.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""
2+
Agentic Vectorless RAG with PageIndex SDK - Local Demo
3+
4+
A simple example of using LocalClient for self-hosted document indexing
5+
and agent-based QA. The agent uses OpenAI Agents SDK to reason over
6+
the document's tree structure index.
7+
8+
Steps:
9+
1 — Download and index a PDF
10+
2 — Stream a question with tool call visibility
11+
12+
Requirements:
13+
pip install pageindex
14+
export OPENAI_API_KEY=your-api-key # or any LiteLLM-supported provider
15+
"""
16+
import asyncio
17+
from pathlib import Path
18+
import requests
19+
from pageindex import LocalClient
20+
21+
_EXAMPLES_DIR = Path(__file__).parent
22+
PDF_URL = "https://arxiv.org/pdf/1706.03762.pdf"
23+
PDF_PATH = _EXAMPLES_DIR / "documents" / "attention.pdf"
24+
WORKSPACE = _EXAMPLES_DIR / "workspace"
25+
MODEL = "gpt-4o-2024-11-20" # any LiteLLM-supported model
26+
27+
# Download PDF if needed
28+
if not PDF_PATH.exists():
29+
print(f"Downloading {PDF_URL} ...")
30+
PDF_PATH.parent.mkdir(parents=True, exist_ok=True)
31+
with requests.get(PDF_URL, stream=True, timeout=30) as r:
32+
r.raise_for_status()
33+
with open(PDF_PATH, "wb") as f:
34+
for chunk in r.iter_content(chunk_size=8192):
35+
if chunk:
36+
f.write(chunk)
37+
print("Download complete.\n")
38+
39+
client = LocalClient(model=MODEL, storage_path=str(WORKSPACE))
40+
col = client.collection()
41+
42+
doc_id = col.add(str(PDF_PATH))
43+
print(f"Indexed: {doc_id}\n")
44+
45+
# Streaming query
46+
stream = col.query(
47+
"What is the main architecture proposed in this paper and how does self-attention work?",
48+
stream=True,
49+
)
50+
51+
async def main():
52+
streamed_text = False
53+
async for event in stream:
54+
if event.type == "answer_delta":
55+
print(event.data, end="", flush=True)
56+
streamed_text = True
57+
elif event.type == "tool_call":
58+
if streamed_text:
59+
print()
60+
streamed_text = False
61+
print(f"[tool call] {event.data['name']}")
62+
elif event.type == "tool_result":
63+
preview = str(event.data)[:200] + "..." if len(str(event.data)) > 200 else event.data
64+
print(f"[tool output] {preview}")
65+
elif event.type == "answer_done":
66+
print()
67+
streamed_text = False
68+
69+
asyncio.run(main())

pageindex/__init__.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,40 @@
1+
# pageindex/__init__.py
2+
# Upstream exports (backward compatibility)
13
from .page_index import *
24
from .page_index_md import md_to_tree
35
from .retrieve import get_document, get_document_structure, get_page_content
4-
from .client import PageIndexClient
6+
7+
# SDK exports
8+
from .client import PageIndexClient, LocalClient, CloudClient
9+
from .config import IndexConfig
10+
from .collection import Collection
11+
from .parser.protocol import ContentNode, ParsedDocument, DocumentParser
12+
from .storage.protocol import StorageEngine
13+
from .events import QueryEvent
14+
from .errors import (
15+
PageIndexError,
16+
CollectionNotFoundError,
17+
DocumentNotFoundError,
18+
IndexingError,
19+
CloudAPIError,
20+
FileTypeError,
21+
)
22+
23+
__all__ = [
24+
"PageIndexClient",
25+
"LocalClient",
26+
"CloudClient",
27+
"IndexConfig",
28+
"Collection",
29+
"ContentNode",
30+
"ParsedDocument",
31+
"DocumentParser",
32+
"StorageEngine",
33+
"QueryEvent",
34+
"PageIndexError",
35+
"CollectionNotFoundError",
36+
"DocumentNotFoundError",
37+
"IndexingError",
38+
"CloudAPIError",
39+
"FileTypeError",
40+
]

pageindex/agent.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# pageindex/agent.py
2+
from __future__ import annotations
3+
from typing import AsyncIterator
4+
from .events import QueryEvent
5+
from .backend.protocol import AgentTools
6+
7+
8+
SYSTEM_PROMPT = """
9+
You are PageIndex, a document QA assistant.
10+
TOOL USE:
11+
- Call list_documents() to see available documents.
12+
- Call get_document(doc_id) to confirm status and page/line count.
13+
- Call get_document_structure(doc_id) to identify relevant page ranges.
14+
- Call get_page_content(doc_id, pages="5-7") with tight ranges; never fetch the whole document.
15+
- Before each tool call, output one short sentence explaining the reason.
16+
IMAGES:
17+
- Page content may contain image references like ![image](path). Always preserve these in your answer so the downstream UI can render them.
18+
- Place images near the relevant context in your answer.
19+
Answer based only on tool output. Be concise.
20+
"""
21+
22+
23+
class QueryStream:
24+
"""Streaming query result, similar to OpenAI's RunResultStreaming.
25+
26+
Usage:
27+
stream = col.query("question", stream=True)
28+
async for event in stream:
29+
if event.type == "answer_delta":
30+
print(event.data, end="", flush=True)
31+
"""
32+
33+
def __init__(self, tools: AgentTools, question: str, model: str = None):
34+
from agents import Agent
35+
from agents.model_settings import ModelSettings
36+
self._agent = Agent(
37+
name="PageIndex",
38+
instructions=SYSTEM_PROMPT,
39+
tools=tools.function_tools,
40+
mcp_servers=tools.mcp_servers,
41+
model=model,
42+
model_settings=ModelSettings(parallel_tool_calls=False),
43+
)
44+
self._question = question
45+
46+
async def stream_events(self) -> AsyncIterator[QueryEvent]:
47+
"""Async generator yielding QueryEvent as they arrive."""
48+
from agents import Runner, ItemHelpers
49+
from agents.stream_events import RawResponsesStreamEvent, RunItemStreamEvent
50+
from openai.types.responses import ResponseTextDeltaEvent
51+
52+
streamed_run = Runner.run_streamed(self._agent, self._question)
53+
async for event in streamed_run.stream_events():
54+
if isinstance(event, RawResponsesStreamEvent):
55+
if isinstance(event.data, ResponseTextDeltaEvent):
56+
yield QueryEvent(type="answer_delta", data=event.data.delta)
57+
elif isinstance(event, RunItemStreamEvent):
58+
item = event.item
59+
if item.type == "tool_call_item":
60+
raw = item.raw_item
61+
yield QueryEvent(type="tool_call", data={
62+
"name": raw.name, "args": getattr(raw, "arguments", "{}"),
63+
})
64+
elif item.type == "tool_call_output_item":
65+
yield QueryEvent(type="tool_result", data=str(item.output))
66+
elif item.type == "message_output_item":
67+
text = ItemHelpers.text_message_output(item)
68+
if text:
69+
yield QueryEvent(type="answer_done", data=text)
70+
71+
def __aiter__(self):
72+
return self.stream_events()
73+
74+
75+
class AgentRunner:
76+
def __init__(self, tools: AgentTools, model: str = None):
77+
self._tools = tools
78+
self._model = model
79+
80+
def run(self, question: str) -> str:
81+
"""Sync non-streaming query. Returns answer string."""
82+
from agents import Agent, Runner
83+
from agents.model_settings import ModelSettings
84+
agent = Agent(
85+
name="PageIndex",
86+
instructions=SYSTEM_PROMPT,
87+
tools=self._tools.function_tools,
88+
mcp_servers=self._tools.mcp_servers,
89+
model=self._model,
90+
model_settings=ModelSettings(parallel_tool_calls=False),
91+
)
92+
result = Runner.run_sync(agent, question)
93+
return result.final_output

pageindex/backend/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)