Skip to content

Commit 2ab26a6

Browse files
Implement agent functionality with Morphik and OpenAI integration; add tools for document retrieval and processing. Update .gitignore to include response.md and remove hello.py.
1 parent 39fb2b9 commit 2ab26a6

4 files changed

Lines changed: 350 additions & 13 deletions

File tree

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,6 @@ wheels/
1111

1212
.env
1313

14-
files/*
14+
files/*
15+
16+
response.md

agent.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import json
2+
import os
3+
4+
from dotenv import load_dotenv
5+
from morphik import Morphik
6+
from openai import OpenAI
7+
8+
from tools import build_tools, run_tool_call
9+
10+
load_dotenv()
11+
12+
morphik = Morphik(uri=os.getenv("MORPHIK_URI"))
13+
openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
14+
15+
MODEL = os.getenv("OPENAI_MODEL", "gpt-4.1")
16+
SYSTEM_INSTRUCTIONS = (
17+
"You are a helpful assistant with access to Morphik retrieval tools and the "
18+
"built-in code interpreter. Use retrieve_chunks for semantic search, "
19+
"get_page_range for page/chunk ranges, list_documents to browse documents, "
20+
"and load_file_for_execution to load files for Python analysis. After loading "
21+
"a file, use its returned filename in the code interpreter."
22+
)
23+
24+
25+
def _json_dumps(value: object) -> str:
26+
return json.dumps(value, default=str, ensure_ascii=True)
27+
28+
29+
def _collect_function_calls(response) -> list:
30+
return [item for item in response.output if item.type == "function_call"]
31+
32+
33+
def run_agent(query: str) -> str:
34+
state = {"file_ids": set(), "loaded_files": {}}
35+
36+
response = openai.responses.create(
37+
model=MODEL,
38+
instructions=SYSTEM_INSTRUCTIONS,
39+
input=[{"role": "user", "content": query}],
40+
tools=build_tools(state["file_ids"]),
41+
)
42+
43+
while True:
44+
tool_calls = _collect_function_calls(response)
45+
if not tool_calls:
46+
return response.output_text
47+
48+
tool_outputs = []
49+
for call in tool_calls:
50+
try:
51+
args = json.loads(call.arguments or "{}")
52+
except json.JSONDecodeError:
53+
args = {}
54+
try:
55+
result = run_tool_call(
56+
call.name,
57+
args,
58+
morphik=morphik,
59+
openai_client=openai,
60+
state=state,
61+
)
62+
output = _json_dumps(result)
63+
except Exception as exc:
64+
output = _json_dumps({"error": str(exc)})
65+
66+
tool_outputs.append(
67+
{
68+
"type": "function_call_output",
69+
"call_id": call.call_id,
70+
"output": output,
71+
}
72+
)
73+
74+
response = openai.responses.create(
75+
model=MODEL,
76+
instructions=SYSTEM_INSTRUCTIONS,
77+
input=tool_outputs,
78+
previous_response_id=response.id,
79+
tools=build_tools(state["file_ids"]),
80+
)
81+
82+
83+
def main() -> None:
84+
query = input("Query: ").strip()
85+
if not query:
86+
print("No query provided.")
87+
return
88+
89+
response_text = run_agent(query)
90+
with open("response.md", "w", encoding="utf-8") as handle:
91+
handle.write(response_text)
92+
print("Response saved to response.md")
93+
94+
95+
if __name__ == "__main__":
96+
main()

hello.py

Lines changed: 0 additions & 12 deletions
This file was deleted.

tools.py

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
from __future__ import annotations
2+
3+
import io
4+
from typing import Any, Dict, List, Sequence
5+
6+
from morphik import Morphik
7+
from openai import OpenAI
8+
9+
DEFAULT_PAGE_OUTPUT_FORMAT = "url"
10+
DEFAULT_CHUNK_OUTPUT_FORMAT = "url"
11+
12+
13+
def build_tools(file_ids: Sequence[str]) -> List[Dict[str, Any]]:
14+
return [
15+
{
16+
"type": "function",
17+
"name": "retrieve_chunks",
18+
"description": (
19+
"Retrieve relevant chunks from Morphik using ColPali mode. "
20+
"Only provide a search query and the number of chunks to fetch."
21+
),
22+
"parameters": {
23+
"type": "object",
24+
"properties": {
25+
"query": {"type": "string", "description": "Search query text."},
26+
"k": {"type": "integer", "description": "Number of chunks to retrieve.", "minimum": 1},
27+
},
28+
"required": ["query"],
29+
},
30+
},
31+
{
32+
"type": "function",
33+
"name": "get_page_range",
34+
"description": (
35+
"Get pages or chunks within a specific range. Provide document_id and either "
36+
"start_page/end_page for page images, or start_chunk/end_chunk for chunk text."
37+
),
38+
"parameters": {
39+
"type": "object",
40+
"properties": {
41+
"document_id": {"type": "string", "description": "Morphik document external ID."},
42+
"start_page": {"type": "integer", "description": "Start page number (1-indexed)."},
43+
"end_page": {"type": "integer", "description": "End page number (1-indexed)."},
44+
"start_chunk": {"type": "integer", "description": "Start chunk number (1-indexed)."},
45+
"end_chunk": {"type": "integer", "description": "End chunk number (1-indexed)."},
46+
},
47+
"required": ["document_id"],
48+
},
49+
},
50+
{
51+
"type": "function",
52+
"name": "list_documents",
53+
"description": "List documents available in Morphik.",
54+
"parameters": {
55+
"type": "object",
56+
"properties": {
57+
"skip": {"type": "integer", "description": "Number of documents to skip.", "minimum": 0},
58+
"limit": {"type": "integer", "description": "Maximum number of documents to return.", "minimum": 1},
59+
"completed_only": {"type": "boolean", "description": "Only return completed documents."},
60+
"sort_by": {
61+
"type": "string",
62+
"description": "Field to sort by.",
63+
"enum": ["created_at", "updated_at", "filename", "external_id"],
64+
},
65+
"sort_direction": {
66+
"type": "string",
67+
"description": "Sort direction.",
68+
"enum": ["asc", "desc"],
69+
},
70+
},
71+
},
72+
},
73+
{
74+
"type": "function",
75+
"name": "load_file_for_execution",
76+
"description": (
77+
"Load a Morphik document into the code execution environment. "
78+
"Provide the document external ID."
79+
),
80+
"parameters": {
81+
"type": "object",
82+
"properties": {
83+
"document_external_id": {"type": "string", "description": "Morphik document external ID."},
84+
},
85+
"required": ["document_external_id"],
86+
},
87+
},
88+
{
89+
"type": "code_interpreter",
90+
"container": {
91+
"type": "auto",
92+
"memory_limit": "4g",
93+
"file_ids": list(file_ids),
94+
},
95+
},
96+
]
97+
98+
99+
def run_tool_call(
100+
name: str,
101+
arguments: Dict[str, Any],
102+
*,
103+
morphik: Morphik,
104+
openai_client: OpenAI,
105+
state: Dict[str, Any],
106+
) -> Dict[str, Any]:
107+
if name == "retrieve_chunks":
108+
return _retrieve_chunks(morphik, arguments)
109+
if name == "get_page_range":
110+
return _get_page_range(morphik, arguments)
111+
if name == "list_documents":
112+
return _list_documents(morphik, arguments)
113+
if name == "load_file_for_execution":
114+
return _load_file_for_execution(morphik, openai_client, arguments, state)
115+
raise ValueError(f"Unknown tool: {name}")
116+
117+
118+
def _retrieve_chunks(morphik: Morphik, arguments: Dict[str, Any]) -> Dict[str, Any]:
119+
query = arguments.get("query")
120+
if not query:
121+
raise ValueError("query is required")
122+
k = int(arguments.get("k") or 4)
123+
chunks = morphik.retrieve_chunks(
124+
query=query,
125+
k=k,
126+
use_colpali=True,
127+
output_format=DEFAULT_CHUNK_OUTPUT_FORMAT,
128+
)
129+
return {"query": query, "k": k, "chunks": [_serialize_chunk(chunk) for chunk in chunks]}
130+
131+
132+
def _get_page_range(morphik: Morphik, arguments: Dict[str, Any]) -> Dict[str, Any]:
133+
document_id = arguments.get("document_id")
134+
if not document_id:
135+
raise ValueError("document_id is required")
136+
start_page = arguments.get("start_page")
137+
end_page = arguments.get("end_page")
138+
start_chunk = arguments.get("start_chunk")
139+
end_chunk = arguments.get("end_chunk")
140+
141+
if start_page is not None and end_page is not None:
142+
pages = morphik.extract_document_pages(
143+
document_id=document_id,
144+
start_page=int(start_page),
145+
end_page=int(end_page),
146+
output_format=DEFAULT_PAGE_OUTPUT_FORMAT,
147+
)
148+
return {"type": "pages", **pages.model_dump()}
149+
150+
if start_chunk is not None and end_chunk is not None:
151+
start_chunk = int(start_chunk)
152+
end_chunk = int(end_chunk)
153+
if end_chunk < start_chunk:
154+
raise ValueError("end_chunk must be >= start_chunk")
155+
sources = [
156+
{"document_id": document_id, "chunk_number": chunk_number}
157+
for chunk_number in range(start_chunk, end_chunk + 1)
158+
]
159+
chunks = morphik.batch_get_chunks(
160+
sources=sources,
161+
use_colpali=True,
162+
output_format=DEFAULT_CHUNK_OUTPUT_FORMAT,
163+
)
164+
return {
165+
"type": "chunks",
166+
"document_id": document_id,
167+
"start_chunk": start_chunk,
168+
"end_chunk": end_chunk,
169+
"chunks": [_serialize_chunk(chunk) for chunk in chunks],
170+
}
171+
172+
raise ValueError("Provide start_page/end_page or start_chunk/end_chunk")
173+
174+
175+
def _list_documents(morphik: Morphik, arguments: Dict[str, Any]) -> Dict[str, Any]:
176+
skip = int(arguments.get("skip") or 0)
177+
limit = int(arguments.get("limit") or 100)
178+
completed_only = arguments.get("completed_only", False)
179+
if isinstance(completed_only, str):
180+
completed_only = completed_only.lower() == "true"
181+
sort_by = arguments.get("sort_by", "updated_at")
182+
sort_direction = arguments.get("sort_direction", "desc")
183+
response = morphik.list_documents(
184+
skip=skip,
185+
limit=limit,
186+
completed_only=completed_only,
187+
sort_by=sort_by,
188+
sort_direction=sort_direction,
189+
)
190+
return response.model_dump()
191+
192+
193+
def _load_file_for_execution(
194+
morphik: Morphik,
195+
openai_client: OpenAI,
196+
arguments: Dict[str, Any],
197+
state: Dict[str, Any],
198+
) -> Dict[str, Any]:
199+
document_id = arguments.get("document_external_id")
200+
if not document_id:
201+
raise ValueError("document_external_id is required")
202+
203+
loaded_files = state.setdefault("loaded_files", {})
204+
if document_id in loaded_files:
205+
return {
206+
"document_id": document_id,
207+
"file_id": loaded_files[document_id]["file_id"],
208+
"filename": loaded_files[document_id]["filename"],
209+
"status": "already_loaded",
210+
}
211+
212+
document = morphik.get_document(document_id)
213+
filename = document.filename or f"{document_id}"
214+
file_bytes = morphik.get_document_file(document_id)
215+
216+
file_buffer = io.BytesIO(file_bytes)
217+
file_buffer.seek(0)
218+
file_obj = openai_client.files.create(
219+
file=(filename, file_buffer),
220+
purpose="assistants",
221+
)
222+
223+
state.setdefault("file_ids", set()).add(file_obj.id)
224+
loaded_files[document_id] = {"file_id": file_obj.id, "filename": filename}
225+
226+
return {
227+
"document_id": document_id,
228+
"file_id": file_obj.id,
229+
"filename": filename,
230+
"status": "loaded",
231+
}
232+
233+
234+
def _serialize_chunk(chunk: Any) -> Dict[str, Any]:
235+
content = chunk.content
236+
if not isinstance(content, str):
237+
if hasattr(content, "size"):
238+
content = f"<image size={getattr(content, 'size', '')}>"
239+
else:
240+
content = str(content)
241+
242+
return {
243+
"document_id": chunk.document_id,
244+
"chunk_number": chunk.chunk_number,
245+
"score": chunk.score,
246+
"content": content,
247+
"metadata": chunk.metadata,
248+
"content_type": chunk.content_type,
249+
"filename": chunk.filename,
250+
"download_url": chunk.download_url,
251+
}

0 commit comments

Comments
 (0)