Skip to content

Commit f218675

Browse files
authored
Add new SDK methods and enhance documentation (#17)
1 parent f483a4f commit f218675

16 files changed

Lines changed: 1020 additions & 5 deletions

docs.json

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,9 +162,11 @@
162162
"group": "Document Retrieval",
163163
"pages": [
164164
"python-sdk/retrieve_chunks",
165+
"python-sdk/retrieve_chunks_grouped",
165166
"python-sdk/retrieve_docs",
166167
"python-sdk/query",
167168
"python-sdk/list_documents",
169+
"python-sdk/search_documents",
168170
"python-sdk/get_document",
169171
"python-sdk/get_document_by_filename"
170172
]
@@ -176,6 +178,8 @@
176178
"python-sdk/create_folder",
177179
"python-sdk/list_folders",
178180
"python-sdk/get_folder",
181+
"python-sdk/get_folders_summary",
182+
"python-sdk/get_folders_details",
179183
"python-sdk/add_document_to_folder",
180184
"python-sdk/remove_document_from_folder",
181185
"python-sdk/delete_folder",
@@ -204,9 +208,12 @@
204208
"group": "Knowledge Graph Operations",
205209
"pages": [
206210
"python-sdk/create_graph",
211+
"python-sdk/update_graph",
207212
"python-sdk/get_graph",
208213
"python-sdk/list_graphs",
209214
"python-sdk/get_graph_visualization",
215+
"python-sdk/get_graph_status",
216+
"python-sdk/wait_for_graph_completion",
210217
"python-sdk/check_workflow_status"
211218
]
212219
},
@@ -220,7 +227,12 @@
220227
{
221228
"group": "Document Management",
222229
"pages": [
223-
"python-sdk/get_document_download_url"
230+
"python-sdk/get_document_file",
231+
"python-sdk/extract_document_pages",
232+
"python-sdk/get_document_download_url",
233+
"python-sdk/get_document_status",
234+
"python-sdk/delete_document",
235+
"python-sdk/delete_document_by_filename"
224236
]
225237
},
226238
{

python-sdk/batch_get_chunks.mdx

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,32 @@ description: "Retrieve specific chunks by their document ID and chunk number"
66
<Tabs>
77
<Tab title="Sync">
88
```python
9-
def batch_get_chunks(sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]
9+
def batch_get_chunks(
10+
sources: List[Union[ChunkSource, Dict[str, Any]]],
11+
folder_name: Optional[Union[str, List[str]]] = None,
12+
use_colpali: bool = True,
13+
output_format: Optional[str] = None,
14+
) -> List[FinalChunkResult]
1015
```
1116
</Tab>
1217
<Tab title="Async">
1318
```python
14-
async def batch_get_chunks(sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]
19+
async def batch_get_chunks(
20+
sources: List[Union[ChunkSource, Dict[str, Any]]],
21+
folder_name: Optional[Union[str, List[str]]] = None,
22+
use_colpali: bool = True,
23+
output_format: Optional[str] = None,
24+
) -> List[FinalChunkResult]
1525
```
1626
</Tab>
1727
</Tabs>
1828

1929
## Parameters
2030

2131
- `sources` (List[Union[ChunkSource, Dict[str, Any]]]): List of ChunkSource objects or dictionaries with document_id and chunk_number
32+
- `folder_name` (str | List[str], optional): Optional folder scope. Accepts a single folder name or a list of folder names.
33+
- `use_colpali` (bool, optional): Whether to request multimodal chunks when available. Defaults to True.
34+
- `output_format` (str, optional): Controls how image chunks are returned. Set to `"url"` to receive presigned URLs; omit or set to `"base64"` (default) to receive base64 content.
2235

2336
## Returns
2437

python-sdk/batch_get_documents.mdx

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,26 @@ description: "Retrieve multiple documents by their IDs in a single batch operati
66
<Tabs>
77
<Tab title="Sync">
88
```python
9-
def batch_get_documents(document_ids: List[str]) -> List[Document]
9+
def batch_get_documents(
10+
document_ids: List[str],
11+
folder_name: Optional[Union[str, List[str]]] = None,
12+
) -> List[Document]
1013
```
1114
</Tab>
1215
<Tab title="Async">
1316
```python
14-
async def batch_get_documents(document_ids: List[str]) -> List[Document]
17+
async def batch_get_documents(
18+
document_ids: List[str],
19+
folder_name: Optional[Union[str, List[str]]] = None,
20+
) -> List[Document]
1521
```
1622
</Tab>
1723
</Tabs>
1824

1925
## Parameters
2026

2127
- `document_ids` (List[str]): List of document IDs to retrieve
28+
- `folder_name` (str | List[str], optional): Optional folder scope. Accepts a single folder name or a list of folder names.
2229

2330
## Returns
2431

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
---
2+
title: "extract_document_pages"
3+
description: "Extract specific pages from a document"
4+
---
5+
6+
<Tabs>
7+
<Tab title="Sync">
8+
```python
9+
def extract_document_pages(
10+
document_id: str,
11+
start_page: int,
12+
end_page: int,
13+
) -> DocumentPagesResponse
14+
```
15+
</Tab>
16+
<Tab title="Async">
17+
```python
18+
async def extract_document_pages(
19+
document_id: str,
20+
start_page: int,
21+
end_page: int,
22+
) -> DocumentPagesResponse
23+
```
24+
</Tab>
25+
</Tabs>
26+
27+
## Parameters
28+
29+
- `document_id` (str): ID of the document to extract pages from
30+
- `start_page` (int): Starting page number (1-indexed)
31+
- `end_page` (int): Ending page number (1-indexed)
32+
33+
## Returns
34+
35+
- `DocumentPagesResponse`: Object containing extracted pages with metadata
36+
37+
## Examples
38+
39+
<Tabs>
40+
<Tab title="Sync">
41+
```python
42+
from morphik import Morphik
43+
44+
db = Morphik()
45+
46+
# Extract pages 1-3 from a document
47+
response = db.extract_document_pages(
48+
document_id="doc_123abc",
49+
start_page=1,
50+
end_page=3,
51+
)
52+
53+
print(f"Document ID: {response.document_id}")
54+
print(f"Extracted pages {response.start_page}-{response.end_page}")
55+
print(f"Total pages in document: {response.total_pages}")
56+
print(f"Number of pages extracted: {len(response.pages)}")
57+
58+
# Pages are base64 encoded
59+
for i, page_content in enumerate(response.pages):
60+
print(f"Page {response.start_page + i}: {len(page_content)} chars")
61+
```
62+
</Tab>
63+
<Tab title="Async">
64+
```python
65+
from morphik import AsyncMorphik
66+
67+
async with AsyncMorphik() as db:
68+
# Extract pages 1-3 from a document
69+
response = await db.extract_document_pages(
70+
document_id="doc_123abc",
71+
start_page=1,
72+
end_page=3,
73+
)
74+
75+
print(f"Document ID: {response.document_id}")
76+
print(f"Extracted pages {response.start_page}-{response.end_page}")
77+
print(f"Total pages in document: {response.total_pages}")
78+
print(f"Number of pages extracted: {len(response.pages)}")
79+
80+
# Pages are base64 encoded
81+
for i, page_content in enumerate(response.pages):
82+
print(f"Page {response.start_page + i}: {len(page_content)} chars")
83+
```
84+
</Tab>
85+
</Tabs>
86+
87+
## DocumentPagesResponse Properties
88+
89+
The `DocumentPagesResponse` object has the following properties:
90+
91+
- `document_id` (str): ID of the document
92+
- `pages` (List[str]): List of page contents as base64 encoded strings
93+
- `start_page` (int): Start page number (1-indexed)
94+
- `end_page` (int): End page number (1-indexed)
95+
- `total_pages` (int): Total number of pages in the document
96+
97+
## Notes
98+
99+
- Page numbers are 1-indexed (first page is 1, not 0).
100+
- The `pages` list contains base64 encoded representations of each page.
101+
- Useful for extracting specific sections of large documents.

python-sdk/get_document_file.mdx

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
---
2+
title: "get_document_file"
3+
description: "Download the raw file content of a document"
4+
---
5+
6+
<Tabs>
7+
<Tab title="Sync">
8+
```python
9+
def get_document_file(
10+
document_id: str,
11+
) -> bytes
12+
```
13+
</Tab>
14+
<Tab title="Async">
15+
```python
16+
async def get_document_file(
17+
document_id: str,
18+
) -> bytes
19+
```
20+
</Tab>
21+
</Tabs>
22+
23+
## Parameters
24+
25+
- `document_id` (str): ID of the document to download
26+
27+
## Returns
28+
29+
- `bytes`: Raw file content as bytes
30+
31+
## Examples
32+
33+
<Tabs>
34+
<Tab title="Sync">
35+
```python
36+
from morphik import Morphik
37+
38+
db = Morphik()
39+
40+
# Download a document's raw file
41+
doc_id = "doc_123abc"
42+
file_content = db.get_document_file(doc_id)
43+
44+
# Save to local file
45+
with open("downloaded_file.pdf", "wb") as f:
46+
f.write(file_content)
47+
48+
print(f"Downloaded {len(file_content)} bytes")
49+
```
50+
</Tab>
51+
<Tab title="Async">
52+
```python
53+
from morphik import AsyncMorphik
54+
import aiofiles
55+
56+
async with AsyncMorphik() as db:
57+
# Download a document's raw file
58+
doc_id = "doc_123abc"
59+
file_content = await db.get_document_file(doc_id)
60+
61+
# Save to local file
62+
async with aiofiles.open("downloaded_file.pdf", "wb") as f:
63+
await f.write(file_content)
64+
65+
print(f"Downloaded {len(file_content)} bytes")
66+
```
67+
</Tab>
68+
</Tabs>
69+
70+
## Notes
71+
72+
- This method returns the raw file bytes, which you can save to disk or process in memory.
73+
- For getting a downloadable URL instead of raw bytes, use [`get_document_download_url`](./get_document_download_url).
74+
- The returned bytes match the original file that was uploaded/ingested.

python-sdk/get_document_status.mdx

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
---
2+
title: "get_document_status"
3+
description: "Get the current processing status of a document"
4+
---
5+
6+
<Tabs>
7+
<Tab title="Sync">
8+
```python
9+
def get_document_status(
10+
document_id: str,
11+
) -> Dict[str, Any]
12+
```
13+
</Tab>
14+
<Tab title="Async">
15+
```python
16+
async def get_document_status(
17+
document_id: str,
18+
) -> Dict[str, Any]
19+
```
20+
</Tab>
21+
</Tabs>
22+
23+
## Parameters
24+
25+
- `document_id` (str): ID of the document to check
26+
27+
## Returns
28+
29+
- `Dict[str, Any]`: Status information including current status, potential errors, and other metadata
30+
31+
## Examples
32+
33+
<Tabs>
34+
<Tab title="Sync">
35+
```python
36+
from morphik import Morphik
37+
38+
db = Morphik()
39+
40+
# Check document processing status
41+
status = db.get_document_status("doc_123abc")
42+
43+
print(f"Status: {status.get('status')}")
44+
if status.get('error'):
45+
print(f"Error: {status.get('error')}")
46+
47+
# Use in a polling loop
48+
import time
49+
50+
while True:
51+
status = db.get_document_status("doc_123abc")
52+
if status.get('status') == 'completed':
53+
print("Document processing complete!")
54+
break
55+
elif status.get('status') == 'failed':
56+
print(f"Document processing failed: {status.get('error')}")
57+
break
58+
time.sleep(2)
59+
```
60+
</Tab>
61+
<Tab title="Async">
62+
```python
63+
from morphik import AsyncMorphik
64+
import asyncio
65+
66+
async with AsyncMorphik() as db:
67+
# Check document processing status
68+
status = await db.get_document_status("doc_123abc")
69+
70+
print(f"Status: {status.get('status')}")
71+
if status.get('error'):
72+
print(f"Error: {status.get('error')}")
73+
74+
# Use in a polling loop
75+
while True:
76+
status = await db.get_document_status("doc_123abc")
77+
if status.get('status') == 'completed':
78+
print("Document processing complete!")
79+
break
80+
elif status.get('status') == 'failed':
81+
print(f"Document processing failed: {status.get('error')}")
82+
break
83+
await asyncio.sleep(2)
84+
```
85+
</Tab>
86+
</Tabs>
87+
88+
## Notes
89+
90+
- Common status values include: `"processing"`, `"completed"`, `"failed"`
91+
- This is a lightweight endpoint useful for checking progress without fetching the full document.
92+
- The SDK also provides a helper method for polling: see the document ingestion methods which can wait for completion automatically.

0 commit comments

Comments
 (0)