Skip to content

Commit a42bacf

Browse files
feat: add crawl pages support
1 parent 998c2fb commit a42bacf

8 files changed

Lines changed: 292 additions & 26 deletions

File tree

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,14 +134,14 @@ res = sgai.search(
134134
Crawl a website and its linked pages.
135135

136136
```python
137-
from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig
137+
from scrapegraph_py import ScrapeGraphAI, ScrapeMarkdownFormatEntry
138138

139139
sgai = ScrapeGraphAI()
140140

141141
# Start a crawl
142142
start = sgai.crawl.start(
143143
"https://example.com",
144-
formats=[MarkdownFormatConfig()],
144+
formats=[ScrapeMarkdownFormatEntry()],
145145
max_pages=50,
146146
max_depth=2,
147147
max_links_per_page=10,
@@ -151,6 +151,7 @@ start = sgai.crawl.start(
151151

152152
# Check status
153153
status = sgai.crawl.get(start.data["id"])
154+
pages = sgai.crawl.pages(start.data["id"], cursor=0, limit=50)
154155

155156
# Control
156157
sgai.crawl.stop(crawl_id)
@@ -259,6 +260,7 @@ async with AsyncScrapeGraphAI() as sgai:
259260
async with AsyncScrapeGraphAI() as sgai:
260261
start = await sgai.crawl.start("https://example.com", max_pages=50)
261262
status = await sgai.crawl.get(start.data["id"])
263+
pages = await sgai.crawl.pages(start.data["id"], cursor=0, limit=50)
262264
```
263265

264266
### Async Monitor
@@ -289,6 +291,7 @@ async with AsyncScrapeGraphAI() as sgai:
289291
| search | [`search_with_extraction.py`](examples/search/search_with_extraction.py) | Search + AI extraction |
290292
| crawl | [`crawl_basic.py`](examples/crawl/crawl_basic.py) | Start and monitor a crawl |
291293
| crawl | [`crawl_with_formats.py`](examples/crawl/crawl_with_formats.py) | Crawl with formats |
294+
| crawl | [`crawl_pages.py`](examples/crawl/crawl_pages.py) | Paginated crawl pages with scrape results |
292295
| monitor | [`monitor_basic.py`](examples/monitor/monitor_basic.py) | Create a page monitor |
293296
| monitor | [`monitor_with_webhook.py`](examples/monitor/monitor_with_webhook.py) | Monitor with webhook |
294297
| utilities | [`credits.py`](examples/utilities/credits.py) | Check credits and limits |
@@ -310,6 +313,7 @@ async with AsyncScrapeGraphAI() as sgai:
310313
| search | [`search_with_extraction_async.py`](examples/search/search_with_extraction_async.py) | Search + AI extraction |
311314
| crawl | [`crawl_basic_async.py`](examples/crawl/crawl_basic_async.py) | Start and monitor a crawl |
312315
| crawl | [`crawl_with_formats_async.py`](examples/crawl/crawl_with_formats_async.py) | Crawl with formats |
316+
| crawl | [`crawl_pages_async.py`](examples/crawl/crawl_pages_async.py) | Paginated crawl pages with scrape results |
313317
| monitor | [`monitor_basic_async.py`](examples/monitor/monitor_basic_async.py) | Create a page monitor |
314318
| monitor | [`monitor_with_webhook_async.py`](examples/monitor/monitor_with_webhook_async.py) | Monitor with webhook |
315319
| utilities | [`credits_async.py`](examples/utilities/credits_async.py) | Check credits and limits |

examples/crawl/crawl_pages.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import time
2+
3+
from dotenv import load_dotenv
4+
5+
from scrapegraph_py import ScrapeGraphAI, ScrapeMarkdownFormatEntry
6+
7+
load_dotenv()
8+
9+
sgai = ScrapeGraphAI()
10+
11+
start_res = sgai.crawl.start(
12+
"https://scrapegraphai.com/",
13+
max_pages=5,
14+
max_depth=2,
15+
formats=[ScrapeMarkdownFormatEntry()],
16+
)
17+
18+
if start_res.status != "success" or not start_res.data:
19+
print("Failed to start:", start_res.error)
20+
raise SystemExit(1)
21+
22+
crawl_id = start_res.data.id
23+
print("Crawl started:", crawl_id)
24+
25+
status = start_res.data.status
26+
while status == "running":
27+
time.sleep(2)
28+
get_res = sgai.crawl.get(crawl_id)
29+
if get_res.status != "success" or not get_res.data:
30+
print("Failed to get status:", get_res.error)
31+
raise SystemExit(1)
32+
status = get_res.data.status
33+
print(f"Progress: {get_res.data.finished}/{get_res.data.total} - {status}")
34+
35+
cursor = 0
36+
while True:
37+
pages_res = sgai.crawl.pages(crawl_id, cursor=cursor, limit=50)
38+
if pages_res.status != "success" or not pages_res.data:
39+
print("Failed to get pages:", pages_res.error)
40+
raise SystemExit(1)
41+
42+
for page in pages_res.data.data:
43+
print(f"\nPage: {page.url}")
44+
print(f"Status: {page.status}")
45+
print(f"Title: {page.title}")
46+
47+
markdown = (page.scrape.results.get("markdown") if page.scrape else None) or {}
48+
snippets = markdown.get("data") or []
49+
if snippets:
50+
print(snippets[0][:300])
51+
52+
next_cursor = pages_res.data.pagination.next_cursor
53+
if next_cursor is None:
54+
break
55+
cursor = int(next_cursor)
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import asyncio
2+
3+
from dotenv import load_dotenv
4+
5+
from scrapegraph_py import AsyncScrapeGraphAI, ScrapeMarkdownFormatEntry
6+
7+
load_dotenv()
8+
9+
10+
async def main():
11+
async with AsyncScrapeGraphAI() as sgai:
12+
start_res = await sgai.crawl.start(
13+
"https://scrapegraphai.com/",
14+
max_pages=5,
15+
max_depth=2,
16+
formats=[ScrapeMarkdownFormatEntry()],
17+
)
18+
19+
if start_res.status != "success" or not start_res.data:
20+
print("Failed to start:", start_res.error)
21+
raise SystemExit(1)
22+
23+
crawl_id = start_res.data.id
24+
print("Crawl started:", crawl_id)
25+
26+
status = start_res.data.status
27+
while status == "running":
28+
await asyncio.sleep(2)
29+
get_res = await sgai.crawl.get(crawl_id)
30+
if get_res.status != "success" or not get_res.data:
31+
print("Failed to get status:", get_res.error)
32+
raise SystemExit(1)
33+
status = get_res.data.status
34+
print(f"Progress: {get_res.data.finished}/{get_res.data.total} - {status}")
35+
36+
cursor = 0
37+
while True:
38+
pages_res = await sgai.crawl.pages(crawl_id, cursor=cursor, limit=50)
39+
if pages_res.status != "success" or not pages_res.data:
40+
print("Failed to get pages:", pages_res.error)
41+
raise SystemExit(1)
42+
43+
for page in pages_res.data.data:
44+
print(f"\nPage: {page.url}")
45+
print(f"Status: {page.status}")
46+
print(f"Title: {page.title}")
47+
48+
markdown = (page.scrape.results.get("markdown") if page.scrape else None) or {}
49+
snippets = markdown.get("data") or []
50+
if snippets:
51+
print(snippets[0][:300])
52+
53+
next_cursor = pages_res.data.pagination.next_cursor
54+
if next_cursor is None:
55+
break
56+
cursor = int(next_cursor)
57+
58+
59+
asyncio.run(main())

src/scrapegraph_py/__init__.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
ApiResult,
55
BrandingFormatConfig,
66
CrawlPage,
7+
CrawlPagesQuery,
8+
CrawlPagesResponse,
79
CrawlPageStatus,
810
CrawlRequest,
911
CrawlResponse,
1012
CrawlStatus,
1113
CreditsResponse,
1214
ExtractRequest,
15+
ExtractRequestBase,
1316
ExtractResponse,
1417
FetchConfig,
1518
FetchContentType,
@@ -26,6 +29,7 @@
2629
JsonFormatConfig,
2730
LinksFormatConfig,
2831
MarkdownFormatConfig,
32+
MonitorActivityQuery,
2933
MonitorActivityRequest,
3034
MonitorActivityResponse,
3135
MonitorCreateRequest,
@@ -35,8 +39,22 @@
3539
MonitorTickEntry,
3640
MonitorTickStatus,
3741
MonitorUpdateRequest,
42+
ScrapeBrandingFormatEntry,
43+
ScrapeCaptureFormat,
44+
ScrapeContentFormat,
45+
ScrapeFormat,
46+
ScrapeFormatEntry,
47+
ScrapeFormatError,
48+
ScrapeHtmlFormatEntry,
49+
ScrapeImagesFormatEntry,
50+
ScrapeJsonFormatEntry,
51+
ScrapeLinksFormatEntry,
52+
ScrapeMarkdownFormatEntry,
3853
ScrapeRequest,
3954
ScrapeResponse,
55+
ScrapeScreenshotData,
56+
ScrapeScreenshotFormatEntry,
57+
ScrapeSummaryFormatEntry,
4058
ScreenshotFormatConfig,
4159
SearchRequest,
4260
SearchResponse,
@@ -54,13 +72,16 @@
5472
"ScrapeRequest",
5573
"ScrapeResponse",
5674
"ExtractRequest",
75+
"ExtractRequestBase",
5776
"ExtractResponse",
5877
"SearchRequest",
5978
"SearchResponse",
6079
"SearchResult",
6180
"CrawlRequest",
6281
"CrawlResponse",
6382
"CrawlPage",
83+
"CrawlPagesQuery",
84+
"CrawlPagesResponse",
6485
"CrawlPageStatus",
6586
"CrawlStatus",
6687
"MonitorCreateRequest",
@@ -69,6 +90,7 @@
6990
"MonitorResult",
7091
"MonitorDiffs",
7192
"MonitorActivityRequest",
93+
"MonitorActivityQuery",
7294
"MonitorActivityResponse",
7395
"MonitorTickEntry",
7496
"MonitorTickStatus",
@@ -83,6 +105,20 @@
83105
"FetchContentType",
84106
"FetchMode",
85107
"FormatConfig",
108+
"ScrapeFormat",
109+
"ScrapeContentFormat",
110+
"ScrapeCaptureFormat",
111+
"ScrapeFormatEntry",
112+
"ScrapeMarkdownFormatEntry",
113+
"ScrapeHtmlFormatEntry",
114+
"ScrapeScreenshotFormatEntry",
115+
"ScrapeJsonFormatEntry",
116+
"ScrapeLinksFormatEntry",
117+
"ScrapeImagesFormatEntry",
118+
"ScrapeSummaryFormatEntry",
119+
"ScrapeBrandingFormatEntry",
120+
"ScrapeFormatError",
121+
"ScrapeScreenshotData",
86122
"HtmlMode",
87123
"Service",
88124
"TimeRange",

src/scrapegraph_py/async_client.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from .env import env
1515
from .schemas import (
1616
ApiResult,
17+
CrawlPagesQuery,
18+
CrawlPagesResponse,
1719
CrawlRequest,
1820
CrawlResponse,
1921
CreditsResponse,
@@ -116,6 +118,18 @@ async def start(
116118
async def get(self, id: str) -> ApiResult[CrawlResponse]:
117119
return await self._client._get(f"/crawl/{id}", CrawlResponse)
118120

121+
async def pages(
122+
self,
123+
id: str,
124+
*,
125+
cursor: int | None = None,
126+
limit: int | None = None,
127+
) -> ApiResult[CrawlPagesResponse]:
128+
kwargs = _compact(cursor=cursor, limit=limit)
129+
query = CrawlPagesQuery(**kwargs) if kwargs else None
130+
qs = {key: getattr(query, key) for key in kwargs} if query else None
131+
return await self._client._get(f"/crawl/{id}/pages", CrawlPagesResponse, params=qs or None)
132+
119133
async def stop(self, id: str) -> ApiResult[dict]:
120134
return await self._client._post_empty(f"/crawl/{id}/stop")
121135

src/scrapegraph_py/client.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from .env import env
1515
from .schemas import (
1616
ApiResult,
17+
CrawlPagesQuery,
18+
CrawlPagesResponse,
1719
CrawlRequest,
1820
CrawlResponse,
1921
CreditsResponse,
@@ -116,6 +118,18 @@ def start(
116118
def get(self, id: str) -> ApiResult[CrawlResponse]:
117119
return self._client._get(f"/crawl/{id}", CrawlResponse)
118120

121+
def pages(
122+
self,
123+
id: str,
124+
*,
125+
cursor: int | None = None,
126+
limit: int | None = None,
127+
) -> ApiResult[CrawlPagesResponse]:
128+
kwargs = _compact(cursor=cursor, limit=limit)
129+
query = CrawlPagesQuery(**kwargs) if kwargs else None
130+
qs = {key: getattr(query, key) for key in kwargs} if query else None
131+
return self._client._get(f"/crawl/{id}/pages", CrawlPagesResponse, params=qs or None)
132+
119133
def stop(self, id: str) -> ApiResult[dict]:
120134
return self._client._post_empty(f"/crawl/{id}/stop")
121135

0 commit comments

Comments
 (0)