Skip to content

Commit 55f520c

Browse files
feat: add crawl pages support
1 parent 998c2fb commit 55f520c

6 files changed

Lines changed: 176 additions & 26 deletions

File tree

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,14 +134,14 @@ res = sgai.search(
134134
Crawl a website and its linked pages.
135135

136136
```python
137-
from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig
137+
from scrapegraph_py import ScrapeGraphAI, ScrapeMarkdownFormatEntry
138138

139139
sgai = ScrapeGraphAI()
140140

141141
# Start a crawl
142142
start = sgai.crawl.start(
143143
"https://example.com",
144-
formats=[MarkdownFormatConfig()],
144+
formats=[ScrapeMarkdownFormatEntry()],
145145
max_pages=50,
146146
max_depth=2,
147147
max_links_per_page=10,
@@ -151,6 +151,7 @@ start = sgai.crawl.start(
151151

152152
# Check status
153153
status = sgai.crawl.get(start.data["id"])
154+
pages = sgai.crawl.pages(start.data["id"], cursor=0, limit=50)
154155

155156
# Control
156157
sgai.crawl.stop(crawl_id)
@@ -259,6 +260,7 @@ async with AsyncScrapeGraphAI() as sgai:
259260
async with AsyncScrapeGraphAI() as sgai:
260261
start = await sgai.crawl.start("https://example.com", max_pages=50)
261262
status = await sgai.crawl.get(start.data["id"])
263+
pages = await sgai.crawl.pages(start.data["id"], cursor=0, limit=50)
262264
```
263265

264266
### Async Monitor

src/scrapegraph_py/__init__.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
ApiResult,
55
BrandingFormatConfig,
66
CrawlPage,
7+
CrawlPagesQuery,
8+
CrawlPagesResponse,
79
CrawlPageStatus,
810
CrawlRequest,
911
CrawlResponse,
1012
CrawlStatus,
1113
CreditsResponse,
1214
ExtractRequest,
15+
ExtractRequestBase,
1316
ExtractResponse,
1417
FetchConfig,
1518
FetchContentType,
@@ -26,6 +29,7 @@
2629
JsonFormatConfig,
2730
LinksFormatConfig,
2831
MarkdownFormatConfig,
32+
MonitorActivityQuery,
2933
MonitorActivityRequest,
3034
MonitorActivityResponse,
3135
MonitorCreateRequest,
@@ -35,8 +39,22 @@
3539
MonitorTickEntry,
3640
MonitorTickStatus,
3741
MonitorUpdateRequest,
42+
ScrapeBrandingFormatEntry,
43+
ScrapeCaptureFormat,
44+
ScrapeContentFormat,
45+
ScrapeFormat,
46+
ScrapeFormatEntry,
47+
ScrapeFormatError,
48+
ScrapeHtmlFormatEntry,
49+
ScrapeImagesFormatEntry,
50+
ScrapeJsonFormatEntry,
51+
ScrapeLinksFormatEntry,
52+
ScrapeMarkdownFormatEntry,
3853
ScrapeRequest,
3954
ScrapeResponse,
55+
ScrapeScreenshotData,
56+
ScrapeScreenshotFormatEntry,
57+
ScrapeSummaryFormatEntry,
4058
ScreenshotFormatConfig,
4159
SearchRequest,
4260
SearchResponse,
@@ -54,13 +72,16 @@
5472
"ScrapeRequest",
5573
"ScrapeResponse",
5674
"ExtractRequest",
75+
"ExtractRequestBase",
5776
"ExtractResponse",
5877
"SearchRequest",
5978
"SearchResponse",
6079
"SearchResult",
6180
"CrawlRequest",
6281
"CrawlResponse",
6382
"CrawlPage",
83+
"CrawlPagesQuery",
84+
"CrawlPagesResponse",
6485
"CrawlPageStatus",
6586
"CrawlStatus",
6687
"MonitorCreateRequest",
@@ -69,6 +90,7 @@
6990
"MonitorResult",
7091
"MonitorDiffs",
7192
"MonitorActivityRequest",
93+
"MonitorActivityQuery",
7294
"MonitorActivityResponse",
7395
"MonitorTickEntry",
7496
"MonitorTickStatus",
@@ -83,6 +105,20 @@
83105
"FetchContentType",
84106
"FetchMode",
85107
"FormatConfig",
108+
"ScrapeFormat",
109+
"ScrapeContentFormat",
110+
"ScrapeCaptureFormat",
111+
"ScrapeFormatEntry",
112+
"ScrapeMarkdownFormatEntry",
113+
"ScrapeHtmlFormatEntry",
114+
"ScrapeScreenshotFormatEntry",
115+
"ScrapeJsonFormatEntry",
116+
"ScrapeLinksFormatEntry",
117+
"ScrapeImagesFormatEntry",
118+
"ScrapeSummaryFormatEntry",
119+
"ScrapeBrandingFormatEntry",
120+
"ScrapeFormatError",
121+
"ScrapeScreenshotData",
86122
"HtmlMode",
87123
"Service",
88124
"TimeRange",

src/scrapegraph_py/async_client.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from .env import env
1515
from .schemas import (
1616
ApiResult,
17+
CrawlPagesQuery,
18+
CrawlPagesResponse,
1719
CrawlRequest,
1820
CrawlResponse,
1921
CreditsResponse,
@@ -116,6 +118,18 @@ async def start(
116118
async def get(self, id: str) -> ApiResult[CrawlResponse]:
117119
return await self._client._get(f"/crawl/{id}", CrawlResponse)
118120

121+
async def pages(
122+
self,
123+
id: str,
124+
*,
125+
cursor: int | None = None,
126+
limit: int | None = None,
127+
) -> ApiResult[CrawlPagesResponse]:
128+
kwargs = _compact(cursor=cursor, limit=limit)
129+
query = CrawlPagesQuery(**kwargs) if kwargs else None
130+
qs = {key: getattr(query, key) for key in kwargs} if query else None
131+
return await self._client._get(f"/crawl/{id}/pages", CrawlPagesResponse, params=qs or None)
132+
119133
async def stop(self, id: str) -> ApiResult[dict]:
120134
return await self._client._post_empty(f"/crawl/{id}/stop")
121135

src/scrapegraph_py/client.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from .env import env
1515
from .schemas import (
1616
ApiResult,
17+
CrawlPagesQuery,
18+
CrawlPagesResponse,
1719
CrawlRequest,
1820
CrawlResponse,
1921
CreditsResponse,
@@ -116,6 +118,18 @@ def start(
116118
def get(self, id: str) -> ApiResult[CrawlResponse]:
117119
return self._client._get(f"/crawl/{id}", CrawlResponse)
118120

121+
def pages(
122+
self,
123+
id: str,
124+
*,
125+
cursor: int | None = None,
126+
limit: int | None = None,
127+
) -> ApiResult[CrawlPagesResponse]:
128+
kwargs = _compact(cursor=cursor, limit=limit)
129+
query = CrawlPagesQuery(**kwargs) if kwargs else None
130+
qs = {key: getattr(query, key) for key in kwargs} if query else None
131+
return self._client._get(f"/crawl/{id}/pages", CrawlPagesResponse, params=qs or None)
132+
119133
def stop(self, id: str) -> ApiResult[dict]:
120134
return self._client._post_empty(f"/crawl/{id}/stop")
121135

0 commit comments

Comments
 (0)