feat: add crawl pages support

FrancescoSaverioZuppichini · FrancescoSaverioZuppichini · commit a42bacfc17c5 · 2026-05-28T20:57:05.000+02:00
diff --git a/README.md b/README.md
@@ -134,14 +134,14 @@ res = sgai.search(
 Crawl a website and its linked pages.
 
 ```python
-from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig
+from scrapegraph_py import ScrapeGraphAI, ScrapeMarkdownFormatEntry
 
 sgai = ScrapeGraphAI()
 
 # Start a crawl
 start = sgai.crawl.start(
     "https://example.com",
-    formats=[MarkdownFormatConfig()],
+    formats=[ScrapeMarkdownFormatEntry()],
     max_pages=50,
     max_depth=2,
     max_links_per_page=10,
@@ -151,6 +151,7 @@ start = sgai.crawl.start(
 
 # Check status
 status = sgai.crawl.get(start.data["id"])
+pages = sgai.crawl.pages(start.data["id"], cursor=0, limit=50)
 
 # Control
 sgai.crawl.stop(crawl_id)
@@ -259,6 +260,7 @@ async with AsyncScrapeGraphAI() as sgai:
 async with AsyncScrapeGraphAI() as sgai:
     start = await sgai.crawl.start("https://example.com", max_pages=50)
     status = await sgai.crawl.get(start.data["id"])
+    pages = await sgai.crawl.pages(start.data["id"], cursor=0, limit=50)
 ```
 
 ### Async Monitor
@@ -289,6 +291,7 @@ async with AsyncScrapeGraphAI() as sgai:
 | search | [`search_with_extraction.py`](examples/search/search_with_extraction.py) | Search + AI extraction |
 | crawl | [`crawl_basic.py`](examples/crawl/crawl_basic.py) | Start and monitor a crawl |
 | crawl | [`crawl_with_formats.py`](examples/crawl/crawl_with_formats.py) | Crawl with formats |
+| crawl | [`crawl_pages.py`](examples/crawl/crawl_pages.py) | Paginated crawl pages with scrape results |
 | monitor | [`monitor_basic.py`](examples/monitor/monitor_basic.py) | Create a page monitor |
 | monitor | [`monitor_with_webhook.py`](examples/monitor/monitor_with_webhook.py) | Monitor with webhook |
 | utilities | [`credits.py`](examples/utilities/credits.py) | Check credits and limits |
@@ -310,6 +313,7 @@ async with AsyncScrapeGraphAI() as sgai:
 | search | [`search_with_extraction_async.py`](examples/search/search_with_extraction_async.py) | Search + AI extraction |
 | crawl | [`crawl_basic_async.py`](examples/crawl/crawl_basic_async.py) | Start and monitor a crawl |
 | crawl | [`crawl_with_formats_async.py`](examples/crawl/crawl_with_formats_async.py) | Crawl with formats |
+| crawl | [`crawl_pages_async.py`](examples/crawl/crawl_pages_async.py) | Paginated crawl pages with scrape results |
 | monitor | [`monitor_basic_async.py`](examples/monitor/monitor_basic_async.py) | Create a page monitor |
 | monitor | [`monitor_with_webhook_async.py`](examples/monitor/monitor_with_webhook_async.py) | Monitor with webhook |
 | utilities | [`credits_async.py`](examples/utilities/credits_async.py) | Check credits and limits |
diff --git a/examples/crawl/crawl_pages.py b/examples/crawl/crawl_pages.py
@@ -0,0 +1,55 @@
+import time
+
+from dotenv import load_dotenv
+
+from scrapegraph_py import ScrapeGraphAI, ScrapeMarkdownFormatEntry
+
+load_dotenv()
+
+sgai = ScrapeGraphAI()
+
+start_res = sgai.crawl.start(
+    "https://scrapegraphai.com/",
+    max_pages=5,
+    max_depth=2,
+    formats=[ScrapeMarkdownFormatEntry()],
+)
+
+if start_res.status != "success" or not start_res.data:
+    print("Failed to start:", start_res.error)
+    raise SystemExit(1)
+
+crawl_id = start_res.data.id
+print("Crawl started:", crawl_id)
+
+status = start_res.data.status
+while status == "running":
+    time.sleep(2)
+    get_res = sgai.crawl.get(crawl_id)
+    if get_res.status != "success" or not get_res.data:
+        print("Failed to get status:", get_res.error)
+        raise SystemExit(1)
+    status = get_res.data.status
+    print(f"Progress: {get_res.data.finished}/{get_res.data.total} - {status}")
+
+cursor = 0
+while True:
+    pages_res = sgai.crawl.pages(crawl_id, cursor=cursor, limit=50)
+    if pages_res.status != "success" or not pages_res.data:
+        print("Failed to get pages:", pages_res.error)
+        raise SystemExit(1)
+
+    for page in pages_res.data.data:
+        print(f"\nPage: {page.url}")
+        print(f"Status: {page.status}")
+        print(f"Title: {page.title}")
+
+        markdown = (page.scrape.results.get("markdown") if page.scrape else None) or {}
+        snippets = markdown.get("data") or []
+        if snippets:
+            print(snippets[0][:300])
+
+    next_cursor = pages_res.data.pagination.next_cursor
+    if next_cursor is None:
+        break
+    cursor = int(next_cursor)
diff --git a/examples/crawl/crawl_pages_async.py b/examples/crawl/crawl_pages_async.py
@@ -0,0 +1,59 @@
+import asyncio
+
+from dotenv import load_dotenv
+
+from scrapegraph_py import AsyncScrapeGraphAI, ScrapeMarkdownFormatEntry
+
+load_dotenv()
+
+
+async def main():
+    async with AsyncScrapeGraphAI() as sgai:
+        start_res = await sgai.crawl.start(
+            "https://scrapegraphai.com/",
+            max_pages=5,
+            max_depth=2,
+            formats=[ScrapeMarkdownFormatEntry()],
+        )
+
+        if start_res.status != "success" or not start_res.data:
+            print("Failed to start:", start_res.error)
+            raise SystemExit(1)
+
+        crawl_id = start_res.data.id
+        print("Crawl started:", crawl_id)
+
+        status = start_res.data.status
+        while status == "running":
+            await asyncio.sleep(2)
+            get_res = await sgai.crawl.get(crawl_id)
+            if get_res.status != "success" or not get_res.data:
+                print("Failed to get status:", get_res.error)
+                raise SystemExit(1)
+            status = get_res.data.status
+            print(f"Progress: {get_res.data.finished}/{get_res.data.total} - {status}")
+
+        cursor = 0
+        while True:
+            pages_res = await sgai.crawl.pages(crawl_id, cursor=cursor, limit=50)
+            if pages_res.status != "success" or not pages_res.data:
+                print("Failed to get pages:", pages_res.error)
+                raise SystemExit(1)
+
+            for page in pages_res.data.data:
+                print(f"\nPage: {page.url}")
+                print(f"Status: {page.status}")
+                print(f"Title: {page.title}")
+
+                markdown = (page.scrape.results.get("markdown") if page.scrape else None) or {}
+                snippets = markdown.get("data") or []
+                if snippets:
+                    print(snippets[0][:300])
+
+            next_cursor = pages_res.data.pagination.next_cursor
+            if next_cursor is None:
+                break
+            cursor = int(next_cursor)
+
+
+asyncio.run(main())
diff --git a/src/scrapegraph_py/__init__.py b/src/scrapegraph_py/__init__.py
@@ -4,12 +4,15 @@
     ApiResult,
     BrandingFormatConfig,
     CrawlPage,
+    CrawlPagesQuery,
+    CrawlPagesResponse,
     CrawlPageStatus,
     CrawlRequest,
     CrawlResponse,
     CrawlStatus,
     CreditsResponse,
     ExtractRequest,
+    ExtractRequestBase,
     ExtractResponse,
     FetchConfig,
     FetchContentType,
@@ -26,6 +29,7 @@
     JsonFormatConfig,
     LinksFormatConfig,
     MarkdownFormatConfig,
+    MonitorActivityQuery,
     MonitorActivityRequest,
     MonitorActivityResponse,
     MonitorCreateRequest,
@@ -35,8 +39,22 @@
     MonitorTickEntry,
     MonitorTickStatus,
     MonitorUpdateRequest,
+    ScrapeBrandingFormatEntry,
+    ScrapeCaptureFormat,
+    ScrapeContentFormat,
+    ScrapeFormat,
+    ScrapeFormatEntry,
+    ScrapeFormatError,
+    ScrapeHtmlFormatEntry,
+    ScrapeImagesFormatEntry,
+    ScrapeJsonFormatEntry,
+    ScrapeLinksFormatEntry,
+    ScrapeMarkdownFormatEntry,
     ScrapeRequest,
     ScrapeResponse,
+    ScrapeScreenshotData,
+    ScrapeScreenshotFormatEntry,
+    ScrapeSummaryFormatEntry,
     ScreenshotFormatConfig,
     SearchRequest,
     SearchResponse,
@@ -54,13 +72,16 @@
     "ScrapeRequest",
     "ScrapeResponse",
     "ExtractRequest",
+    "ExtractRequestBase",
     "ExtractResponse",
     "SearchRequest",
     "SearchResponse",
     "SearchResult",
     "CrawlRequest",
     "CrawlResponse",
     "CrawlPage",
+    "CrawlPagesQuery",
+    "CrawlPagesResponse",
     "CrawlPageStatus",
     "CrawlStatus",
     "MonitorCreateRequest",
@@ -69,6 +90,7 @@
     "MonitorResult",
     "MonitorDiffs",
     "MonitorActivityRequest",
+    "MonitorActivityQuery",
     "MonitorActivityResponse",
     "MonitorTickEntry",
     "MonitorTickStatus",
@@ -83,6 +105,20 @@
     "FetchContentType",
     "FetchMode",
     "FormatConfig",
+    "ScrapeFormat",
+    "ScrapeContentFormat",
+    "ScrapeCaptureFormat",
+    "ScrapeFormatEntry",
+    "ScrapeMarkdownFormatEntry",
+    "ScrapeHtmlFormatEntry",
+    "ScrapeScreenshotFormatEntry",
+    "ScrapeJsonFormatEntry",
+    "ScrapeLinksFormatEntry",
+    "ScrapeImagesFormatEntry",
+    "ScrapeSummaryFormatEntry",
+    "ScrapeBrandingFormatEntry",
+    "ScrapeFormatError",
+    "ScrapeScreenshotData",
     "HtmlMode",
     "Service",
     "TimeRange",
diff --git a/src/scrapegraph_py/async_client.py b/src/scrapegraph_py/async_client.py
@@ -14,6 +14,8 @@
 from .env import env
 from .schemas import (
     ApiResult,
+    CrawlPagesQuery,
+    CrawlPagesResponse,
     CrawlRequest,
     CrawlResponse,
     CreditsResponse,
@@ -116,6 +118,18 @@ async def start(
     async def get(self, id: str) -> ApiResult[CrawlResponse]:
         return await self._client._get(f"/crawl/{id}", CrawlResponse)
 
+    async def pages(
+        self,
+        id: str,
+        *,
+        cursor: int | None = None,
+        limit: int | None = None,
+    ) -> ApiResult[CrawlPagesResponse]:
+        kwargs = _compact(cursor=cursor, limit=limit)
+        query = CrawlPagesQuery(**kwargs) if kwargs else None
+        qs = {key: getattr(query, key) for key in kwargs} if query else None
+        return await self._client._get(f"/crawl/{id}/pages", CrawlPagesResponse, params=qs or None)
+
     async def stop(self, id: str) -> ApiResult[dict]:
         return await self._client._post_empty(f"/crawl/{id}/stop")
 
diff --git a/src/scrapegraph_py/client.py b/src/scrapegraph_py/client.py
@@ -14,6 +14,8 @@
 from .env import env
 from .schemas import (
     ApiResult,
+    CrawlPagesQuery,
+    CrawlPagesResponse,
     CrawlRequest,
     CrawlResponse,
     CreditsResponse,
@@ -116,6 +118,18 @@ def start(
     def get(self, id: str) -> ApiResult[CrawlResponse]:
         return self._client._get(f"/crawl/{id}", CrawlResponse)
 
+    def pages(
+        self,
+        id: str,
+        *,
+        cursor: int | None = None,
+        limit: int | None = None,
+    ) -> ApiResult[CrawlPagesResponse]:
+        kwargs = _compact(cursor=cursor, limit=limit)
+        query = CrawlPagesQuery(**kwargs) if kwargs else None
+        qs = {key: getattr(query, key) for key in kwargs} if query else None
+        return self._client._get(f"/crawl/{id}/pages", CrawlPagesResponse, params=qs or None)
+
     def stop(self, id: str) -> ApiResult[dict]:
         return self._client._post_empty(f"/crawl/{id}/stop")
 
diff --git a/src/scrapegraph_py/schemas.py b/src/scrapegraph_py/schemas.py
diff --git a/tests/test_client.py b/tests/test_client.py