apify
diff --git a/‎tests/e2e/test_crawlee_crawlers/actor_source/Dockerfile.playwright‎
Lines changed: 9 additions & 0 deletions b/‎tests/e2e/test_crawlee_crawlers/actor_source/Dockerfile.playwright‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py‎
Lines changed: 40 additions & 0 deletions b/‎tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py‎
Lines changed: 74 additions & 0 deletions b/‎tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py‎
Lines changed: 40 additions & 0 deletions b/‎tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py‎
Lines changed: 46 additions & 0 deletions b/‎tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py‎
Lines changed: 40 additions & 0 deletions b/‎tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py‎
Lines changed: 40 additions & 0 deletions b/‎tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎tests/e2e/test_crawlee_crawlers/actor_source/server.py‎
Lines changed: 84 additions & 0 deletions b/‎tests/e2e/test_crawlee_crawlers/actor_source/server.py‎
Lines changed: 84 additions & 0 deletions
@@ -0,0 +1,9 @@
+FROM apify/actor-python-playwright:PYTHON_VERSION_PLACEHOLDER
+
+COPY . ./
+
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --force-reinstall -r requirements.txt
+
+CMD ["sh", "-c", "python server.py & python -m src"]
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext
+
+from apify import Actor
+
+
+async def main() -> None:
+    async with Actor:
+        pages_visited: list[str] = []
+        crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser()
+
+        @crawler.router.default_handler
+        async def handler(context: AdaptivePlaywrightCrawlingContext) -> None:
+            pages_visited.append(context.request.url)
+            await context.enqueue_links()
+
+            if '/products/' in context.request.url:
+                name = context.parsed_content.css('h1::text').get('').strip()
+                price = context.parsed_content.css('span.price::text').get('').strip()
+                description = context.parsed_content.css('p.description::text').get('').strip()
+                if name:
+                    await context.push_data(
+                        {
+                            'url': context.request.url,
+                            'name': name,
+                            'price': price,
+                            'description': description,
+                        }
+                    )
+
+        await crawler.run(['http://localhost:8080/'])
+
+        await Actor.set_value(
+            'CRAWLER_RESULT',
+            {
+                'pages_visited_count': len(pages_visited),
+                'crawler_type': 'AdaptivePlaywrightCrawler',
+            },
+        )
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+from html.parser import HTMLParser
+from typing import TYPE_CHECKING
+
+from crawlee.crawlers import BasicCrawler
+
+from apify import Actor
+
+if TYPE_CHECKING:
+    from crawlee._types import BasicCrawlingContext
+
+
+class _PageParser(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__()
+        self.links: list[str] = []
+        self.data: dict[str, str] = {}
+        self._in_tag: str | None = None
+        self._in_class: str = ''
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        attrs_dict = dict(attrs)
+        if tag == 'a' and (href := attrs_dict.get('href')):
+            self.links.append(href)
+        self._in_tag = tag
+        self._in_class = attrs_dict.get('class', '') or ''
+
+    def handle_endtag(self, tag: str) -> None:  # noqa: ARG002
+        self._in_tag = None
+        self._in_class = ''
+
+    def handle_data(self, data: str) -> None:
+        text = data.strip()
+        if not text:
+            return
+        if self._in_tag == 'h1':
+            self.data['name'] = text
+        elif self._in_tag == 'span' and self._in_class == 'price':
+            self.data['price'] = text
+        elif self._in_tag == 'p' and self._in_class == 'description':
+            self.data['description'] = text
+
+
+async def main() -> None:
+    async with Actor:
+        pages_visited: list[str] = []
+        crawler = BasicCrawler()
+
+        @crawler.router.default_handler
+        async def handler(context: BasicCrawlingContext) -> None:
+            pages_visited.append(context.request.url)
+
+            response = await context.send_request(context.request.url)
+            html = (await response.read()).decode()
+
+            parser = _PageParser()
+            parser.feed(html)
+
+            base_url = 'http://localhost:8080'
+            await context.add_requests([f'{base_url}{link}' for link in parser.links if link.startswith('/')])
+
+            if '/products/' in context.request.url and parser.data.get('name'):
+                await context.push_data({'url': context.request.url, **parser.data})
+
+        await crawler.run(['http://localhost:8080/'])
+
+        await Actor.set_value(
+            'CRAWLER_RESULT',
+            {
+                'pages_visited_count': len(pages_visited),
+                'crawler_type': 'BasicCrawler',
+            },
+        )
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+from apify import Actor
+
+
+async def main() -> None:
+    async with Actor:
+        pages_visited: list[str] = []
+        crawler = BeautifulSoupCrawler()
+
+        @crawler.router.default_handler
+        async def handler(context: BeautifulSoupCrawlingContext) -> None:
+            pages_visited.append(context.request.url)
+            await context.enqueue_links()
+
+            if '/products/' in context.request.url:
+                name_tag = context.soup.find('h1')
+                price_tag = context.soup.find('span', class_='price')
+                desc_tag = context.soup.find('p', class_='description')
+                if name_tag:
+                    await context.push_data(
+                        {
+                            'url': context.request.url,
+                            'name': name_tag.get_text(strip=True),
+                            'price': price_tag.get_text(strip=True) if price_tag else '',
+                            'description': desc_tag.get_text(strip=True) if desc_tag else '',
+                        }
+                    )
+
+        await crawler.run(['http://localhost:8080/'])
+
+        await Actor.set_value(
+            'CRAWLER_RESULT',
+            {
+                'pages_visited_count': len(pages_visited),
+                'crawler_type': 'BeautifulSoupCrawler',
+            },
+        )
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+import re
+
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+from apify import Actor
+
+
+async def main() -> None:
+    async with Actor:
+        pages_visited: list[str] = []
+        crawler = HttpCrawler()
+
+        @crawler.router.default_handler
+        async def handler(context: HttpCrawlingContext) -> None:
+            pages_visited.append(context.request.url)
+            html = (await context.http_response.read()).decode()
+
+            links = re.findall(r'href="(/[^"]*)"', html)
+            base_url = 'http://localhost:8080'
+            await context.add_requests([f'{base_url}{link}' for link in links])
+
+            if '/products/' in context.request.url:
+                name_match = re.search(r'<h1>(.*?)</h1>', html)
+                price_match = re.search(r'<span class="price">(.*?)</span>', html)
+                desc_match = re.search(r'<p class="description">(.*?)</p>', html)
+                if name_match:
+                    await context.push_data(
+                        {
+                            'url': context.request.url,
+                            'name': name_match.group(1),
+                            'price': price_match.group(1) if price_match else '',
+                            'description': desc_match.group(1) if desc_match else '',
+                        }
+                    )
+
+        await crawler.run(['http://localhost:8080/'])
+
+        await Actor.set_value(
+            'CRAWLER_RESULT',
+            {
+                'pages_visited_count': len(pages_visited),
+                'crawler_type': 'HttpCrawler',
+            },
+        )
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
+
+from apify import Actor
+
+
+async def main() -> None:
+    async with Actor:
+        pages_visited: list[str] = []
+        crawler = ParselCrawler()
+
+        @crawler.router.default_handler
+        async def handler(context: ParselCrawlingContext) -> None:
+            pages_visited.append(context.request.url)
+            await context.enqueue_links()
+
+            if '/products/' in context.request.url:
+                name = context.selector.css('h1::text').get('').strip()
+                price = context.selector.css('span.price::text').get('').strip()
+                description = context.selector.css('p.description::text').get('').strip()
+                if name:
+                    await context.push_data(
+                        {
+                            'url': context.request.url,
+                            'name': name,
+                            'price': price,
+                            'description': description,
+                        }
+                    )
+
+        await crawler.run(['http://localhost:8080/'])
+
+        await Actor.set_value(
+            'CRAWLER_RESULT',
+            {
+                'pages_visited_count': len(pages_visited),
+                'crawler_type': 'ParselCrawler',
+            },
+        )
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
+
+from apify import Actor
+
+
+async def main() -> None:
+    async with Actor:
+        pages_visited: list[str] = []
+        crawler = PlaywrightCrawler()
+
+        @crawler.router.default_handler
+        async def handler(context: PlaywrightCrawlingContext) -> None:
+            pages_visited.append(context.request.url)
+            await context.enqueue_links()
+
+            if '/products/' in context.request.url:
+                name = await context.page.locator('h1').text_content()
+                price = await context.page.locator('span.price').text_content()
+                description = await context.page.locator('p.description').text_content()
+                if name:
+                    await context.push_data(
+                        {
+                            'url': context.request.url,
+                            'name': name.strip(),
+                            'price': (price or '').strip(),
+                            'description': (description or '').strip(),
+                        }
+                    )
+
+        await crawler.run(['http://localhost:8080/'])
+
+        await Actor.set_value(
+            'CRAWLER_RESULT',
+            {
+                'pages_visited_count': len(pages_visited),
+                'crawler_type': 'PlaywrightCrawler',
+            },
+        )
@@ -0,0 +1,84 @@
+import asyncio
+import logging
+from collections.abc import Awaitable, Callable, Coroutine
+from typing import Any
+
+from uvicorn import Config
+from uvicorn.server import Server
+
+Receive = Callable[[], Awaitable[dict[str, Any]]]
+Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
+
+_PRODUCTS = {
+    '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'},
+    '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'},
+    '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'},
+}
+
+
+async def _send_html(send: Send, html: str, status: int = 200) -> None:
+    await send(
+        {
+            'type': 'http.response.start',
+            'status': status,
+            'headers': [[b'content-type', b'text/html; charset=utf-8']],
+        }
+    )
+    await send({'type': 'http.response.body', 'body': html.encode()})
+
+
+async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
+    assert scope['type'] == 'http'
+    path = scope['path']
+
+    if path == '/':
+        await _send_html(
+            send,
+            '<html><head><title>E-commerce Test Store</title></head><body>'
+            '<h1>Welcome to Test Store</h1>'
+            '<a href="/products/1">Widget A</a>'
+            '<a href="/products/2">Widget B</a>'
+            '<a href="/products/3">Widget C</a>'
+            '<a href="/about">About Us</a>'
+            '</body></html>',
+        )
+    elif path.startswith('/products/'):
+        product = _PRODUCTS.get(path.split('/')[-1])
+        if product:
+            await _send_html(
+                send,
+                f'<html><head><title>{product["name"]}</title></head><body>'
+                f'<h1>{product["name"]}</h1>'
+                f'<span class="price">{product["price"]}</span>'
+                f'<p class="description">{product["description"]}</p>'
+                f'<a href="/">Back to Home</a>'
+                f'</body></html>',
+            )
+        else:
+            await _send_html(send, '<html><body>Not Found</body></html>', 404)
+    elif path == '/about':
+        await _send_html(
+            send,
+            '<html><head><title>About Us</title></head><body>'
+            '<h1>About Test Store</h1>'
+            '<p class="description">We sell the best widgets in the world.</p>'
+            '<a href="/">Back to Home</a>'
+            '</body></html>',
+        )
+    else:
+        await _send_html(send, '<html><body>Not Found</body></html>', 404)
+
+
+if __name__ == '__main__':
+    asyncio.run(
+        Server(
+            config=Config(
+                app=app,
+                lifespan='off',
+                loop='asyncio',
+                port=8080,
+                log_config=None,
+                log_level=logging.CRITICAL,
+            )
+        ).serve()
+    )