diff --git a/README.md b/README.md index e945edf4..5cb37ce3 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,9 @@ Below is a comprehensive table of all available tools, how to use them with an a | Tool | Agent Usage | Use Case | |------|-------------|----------| | a2a_client | `provider = A2AClientToolProvider(known_agent_urls=["http://localhost:9000"]); agent = Agent(tools=provider.tools)` | Discover and communicate with A2A-compliant agents, send messages between agents | +| apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor with arbitrary input | +| apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | +| apify_google_search_scraper | `agent.tool.apify_google_search_scraper(search_query="best AI frameworks")` | Search Google and return structured results | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | | editor | `agent.tool.editor(command="view", path="path/to/file.py")` | Advanced file operations like syntax highlighting, pattern replacement, and multi-file edits | @@ -960,6 +963,54 @@ result = agent.tool.mongodb_memory( ) ``` +### Apify + +```python +from strands import Agent +from strands_tools.apify import APIFY_ALL_TOOLS + +agent = Agent(tools=APIFY_ALL_TOOLS) + +# Scrape a single URL and get markdown content +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor and get results in one step +result = agent.tool.apify_run_actor_and_get_dataset( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + dataset_items_limit=50, +) + +# Run a saved task (pre-configured Actor with default inputs) +run_info = agent.tool.apify_run_task(task_id="user/my-task") + +# Run a task and get results in one step +result = agent.tool.apify_run_task_and_get_dataset( + task_id="user/my-task", + task_input={"query": "override default input"}, + dataset_items_limit=50, +) + +# Run an Actor (get metadata only) +run_info = agent.tool.apify_run_actor( + actor_id="apify/google-search-scraper", + run_input={"queries": "AI agent frameworks"}, +) + +# Fetch dataset items separately +items = agent.tool.apify_get_dataset_items( + dataset_id="abc123", + limit=100, +) + +# Search Google +results = agent.tool.apify_google_search_scraper( + search_query="best AI frameworks 2025", + results_limit=10, +) + +``` + ## 🌍 Environment Variables Configuration Agents Tools provides extensive customization through environment variables. This allows you to configure tool behavior without modifying code, making it ideal for different environments (development, testing, production). @@ -1068,6 +1119,12 @@ The Mem0 Memory Tool supports three different backend configurations: - If `NEPTUNE_ANALYTICS_GRAPH_IDENTIFIER` is set, the tool will configure Neptune Analytics as graph store to enhance memory search - LLM configuration applies to all backend modes and allows customization of the language model used for memory processing +#### Apify Tool + +| Environment Variable | Description | Default | +|----------------------|-------------|---------| +| APIFY_API_TOKEN | Apify API token for authentication (required) | None | + #### Bright Data Tool | Environment Variable | Description | Default | diff --git a/docs/apify_tool.md b/docs/apify_tool.md new file mode 100644 index 00000000..daf054e7 --- /dev/null +++ b/docs/apify_tool.md @@ -0,0 +1,364 @@ +# Apify + +The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform — running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. + +## Installation + +```bash +pip install strands-agents-tools[apify] +``` + +## Configuration + +Set your Apify API token as an environment variable: + +```bash +export APIFY_API_TOKEN=apify_api_your_token_here +``` + +Get your token from [Apify Console](https://console.apify.com/account/integrations) → Settings → API & Integrations → Personal API tokens. + +## Usage + +Register all core tools at once: + +```python +from strands import Agent +from strands_tools.apify import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) +``` + +Or pick individual tools: + +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_run_actor, + apify.apify_scrape_url, +]) +``` + +### Scrape a URL + +The simplest way to extract content from any web page. Uses the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor under the hood and returns the page content as Markdown: + +```python +content = agent.tool.apify_scrape_url(url="https://example.com") +``` + +### Run an Actor + +Execute any Actor from [Apify Store](https://apify.com/store) by its ID. The call blocks until the Actor run finishes or the timeout is reached: + +```python +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + timeout_secs=300, +) +``` + +The result is a JSON string containing run metadata: `run_id`, `status`, `dataset_id`, `started_at`, and `finished_at`. + +### Run an Actor and Get Results + +Combine running an Actor and fetching its dataset results in a single call: + +```python +result = agent.tool.apify_run_actor_and_get_dataset( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + dataset_items_limit=50, +) +``` + +### Run a task + +Execute a saved [Actor task](https://docs.apify.com/platform/actors/running/tasks) — a pre-configured Actor with preset inputs. Use this when a task has already been set up in Apify Console: + +```python +result = agent.tool.apify_run_task( + task_id="user~my-task", + task_input={"query": "override input"}, + timeout_secs=300, +) +``` + +The result is a JSON string containing run metadata: `run_id`, `status`, `dataset_id`, `started_at`, and `finished_at`. + +### Run a task and get results + +Combine running a task and fetching its dataset results in a single call: + +```python +result = agent.tool.apify_run_task_and_get_dataset( + task_id="user~my-task", + dataset_items_limit=50, +) +``` + +### Fetch dataset items + +Retrieve results from a dataset by its ID. Useful after running an Actor to get the structured results separately, or to access any existing dataset: + +```python +items = agent.tool.apify_get_dataset_items( + dataset_id="abc123", + limit=100, + offset=0, +) +``` + +## Tool Parameters + +### apify_scrape_url + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `url` | string | Yes | — | The URL to scrape | +| `timeout_secs` | int | No | 120 | Maximum time in seconds to wait for scraping to finish | +| `crawler_type` | string | No | `"cheerio"` | Crawler engine to use. One of `"cheerio"` (fastest, no JS rendering), `"playwright:adaptive"` (fast, renders JS if present), or `"playwright:firefox"` (reliable, renders JS, best at avoiding blocking but slower) | + +**Returns:** Markdown content of the scraped page as a plain string. + +### apify_run_actor + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `actor_id` | string | Yes | — | Actor identifier (e.g., `apify/website-content-crawler`) | +| `run_input` | dict | No | None | JSON-serializable input for the Actor | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | +| `build` | string | No | None | Actor build tag or number to run a specific version (uses latest build if not set) | + +**Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. + +### apify_run_task + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `task_id` | string | Yes | — | Task identifier (e.g., `user~my-task` or a task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the task run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the task run (uses task default if not set) | + +**Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. + +### apify_run_task_and_get_dataset + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `task_id` | string | Yes | — | Task identifier (e.g., `user~my-task` or a task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the task run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the task run (uses task default if not set) | +| `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | +| `dataset_items_offset` | int | No | 0 | Number of dataset items to skip for pagination | + +**Returns:** JSON string with run metadata plus an `items` array containing the dataset results. + +### apify_get_dataset_items + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `dataset_id` | string | Yes | — | The Apify dataset ID to fetch items from | +| `limit` | int | No | 100 | Maximum number of items to return | +| `offset` | int | No | 0 | Number of items to skip for pagination | + +**Returns:** JSON string containing an array of dataset items. + +### apify_run_actor_and_get_dataset + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `actor_id` | string | Yes | — | Actor identifier (e.g., `apify/website-content-crawler`) | +| `run_input` | dict | No | None | JSON-serializable input for the Actor | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | +| `build` | string | No | None | Actor build tag or number to run a specific version (uses latest build if not set) | +| `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | +| `dataset_items_offset` | int | No | 0 | Number of dataset items to skip for pagination | + +**Returns:** JSON string with run metadata plus an `items` array containing the dataset results. + +## Search & Crawling + +Specialized tools for common search and crawling use cases. Register all search tools at once: + +```python +from strands import Agent +from strands_tools.apify import APIFY_SEARCH_TOOLS + +agent = Agent(tools=APIFY_SEARCH_TOOLS) +``` + +Or register all Apify tools (core + search): + +```python +from strands_tools.apify import APIFY_ALL_TOOLS + +agent = Agent(tools=APIFY_ALL_TOOLS) +``` + +### Search Google + +Search Google and return structured results using the [Google Search Scraper](https://apify.com/apify/google-search-scraper) Actor: + +```python +result = agent.tool.apify_google_search_scraper( + search_query="best AI frameworks 2025", + results_limit=10, + country_code="us", +) +``` + +### Search Google Maps + +Search Google Maps for businesses and places using the [Google Maps Scraper](https://apify.com/compass/crawler-google-places) Actor: + +```python +result = agent.tool.apify_google_places_scraper( + search_query="restaurants in Prague", + results_limit=20, + include_reviews=True, + max_reviews=5, +) +``` + +### Scrape YouTube + +Scrape YouTube videos, channels, or search results using the [YouTube Scraper](https://apify.com/streamers/youtube-scraper) Actor: + +```python +# Search YouTube +result = agent.tool.apify_youtube_scraper( + search_query="python tutorial", + results_limit=10, +) + +# Scrape specific videos +result = agent.tool.apify_youtube_scraper( + urls=["https://www.youtube.com/watch?v=dQw4w9WgXcQ"], +) +``` + +### Crawl a website + +Crawl a website and extract content from multiple pages using the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor. This is the multi-page version — distinct from `apify_scrape_url` which is limited to a single page: + +```python +result = agent.tool.apify_website_content_crawler( + start_url="https://docs.example.com", + max_pages=20, + max_depth=3, +) +``` + +### Scrape e-commerce products + +Scrape product data from e-commerce websites using the [E-commerce Scraping Tool](https://apify.com/apify/e-commerce-scraping-tool) Actor. Supports Amazon, eBay, Walmart, and other platforms: + +```python +# Scrape a single product page +result = agent.tool.apify_ecommerce_scraper( + url="https://www.amazon.com/dp/B0TEST", +) + +# Scrape a category or search results page +result = agent.tool.apify_ecommerce_scraper( + url="https://www.amazon.com/s?k=headphones", + url_type="listing", + results_limit=20, +) +``` + +## Search & Crawling Tool Parameters + +### apify_google_search_scraper + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `search_query` | string | Yes | — | The search query string. Supports advanced Google operators like `"site:example.com"` | +| `results_limit` | int | No | 10 | Maximum number of results to return. Google returns ~10 per page, so requesting more triggers additional page scraping | +| `country_code` | string | No | None | Two-letter country code for localized results (e.g., `"us"`, `"de"`) | +| `language_code` | string | No | None | Two-letter language code (e.g., `"en"`, `"de"`) | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait | + +**Returns:** JSON string with run metadata and an `items` array containing structured search results (organic results, ads, People Also Ask). + +### apify_google_places_scraper + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `search_query` | string | Yes | — | Search query for Google Maps (e.g., `"restaurants in Prague"`) | +| `results_limit` | int | No | 20 | Maximum number of places to return | +| `language` | string | No | None | Language for results (e.g., `"en"`, `"de"`) | +| `include_reviews` | bool | No | False | Whether to include user reviews | +| `max_reviews` | int | No | 5 | Maximum reviews per place when `include_reviews` is True | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait | + +**Returns:** JSON string with run metadata and an `items` array containing place data (name, address, rating, phone, website). + +### apify_youtube_scraper + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `search_query` | string | No | None | YouTube search query | +| `urls` | list[str] | No | None | Specific YouTube video or channel URLs | +| `results_limit` | int | No | 20 | Maximum number of results to return | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait | + +At least one of `search_query` or `urls` must be provided. + +**Returns:** JSON string with run metadata and an `items` array containing video/channel data. + +### apify_website_content_crawler + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `start_url` | string | Yes | — | The starting URL to crawl | +| `max_pages` | int | No | 10 | Maximum number of pages to crawl | +| `max_depth` | int | No | 2 | Maximum crawl depth from the start URL | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait | + +**Returns:** JSON string with run metadata and an `items` array containing crawled page data with markdown content. + +### apify_ecommerce_scraper + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `url` | string | Yes | — | The URL to scrape | +| `url_type` | string | No | `"product"` | Type of URL: `"product"` for a product detail page, `"listing"` for a category or search results page | +| `results_limit` | int | No | 20 | Maximum number of products to return | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait | + +**Returns:** JSON string with run metadata and an `items` array containing structured product data. + +## Troubleshooting + +| Error | Cause | Fix | +|-------|-------|-----| +| `APIFY_API_TOKEN environment variable is not set` | Token not configured | Set the `APIFY_API_TOKEN` environment variable | +| `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | +| `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in [Apify Console](https://console.apify.com) | +| `Task ... finished with status FAILED` | task execution error | Check task configuration and run logs in [Apify Console](https://console.apify.com) | +| `Actor/task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter; `apify_website_content_crawler` with large `max_pages` may need 600+ seconds | +| `Task ... returned no run data` | task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | +| `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | +| `At least one of 'search_query' or 'urls' must be provided` | YouTube Scraper called without input | Provide a `search_query`, `urls`, or both | + +## References + +- [Strands Agents Tools](https://strandsagents.com/latest/user-guide/concepts/tools/tools_overview/) +- [Apify Platform](https://apify.com) +- [Apify API Documentation](https://docs.apify.com/api/v2) +- [Apify Store](https://apify.com/store) +- [Apify Python Client](https://docs.apify.com/api/client/python/docs) +- [Google Search Scraper Actor](https://apify.com/apify/google-search-scraper) +- [Google Maps Scraper Actor](https://apify.com/compass/crawler-google-places) +- [YouTube Scraper Actor](https://apify.com/streamers/youtube-scraper) +- [Website Content Crawler Actor](https://apify.com/apify/website-content-crawler) +- [E-commerce Scraping Tool Actor](https://apify.com/apify/e-commerce-scraping-tool) diff --git a/pyproject.toml b/pyproject.toml index bf00325f..93e05c6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,9 @@ Homepage = "https://github.com/strands-agents/tools" Documentation = "https://strandsagents.com/" [project.optional-dependencies] +apify = [ + "apify-client>=2.5.0,<3.0.0", +] build = [ "hatch>=1.16.5", ] @@ -122,7 +125,7 @@ mongodb-memory = [ ] [tool.hatch.envs.hatch-static-analysis] -features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory"] +features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory", "apify"] dependencies = [ "strands-agents>=1.0.0", "mypy>=0.981,<1.0.0", @@ -141,7 +144,7 @@ lint-check = [ lint-fix = ["ruff check --fix"] [tool.hatch.envs.hatch-test] -features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory"] +features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory", "apify"] extra-dependencies = [ "moto>=5.1.0,<6.0.0", "pytest>=8.0.0,<10.0.0", diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py new file mode 100644 index 00000000..0194ed64 --- /dev/null +++ b/src/strands_tools/apify.py @@ -0,0 +1,978 @@ +"""Apify platform tools for Strands Agents. + +This module provides web scraping, data extraction, and automation capabilities +using the Apify platform. It lets you run any Actor, task, fetch dataset +results, scrape individual URLs, and perform specialized search and crawling. + +Available Tools: +--------------- +Core: +- apify_run_actor: Run any Apify Actor with custom input +- apify_get_dataset_items: Fetch items from an Apify dataset with pagination +- apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step +- apify_run_task: Run a saved Actor task with optional input overrides +- apify_run_task_and_get_dataset: Run a task and fetch results in one step +- apify_scrape_url: Scrape a single URL and return content as Markdown + +Search & Crawling: +- apify_google_search_scraper: Search Google and return structured results +- apify_google_places_scraper: Search Google Maps for businesses and places +- apify_youtube_scraper: Scrape YouTube videos, channels, or search results +- apify_website_content_crawler: Crawl a website and extract content from multiple pages +- apify_ecommerce_scraper: Scrape product data from e-commerce websites + +Setup Requirements: +------------------ +1. Create an Apify account at https://apify.com +2. Obtain your API token: Apify Console > Settings > API & Integrations > Personal API tokens +3. Install the optional dependency: pip install strands-agents-tools[apify] +4. Set the environment variable: + APIFY_API_TOKEN=your_api_token_here + +Usage Examples: +-------------- +Register all core tools at once via the preset list: + +```python +from strands import Agent +from strands_tools.apify import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) +``` + +Register all search & crawling tools: + +```python +from strands import Agent +from strands_tools.apify import APIFY_SEARCH_TOOLS + +agent = Agent(tools=APIFY_SEARCH_TOOLS) +``` + +Register all Apify tools (core + search): + +```python +from strands import Agent +from strands_tools.apify import APIFY_ALL_TOOLS + +agent = Agent(tools=APIFY_ALL_TOOLS) +``` + +Or pick individual tools for a smaller LLM tool surface: + +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_scrape_url, + apify.apify_run_actor, + apify.apify_google_search_scraper, +]) + +# Scrape a single URL +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, +) + +# Search Google +results = agent.tool.apify_google_search_scraper( + search_query="best AI frameworks 2025", + results_limit=10, +) +``` +""" + +import json +import logging +import os +from typing import Any, Dict, List, Literal, Optional, get_args +from urllib.parse import urlparse + +from rich.panel import Panel +from rich.text import Text +from strands import tool + +from strands_tools.utils import console_util + +logger = logging.getLogger(__name__) +console = console_util.create() + +try: + from apify_client import ApifyClient + from apify_client.errors import ApifyApiError + + HAS_APIFY_CLIENT = True +except ImportError: + HAS_APIFY_CLIENT = False + +# Attribution header - lets Apify track usage originating from strands-agents (analytics only) +TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} +ERROR_PANEL_TITLE = "[bold red]Apify Error[/bold red]" +DEFAULT_TIMEOUT_SECS = 300 +DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +DEFAULT_DATASET_ITEMS_LIMIT = 100 + +WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" +CrawlerType = Literal["playwright:adaptive", "playwright:firefox", "cheerio"] +WEBSITE_CONTENT_CRAWLER_TYPES = get_args(CrawlerType) + + +# --- Helper functions --- + + +def _check_dependency() -> None: + """Raise ImportError if apify-client is not installed.""" + if not HAS_APIFY_CLIENT: + raise ImportError("apify-client package is required. Install with: pip install strands-agents-tools[apify]") + + +def _format_error(e: Exception) -> str: + """Map exceptions to user-friendly error messages, with special handling for ApifyApiError.""" + if HAS_APIFY_CLIENT and isinstance(e, ApifyApiError): + status_code = getattr(e, "status_code", None) + msg = getattr(e, "message", str(e)) + match status_code: + case 400: + return f"Invalid request: {msg}" + case 401: + return "Authentication failed. Verify your APIFY_API_TOKEN is valid." + case 402: + return "Insufficient Apify plan credits or subscription limits exceeded." + case 404: + return f"Resource not found: {msg}" + case 408: + return f"Actor run timed out: {msg}" + case 429: + return ( + "Rate limit exceeded. The Apify client retries automatically; " + "if this persists, reduce request frequency." + ) + case None: + return f"Apify API error: {msg}" + case _: + return f"Apify API error ({status_code}): {msg}" + return str(e) + + +def _error_result(e: Exception, tool_name: str) -> Dict[str, Any]: + """Build a structured error response and display an error panel.""" + message = _format_error(e) + logger.error("%s failed: %s", tool_name, message) + console.print(Panel(Text(message, style="red"), title=ERROR_PANEL_TITLE, border_style="red")) + return {"status": "error", "content": [{"text": message}]} + + +def _success_result(text: str, panel_body: str, panel_title: str) -> Dict[str, Any]: + """Build a structured success response and display a success panel.""" + console.print(Panel(panel_body, title=f"[bold cyan]{panel_title}[/bold cyan]", border_style="green")) + return {"status": "success", "content": [{"text": text}]} + + +class ApifyToolClient: + """Helper class encapsulating Apify API interactions via apify-client.""" + + def __init__(self) -> None: + token = os.getenv("APIFY_API_TOKEN", "") + if not token: + raise ValueError( + "APIFY_API_TOKEN environment variable is not set. " + "Get your token at https://console.apify.com/account/integrations" + ) + self.client: "ApifyClient" = ApifyClient(token, headers=TRACKING_HEADER) + + @staticmethod + def _check_run_status(actor_run: Dict[str, Any], label: str) -> None: + """Raise RuntimeError if the Actor run did not succeed.""" + status = actor_run.get("status", "UNKNOWN") + if status != "SUCCEEDED": + run_id = actor_run.get("id", "N/A") + raise RuntimeError(f"{label} finished with status {status}. Run ID: {run_id}") + + @staticmethod + def _validate_url(url: str) -> None: + """Raise ValueError if the URL does not have a valid HTTP(S) scheme and domain.""" + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + raise ValueError(f"Invalid URL scheme '{parsed.scheme}'. Only http and https URLs are supported.") + if not parsed.netloc: + raise ValueError(f"Invalid URL '{url}'. A domain is required.") + + @staticmethod + def _validate_identifier(value: str, name: str) -> None: + """Raise ValueError if a required string identifier is empty or whitespace-only.""" + if not value.strip(): + raise ValueError(f"'{name}' must be a non-empty string.") + + @staticmethod + def _validate_positive(value: int, name: str) -> None: + """Raise ValueError if the value is not a positive integer (> 0).""" + if value <= 0: + raise ValueError(f"'{name}' must be a positive integer, got {value}.") + + @staticmethod + def _validate_non_negative(value: int, name: str) -> None: + """Raise ValueError if the value is negative.""" + if value < 0: + raise ValueError(f"'{name}' must be a non-negative integer, got {value}.") + + def run_actor( + self, + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, + ) -> Dict[str, Any]: + """Run an Apify Actor synchronously and return run metadata.""" + self._validate_identifier(actor_id, "actor_id") + self._validate_positive(timeout_secs, "timeout_secs") + if memory_mbytes is not None: + self._validate_positive(memory_mbytes, "memory_mbytes") + + call_kwargs: Dict[str, Any] = { + "run_input": run_input if run_input is not None else {}, + "timeout_secs": timeout_secs, + "logger": None, # Suppress verbose apify-client logging not useful to end users + } + if memory_mbytes is not None: + call_kwargs["memory_mbytes"] = memory_mbytes + if build is not None: + call_kwargs["build"] = build + + actor_run = self.client.actor(actor_id).call(**call_kwargs) + if actor_run is None: + raise RuntimeError(f"Actor {actor_id} returned no run data (possible wait timeout).") + self._check_run_status(actor_run, f"Actor {actor_id}") + + return { + "run_id": actor_run.get("id"), + "status": actor_run.get("status"), + "dataset_id": actor_run.get("defaultDatasetId"), + "started_at": actor_run.get("startedAt"), + "finished_at": actor_run.get("finishedAt"), + } + + def get_dataset_items( + self, + dataset_id: str, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + offset: int = 0, + ) -> List[Dict[str, Any]]: + """Fetch items from an Apify dataset.""" + self._validate_identifier(dataset_id, "dataset_id") + self._validate_positive(limit, "limit") + self._validate_non_negative(offset, "offset") + + result = self.client.dataset(dataset_id).list_items(limit=limit, offset=offset) + return list(result.items) + + def run_actor_and_get_dataset( + self, + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, + ) -> Dict[str, Any]: + """Run an Actor synchronously, then fetch its default dataset items.""" + self._validate_positive(dataset_items_limit, "dataset_items_limit") + self._validate_non_negative(dataset_items_offset, "dataset_items_offset") + + run_metadata = self.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + ) + dataset_id = run_metadata["dataset_id"] + if not dataset_id: + raise RuntimeError(f"Actor {actor_id} run has no default dataset.") + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) + return {**run_metadata, "items": items} + + def run_task( + self, + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + ) -> Dict[str, Any]: + """Run an Apify task synchronously and return run metadata.""" + self._validate_identifier(task_id, "task_id") + self._validate_positive(timeout_secs, "timeout_secs") + if memory_mbytes is not None: + self._validate_positive(memory_mbytes, "memory_mbytes") + + call_kwargs: Dict[str, Any] = {"timeout_secs": timeout_secs} + if task_input is not None: + call_kwargs["task_input"] = task_input + if memory_mbytes is not None: + call_kwargs["memory_mbytes"] = memory_mbytes + + task_run = self.client.task(task_id).call(**call_kwargs) + if task_run is None: + raise RuntimeError(f"Task {task_id} returned no run data (possible wait timeout).") + self._check_run_status(task_run, f"Task {task_id}") + + return { + "run_id": task_run.get("id"), + "status": task_run.get("status"), + "dataset_id": task_run.get("defaultDatasetId"), + "started_at": task_run.get("startedAt"), + "finished_at": task_run.get("finishedAt"), + } + + def run_task_and_get_dataset( + self, + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, + ) -> Dict[str, Any]: + """Run a task synchronously, then fetch its default dataset items.""" + self._validate_positive(dataset_items_limit, "dataset_items_limit") + self._validate_non_negative(dataset_items_offset, "dataset_items_offset") + + run_metadata = self.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + dataset_id = run_metadata["dataset_id"] + if not dataset_id: + raise RuntimeError(f"Task {task_id} run has no default dataset.") + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) + return {**run_metadata, "items": items} + + def scrape_url( + self, + url: str, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: CrawlerType = "cheerio", + ) -> str: + """Scrape a single URL using Website Content Crawler and return markdown.""" + self._validate_url(url) + self._validate_positive(timeout_secs, "timeout_secs") + if crawler_type not in WEBSITE_CONTENT_CRAWLER_TYPES: + raise ValueError( + f"Invalid crawler_type '{crawler_type}'. Must be one of: {', '.join(WEBSITE_CONTENT_CRAWLER_TYPES)}." + ) + + run_input: Dict[str, Any] = { + "startUrls": [{"url": url}], + "maxCrawlPages": 1, + "crawlerType": crawler_type, + } + actor_run = self.client.actor(WEBSITE_CONTENT_CRAWLER).call( + run_input=run_input, + timeout_secs=timeout_secs, + logger=None, # Suppress verbose apify-client logging not useful to end users + ) + if actor_run is None: + raise RuntimeError("Website Content Crawler returned no run data (possible wait timeout).") + self._check_run_status(actor_run, "Website Content Crawler") + + dataset_id = actor_run.get("defaultDatasetId") + if not dataset_id: + raise RuntimeError("Website Content Crawler run has no default dataset.") + result = self.client.dataset(dataset_id).list_items(limit=1) + items = list(result.items) + + if not items: + raise RuntimeError(f"No content returned for URL: {url}") + + return str(items[0].get("markdown") or items[0].get("text", "")) + + +# --- Tool functions --- + + +@tool +def apify_run_actor( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, +) -> Dict[str, Any]: + """Run any Apify Actor and return the run metadata as JSON. + + Executes the Actor synchronously - blocks until the Actor run finishes or the timeout + is reached. Use this when you need to run a specific Actor and then inspect or process + the results separately. + + Common Actors: + - "apify/website-content-crawler" - scrape websites and extract content + - "apify/web-scraper" - general-purpose web scraper + - "apify/google-search-scraper" - scrape Google search results + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Actor", + ) + except Exception as e: + return _error_result(e, "apify_run_actor") + + +@tool +def apify_get_dataset_items( + dataset_id: str, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + offset: int = 0, +) -> Dict[str, Any]: + """Fetch items from an existing Apify dataset and return them as JSON. + + Use this after running an Actor to retrieve the structured results from its + default dataset, or to access any dataset by ID. + + Args: + dataset_id: The Apify dataset ID to fetch items from. + limit: Maximum number of items to return. Defaults to 100. + offset: Number of items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing an array of dataset items. + """ + try: + _check_dependency() + client = ApifyToolClient() + items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) + return _success_result( + text=json.dumps(items, indent=2, default=str), + panel_body=( + f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" + ), + panel_title="Apify: Dataset Items", + ) + except Exception as e: + return _error_result(e, "apify_get_dataset_items") + + +@tool +def apify_run_actor_and_get_dataset( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify Actor and fetch its dataset results in one step. + + Convenience tool that combines running an Actor and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor_and_get_dataset( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed with dataset[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Actor + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_actor_and_get_dataset") + + +@tool +def apify_run_task( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, +) -> Dict[str, Any]: + """Run an Apify task and return the run metadata as JSON. + + Tasks are saved Actor configurations with preset inputs. Use this when a task + has already been configured in Apify Console, so you don't need to specify + the full Actor input every time. + + Args: + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Task", + ) + except Exception as e: + return _error_result(e, "apify_run_task") + + +@tool +def apify_run_task_and_get_dataset( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify task and fetch its dataset results in one step. + + Convenience tool that combines running a task and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task_and_get_dataset( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed with dataset[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Task + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_task_and_get_dataset") + + +@tool +def apify_scrape_url( + url: str, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: CrawlerType = "cheerio", +) -> Dict[str, Any]: + """Scrape a single URL and return its content as markdown. + + Uses the Website Content Crawler Actor under the hood, pre-configured for + fast single-page scraping. This is the simplest way to extract readable content + from any web page. + + Args: + url: The URL to scrape, e.g. "https://example.com". + timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. + crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, + default), "playwright:adaptive" (fast, renders JS if present), or + "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). + + Returns: + Dict with status and content containing the markdown content of the scraped page. + """ + try: + _check_dependency() + client = ApifyToolClient() + content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) + return _success_result( + text=content, + panel_body=( + f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" + ), + panel_title="Apify: Scrape URL", + ) + except Exception as e: + return _error_result(e, "apify_scrape_url") + + +APIFY_CORE_TOOLS = [ + apify_run_actor, + apify_get_dataset_items, + apify_run_actor_and_get_dataset, + apify_run_task, + apify_run_task_and_get_dataset, + apify_scrape_url, +] + + +# --- Search & crawling tool constants --- + +GOOGLE_SEARCH_SCRAPER_ID = "apify/google-search-scraper" +GOOGLE_PLACES_SCRAPER_ID = "compass/crawler-google-places" +YOUTUBE_SCRAPER_ID = "streamers/youtube-scraper" +ECOMMERCE_SCRAPER_ID = "apify/e-commerce-scraping-tool" +DEFAULT_SEARCH_RESULTS_LIMIT = 20 + + +# --- Search & crawling helpers --- + + +def _search_crawl_result( + actor_name: str, + client: ApifyToolClient, + run_input: Dict[str, Any], + actor_id: str, + timeout_secs: int, + results_limit: int, +) -> Dict[str, Any]: + """Run a search/crawling Actor and return formatted results.""" + result = client.run_actor_and_get_dataset( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=results_limit, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]{actor_name} completed[/green]\nRun ID: {result['run_id']}\nItems returned: {len(result['items'])}" + ), + panel_title=f"Apify: {actor_name}", + ) + + +# --- Search & crawling tool functions --- + + +@tool +def apify_google_search_scraper( + search_query: str, + results_limit: int = 10, + country_code: Optional[str] = None, + language_code: Optional[str] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, +) -> Dict[str, Any]: + """Search Google and return structured search results. + + Uses the Google Search Scraper Actor to perform a Google search and return + organic results, ads, People Also Ask, and related queries in a structured format. + + Args: + search_query: The search query string, e.g. "best AI frameworks 2025". + Supports advanced Google operators like "site:example.com" or "AI OR ML". + results_limit: Maximum number of results to return. Google returns ~10 results + per page, so requesting more triggers additional page scraping. Defaults to 10. + country_code: Two-letter country code for localized results, e.g. "us", "de". + language_code: Two-letter language code for the interface, e.g. "en", "de". + timeout_secs: Maximum time in seconds to wait for the run to finish. Defaults to 300. + + Returns: + Dict with status and content containing structured Google search results including + organic results, ads, and People Also Ask data. + """ + try: + _check_dependency() + client = ApifyToolClient() + max_pages = max(1, (results_limit + 9) // 10) + run_input: Dict[str, Any] = { + "queries": search_query, + "maxPagesPerQuery": max_pages, + } + if country_code is not None: + run_input["countryCode"] = country_code + if language_code is not None: + run_input["languageCode"] = language_code + return _search_crawl_result( + actor_name="Google Search Scraper", + client=client, + run_input=run_input, + actor_id=GOOGLE_SEARCH_SCRAPER_ID, + timeout_secs=timeout_secs, + results_limit=results_limit, + ) + except Exception as e: + return _error_result(e, "apify_google_search_scraper") + + +@tool +def apify_google_places_scraper( + search_query: str, + results_limit: int = DEFAULT_SEARCH_RESULTS_LIMIT, + language: Optional[str] = None, + include_reviews: bool = False, + max_reviews: int = 5, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, +) -> Dict[str, Any]: + """Search Google Maps for businesses and places, optionally including reviews. + + Uses the Google Maps Scraper Actor to find places matching a search query + and return structured data including name, address, rating, phone, and website. + + Args: + search_query: Search query for Google Maps, e.g. "restaurants in Prague". + results_limit: Maximum number of places to return. Defaults to 20. + language: Language for results, e.g. "en", "de". Defaults to English. + include_reviews: Whether to include user reviews for each place. Defaults to False. + max_reviews: Maximum reviews per place when include_reviews is True. Defaults to 5. + timeout_secs: Maximum time in seconds to wait for the run to finish. Defaults to 300. + + Returns: + Dict with status and content containing structured Google Maps place data. + """ + try: + _check_dependency() + client = ApifyToolClient() + run_input: Dict[str, Any] = { + "searchStringsArray": [search_query], + "maxCrawledPlacesPerSearch": results_limit, + "maxReviews": max_reviews if include_reviews else 0, + } + if language is not None: + run_input["language"] = language + return _search_crawl_result( + actor_name="Google Places Scraper", + client=client, + run_input=run_input, + actor_id=GOOGLE_PLACES_SCRAPER_ID, + timeout_secs=timeout_secs, + results_limit=results_limit, + ) + except Exception as e: + return _error_result(e, "apify_google_places_scraper") + + +@tool +def apify_youtube_scraper( + search_query: Optional[str] = None, + urls: Optional[List[str]] = None, + results_limit: int = DEFAULT_SEARCH_RESULTS_LIMIT, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, +) -> Dict[str, Any]: + """Scrape YouTube videos, channels, or search results. + + Uses the YouTube Scraper Actor to search YouTube or scrape specific video/channel + URLs. Provide either a search query, specific URLs, or both. + + Args: + search_query: YouTube search query, e.g. "python tutorial". + urls: Specific YouTube video or channel URLs to scrape. + results_limit: Maximum number of results to return. Defaults to 20. + timeout_secs: Maximum time in seconds to wait for the run to finish. Defaults to 300. + + Returns: + Dict with status and content containing structured YouTube video/channel data. + """ + try: + _check_dependency() + if not search_query and not urls: + raise ValueError("At least one of 'search_query' or 'urls' must be provided.") + client = ApifyToolClient() + run_input: Dict[str, Any] = { + "maxResults": results_limit, + } + if search_query is not None: + run_input["searchQueries"] = [search_query] + if urls is not None: + run_input["startUrls"] = [{"url": u} for u in urls] + return _search_crawl_result( + actor_name="YouTube Scraper", + client=client, + run_input=run_input, + actor_id=YOUTUBE_SCRAPER_ID, + timeout_secs=timeout_secs, + results_limit=results_limit, + ) + except Exception as e: + return _error_result(e, "apify_youtube_scraper") + + +@tool +def apify_website_content_crawler( + start_url: str, + max_pages: int = 10, + max_depth: int = 2, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, +) -> Dict[str, Any]: + """Crawl a website and extract content from multiple pages. + + Uses the Website Content Crawler Actor to perform a multi-page crawl starting + from the given URL. Returns page content as markdown. This is the extended + multi-page version — distinct from apify_scrape_url which scrapes a single page. + + Args: + start_url: The starting URL to crawl, e.g. "https://docs.example.com". + max_pages: Maximum number of pages to crawl. Defaults to 10. + max_depth: Maximum crawl depth from the start URL. Defaults to 2. + timeout_secs: Maximum time in seconds to wait for the run to finish. Defaults to 300. + + Returns: + Dict with status and content containing crawled page data with markdown content. + """ + try: + _check_dependency() + client = ApifyToolClient() + client._validate_url(start_url) + run_input: Dict[str, Any] = { + "startUrls": [{"url": start_url}], + "maxCrawlPages": max_pages, + "maxCrawlDepth": max_depth, + "proxyConfiguration": {"useApifyProxy": True}, + } + return _search_crawl_result( + actor_name="Website Content Crawler", + client=client, + run_input=run_input, + actor_id=WEBSITE_CONTENT_CRAWLER, + timeout_secs=timeout_secs, + results_limit=max_pages, + ) + except Exception as e: + return _error_result(e, "apify_website_content_crawler") + + +VALID_ECOMMERCE_URL_TYPES = ("product", "listing") + + +@tool +def apify_ecommerce_scraper( + url: str, + url_type: str = "product", + results_limit: int = DEFAULT_SEARCH_RESULTS_LIMIT, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, +) -> Dict[str, Any]: + """Scrape product data from e-commerce websites. + + Uses the E-commerce Scraping Tool Actor to extract structured product data + (title, price, description, images, etc.) from supported e-commerce platforms + including Amazon, eBay, Walmart, and others. The Actor auto-detects the site. + + Args: + url: The URL to scrape. + url_type: Type of URL being scraped. Use "product" (default) for a direct product + detail page, or "listing" for a category page or search results page containing + multiple products. + results_limit: Maximum number of products to return. Defaults to 20. + timeout_secs: Maximum time in seconds to wait for the run to finish. Defaults to 300. + + Returns: + Dict with status and content containing structured product data. + """ + try: + _check_dependency() + client = ApifyToolClient() + client._validate_url(url) + if url_type not in VALID_ECOMMERCE_URL_TYPES: + raise ValueError(f"Invalid url_type '{url_type}'. Must be one of: {', '.join(VALID_ECOMMERCE_URL_TYPES)}.") + url_field = "listingUrls" if url_type == "listing" else "detailsUrls" + run_input: Dict[str, Any] = { + url_field: [{"url": url}], + "maxProductResults": results_limit, + } + return _search_crawl_result( + actor_name="E-commerce Scraper", + client=client, + run_input=run_input, + actor_id=ECOMMERCE_SCRAPER_ID, + timeout_secs=timeout_secs, + results_limit=results_limit, + ) + except Exception as e: + return _error_result(e, "apify_ecommerce_scraper") + + +APIFY_SEARCH_TOOLS = [ + apify_google_search_scraper, + apify_google_places_scraper, + apify_youtube_scraper, + apify_website_content_crawler, + apify_ecommerce_scraper, +] + +APIFY_ALL_TOOLS = APIFY_CORE_TOOLS + APIFY_SEARCH_TOOLS diff --git a/tests/test_apify.py b/tests/test_apify.py new file mode 100644 index 00000000..34fc6537 --- /dev/null +++ b/tests/test_apify.py @@ -0,0 +1,1050 @@ +"""Tests for the Apify tools.""" + +import json +from unittest.mock import MagicMock, patch + +import pytest + +from strands_tools import apify +from strands_tools.apify import ( + ApifyToolClient, + apify_ecommerce_scraper, + apify_get_dataset_items, + apify_google_places_scraper, + apify_google_search_scraper, + apify_run_actor, + apify_run_actor_and_get_dataset, + apify_run_task, + apify_run_task_and_get_dataset, + apify_scrape_url, + apify_website_content_crawler, + apify_youtube_scraper, +) + +MOCK_ACTOR_RUN = { + "id": "run-HG7ml5fB1hCp8YEBA", + "actId": "actor~my-scraper", + "userId": "user-abc123", + "startedAt": "2026-03-15T14:30:00.000Z", + "finishedAt": "2026-03-15T14:35:22.000Z", + "status": "SUCCEEDED", + "statusMessage": "Actor finished successfully", + "defaultDatasetId": "dataset-WkC9gct8rq1uR5vDZ", + "defaultKeyValueStoreId": "kvs-Xb3A8gct8rq1uR5vD", + "buildNumber": "1.2.3", +} + +MOCK_FAILED_RUN = { + **MOCK_ACTOR_RUN, + "status": "FAILED", + "statusMessage": "Actor failed with an error", +} + +MOCK_TIMED_OUT_RUN = { + **MOCK_ACTOR_RUN, + "status": "TIMED-OUT", + "statusMessage": "Actor run timed out", +} + +MOCK_DATASET_ITEMS = [ + {"url": "https://example.com/product/1", "title": "Widget A", "price": 19.99, "currency": "USD"}, + {"url": "https://example.com/product/2", "title": "Widget B", "price": 29.99, "currency": "USD"}, + {"url": "https://example.com/product/3", "title": "Widget C", "price": 39.99, "currency": "EUR"}, +] + +MOCK_SCRAPED_ITEM = { + "url": "https://example.com", + "markdown": "# Example Domain\n\nThis domain is for use in illustrative examples.", + "text": "Example Domain. This domain is for use in illustrative examples.", +} + + +def _make_apify_api_error(status_code: int, message: str) -> Exception: + """Create an ApifyApiError instance for testing without calling its real __init__.""" + from apify_client.errors import ApifyApiError + + error = ApifyApiError.__new__(ApifyApiError) + Exception.__init__(error, message) + error.status_code = status_code + error.message = message + return error + + +@pytest.fixture +def mock_apify_client(): + """Create a mock ApifyClient with pre-configured responses.""" + client = MagicMock() + + mock_actor = MagicMock() + mock_actor.call.return_value = MOCK_ACTOR_RUN + client.actor.return_value = mock_actor + + mock_task = MagicMock() + mock_task.call.return_value = MOCK_ACTOR_RUN + client.task.return_value = mock_task + + mock_dataset = MagicMock() + mock_list_result = MagicMock() + mock_list_result.items = MOCK_DATASET_ITEMS + mock_dataset.list_items.return_value = mock_list_result + client.dataset.return_value = mock_dataset + + return client + + +@pytest.fixture +def mock_apify_env(monkeypatch): + """Set required Apify environment variables.""" + monkeypatch.setenv("APIFY_API_TOKEN", "test-token-12345") + + +# --- Module import --- + + +def test_apify_module_is_importable(): + """Verify that the apify module can be imported from strands_tools.""" + assert apify is not None + assert apify.__name__ == "strands_tools.apify" + + +# --- ApifyToolClient --- + + +def test_client_missing_token(monkeypatch): + """ApifyToolClient raises ValueError when APIFY_API_TOKEN is not set.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + with pytest.raises(ValueError, match="APIFY_API_TOKEN"): + ApifyToolClient() + + +def test_client_uses_env_token(mock_apify_env): + """ApifyToolClient passes the env token to ApifyClient.""" + with patch("strands_tools.apify.ApifyClient") as MockClient: + ApifyToolClient() + MockClient.assert_called_once_with( + "test-token-12345", + headers={"x-apify-integration-platform": "strands-agents"}, + ) + + +# --- apify_run_actor --- + + +def test_run_actor_success(mock_apify_env, mock_apify_client): + """Successful Actor run returns structured result with run metadata.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper", run_input={"url": "https://example.com"}) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + assert "started_at" in data + assert "finished_at" in data + mock_apify_client.actor.assert_called_once_with("actor/my-scraper") + + +def test_run_actor_default_input(mock_apify_env, mock_apify_client): + """Actor run defaults run_input to empty dict when not provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "success" + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["run_input"] == {} + + +def test_run_actor_explicit_empty_input(mock_apify_env, mock_apify_client): + """Actor run passes through an explicitly empty dict instead of treating it as falsy.""" + empty_input: dict = {} + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper", run_input=empty_input) + + assert result["status"] == "success" + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["run_input"] is empty_input + + +def test_run_actor_with_memory(mock_apify_env, mock_apify_client): + """Actor run passes memory_mbytes when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_run_actor(actor_id="actor/my-scraper", memory_mbytes=512) + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["memory_mbytes"] == 512 + + +def test_run_actor_failure(mock_apify_env, mock_apify_client): + """Actor run returns error dict when Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +def test_run_actor_timeout(mock_apify_env, mock_apify_client): + """Actor run returns error dict when Actor times out.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_TIMED_OUT_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "TIMED-OUT" in result["content"][0]["text"] + + +def test_run_actor_api_exception(mock_apify_env, mock_apify_client): + """Actor run returns error dict on API exceptions.""" + mock_apify_client.actor.return_value.call.side_effect = Exception("Connection failed") + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "Connection failed" in result["content"][0]["text"] + + +def test_run_actor_none_response(mock_apify_env, mock_apify_client): + """Actor run returns error dict when ActorClient.call() returns None.""" + mock_apify_client.actor.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + +def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): + """Actor run returns friendly message for 401 authentication errors.""" + error = _make_apify_api_error(401, "Unauthorized") + mock_apify_client.actor.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "Authentication failed" in result["content"][0]["text"] + + +def test_run_actor_apify_api_error_404(mock_apify_env, mock_apify_client): + """Actor run returns friendly message for 404 not-found errors.""" + error = _make_apify_api_error(404, "Actor not found") + mock_apify_client.actor.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/nonexistent") + + assert result["status"] == "error" + assert "Resource not found" in result["content"][0]["text"] + + +# --- apify_get_dataset_items --- + + +def test_get_dataset_items_success(mock_apify_env, mock_apify_client): + """Successful dataset retrieval returns structured result with items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_get_dataset_items(dataset_id="dataset-WkC9gct8rq1uR5vDZ") + + assert result["status"] == "success" + items = json.loads(result["content"][0]["text"]) + assert len(items) == 3 + assert items[0]["title"] == "Widget A" + assert items[2]["currency"] == "EUR" + mock_apify_client.dataset.assert_called_once_with("dataset-WkC9gct8rq1uR5vDZ") + + +def test_get_dataset_items_with_pagination(mock_apify_env, mock_apify_client): + """dataset retrieval passes limit and offset.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_get_dataset_items(dataset_id="dataset-xyz", limit=50, offset=10) + + mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10) + + +def test_get_dataset_items_empty(mock_apify_env, mock_apify_client): + """Empty dataset returns a structured result with empty JSON array.""" + mock_list_result = MagicMock() + mock_list_result.items = [] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_get_dataset_items(dataset_id="dataset-empty") + + assert result["status"] == "success" + items = json.loads(result["content"][0]["text"]) + assert items == [] + + +# --- apify_run_actor_and_get_dataset --- + + +def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): + """Combined run + dataset fetch returns structured result with metadata and items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor_and_get_dataset( + actor_id="actor/my-scraper", + run_input={"url": "https://example.com"}, + dataset_items_limit=50, + ) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + assert len(data["items"]) == 3 + assert data["items"][0]["title"] == "Widget A" + + +def test_run_actor_and_get_dataset_no_dataset_id(mock_apify_env, mock_apify_client): + """Combined tool returns error when the Actor run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor_and_get_dataset(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_client): + """Combined tool returns error dict when the Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor_and_get_dataset(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +# --- apify_run_task --- + + +def test_run_task_success(mock_apify_env, mock_apify_client): + """Successful task run returns structured result with run metadata.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="user~my-task", task_input={"query": "test"}) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + mock_apify_client.task.assert_called_once_with("user~my-task") + + +def test_run_task_no_input(mock_apify_env, mock_apify_client): + """task run omits task_input kwarg when not provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "success" + call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs + assert "task_input" not in call_kwargs + + +def test_run_task_with_memory(mock_apify_env, mock_apify_client): + """task run passes memory_mbytes when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_run_task(task_id="user~my-task", memory_mbytes=1024) + + call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs + assert call_kwargs["memory_mbytes"] == 1024 + + +def test_run_task_failure(mock_apify_env, mock_apify_client): + """task run returns error dict when task fails.""" + mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +def test_run_task_none_response(mock_apify_env, mock_apify_client): + """task run returns error dict when TaskClient.call() returns None.""" + mock_apify_client.task.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + +def test_run_task_apify_api_error_401(mock_apify_env, mock_apify_client): + """task run returns friendly message for 401 authentication errors.""" + error = _make_apify_api_error(401, "Unauthorized") + mock_apify_client.task.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "error" + assert "Authentication failed" in result["content"][0]["text"] + + +# --- apify_run_task_and_get_dataset --- + + +def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): + """Combined task run + dataset fetch returns structured result with metadata and items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset( + task_id="user~my-task", + task_input={"query": "test"}, + dataset_items_limit=50, + ) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert len(data["items"]) == 3 + assert data["items"][0]["title"] == "Widget A" + + +def test_run_task_and_get_dataset_no_dataset_id(mock_apify_env, mock_apify_client): + """Combined task tool returns error when the task run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.task.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset(task_id="user~my-task") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_task_failure(mock_apify_env, mock_apify_client): + """Combined task tool returns error dict when the task fails.""" + mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset(task_id="user~my-task") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +# --- apify_scrape_url --- + + +def test_scrape_url_success(mock_apify_env, mock_apify_client): + """Scrape URL returns structured result with markdown content.""" + mock_list_result = MagicMock() + mock_list_result.items = [MOCK_SCRAPED_ITEM] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "success" + assert "Example Domain" in result["content"][0]["text"] + mock_apify_client.actor.assert_called_once_with("apify/website-content-crawler") + + +def test_scrape_url_none_response(mock_apify_env, mock_apify_client): + """Scrape URL returns error dict when ActorClient.call() returns None.""" + mock_apify_client.actor.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + +def test_scrape_url_no_dataset_id(mock_apify_env, mock_apify_client): + """Scrape URL returns error when the crawler run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + +def test_scrape_url_no_content(mock_apify_env, mock_apify_client): + """Scrape URL returns error dict when no content is returned.""" + mock_list_result = MagicMock() + mock_list_result.items = [] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "No content returned" in result["content"][0]["text"] + + +def test_scrape_url_crawler_failure(mock_apify_env, mock_apify_client): + """Scrape URL returns error dict when the crawler Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +def test_scrape_url_falls_back_to_text(mock_apify_env, mock_apify_client): + """Scrape URL falls back to text field when markdown is absent.""" + item_without_markdown = {"url": "https://example.com", "text": "Plain text content"} + mock_list_result = MagicMock() + mock_list_result.items = [item_without_markdown] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "success" + assert result["content"][0]["text"] == "Plain text content" + + +def test_scrape_url_invalid_url_scheme(mock_apify_env): + """apify_scrape_url returns error for invalid URL scheme.""" + result = apify_scrape_url(url="ftp://example.com") + + assert result["status"] == "error" + assert "Invalid URL scheme" in result["content"][0]["text"] + + +def test_scrape_url_missing_scheme(mock_apify_env): + """apify_scrape_url returns error for URL without http/https scheme.""" + result = apify_scrape_url(url="example.com") + + assert result["status"] == "error" + assert "Invalid URL scheme" in result["content"][0]["text"] + + +# --- Parameter validation --- + + +def test_run_actor_empty_actor_id(mock_apify_env): + """apify_run_actor returns error for whitespace-only actor_id.""" + result = apify_run_actor(actor_id=" ") + + assert result["status"] == "error" + assert "actor_id" in result["content"][0]["text"] + + +def test_run_actor_zero_timeout(mock_apify_env): + """apify_run_actor returns error for non-positive timeout_secs.""" + result = apify_run_actor(actor_id="actor/valid", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_actor_negative_timeout(mock_apify_env): + """apify_run_actor returns error for negative timeout_secs.""" + result = apify_run_actor(actor_id="actor/valid", timeout_secs=-5) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_actor_zero_memory(mock_apify_env): + """apify_run_actor returns error for non-positive memory_mbytes.""" + result = apify_run_actor(actor_id="actor/valid", memory_mbytes=0) + + assert result["status"] == "error" + assert "memory_mbytes" in result["content"][0]["text"] + + +def test_run_task_empty_task_id(mock_apify_env): + """apify_run_task returns error for whitespace-only task_id.""" + result = apify_run_task(task_id=" ") + + assert result["status"] == "error" + assert "task_id" in result["content"][0]["text"] + + +def test_run_task_zero_timeout(mock_apify_env): + """apify_run_task returns error for non-positive timeout_secs.""" + result = apify_run_task(task_id="user~my-task", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_task_zero_memory(mock_apify_env): + """apify_run_task returns error for non-positive memory_mbytes.""" + result = apify_run_task(task_id="user~my-task", memory_mbytes=0) + + assert result["status"] == "error" + assert "memory_mbytes" in result["content"][0]["text"] + + +def test_get_dataset_items_empty_dataset_id(mock_apify_env): + """apify_get_dataset_items returns error for whitespace-only dataset_id.""" + result = apify_get_dataset_items(dataset_id=" ") + + assert result["status"] == "error" + assert "dataset_id" in result["content"][0]["text"] + + +def test_get_dataset_items_zero_limit(mock_apify_env): + """apify_get_dataset_items returns error for non-positive limit.""" + result = apify_get_dataset_items(dataset_id="dataset-abc", limit=0) + + assert result["status"] == "error" + assert "limit" in result["content"][0]["text"] + + +def test_get_dataset_items_negative_offset(mock_apify_env): + """apify_get_dataset_items returns error for negative offset.""" + result = apify_get_dataset_items(dataset_id="dataset-abc", offset=-1) + + assert result["status"] == "error" + assert "offset" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_zero_dataset_limit(mock_apify_env): + """apify_run_actor_and_get_dataset returns error for non-positive dataset_items_limit.""" + result = apify_run_actor_and_get_dataset(actor_id="actor/valid", dataset_items_limit=0) + + assert result["status"] == "error" + assert "dataset_items_limit" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_negative_dataset_offset(mock_apify_env): + """apify_run_actor_and_get_dataset returns error for negative dataset_items_offset.""" + result = apify_run_actor_and_get_dataset(actor_id="actor/valid", dataset_items_offset=-1) + + assert result["status"] == "error" + assert "dataset_items_offset" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_zero_dataset_limit(mock_apify_env): + """apify_run_task_and_get_dataset returns error for non-positive dataset_items_limit.""" + result = apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_limit=0) + + assert result["status"] == "error" + assert "dataset_items_limit" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_negative_dataset_offset(mock_apify_env): + """apify_run_task_and_get_dataset returns error for negative dataset_items_offset.""" + result = apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_offset=-1) + + assert result["status"] == "error" + assert "dataset_items_offset" in result["content"][0]["text"] + + +def test_scrape_url_zero_timeout(mock_apify_env): + """apify_scrape_url returns error for non-positive timeout_secs.""" + result = apify_scrape_url(url="https://example.com", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_scrape_url_invalid_crawler_type(mock_apify_env): + """apify_scrape_url returns error for unsupported crawler_type.""" + result = apify_scrape_url(url="https://example.com", crawler_type="invalid") + + assert result["status"] == "error" + assert "crawler_type" in result["content"][0]["text"] + + +def test_scrape_url_missing_domain(mock_apify_env): + """apify_scrape_url returns error for URL with no domain.""" + result = apify_scrape_url(url="https://") + + assert result["status"] == "error" + assert "domain" in result["content"][0]["text"].lower() + + +# --- Dependency guard --- + + +def test_missing_apify_client_run_actor(mock_apify_env): + """apify_run_actor returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_actor(actor_id="test/actor") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_get_dataset(mock_apify_env): + """apify_get_dataset_items returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_get_dataset_items(dataset_id="dataset-123") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_run_and_get(mock_apify_env): + """apify_run_actor_and_get_dataset returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_actor_and_get_dataset(actor_id="test/actor") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_run_task(mock_apify_env): + """apify_run_task returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_run_task_and_get(mock_apify_env): + """apify_run_task_and_get_dataset returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_task_and_get_dataset(task_id="user~my-task") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_scrape_url(mock_apify_env): + """apify_scrape_url returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +# --- Missing token from tool entry points --- + + +def test_run_actor_missing_token(monkeypatch): + """apify_run_actor returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_actor(actor_id="test/actor") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_get_dataset_items_missing_token(monkeypatch): + """apify_get_dataset_items returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_get_dataset_items(dataset_id="dataset-123") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_missing_token(monkeypatch): + """apify_run_actor_and_get_dataset returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_actor_and_get_dataset(actor_id="test/actor") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_run_task_missing_token(monkeypatch): + """apify_run_task returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_missing_token(monkeypatch): + """apify_run_task_and_get_dataset returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_task_and_get_dataset(task_id="user~my-task") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_scrape_url_missing_token(monkeypatch): + """apify_scrape_url returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +# --- apify_google_search_scraper --- + + +def test_google_search_scraper_success(mock_apify_env, mock_apify_client): + """Google Search Scraper returns structured results with correct input mapping.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_google_search_scraper(search_query="best AI frameworks", results_limit=5) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert len(data["items"]) == 3 + + mock_apify_client.actor.assert_called_once_with("apify/google-search-scraper") + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["queries"] == "best AI frameworks" + assert run_input["maxPagesPerQuery"] == 1 + assert "resultsPerPage" not in run_input + + +def test_google_search_scraper_multi_page(mock_apify_env, mock_apify_client): + """Google Search Scraper calculates correct page count when results_limit exceeds 10.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_search_scraper(search_query="AI", results_limit=25) + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["maxPagesPerQuery"] == 3 + assert "resultsPerPage" not in run_input + + +def test_google_search_scraper_optional_params(mock_apify_env, mock_apify_client): + """Google Search Scraper includes optional country and language codes when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_search_scraper(search_query="AI", results_limit=10, country_code="de", language_code="de") + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["countryCode"] == "de" + assert run_input["languageCode"] == "de" + + +def test_google_search_scraper_optional_params_omitted(mock_apify_env, mock_apify_client): + """Google Search Scraper omits optional fields when not provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_search_scraper(search_query="AI") + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert "countryCode" not in run_input + assert "languageCode" not in run_input + + +def test_google_search_scraper_missing_dependency(mock_apify_env): + """Google Search Scraper returns error when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_google_search_scraper(search_query="test") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_google_search_scraper_missing_token(monkeypatch): + """Google Search Scraper returns error when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_google_search_scraper(search_query="test") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_google_search_scraper_actor_failure(mock_apify_env, mock_apify_client): + """Google Search Scraper returns error when Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_google_search_scraper(search_query="test") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +# --- apify_google_places_scraper --- + + +def test_google_places_scraper_success(mock_apify_env, mock_apify_client): + """Google Places Scraper returns structured results with correct input mapping.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_google_places_scraper(search_query="restaurants in Prague", results_limit=10) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + + mock_apify_client.actor.assert_called_once_with("compass/crawler-google-places") + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["searchStringsArray"] == ["restaurants in Prague"] + assert run_input["maxCrawledPlacesPerSearch"] == 10 + assert run_input["maxReviews"] == 0 + + +def test_google_places_scraper_with_reviews(mock_apify_env, mock_apify_client): + """Google Places Scraper sets maxReviews when include_reviews is True.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_places_scraper(search_query="hotels in Berlin", include_reviews=True, max_reviews=10) + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["maxReviews"] == 10 + + +def test_google_places_scraper_reviews_disabled(mock_apify_env, mock_apify_client): + """Google Places Scraper sets maxReviews to 0 when include_reviews is False.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_places_scraper(search_query="cafes", include_reviews=False, max_reviews=10) + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["maxReviews"] == 0 + + +def test_google_places_scraper_optional_language(mock_apify_env, mock_apify_client): + """Google Places Scraper includes language when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_places_scraper(search_query="cafes", language="de") + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["language"] == "de" + + +# --- apify_youtube_scraper --- + + +def test_youtube_scraper_search_query(mock_apify_env, mock_apify_client): + """YouTube Scraper returns results when given a search query.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_youtube_scraper(search_query="python tutorial", results_limit=5) + + assert result["status"] == "success" + mock_apify_client.actor.assert_called_once_with("streamers/youtube-scraper") + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["searchQueries"] == ["python tutorial"] + assert run_input["maxResults"] == 5 + assert "startUrls" not in run_input + + +def test_youtube_scraper_urls(mock_apify_env, mock_apify_client): + """YouTube Scraper returns results when given specific URLs.""" + urls = ["https://www.youtube.com/watch?v=abc123"] + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_youtube_scraper(urls=urls) + + assert result["status"] == "success" + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["startUrls"] == [{"url": "https://www.youtube.com/watch?v=abc123"}] + assert "searchQueries" not in run_input + + +def test_youtube_scraper_both_query_and_urls(mock_apify_env, mock_apify_client): + """YouTube Scraper accepts both search_query and urls simultaneously.""" + urls = ["https://www.youtube.com/watch?v=abc123"] + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_youtube_scraper(search_query="python", urls=urls) + + assert result["status"] == "success" + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["searchQueries"] == ["python"] + assert run_input["startUrls"] == [{"url": "https://www.youtube.com/watch?v=abc123"}] + + +def test_youtube_scraper_no_input(mock_apify_env): + """YouTube Scraper returns error when neither search_query nor urls is provided.""" + result = apify_youtube_scraper() + + assert result["status"] == "error" + assert "search_query" in result["content"][0]["text"] + + +# --- apify_website_content_crawler --- + + +def test_website_content_crawler_success(mock_apify_env, mock_apify_client): + """Website Content Crawler returns results with correct input mapping.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_website_content_crawler(start_url="https://docs.example.com", max_pages=5, max_depth=3) + + assert result["status"] == "success" + mock_apify_client.actor.assert_called_once_with("apify/website-content-crawler") + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["startUrls"] == [{"url": "https://docs.example.com"}] + assert run_input["maxCrawlPages"] == 5 + assert run_input["maxCrawlDepth"] == 3 + assert run_input["proxyConfiguration"] == {"useApifyProxy": True} + + +def test_website_content_crawler_defaults(mock_apify_env, mock_apify_client): + """Website Content Crawler uses correct defaults for max_pages and max_depth.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_website_content_crawler(start_url="https://example.com") + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["maxCrawlPages"] == 10 + assert run_input["maxCrawlDepth"] == 2 + + +def test_website_content_crawler_invalid_url(mock_apify_env): + """Website Content Crawler returns error for invalid URL.""" + result = apify_website_content_crawler(start_url="not-a-url") + + assert result["status"] == "error" + assert "Invalid URL" in result["content"][0]["text"] + + +# --- apify_ecommerce_scraper --- + + +def test_ecommerce_scraper_success(mock_apify_env, mock_apify_client): + """E-commerce Scraper returns results with correct input mapping for product URL.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_ecommerce_scraper(url="https://www.amazon.com/dp/B0TEST", results_limit=10) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + + mock_apify_client.actor.assert_called_once_with("apify/e-commerce-scraping-tool") + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["detailsUrls"] == [{"url": "https://www.amazon.com/dp/B0TEST"}] + assert "listingUrls" not in run_input + assert run_input["maxProductResults"] == 10 + + +def test_ecommerce_scraper_listing_url(mock_apify_env, mock_apify_client): + """E-commerce Scraper uses listingUrls when url_type is 'listing'.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_ecommerce_scraper( + url="https://www.amazon.com/s?k=headphones", url_type="listing", results_limit=10 + ) + + assert result["status"] == "success" + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["listingUrls"] == [{"url": "https://www.amazon.com/s?k=headphones"}] + assert "detailsUrls" not in run_input + + +def test_ecommerce_scraper_invalid_url_type(mock_apify_env): + """E-commerce Scraper returns error for invalid url_type.""" + result = apify_ecommerce_scraper(url="https://www.amazon.com/dp/B0TEST", url_type="invalid") + + assert result["status"] == "error" + assert "url_type" in result["content"][0]["text"] + + +def test_ecommerce_scraper_invalid_url(mock_apify_env): + """E-commerce Scraper returns error for invalid URL.""" + result = apify_ecommerce_scraper(url="not-a-url") + + assert result["status"] == "error" + assert "Invalid URL" in result["content"][0]["text"] + + +def test_ecommerce_scraper_actor_failure(mock_apify_env, mock_apify_client): + """E-commerce Scraper returns error when Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_ecommerce_scraper(url="https://www.amazon.com/dp/B0TEST") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"]