sample: E-commerce Product Scraper Agent

cosmin-staicu · cosmin-staicu · commit bc83bd1e49fd · 2026-03-05T11:09:32.000+02:00
diff --git a/samples/ecommerce-scraper-agent/agent.mermaid b/samples/ecommerce-scraper-agent/agent.mermaid
@@ -0,0 +1,11 @@
+flowchart TB
+  __start__(__start__)
+  coordinator(coordinator)
+  scraper(scraper)
+  finalize(finalize)
+  __end__(__end__)
+  __start__ --> coordinator
+  coordinator --> finalize
+  coordinator --> scraper
+  scraper --> coordinator
+  finalize --> __end__
diff --git a/samples/ecommerce-scraper-agent/bindings.json b/samples/ecommerce-scraper-agent/bindings.json
@@ -0,0 +1,4 @@
+{
+    "version": "2.0",
+    "resources": []
+}
diff --git a/samples/ecommerce-scraper-agent/evaluations/eval-sets/edge-cases.json b/samples/ecommerce-scraper-agent/evaluations/eval-sets/edge-cases.json
@@ -0,0 +1,57 @@
+{
+  "version": "1.0",
+  "id": "edge-cases",
+  "name": "Edge Cases and Error Scenarios",
+  "description": "Tests for unusual inputs, single pages, and boundary conditions",
+  "evaluatorRefs": ["JsonSimilarityEvaluator", "LLMJudgeOutputEvaluator", "TrajectoryEvaluator"],
+  "evaluations": [
+    {
+      "id": "test-1-single-product-page",
+      "name": "Starting from a single product page extracts that product",
+      "inputs": {
+        "start_url": "https://sandbox.oxylabs.io/products/1"
+      },
+      "evaluationCriterias": {
+        "LLMJudgeOutputEvaluator": {
+          "expectedOutput": {
+            "products": "Should contain at least one product with name, price, currency, and url fields. The product URL should be https://sandbox.oxylabs.io/products/1 or related products discovered from that page.",
+            "total_products": "At least 1 product should be found"
+          }
+        },
+        "TrajectoryEvaluator": {
+          "expectedAgentBehavior": "The agent should: 1) Start with the single product URL. 2) Fetch the page and classify it as a 'product' page via LLM analysis. 3) Extract product data fields (name, price, description, etc.) using CSS selectors from the strategy. 4) Discover any related product links or navigation links on the product page. 5) Optionally follow discovered links to scrape more products. 6) Finalize with at least one product."
+        }
+      }
+    },
+    {
+      "id": "test-2-last-pagination-page",
+      "name": "Scraping the last page of pagination works correctly",
+      "inputs": {
+        "start_url": "https://sandbox.oxylabs.io/products?page=94"
+      },
+      "evaluationCriterias": {
+        "LLMJudgeOutputEvaluator": {
+          "expectedOutput": {
+            "products": "Should contain products from the last page. Each product should have valid name, price, and url fields.",
+            "total_products": "Should be a positive number, likely around 20-32 products from the last page and any discovered linked pages"
+          }
+        }
+      }
+    },
+    {
+      "id": "test-3-nonexistent-page",
+      "name": "Handling a page that returns no products",
+      "inputs": {
+        "start_url": "https://sandbox.oxylabs.io/products?page=9999"
+      },
+      "evaluationCriterias": {
+        "JsonSimilarityEvaluator": {
+          "expectedOutput": {
+            "total_products": 0,
+            "urls_scraped": 0
+          }
+        }
+      }
+    }
+  ]
+}
diff --git a/samples/ecommerce-scraper-agent/evaluations/eval-sets/happy-path.json b/samples/ecommerce-scraper-agent/evaluations/eval-sets/happy-path.json
@@ -0,0 +1,39 @@
+{
+  "version": "1.0",
+  "id": "happy-path",
+  "name": "Happy Path Scenarios",
+  "description": "Tests for normal scraping operations with the oxylabs sandbox site",
+  "evaluatorRefs": ["JsonSimilarityEvaluator", "TrajectoryEvaluator"],
+  "evaluations": [
+    {
+      "id": "test-1-default-url-products-found",
+      "name": "Scrape default URL returns products with expected fields",
+      "inputs": {
+        "start_url": "https://sandbox.oxylabs.io/products"
+      },
+      "evaluationCriterias": {
+        "JsonSimilarityEvaluator": {
+          "expectedOutput": {
+            "total_products": 2993,
+            "urls_scraped": 3301
+          }
+        },
+        "TrajectoryEvaluator": {
+          "expectedAgentBehavior": "The agent should: 1) Start at the coordinator node which seeds the start URL. 2) Dispatch URLs to parallel scraper sub-agents. 3) Scrapers fetch pages, call the LLM to analyze page type (listing vs product), and extract product links from listing pages and product data from product pages. 4) Return discovered URLs to coordinator for further rounds. 5) Continue until no new URLs remain. 6) Finalize by deduplicating products and resolving currency symbols. The agent should scrape approximately 3000 products across multiple rounds of coordinator-scraper cycles."
+        }
+      }
+    },
+    {
+      "id": "test-2-category-page",
+      "name": "Scrape a specific category page discovers products",
+      "inputs": {
+        "start_url": "https://sandbox.oxylabs.io/products/category/nintendo"
+      },
+      "evaluationCriterias": {
+        "TrajectoryEvaluator": {
+          "expectedAgentBehavior": "The agent should: 1) Start at the coordinator with the Nintendo category URL. 2) Dispatch to scrapers which classify the page as a listing page. 3) Extract product links and pagination links from the category listing. 4) Visit individual product pages to extract product data (name, price, description, etc.). 5) Follow pagination to discover all products in the category. 6) Finalize with deduplicated products, all having currency resolved to ISO codes. The total products should be fewer than the full site (~1000 or less for a single category)."
+        }
+      }
+    }
+  ]
+}
diff --git a/samples/ecommerce-scraper-agent/evaluations/eval-sets/output-structure.json b/samples/ecommerce-scraper-agent/evaluations/eval-sets/output-structure.json
@@ -0,0 +1,54 @@
+{
+  "version": "1.0",
+  "id": "output-structure",
+  "name": "Output Structure Validation",
+  "description": "Validates that scraped products have the expected fields and data quality",
+  "evaluatorRefs": ["LLMJudgeOutputEvaluator"],
+  "evaluations": [
+    {
+      "id": "test-1-product-fields-present",
+      "name": "Products contain required fields (name, price, url, currency)",
+      "inputs": {
+        "start_url": "https://sandbox.oxylabs.io/products?page=1"
+      },
+      "evaluationCriterias": {
+        "LLMJudgeOutputEvaluator": {
+          "expectedOutput": {
+            "products": "A list of product objects where each product has at minimum: 'url' (a valid URL to the product page), 'name' (non-empty product name), 'price' (a numeric price value), and 'currency' (ISO 4217 code like EUR). Products may also have description, availability, developer, platform, and type fields.",
+            "total_products": "A positive integer greater than 0 representing the number of unique products scraped",
+            "urls_scraped": "A positive integer representing the total number of URLs visited during scraping"
+          }
+        }
+      }
+    },
+    {
+      "id": "test-2-currency-resolved",
+      "name": "Currency symbols are resolved to ISO 4217 codes",
+      "inputs": {
+        "start_url": "https://sandbox.oxylabs.io/products?page=2"
+      },
+      "evaluationCriterias": {
+        "LLMJudgeOutputEvaluator": {
+          "expectedOutput": {
+            "products": "Products should have a 'currency' field containing a valid ISO 4217 three-letter code (e.g. 'EUR', 'USD', 'GBP') rather than a raw currency symbol (e.g. not '€' or '$'). The price field should be a numeric string without currency symbols."
+          }
+        }
+      }
+    },
+    {
+      "id": "test-3-no-duplicate-products",
+      "name": "No duplicate products in output",
+      "inputs": {
+        "start_url": "https://sandbox.oxylabs.io/products?page=3"
+      },
+      "evaluationCriterias": {
+        "LLMJudgeOutputEvaluator": {
+          "expectedOutput": {
+            "products": "Each product in the list should have a unique URL. There should be no two products with the same 'url' field. The total_products count should match the length of the products list.",
+            "total_products": "Should exactly match the number of items in the products array"
+          }
+        }
+      }
+    }
+  ]
+}
diff --git a/samples/ecommerce-scraper-agent/evaluations/evaluators/json-similarity.json b/samples/ecommerce-scraper-agent/evaluations/evaluators/json-similarity.json
@@ -0,0 +1,9 @@
+{
+  "version": "1.0",
+  "id": "JsonSimilarityEvaluator",
+  "evaluatorTypeId": "uipath-json-similarity",
+  "evaluatorConfig": {
+    "name": "JsonSimilarityEvaluator",
+    "targetOutputKey": "*"
+  }
+}
diff --git a/samples/ecommerce-scraper-agent/evaluations/evaluators/llm-judge-output.json b/samples/ecommerce-scraper-agent/evaluations/evaluators/llm-judge-output.json
@@ -0,0 +1,10 @@
+{
+  "version": "1.0",
+  "id": "LLMJudgeOutputEvaluator",
+  "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity",
+  "evaluatorConfig": {
+    "name": "LLMJudgeOutputEvaluator",
+    "model": "gpt-4o-2024-11-20",
+    "temperature": 0.0
+  }
+}
diff --git a/samples/ecommerce-scraper-agent/evaluations/evaluators/trajectory.json b/samples/ecommerce-scraper-agent/evaluations/evaluators/trajectory.json
@@ -0,0 +1,10 @@
+{
+  "version": "1.0",
+  "id": "TrajectoryEvaluator",
+  "evaluatorTypeId": "uipath-llm-judge-trajectory-similarity",
+  "evaluatorConfig": {
+    "name": "LLMJudgeTrajectoryEvaluator",
+    "model": "gpt-4o-2024-11-20",
+    "temperature": 0.0
+  }
+}
diff --git a/samples/ecommerce-scraper-agent/langgraph.json b/samples/ecommerce-scraper-agent/langgraph.json
@@ -0,0 +1,5 @@
+{
+    "graphs": {
+        "agent": "./main.py:graph"
+    }
+}
diff --git a/samples/ecommerce-scraper-agent/main.py b/samples/ecommerce-scraper-agent/main.py
diff --git a/samples/ecommerce-scraper-agent/pyproject.toml b/samples/ecommerce-scraper-agent/pyproject.toml
diff --git a/samples/ecommerce-scraper-agent/uipath.json b/samples/ecommerce-scraper-agent/uipath.json

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +    "version": "2.0",
 +    "resources": []
 +}