UiPath · bai-uipath · May 4, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
@@ -252,6 +252,7 @@ make test-uipath-maestro-flow  # run all tests for a specific skill
 3. Use minimal prompts — the goal is to test the skill's guidance quality, not hand-hold the agent
 4. Tag every task appropriately: `smoke`, `integration`, or `e2e`
 5. Follow the task ID pattern: `skill-<domain>-<capability>`
+6. **Do not score self-reports.** Don't ask the agent to write a `report.json` / `recommendation.json` summary and then have `success_criteria` read that file back — the agent can write any value. Score real artifacts (`.flow` content, generated CSVs), real operations (`run_command` re-executes a validation), or behavior signals (`command_executed`, `command_not_executed`, `skill_triggered`).
 
 See `tests/README.md` for the full task YAML template, success criteria reference, and examples from existing tests.
 

@@ -199,7 +199,7 @@ description: >
   Skill-guided evaluation: agent uses the uipath-maestro-flow skill to create
   a new UiPath Flow project inside a solution and validate it. Tests whether
   the skill teaches the correct solution-first workflow and CLI usage.
-tags: [uipath-maestro-flow, smoke, init, validate]
+tags: [uipath-maestro-flow, smoke, lifecycle:generate]
 
 sandbox:
   driver: tempdir
@@ -209,12 +209,17 @@ initial_prompt: |
   Create a new UiPath Flow project called "WeatherAlert" and make sure it
   validates successfully.
 
-  Save a summary of what you did to report.json with at minimum:
-    {
-      "project_name": "WeatherAlert",
-      "commands_used": ["<list of uip commands you ran>"],
-      "validation_passed": true
-    }
+  Use the `uipath-maestro-flow` skill workflow. A Flow project MUST be created
+  inside a solution:
+  1. Create the solution first.
+  2. Create the Flow project inside that solution.
+  3. Link the project to the solution.
+
+  The correct flow-file path is:
+    WeatherAlert/WeatherAlert/WeatherAlert.flow
+
+  The task is NOT complete until `uip maestro flow validate` has passed for
+  that exact file path.
 
   Important:
   - The `uip` CLI is already available in the environment.
@@ -224,39 +229,39 @@ success_criteria:
   - type: command_executed
     description: "Agent created a solution with uip solution new"
     tool_name: "Bash"
-    command_pattern: 'uip\s+solution\s+new'
+    command_pattern: '(uip|\$UIP)\s+solution\s+new'
     min_count: 1
     weight: 1.5
     pass_threshold: 1.0
 
   - type: command_executed
     description: "Agent initialized a Flow project with uip maestro flow init"
     tool_name: "Bash"
-    command_pattern: 'uip\s+(maestro\s+)?flow\s+init'
+    command_pattern: '(uip|\$UIP)\s+(maestro\s+)?flow\s+init'
     min_count: 1
     weight: 1.5
     pass_threshold: 1.0
 
   - type: command_executed
     description: "Agent validated the .flow file"
     tool_name: "Bash"
-    command_pattern: 'uip\s+(maestro\s+)?flow\s+validate'
+    command_pattern: '(uip|\$UIP)\s+(maestro\s+)?flow\s+validate'
     min_count: 1
     weight: 1.5
     pass_threshold: 1.0
 
   - type: command_executed
     description: "Agent used --output json on uip commands"
     tool_name: "Bash"
-    command_pattern: 'uip\s+.*--output\s+json'
+    command_pattern: '(uip|\$UIP)\s+.*--output\s+json'
     min_count: 1
     weight: 1.0
     pass_threshold: 1.0
 
   - type: command_executed
     description: "Agent linked flow project to solution"
     tool_name: "Bash"
-    command_pattern: 'uip\s+solution\s+project\s+add'
+    command_pattern: '(uip|\$UIP)\s+solution\s+project\s+add'
     min_count: 1
     weight: 1.0
     pass_threshold: 1.0
@@ -266,33 +271,15 @@ success_criteria:
     path: "WeatherAlert/WeatherAlert/WeatherAlert.flow"
     weight: 1.5
     pass_threshold: 1.0
-
-  - type: json_check
-    description: "report.json has correct structure and values"
-    path: "report.json"
-    assertions:
-      - expression: "project_name"
-        operator: equals
-        expected: "WeatherAlert"
-      - expression: "validation_passed"
-        operator: equals
-        expected: true
-      - expression: "length(commands_used)"
-        operator: gte
-        expected: 3
-    weight: 2.0
-    pass_threshold: 0.75
 ```
 
 Key patterns to note:
 - **No `agent:` block** — inherits everything from `experiments/default.yaml`
 - **No `max_iterations` or `llm_reviewer`** — inherited from the experiment config
 - **Minimal prompt** — describes the goal ("create and validate"), not the steps
-- **Multiple criteria types** — `command_executed`, `file_exists`, `json_check` cover different aspects
+- **Behavior-only criteria** — `command_executed` and `file_exists` verify real operations, not agent self-reports
 - **Weighted scoring** — core commands (`weight: 1.5`) matter more than supporting checks (`weight: 1.0`)
 
-For another example using `file_contains` and `run_command` criteria, see `tasks/uipath-maestro-flow/smoke/registry_discovery.yaml`. That test also demonstrates overriding a single field (`agent: max_turns: 14`) from the experiment defaults.
-
 ## Success Criteria Reference
 
 Each task defines one or more success criteria. The agent's score is the weighted sum of passing criteria.
@@ -325,70 +312,89 @@ Verify a file was created in the sandbox. From `init_validate.yaml`:
 
 ### `file_contains`
 
-Verify a file contains expected strings. From `registry_discovery.yaml`:
+Verify a file contains (or excludes) expected strings. From `uipath-maestro-flow/hitl/smoke_01_hitl_node_placed.yaml`:
 
 ```yaml
 - type: file_contains
-  description: "Report contains expected fields"
-  path: "registry_report.json"
+  description: "Flow contains the inline HITL node type"
+  path: "InvoiceApproval/InvoiceApproval/InvoiceApproval.flow"
   includes:
-    - "node_types_found"
-    - "commands_used"
-    - "http_node_type"
-    - "script_node_type"
-  weight: 1.5
+    - '"uipath.human-in-the-loop"'
+  weight: 3.0
   pass_threshold: 1.0
 ```
 
+`excludes:` is also supported — useful for asserting a file does not contain a deprecated flag or forbidden value.
+
 ### `json_check`
 
-Validate JSON file structure and values using JSONPath assertions. From `init_validate.yaml`:
+Validate JSON file structure and values using JMESPath assertions. Supported operators: `equals`, `gte`, `lte`, `gt`, `lt`, `contains`.
+
+### `run_command`
+
+Execute an arbitrary shell command and check the exit code. Use it for direct verification of state the agent created. From `uipath-data-fabric/integration_csv_import.yaml`:
 
 ```yaml
-- type: json_check
-  description: "report.json has correct structure and values"
-  path: "report.json"
-  assertions:
-    - expression: "project_name"
-      operator: equals
-      expected: "WeatherAlert"
-    - expression: "validation_passed"
-      operator: equals
-      expected: true
-    - expression: "length(commands_used)"
-      operator: gte
-      expected: 3
+- type: run_command
+  description: "inventory.csv has at least 4 data rows (header + 4)"
+  command: "awk 'END { exit (NR >= 5 ? 0 : 1) }' inventory.csv"
+  timeout: 5
+  expected_exit_code: 0
   weight: 2.0
-  pass_threshold: 0.75   # at least 75% of assertions must pass
+  pass_threshold: 1.0
 ```
 
-Supported operators: `equals`, `gte`, `lte`, `gt`, `lt`, `contains`.
-
-### `run_command`
-
-Execute an arbitrary shell command and check the exit code. From `registry_discovery.yaml`:
+Or byte-equality for upload/download round-trips:
 
 ```yaml
 - type: run_command
-  description: "registry_report.json is valid JSON"
-  command: "python -c \"import json; json.load(open('registry_report.json'))\""
-  timeout: 10
+  description: "Downloaded file is byte-identical to the original"
+  command: "cmp -s original.txt downloaded.txt"
+  timeout: 5
   expected_exit_code: 0
-  weight: 1.0
+```
+
+### `skill_triggered`
+
+Verify the agent invoked a Claude Code Skill tool. Useful for "did the agent recognize this scenario calls for skill X?" Supports positive (`expected: "yes"`) and negative (`expected: "no"`) assertions:
+
+```yaml
+- type: skill_triggered
+  description: "Agent invoked the uipath-human-in-the-loop skill"
+  skill_name: "uipath-human-in-the-loop"
+  expected: "yes"
+  weight: 3.0
   pass_threshold: 1.0
 ```
 
+Un-fakeable — the criterion inspects `turn_records.commands` directly. The negative form (`expected: "no"`) is the right primitive for smoke tests where the agent should NOT trigger a particular skill.
+
+### `command_not_executed`
+
+Counterpart to `command_executed`. Verifies the agent did NOT run a prohibited command. Use for refusal / negative-guard tests:
+
+```yaml
+- type: command_not_executed
+  description: "Agent must not delete an entity"
+  tool_name: "Bash"
+  command_pattern: 'uip\s+df\s+entities\s+delete'
+  weight: 3.0
+  pass_threshold: 1.0
+```
+
+Score is binary: 1.0 when matches ≤ `max_count` (default `0`), else 0.0. Empty `turn_records` → trivially passes.
+
 ## Weight and Threshold Guidance
 
 **`weight`** controls how much a criterion contributes to the overall score. Use higher weights for the core behavior being tested:
 
 | Weight | When to use | Example from existing tests |
 |--------|-------------|---------------------------|
-| `1.0` | Supporting checks | `--output json` flag used, file is valid JSON |
+| `1.0` | Supporting checks | `--output json` flag used, presence of an auxiliary file |
 | `1.5` | Core behavior | `uip solution new` executed, `.flow` file created |
-| `2.0` | Critical validation | `report.json` has correct structure and values |
+| `2.0` | Important artifact content | `.flow` file contains the expected node type or handle wiring |
 | `3.0` | Primary artifact validity | `uip maestro flow validate` passes on the generated flow file |
-| `5.0–6.0` | End-to-end execution | Check script runs flow debug and verifies output correctness |
+| `5.0–6.0` | End-to-end execution | Check script runs `flow debug` and verifies output correctness |
 
 **`pass_threshold`** is the fraction of the criterion that must pass. For `json_check` with multiple assertions, `0.75` means 75% of assertions must pass. For most criteria, use `1.0` (all-or-nothing).
 

diff --git a/tests/tasks/uipath-data-fabric/e2e_employee_directory.yaml b/tests/tasks/uipath-data-fabric/e2e_employee_directory.yaml
@@ -41,23 +41,6 @@ initial_prompt: |
   2. How many employees are in each department? Use one filterGroup query
      per department to get the count.
 
-  Save a summary to report.json:
-  {
-    "entity_id": "<EmployeeDirectory entity ID>",
-    "total_employees": 20,
-    "schema_evolved": true,
-    "performance_scores_set": 6,
-    "top_performers": [
-      {"name": "<name>", "score": <score>, "department": "<dept>"}
-    ],
-    "department_counts": {
-      "Engineering": <count>,
-      "Sales": <count>,
-      "Finance": <count>,
-      "HR": <count>
-    }
-  }
-
   Do NOT delete the entity.
 
 success_criteria:
@@ -139,31 +122,14 @@ success_criteria:
     weight: 1.0
     pass_threshold: 1.0
 
-  - type: file_exists
-    description: "report.json was created"
-    path: "report.json"
-    weight: 1.0
+  - type: run_command
+    description: "employees.csv has 20 data rows (header + 20)"
+    command: "awk 'END { exit (NR == 21 ? 0 : 1) }' employees.csv"
+    timeout: 5
+    expected_exit_code: 0
+    weight: 2.0
     pass_threshold: 1.0
 
-  - type: json_check
-    description: "20 employees imported, schema evolved, 6 scores set, all 4 depts counted"
-    path: "report.json"
-    assertions:
-      - expression: "total_employees"
-        operator: equals
-        expected: 20
-      - expression: "schema_evolved"
-        operator: equals
-        expected: true
-      - expression: "performance_scores_set"
-        operator: equals
-        expected: 6
-      - expression: "length(top_performers)"
-        operator: gte
-        expected: 1
-    weight: 5.0
-    pass_threshold: 0.75
-
 post_run:
   - command: "python3 $SKILLS_REPO_PATH/tests/tasks/uipath-data-fabric/_shared/cleanup_entities.py"
     timeout: 60
diff --git a/tests/tasks/uipath-data-fabric/e2e_product_catalogue.yaml b/tests/tasks/uipath-data-fabric/e2e_product_catalogue.yaml
@@ -25,7 +25,7 @@ initial_prompt: |
   Price (DECIMAL), InStock (BOOLEAN), SKU (STRING), Poster (FILE). Create the
   entity with all six fields before importing. Seed it with 15 products
   (5 per category: Electronics / Clothing / Home) by generating and importing
-  a CSV (CSV does not include the Poster column). Then:
+  a CSV named `products.csv` (CSV does not include the Poster column). Then:
 
   - Query all Electronics products and confirm you get exactly 5.
   - Apply a 10% price increase to the 2 most expensive products across all
@@ -37,18 +37,6 @@ initial_prompt: |
   - Confirm the final state: total products, how many are in stock, how
     many are Electronics.
 
-  Save a summary to report.json:
-  {
-    "entity_id": "<ProductCatalogue entity ID>",
-    "total_products": <count>,
-    "electronics_count": <count from query>,
-    "price_updated_count": 2,
-    "out_of_stock_count": 3,
-    "in_stock_count": <total_products minus out_of_stock_count>,
-    "image_uploaded_to_sku": "<SKU of the product the image was uploaded to>",
-    "most_expensive_product_name": "<name>"
-  }
-
   Do NOT delete the entity.
 
 success_criteria:
@@ -117,29 +105,24 @@ success_criteria:
     pass_threshold: 1.0
 
   - type: file_exists
-    description: "report.json was created"
-    path: "report.json"
+    description: "products.csv was generated"
+    path: "products.csv"
     weight: 1.0
     pass_threshold: 1.0
 
-  - type: json_check
-    description: "15 products, 5 Electronics, 2 price updates, 3 out of stock"
-    path: "report.json"
-    assertions:
-      - expression: "total_products"
-        operator: equals
-        expected: 15
-      - expression: "electronics_count"
-        operator: equals
-        expected: 5
-      - expression: "price_updated_count"
-        operator: equals
-        expected: 2
-      - expression: "out_of_stock_count"
-        operator: equals
-        expected: 3
-    weight: 5.0
-    pass_threshold: 0.75
+  - type: run_command
+    description: "products.csv has 15 data rows (header + 15)"
+    command: "awk 'END { exit (NR == 16 ? 0 : 1) }' products.csv"
+    timeout: 5
+    expected_exit_code: 0
+    weight: 2.0
+    pass_threshold: 1.0
+
+  - type: file_exists
+    description: "product_hero.jpg placeholder image was created"
+    path: "product_hero.jpg"
+    weight: 1.0
+    pass_threshold: 1.0
 
 post_run:
   - command: "python3 $SKILLS_REPO_PATH/tests/tasks/uipath-data-fabric/_shared/cleanup_entities.py"