UiPath
diff --git a/‎tests/tasks/uipath-audit/audit_events_basic_smoke.yaml‎
Lines changed: 101 additions & 0 deletions b/‎tests/tasks/uipath-audit/audit_events_basic_smoke.yaml‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎tests/tasks/uipath-audit/audit_export_basic_smoke.yaml‎
Lines changed: 89 additions & 0 deletions b/‎tests/tasks/uipath-audit/audit_export_basic_smoke.yaml‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎tests/tasks/uipath-audit/audit_login_history_e2e.yaml‎
Lines changed: 125 additions & 0 deletions b/‎tests/tasks/uipath-audit/audit_login_history_e2e.yaml‎
Lines changed: 125 additions & 0 deletions
@@ -0,0 +1,101 @@
+task_id: skill-audit-events-basic-smoke
+description: >
+  Skill-guided evaluation: agent uses the uipath-audit skill to query
+  audit events with a small bounded time window and a small --limit
+  that does NOT trigger client-side pagination. Validates the simpler
+  events form most users will write — `--from-date`/`--to-date` ISO 8601
+  bounds, `--limit <= 200`, `--output json`.
+
+  Platform note: runs without an authenticated tenant — commands will
+  fail with auth errors. That is acceptable; what matters is correct
+  command invocation with correct flags.
+tags: [uipath-audit, smoke, events]
+
+sandbox:
+  driver: tempdir
+  python: {}
+
+initial_prompt: |
+  A reviewer wants the most recent 25 tenant audit events from the
+  last 24 hours. Use the uipath-audit skill.
+
+  Save a summary to report.json in the current working directory with
+  at minimum:
+    {
+      "events_command": "<exact uip admin audit tenant events command you ran>",
+      "from": "<the --from-date value>",
+      "to":   "<the --to-date value>",
+      "limit": <the --limit value, as a number>,
+      "commands_used": ["<list of uip commands you attempted>"]
+    }
+
+  Important:
+  - The `uip` CLI is available but the `admin` subcommand may not be
+    installed and/or the CLI is not connected to a live tenant.
+    Commands WILL fail — that is expected and acceptable.
+    Run each command exactly once regardless of errors.
+    Do NOT retry, do NOT attempt to login, do NOT try to install tools,
+    do NOT troubleshoot errors.
+  - Use --output json on the events call.
+  - Do NOT prompt the user for confirmation — this is an automated test.
+
+success_criteria:
+  - type: command_executed
+    description: "Agent invoked `uip admin audit tenant events` with both --from-date and --to-date ISO 8601 bounds"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+admin\s+audit\s+tenant\s+events\b.*--from-date\s+\d{4}-\d{2}-\d{2}.*--to-date\s+\d{4}-\d{2}-\d{2}'
+    min_count: 1
+    weight: 2.0
+    pass_threshold: 1.0
+
+  - type: command_executed
+    description: "Agent passed --limit with a value <= 200 (single-call form, no pagination needed)"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+admin\s+audit\s+tenant\s+events\b.*--limit\s+(?:[1-9]|[1-9]\d|1\d\d|200)\b'
+    min_count: 1
+    weight: 1.5
+    pass_threshold: 1.0
+
+  - type: command_executed
+    description: "Agent used --output json"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+admin\s+audit\s+tenant\s+events\b.*--output\s+json'
+    min_count: 1
+    weight: 1.0
+    pass_threshold: 1.0
+
+  - type: file_exists
+    description: "report.json was created"
+    path: "report.json"
+    weight: 1.0
+    pass_threshold: 1.0
+
+  - type: file_contains
+    description: "report.json must NOT use cursor flags (CLI handles pagination internally)"
+    path: "report.json"
+    excludes:
+      - "--before "
+      - "--after "
+      - "--before-id"
+      - "--after-id"
+    weight: 1.0
+    pass_threshold: 1.0
+
+  - type: json_check
+    description: "report.json captures bounds and a small limit"
+    path: "report.json"
+    assertions:
+      - expression: "limit"
+        operator: lte
+        expected: 200
+      - expression: "limit"
+        operator: gte
+        expected: 1
+      - expression: "length(from)"
+        operator: gte
+        expected: 8
+      - expression: "length(to)"
+        operator: gte
+        expected: 8
+    weight: 1.0
+    pass_threshold: 0.75
@@ -0,0 +1,89 @@
+task_id: skill-audit-export-basic-smoke
+description: >
+  Skill-guided evaluation: agent uses the uipath-audit skill to run
+  a minimal `export` against a small window. Validates the three
+  required flags (`--from-date`, `--to-date`, `--output-file`) and
+  that the agent picks a reasonable output path inside the working
+  directory.
+
+  Platform note: runs without an authenticated tenant — commands will
+  fail with auth errors. That is acceptable; what matters is correct
+  command invocation with correct flags.
+tags: [uipath-audit, smoke, export]
+
+sandbox:
+  driver: tempdir
+  python: {}
+
+initial_prompt: |
+  An admin needs a ZIP of all tenant audit events from yesterday for
+  archival. Use the uipath-audit skill to export the window. Write the
+  ZIP to ./audit-yesterday.zip in the current working directory.
+
+  Save a summary to report.json with at minimum:
+    {
+      "export_command":     "<exact uip admin audit tenant export command you ran>",
+      "export_output_file": "<the --output-file value you passed>",
+      "export_from":        "<the --from-date value>",
+      "export_to":          "<the --to-date value>",
+      "commands_used":      ["<list of uip commands you attempted>"]
+    }
+
+  Important:
+  - The `uip` CLI is available but the `admin` subcommand may not be
+    installed and/or the CLI is not connected to a live tenant.
+    Commands WILL fail — that is expected and acceptable.
+    Run each command exactly once regardless of errors.
+    Do NOT retry, do NOT attempt to login, do NOT try to install tools,
+    do NOT troubleshoot errors.
+  - Do NOT prompt the user for confirmation — this is an automated test.
+
+success_criteria:
+  - type: command_executed
+    description: "Agent invoked `uip admin audit tenant export` with --from-date, --to-date, AND --output-file"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+admin\s+audit\s+tenant\s+export\b.*--from-date\s+\d{4}-\d{2}-\d{2}.*--to-date\s+\d{4}-\d{2}-\d{2}.*--output-file\s+\S+'
+    min_count: 1
+    weight: 2.5
+    pass_threshold: 1.0
+
+  - type: command_executed
+    description: "Agent's export targeted the requested ./audit-yesterday.zip path"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+admin\s+audit\s+tenant\s+export\b.*--output-file\s+\.?/?audit-yesterday\.zip'
+    min_count: 1
+    weight: 1.5
+    pass_threshold: 1.0
+
+  - type: file_exists
+    description: "report.json was created"
+    path: "report.json"
+    weight: 1.0
+    pass_threshold: 1.0
+
+  - type: file_contains
+    description: "report.json references the export command and forbids legacy spellings"
+    path: "report.json"
+    includes:
+      - "uip admin audit tenant export"
+    excludes:
+      - "uip audit "
+      - "uip admin aops-policy"
+    weight: 1.5
+    pass_threshold: 1.0
+
+  - type: json_check
+    description: "report.json captures the export bounds and output path"
+    path: "report.json"
+    assertions:
+      - expression: "export_output_file"
+        operator: contains
+        expected: "audit-yesterday.zip"
+      - expression: "length(export_from)"
+        operator: gte
+        expected: 8
+      - expression: "length(export_to)"
+        operator: gte
+        expected: 8
+    weight: 1.0
+    pass_threshold: 0.75
@@ -0,0 +1,125 @@
+task_id: skill-audit-login-history-e2e
+description: >
+  End-to-end test: agent runs Investigation 2 from
+  audit-workflow-guide.md ("login history for user X, failed attempts
+  only"). Sequence is: list `sources` to find the User Login type GUID
+  (Identity → Authentication → User Login), then query `events` with
+  `--user-id`, `--type`, and `--status Failure` inside a bounded
+  window. Validates the agent uses server-side filters instead of
+  pulling everything and post-filtering.
+
+  Command-construction only — no live tenant available for
+  side-effect verification.
+tags: [uipath-audit, e2e, investigation, login-history]
+max_iterations: 2
+
+agent:
+  type: claude-code
+  permission_mode: acceptEdits
+  max_turns: 40
+  turn_timeout: 600
+  allowed_tools: ["Skill", "Bash", "Read", "Write", "Edit", "Glob", "Grep"]
+
+sandbox:
+  driver: tempdir
+  python: {}
+
+initial_prompt: |
+  Show me failed login attempts for jane.doe@example.com on the
+  active tenant during April 2026. Use the uipath-audit skill
+  end-to-end.
+
+  Save a summary to report.json with at minimum:
+    {
+      "sources_command": "<uip admin audit tenant sources ... command>",
+      "events_command":  "<uip admin audit tenant events ... command>",
+      "user_filter":     "<the --user-id value or the --search term you used to resolve them>",
+      "type_filter":     "<the --type value you used to scope to User Login>",
+      "status":          "<the --status value, expected: Failure>",
+      "from":            "<the --from-date value>",
+      "to":              "<the --to-date value>",
+      "commands_used":   ["<every uip command you attempted>"]
+    }
+
+  Important:
+  - The `uip` CLI is available but the `admin` subcommand may not be
+    installed and/or the CLI is not connected to a live tenant.
+    Commands WILL fail — that is expected and acceptable.
+    Run each command exactly once regardless of errors.
+    Do NOT retry, do NOT attempt to login, do NOT try to install tools,
+    do NOT troubleshoot errors.
+  - Use --output json on every uip admin audit command.
+  - The skill teaches that login filtering is done server-side via
+    --user-id / --type / --status — do NOT pull everything and
+    post-filter on the client.
+  - Do NOT prompt the user for confirmation — this is an automated test.
+
+success_criteria:
+  - type: command_executed
+    description: "Agent invoked `uip admin audit tenant sources` to discover the User Login type GUID"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+admin\s+audit\s+tenant\s+sources\b.*--output\s+json'
+    min_count: 1
+    weight: 2.0
+    pass_threshold: 1.0
+
+  - type: command_executed
+    description: "Agent invoked `uip admin audit tenant events` with --status Failure"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+admin\s+audit\s+tenant\s+events\b.*--status\s+(?i:Failure|1)'
+    min_count: 1
+    weight: 2.5
+    pass_threshold: 1.0
+
+  - type: command_executed
+    description: "Agent passed --user-id (or --search to resolve the user) on the events call"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+admin\s+audit\s+tenant\s+events\b.*(?:--user-id\s+\S+|--search\s+\S+)'
+    min_count: 1
+    weight: 2.0
+    pass_threshold: 1.0
+
+  - type: command_executed
+    description: "Agent kept a bounded time window via --from-date/--to-date covering April 2026"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+admin\s+audit\s+tenant\s+events\b.*--from-date\s+2026-04.*--to-date\s+2026-0[45]'
+    min_count: 1
+    weight: 1.5
+    pass_threshold: 1.0
+
+  - type: file_exists
+    description: "report.json was created"
+    path: "report.json"
+    weight: 1.0
+    pass_threshold: 1.0
+
+  - type: file_contains
+    description: "report.json references sources discovery and the events filter chain"
+    path: "report.json"
+    includes:
+      - "uip admin audit tenant sources"
+      - "uip admin audit tenant events"
+      - "--status"
+    excludes:
+      - "--before "
+      - "--after "
+      - "--before-id"
+      - "--after-id"
+    weight: 1.5
+    pass_threshold: 1.0
+
+  - type: json_check
+    description: "report.json captures the user filter, type filter, and Failure status"
+    path: "report.json"
+    assertions:
+      - expression: "length(user_filter)"
+        operator: gte
+        expected: 1
+      - expression: "status"
+        operator: contains
+        expected: "ailure"
+      - expression: "length(commands_used)"
+        operator: gte
+        expected: 2
+    weight: 1.5
+    pass_threshold: 0.75