diff --git a/tests/tasks/uipath-agents/deploy_my_workspace/check_deploy_my_workspace.py b/tests/tasks/uipath-agents/deploy_my_workspace/check_deploy_my_workspace.py new file mode 100644 index 000000000..ce6db5312 --- /dev/null +++ b/tests/tasks/uipath-agents/deploy_my_workspace/check_deploy_my_workspace.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +"""Deploy-lifecycle artifact + metadata check. + +Asserts the artifacts `uip codedagent pack` / `deploy` produce in +`.uipath/`, and that `pyproject.toml` carries the four fields the +deployment guide flags as required (`name`, `version`, `description`, +`authors`). Without `authors`, packaging fails with `Project authors +cannot be empty`. + +Checks: + 1. `deploy-smoke/pyproject.toml` has `name`, `version`, + `description`, and `authors`. No `[build-system]`. + 2. `deploy-smoke/.uipath/` exists and contains a `*.nupkg` file + (proof that `pack` ran successfully). + 3. `deploy-smoke/invoke-output.txt` exists, is non-empty, and the + `file_contains` criterion in the YAML separately checks that it + surfaces an `https://` URL — kept here as a complementary + "non-empty" guard. +""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from _shared.project_root import find_project_root # noqa: E402 + +ROOT = find_project_root("deploy-smoke") + + +def _read_text(path: Path) -> str: + if not path.is_file(): + sys.exit(f"FAIL: Missing {path}") + return path.read_text(encoding="utf-8") + + +def check_pyproject() -> None: + text = _read_text(ROOT / "pyproject.toml") + if "[build-system]" in text: + sys.exit("FAIL: pyproject.toml contains a [build-system] section") + for needle in ("name", "version", "description", "authors"): + if needle not in text: + sys.exit( + f"FAIL: pyproject.toml is missing `{needle}` — " + "deployment guide requires all four fields." + ) + print("OK: pyproject.toml has name, version, description, authors") + + +def check_pack_artifacts() -> None: + uipath_dir = ROOT / ".uipath" + if not uipath_dir.is_dir(): + sys.exit( + f"FAIL: {uipath_dir} does not exist — `uip codedagent pack` " + "did not run." + ) + nupkgs = sorted(uipath_dir.glob("*.nupkg")) + if not nupkgs: + sys.exit( + f"FAIL: no .nupkg file in {uipath_dir} — pack did not produce " + "the expected package artifact." + ) + print(f"OK: {uipath_dir.name}/{nupkgs[0].name} exists ({len(nupkgs)} package(s) total)") + + +def check_invoke_output() -> None: + path = ROOT / "invoke-output.txt" + text = _read_text(path) + if not text.strip(): + sys.exit(f"FAIL: {path.name} is empty — `uip codedagent invoke` produced no output") + if "https://" not in text: + sys.exit(f"FAIL: {path.name} does not contain a monitoring URL (no `https://` substring)") + print(f"OK: {path.name} captured {len(text)} bytes of invoke stdout (with monitoring URL)") + + +def main() -> None: + if not ROOT.is_dir(): + sys.exit(f"FAIL: project directory {ROOT} does not exist") + check_pyproject() + check_pack_artifacts() + check_invoke_output() + + +if __name__ == "__main__": + main() diff --git a/tests/tasks/uipath-agents/deploy_my_workspace/deploy_my_workspace.yaml b/tests/tasks/uipath-agents/deploy_my_workspace/deploy_my_workspace.yaml new file mode 100644 index 000000000..258c94249 --- /dev/null +++ b/tests/tasks/uipath-agents/deploy_my_workspace/deploy_my_workspace.yaml @@ -0,0 +1,65 @@ +task_id: skill-agent-coded-deploy-my-workspace +description: > + Coded-agent deploy lifecycle. Verifies the skill guides the agent + through `uip codedagent pack` → `uip codedagent publish + --my-workspace` (or the combined `uip codedagent deploy + --my-workspace`), then `uip codedagent invoke` to start a cloud + job and surface the monitoring URL. Exercises the entire production + packaging path the existing test suite never reaches. +tags: [uipath-agents, e2e, coded, lifecycle:deploy, feature:deploy] +max_iterations: 1 + +agent: + type: claude-code + permission_mode: acceptEdits + allowed_tools: ["Skill", "Bash", "Read", "Write", "Edit", "Glob", "Grep"] + turn_timeout: 1200 + +sandbox: + driver: tempdir + python: {} + +initial_prompt: | + Build a minimal Simple Function UiPath coded agent named + `deploy-smoke` whose `main(input)` returns `{"echoed": + input.message}`. No LLM. The point is the deployment lifecycle, + not the agent logic. + + Take the agent through scaffold → init → run, then deploy it to + the user's personal workspace and invoke the published version + with `{"message": "deployed"}`. + + After invoke succeeds, write its full stdout (which includes the + monitoring URL) to `deploy-smoke/invoke-output.txt` (inside the + scaffolded project directory, alongside `pyproject.toml`) so the + test harness can verify the URL was surfaced. + + The test harness has UiPath auth pre-configured. + + Do NOT pause between planning and implementation. Complete + end-to-end in a single pass. + +success_criteria: + - type: command_executed + description: "Agent packed and published to my-workspace (deploy or pack+publish)" + tool_name: "Bash" + command_pattern: 'uip\s+codedagent\s+(deploy|publish)\s+.*--my-workspace' + min_count: 1 + weight: 2.5 + pass_threshold: 1.0 + + - type: command_executed + description: "Agent invoked the published agent" + tool_name: "Bash" + command_pattern: 'uip\s+codedagent\s+invoke\s+main' + min_count: 1 + weight: 2.5 + pass_threshold: 1.0 + + - type: run_command + description: "Deploy artifacts present and pyproject.toml is well-formed for packaging" + command: "python3 $TASK_DIR/check_deploy_my_workspace.py" + timeout: 30 + expected_exit_code: 0 + weight: 4.0 + pass_threshold: 1.0 diff --git a/tests/tasks/uipath-agents/eval_exact_match/check_eval_exact_match.py b/tests/tasks/uipath-agents/eval_exact_match/check_eval_exact_match.py new file mode 100644 index 000000000..200bd859d --- /dev/null +++ b/tests/tasks/uipath-agents/eval_exact_match/check_eval_exact_match.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +"""Eval-lifecycle check for the deterministic ExactMatch path. + +Validates that the agent authored both halves of the evaluation +harness — the evaluator config under `evaluations/evaluators/` AND +the evaluation set under `evaluations/eval-sets/` whose `evaluatorRefs` +match the evaluator `id` — and that `uip codedagent eval --no-report` +produced an output file in which every test case has +`status == "PASSED"` (deterministic agent + deterministic evaluator +means anything else is a bug). + +Checks: + 1. `adder/evaluations/evaluators/.json` has `evaluatorTypeId` + == "uipath-exact-match" and a non-empty `id`. + 2. `adder/evaluations/eval-sets/.json` has version "1.0", + `evaluatorRefs` referencing the evaluator id, at least 2 test + cases, and each test case's `evaluationCriterias` keys the + evaluator id. + 3. `eval-results.json` exists with the documented top-level shape + (`evaluationSetName`, `evaluationSetResults: [...]`), every + test case in `evaluationSetResults` has at least one matching + `evaluationRunResults[]` entry for the configured evaluator, + and every such entry scored exactly 1.0 (deterministic agent + + deterministic evaluator: anything below 1.0 is a bug). +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from _shared.project_root import find_project_root # noqa: E402 + +ROOT = find_project_root("adder") + + +def _load_json(path: Path) -> dict: + if not path.is_file(): + sys.exit(f"FAIL: Missing {path}") + try: + return json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as e: + sys.exit(f"FAIL: {path} is not valid JSON: {e}") + + +def find_single_json(directory: Path) -> Path: + if not directory.is_dir(): + sys.exit(f"FAIL: {directory} does not exist") + files = sorted(p for p in directory.glob("*.json") if p.is_file()) + if not files: + sys.exit(f"FAIL: {directory} contains no .json files") + if len(files) > 1: + sys.exit(f"FAIL: {directory} should contain exactly one .json file, got {len(files)}") + return files[0] + + +def check_evaluator() -> str: + path = find_single_json(ROOT / "evaluations" / "evaluators") + doc = _load_json(path) + type_id = doc.get("evaluatorTypeId") + if type_id != "uipath-exact-match": + sys.exit( + f'FAIL: {path.name} evaluatorTypeId should be "uipath-exact-match", ' + f'got {type_id!r}' + ) + eval_id = doc.get("id") + if not eval_id: + sys.exit(f"FAIL: {path.name} is missing required `id` field") + print(f'OK: evaluator config {path.name} has evaluatorTypeId={type_id!r} id={eval_id!r}') + return eval_id + + +def check_eval_set(evaluator_id: str) -> int: + path = find_single_json(ROOT / "evaluations" / "eval-sets") + doc = _load_json(path) + if doc.get("version") != "1.0": + sys.exit(f'FAIL: eval set version should be "1.0", got {doc.get("version")!r}') + refs = doc.get("evaluatorRefs") or [] + if evaluator_id not in refs: + sys.exit( + f'FAIL: eval set `evaluatorRefs` does not include the evaluator ' + f'id {evaluator_id!r}. Got: {refs}' + ) + cases = doc.get("evaluations") or [] + if len(cases) < 2: + sys.exit(f"FAIL: eval set must have at least 2 test cases, got {len(cases)}") + for i, case in enumerate(cases): + crit = case.get("evaluationCriterias") or {} + if evaluator_id not in crit: + sys.exit( + f'FAIL: eval set test case {i} (`{case.get("id", "?")}`) does ' + f'not key its evaluationCriterias on the evaluator id ' + f'{evaluator_id!r}. Got keys: {list(crit.keys())}' + ) + print(f'OK: eval set {path.name} references {evaluator_id!r} across {len(cases)} test cases') + return len(cases) + + +def check_results(evaluator_id: str, expected_case_count: int) -> None: + path = ROOT / "eval-results.json" + doc = _load_json(path) + if not isinstance(doc, dict): + sys.exit(f"FAIL: {path.name} top-level should be an object, got {type(doc).__name__}") + cases = doc.get("evaluationSetResults") + if not isinstance(cases, list) or not cases: + sys.exit( + f"FAIL: {path.name} is missing a non-empty `evaluationSetResults` " + f"list. Top-level keys: {list(doc.keys())}" + ) + if len(cases) != expected_case_count: + sys.exit( + f"FAIL: expected {expected_case_count} entries in " + f"`evaluationSetResults` (one per eval-set test case), got {len(cases)}" + ) + print(f'OK: eval-results.json carries {len(cases)} entries in evaluationSetResults') + bad_cases = [] + matching_run_count = 0 + for case in cases: + if not isinstance(case, dict): + continue + case_name = case.get("evaluationName") or "?" + runs = case.get("evaluationRunResults") or [] + matching = [ + r for r in runs + if isinstance(r, dict) and r.get("evaluatorId") == evaluator_id + ] + if not matching: + bad_cases.append( + f'{case_name!r}: no evaluationRunResults entry references ' + f'evaluatorId={evaluator_id!r}' + ) + continue + for r in matching: + score = (r.get("result") or {}).get("score") + if score != 1.0: + bad_cases.append( + f'{case_name!r}: evaluator {evaluator_id!r} scored ' + f'{score!r}, expected 1.0 (deterministic agent + ' + f'deterministic evaluator)' + ) + matching_run_count += len(matching) + if bad_cases: + sys.exit("FAIL: " + " | ".join(bad_cases)) + print( + f'OK: every test case has an ExactMatchEvaluator run scoring 1.0 ' + f'({matching_run_count} run(s) across {len(cases)} case(s))' + ) + + +def main() -> None: + if not ROOT.is_dir(): + sys.exit(f"FAIL: project directory {ROOT} does not exist") + evaluator_id = check_evaluator() + case_count = check_eval_set(evaluator_id) + check_results(evaluator_id, case_count) + + +if __name__ == "__main__": + main() diff --git a/tests/tasks/uipath-agents/eval_exact_match/eval_exact_match.yaml b/tests/tasks/uipath-agents/eval_exact_match/eval_exact_match.yaml new file mode 100644 index 000000000..e63a7d144 --- /dev/null +++ b/tests/tasks/uipath-agents/eval_exact_match/eval_exact_match.yaml @@ -0,0 +1,66 @@ +task_id: skill-agent-coded-eval-exact-match +description: > + Coded-agent eval lifecycle, deterministic path. Verifies the skill + guides the agent to author an evaluator config under + `evaluations/evaluators/`, an evaluation set under + `evaluations/eval-sets/` whose `evaluatorRefs` match the evaluator + `id`, and run `uip codedagent eval --no-report` against a + deterministic Simple Function agent. All test cases must score 1.0 + and report PASSED in the output file. +tags: [uipath-agents, e2e, coded, lifecycle:validate, feature:eval] +max_iterations: 1 + +agent: + type: claude-code + permission_mode: acceptEdits + allowed_tools: ["Skill", "Bash", "Read", "Write", "Edit", "Glob", "Grep"] + turn_timeout: 1200 + +sandbox: + driver: tempdir + python: {} + +initial_prompt: | + Build a Simple Function UiPath coded agent named `adder` whose + `main(input)` returns the sum of two integers. No LLM — purely + deterministic. + + Input fields: `a` (int), `b` (int). Output field: `result` (int). + + Take the agent through scaffold → init → run, then add an + evaluation harness using the **ExactMatch evaluator** with three + test cases (e.g. 2+3=5, 0+0=0, 7+1=8). Run the eval set locally + (no Studio Web reporting) and save the output to + `eval-results.json` in the project root. + + Every test case must score 1.0 — deterministic agent + + deterministic evaluator means anything below 1.0 is a regression. + + Do NOT publish, upload, or deploy. Do NOT call `uip login`. Do NOT + pause between planning and implementation. Complete end-to-end in + a single pass. + +success_criteria: + - type: command_executed + description: "Agent scaffolded the project with uip codedagent new" + tool_name: "Bash" + command_pattern: 'uip\s+codedagent\s+new' + min_count: 1 + weight: 1.0 + pass_threshold: 1.0 + + - type: command_executed + description: "Agent ran the eval suite with --no-report" + tool_name: "Bash" + command_pattern: 'uip\s+codedagent\s+eval\s+.*--no-report' + min_count: 1 + weight: 2.0 + pass_threshold: 1.0 + + - type: run_command + description: "Evaluator + eval-set + results shape, all test cases PASSED" + command: "python3 $TASK_DIR/check_eval_exact_match.py" + timeout: 30 + expected_exit_code: 0 + weight: 5.0 + pass_threshold: 1.0 diff --git a/tests/tasks/uipath-agents/eval_llm_judges/check_eval_llm_judges.py b/tests/tasks/uipath-agents/eval_llm_judges/check_eval_llm_judges.py new file mode 100644 index 000000000..5c90e6206 --- /dev/null +++ b/tests/tasks/uipath-agents/eval_llm_judges/check_eval_llm_judges.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +"""Eval-lifecycle check for the LLM-judge path (two evaluators). + +Validates the dual-evaluator harness: + - `LLMJudgeOutputEvaluator` config with the + `uipath-llm-judge-output-semantic-similarity` typeId. + - `LLMJudgeTrajectoryEvaluator` config with the + `uipath-llm-judge-trajectory-similarity` typeId. + - One eval set whose `evaluatorRefs` lists BOTH ids and whose test + cases key `evaluationCriterias` on BOTH ids — the output judge + gets an `expectedOutput` block, the trajectory judge gets an + `expectedAgentBehavior` string. + - `eval-results.json` exists and is a non-empty test-case list. + LLM-judge scores are continuous (0.0-1.0) so we don't assert an + exact score — only that the results file is well-formed and + references the expected evaluator ids. +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from _shared.project_root import find_project_root # noqa: E402 + +ROOT = find_project_root("intent-classifier") + +EXPECTED_EVALUATORS = { + "LLMJudgeOutputEvaluator": "uipath-llm-judge-output-semantic-similarity", + "LLMJudgeTrajectoryEvaluator": "uipath-llm-judge-trajectory-similarity", +} + + +def _load_json(path: Path) -> dict: + if not path.is_file(): + sys.exit(f"FAIL: Missing {path}") + try: + return json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as e: + sys.exit(f"FAIL: {path} is not valid JSON: {e}") + + +def check_evaluator_configs() -> None: + evaluators_dir = ROOT / "evaluations" / "evaluators" + if not evaluators_dir.is_dir(): + sys.exit(f"FAIL: {evaluators_dir} does not exist") + found_by_id: dict[str, Path] = {} + for json_file in sorted(evaluators_dir.glob("*.json")): + doc = _load_json(json_file) + eval_id = doc.get("id") + type_id = doc.get("evaluatorTypeId") + if eval_id in EXPECTED_EVALUATORS: + expected_type = EXPECTED_EVALUATORS[eval_id] + if type_id != expected_type: + sys.exit( + f'FAIL: evaluator {eval_id!r} should have evaluatorTypeId=' + f'{expected_type!r}, got {type_id!r}' + ) + found_by_id[eval_id] = json_file + print(f'OK: evaluator config {json_file.name} has id={eval_id!r} typeId={type_id!r}') + missing = set(EXPECTED_EVALUATORS) - set(found_by_id) + if missing: + sys.exit( + f'FAIL: missing evaluator configs for ids {sorted(missing)}. ' + f'Found ids: {sorted(found_by_id)}' + ) + + +def check_eval_set() -> None: + eval_sets_dir = ROOT / "evaluations" / "eval-sets" + if not eval_sets_dir.is_dir(): + sys.exit(f"FAIL: {eval_sets_dir} does not exist") + files = sorted(eval_sets_dir.glob("*.json")) + if not files: + sys.exit(f"FAIL: no eval set files in {eval_sets_dir}") + if len(files) > 1: + sys.exit(f"FAIL: expected exactly one eval set file, got {len(files)}") + path = files[0] + doc = _load_json(path) + if doc.get("version") != "1.0": + sys.exit(f'FAIL: eval set version should be "1.0", got {doc.get("version")!r}') + refs = doc.get("evaluatorRefs") or [] + missing_refs = set(EXPECTED_EVALUATORS) - set(refs) + if missing_refs: + sys.exit( + f'FAIL: eval set `evaluatorRefs` is missing {sorted(missing_refs)}. ' + f'Got: {refs}' + ) + cases = doc.get("evaluations") or [] + if len(cases) < 2: + sys.exit(f"FAIL: eval set must have at least 2 test cases, got {len(cases)}") + for i, case in enumerate(cases): + crit = case.get("evaluationCriterias") or {} + for evaluator_id in EXPECTED_EVALUATORS: + if evaluator_id not in crit: + sys.exit( + f'FAIL: test case {i} (`{case.get("id", "?")}`) does not ' + f'key evaluationCriterias on {evaluator_id!r}. Got keys: ' + f'{list(crit.keys())}' + ) + # Trajectory judge requires `expectedAgentBehavior`. + traj = crit.get("LLMJudgeTrajectoryEvaluator") or {} + if not traj.get("expectedAgentBehavior"): + sys.exit( + f'FAIL: test case {i} LLMJudgeTrajectoryEvaluator entry is ' + f'missing the required `expectedAgentBehavior` field. Got: {traj}' + ) + # Output judge requires `expectedOutput`. + out = crit.get("LLMJudgeOutputEvaluator") or {} + if "expectedOutput" not in out: + sys.exit( + f'FAIL: test case {i} LLMJudgeOutputEvaluator entry is ' + f'missing the required `expectedOutput` field. Got: {out}' + ) + print( + f"OK: eval set {path.name} references both judges across {len(cases)} " + "test cases with the right per-judge criteria" + ) + + +def check_results() -> None: + path = ROOT / "eval-results.json" + doc = _load_json(path) + if not isinstance(doc, dict): + sys.exit(f"FAIL: {path.name} top-level should be an object, got {type(doc).__name__}") + cases = doc.get("evaluationSetResults") + if not isinstance(cases, list) or not cases: + sys.exit( + f"FAIL: {path.name} is missing a non-empty `evaluationSetResults` " + f"list. Top-level keys: {list(doc.keys())}" + ) + seen_ids: set[str] = set() + for c in cases: + if not isinstance(c, dict): + continue + for r in c.get("evaluationRunResults") or []: + if isinstance(r, dict): + eid = r.get("evaluatorId") + if eid: + seen_ids.add(eid) + missing = set(EXPECTED_EVALUATORS) - seen_ids + if missing: + sys.exit( + f'FAIL: results file does not surface evaluatorId entries for ' + f'{sorted(missing)} (seen: {sorted(seen_ids)}). Both judges ' + f'should run on every test case.' + ) + print( + f"OK: results file references both evaluator ids ({sorted(seen_ids)}) " + f"across {len(cases)} test case(s)" + ) + + +def main() -> None: + if not ROOT.is_dir(): + sys.exit(f"FAIL: project directory {ROOT} does not exist") + check_evaluator_configs() + check_eval_set() + check_results() + + +if __name__ == "__main__": + main() diff --git a/tests/tasks/uipath-agents/eval_llm_judges/eval_llm_judges.yaml b/tests/tasks/uipath-agents/eval_llm_judges/eval_llm_judges.yaml new file mode 100644 index 000000000..f94c9803e --- /dev/null +++ b/tests/tasks/uipath-agents/eval_llm_judges/eval_llm_judges.yaml @@ -0,0 +1,67 @@ +task_id: skill-agent-coded-eval-llm-judges +description: > + Coded-agent eval lifecycle, LLM-judge path. Verifies the agent + authors two LLM-judge evaluator configs in one eval set — + `LLMJudgeOutputEvaluator` + (`uipath-llm-judge-output-semantic-similarity`) and + `LLMJudgeTrajectoryEvaluator` + (`uipath-llm-judge-trajectory-similarity`) — and runs them against + a LangGraph classifier with `--no-report` and `--mocker-cache` so + the judges are reproducible. +tags: [uipath-agents, e2e, coded, lifecycle:validate, feature:eval] +max_iterations: 1 + +agent: + type: claude-code + permission_mode: acceptEdits + allowed_tools: ["Skill", "Bash", "Read", "Write", "Edit", "Glob", "Grep"] + turn_timeout: 1200 + +sandbox: + driver: tempdir + python: {} + +initial_prompt: | + Build a LangGraph UiPath coded agent named `intent-classifier` that + classifies a free-text user request into one of: `weather`, + `news`, `joke`. + + Input: `text` (str). Output: `category` (str), `text` (str). + + For deterministic test runs, pin the LLM to a low-cost gateway + model (e.g. `gpt-4o-mini-2024-07-18`) with `temperature=0`. + + Take the agent through scaffold → init → run, then add an + evaluation harness with **two LLM judges in one eval set**: + + - The semantic-output LLM judge (id `LLMJudgeOutputEvaluator`). + Each test case's criteria block carries an `expectedOutput`. + - The trajectory LLM judge (id `LLMJudgeTrajectoryEvaluator`). + Each test case's criteria block carries an + `expectedAgentBehavior` string — e.g. "Agent classifies the + input into exactly one of weather/news/joke and returns that + label." + + Three test cases. Run the eval set locally (no Studio Web + reporting) and cache LLM responses for reproducibility. Save the + results to `eval-results.json` in the project root. + + Do NOT publish, upload, or deploy. Do NOT pause between planning + and implementation. Complete end-to-end in a single pass. + +success_criteria: + - type: command_executed + description: "Agent ran the eval suite with --no-report" + tool_name: "Bash" + command_pattern: 'uip\s+codedagent\s+eval\s+.*--no-report' + min_count: 1 + weight: 2.0 + pass_threshold: 1.0 + + - type: run_command + description: "Both LLM-judge evaluator configs + eval-set + results shape" + command: "python3 $TASK_DIR/check_eval_llm_judges.py" + timeout: 30 + expected_exit_code: 0 + weight: 5.0 + pass_threshold: 1.0